（10-3-02）基于模型的强化学习：综合实战：用采样规划方法解决一个网格世界问题-CSDN博客

本文链接：https://blog.csdn.net/asd343442/article/details/149407725

10.3.6 综合实战：用采样规划方法解决一个网格世界问题

例如下面是一个基于强化学习和采样规划方法的例子，用于解决一个简单的网格世界（GridWorld）问题。本实例展示了多种算法的实现和应用，包括值迭代、策略迭代、蒙特卡洛树搜索（MCTS）、模型预测控制（MPC）、PI2（概率推断学习控制）以及基于采样的值函数估计方法（蒙特卡洛方法和时序差分学习TD(0)）。

实例10-1：用采样规划方法解决一个网格世界问题（源码路径：codes\10\Gui.py）

实例文件Gui.py的具体实现流程如下所示。

（1）定义类 GridWorld，功能是定义一个网格世界环境，包含状态、动作、目标、障碍和奖励机制，提供状态转移函数、奖励函数以及策略和值函数的可视化功能。

# ======================== 1. 环境定义 ========================
class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.states = range(size * size)
        self.actions = [0, 1, 2, 3]  # 上、下、左、右
        self.target = (size - 1, size - 1)  # 目标状态 (4,4)
        self.walls = [(1, 1), (2, 2)]  # 障碍
        self.gamma = 0.9  # 折扣因子

    def step(self, state, action):
        x, y = state // self.size, state % self.size
        # 动作执行
        if action == 0:
            x = max(x - 1, 0)  # 上
        elif action == 1:
            x = min(x + 1, self.size - 1)  # 下
        elif action == 2:
            y = max(y - 1, 0)  # 左
        elif action == 3:
            y = min(y + 1, self.size - 1)  # 右

        next_state = x * self.size + y
        # 奖励与终止判断
        if (x, y) in self.walls:
            return next_state, -1.0, True  # 撞墙
        elif (x, y) == self.target:
            return next_state, 1.0, True  # 到达目标
        else:
            return next_state, -0.1, False  # 普通移动

    def reset(self):
        return 0  # 初始状态 (0,0)

    def render_policy(self, policy, values=None):
        """可视化策略与值函数"""
        size = self.size
        fig, ax = plt.subplots(figsize=(size, size))
        ax.set_xticks(np.arange(0.5, size, 1))
        ax.set_yticks(np.arange(0.5, size, 1))
        ax.grid(which='both', color='black', linewidth=2)
        ax.set_xticklabels([]);
        ax.set_yticklabels([])

        # 绘制障碍与目标
        for (x, y) in self.walls:
            rect = plt.Rectangle((y, size - 1 - x), 1, 1, color='gray')
            ax.add_patch(rect)
        tx, ty = self.target
        rect = plt.Rectangle((ty, size - 1 - tx), 1, 1, color='green')
        ax.add_patch(rect)

        # 绘制值函数
        if values is not None:
            for i in range(size):
                for j in range(size):
                    state = i * size + j
                    ax.text(j + 0.5, size - 1 - i + 0.2, f'{values[state]:.2f}',
                            ha='center', va='center', fontsize=12)

        # 绘制策略（箭头）
        arrow_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}
        for i in range(size):
            for j in range(size):
                state = i * size + j
                if (i, j) not in self.walls and (i, j) != self.target:
                    ax.text(j + 0.5, size - 1 - i - 0.2, arrow_map[policy[state]],
                            ha='center', va='center', fontsize=16, color='red')
        plt.show()

（2）定义函数 value_iteration 和 policy_iteration，功能是实现值迭代和策略迭代两种动态规划算法。值迭代通过动态规划直接求解最优值函数和策略，而策略迭代通过策略评估和策略改进交替进行，优化策略。这两种算法用于求解网格世界中的最优策略和值函数。

# ======================== 2. 基础规划：值迭代与策略迭代 ========================
def value_iteration(env, max_iter=100, theta=1e-4):
    """值迭代算法"""
    values = np.zeros(len(env.states))
    for _ in range(max_iter):
        delta = 0
        for s in env.states:
            old_v = values[s]
            # 计算所有动作的Q值
            q_values = [env.step(s, a)[1] + env.gamma * values[env.step(s, a)[0]] for a in env.actions]
            values[s] = max(q_values)
            delta = max(delta, abs(old_v - values[s]))
        if delta < theta:
            break
    # 提取最优策略
    policy = np.array([np.argmax([env.step(s, a)[1] + env.gamma * values[env.step(s, a)[0]] for a in env.actions])
                       for s in env.states])
    return values, policy


def policy_iteration(env, max_iter=100):
    """策略迭代算法"""
    # 初始化随机策略
    policy = np.random.choice(env.actions, size=len(env.states))
    for _ in range(max_iter):
        # 1. 策略评估
        values = np.zeros(len(env.states))
        while True:
            delta = 0
            for s in env.states:
                old_v = values[s]
                a = policy[s]
                next_s, r, _ = env.step(s, a)
                values[s] = r + env.gamma * values[next_s]
                delta = max(delta, abs(old_v - values[s]))
            if delta < 1e-4:
                break
        # 2. 策略改进
        new_policy = np.array(
            [np.argmax([env.step(s, a)[1] + env.gamma * values[env.step(s, a)[0]] for a in env.actions])
             for s in env.states])
        if np.all(policy == new_policy):
            break
        policy = new_policy
    return values, policy

（3）定义类 MCTSNode 和函数 mcts，功能是实现蒙特卡洛树搜索（MCTS）。MCTS 是一种基于采样的规划方法，通过选择、扩展、模拟和回溯四个步骤，优化动作选择。它适用于复杂环境中的动作决策，能够在有限的搜索时间内找到较优的动作。

# ======================== 3. 采样式规划：MCTS 核心逻辑 ========================
class MCTSNode:
    def __init__(self, state, parent=None):
        self.state = state
        self.parent = parent
        self.children = {}  # action -> MCTSNode
        self.visits = 0
        self.value = 0.0

def mcts(env, root_state, iterations=100, C=1.414):
    """蒙特卡洛树搜索"""
    root = MCTSNode(root_state)

    def uct(node):
        """Upper Confidence Bound"""
        if node.visits == 0:
            return float('inf')
        return (node.value / node.visits) + C * np.sqrt(np.log(root.visits) / node.visits)

    def rollout(node):
        """随机模拟（Rollout）"""
        state = node.state
        total_reward = 0
        done = False
        while not done:
            action = random.choice(env.actions)
            state, reward, done = env.step(state, action)
            total_reward += reward
        return total_reward

    for _ in range(iterations):
        node = root
        done = False  # 初始化done变量

        # 1. 选择（Selection）
        while node.children and not done:
            # 找到UCT值最大的子节点
            best_action = max(node.children, key=lambda a: uct(node.children[a]))
            next_state, _, done = env.step(node.state, best_action)
            node = node.children[best_action]

        # 2. 扩展（Expansion）
        if not done and not node.children:
            # 检查是否所有动作都已被扩展
            for a in env.actions:
                next_s, _, child_done = env.step(node.state, a)
                if next_s not in [child.state for child in node.children.values()]:
                    node.children[a] = MCTSNode(next_s, parent=node)

            # 如果有子节点，选择一个继续模拟
            if node.children:
                # 优先选择未访问过的节点
                unvisited_children = [a for a, child in node.children.items() if child.visits == 0]
                if unvisited_children:
                    action = random.choice(unvisited_children)
                else:
                    action = max(node.children, key=lambda a: uct(node.children[a]))

                node = node.children[action]
                _, _, done = env.step(node.parent.state, action)

        # 3. 模拟（Simulation）
        if not done:
            reward = rollout(node)
        else:
            # 如果已经是终止状态，直接获取奖励
            reward = 0

        # 4. 回溯（Backpropagation）
        while node is not None:
            node.visits += 1
            node.value += reward
            node = node.parent

    # 返回最优动作
    if root.children:
        return max(root.children, key=lambda a: root.children[a].visits)
    else:
        # 如果根节点没有子节点，返回随机动作
        return random.choice(env.actions)

（4）定义类 MPC，功能是实现模型预测控制（MPC）。MPC 是一种基于模型的优化方法，通过优化未来一段时间内的动作序列，实现短期规划。它结合了模型预测和优化技术，能够在动态环境中实时调整控制策略。

# ======================== 4. 模型预测控制（MPC）演示 ========================
class MPC:
    def __init__(self, env, horizon=7):  # 增加时域长度
        self.env = env
        self.horizon = horizon

    def cost(self, actions, initial_state):
        """计算动作序列的代价（负奖励和 + 距离目标的惩罚 + 方向持续性）"""
        state = initial_state
        total_cost = 0
        prev_action = None

        for t in range(self.horizon):
            action = actions[t] if t < len(actions) else 0
            next_s, reward, done = self.env.step(state, action)

            # 计算到目标的距离惩罚（增加权重）
            x, y = next_s // self.env.size, next_s % self.env.size
            tx, ty = self.env.target
            distance_penalty = (abs(x - tx) + abs(y - ty)) * 0.5

            # 方向持续性奖励（减少频繁转向）
            direction_penalty = 0
            if prev_action is not None and action != prev_action:
                direction_penalty = 0.2

            total_cost += (-reward) + distance_penalty + direction_penalty
            if done:
                break
            state = next_s
            prev_action = action

        return total_cost

    def optimize(self, initial_state):
        """优化动作序列（使用改进的代价函数）"""
        best_cost = float('inf')
        best_action = 0
        from itertools import product

        # 限制搜索空间为合理动作（避免重复无效动作）
        if initial_state == 0:  # 在起点(0,0)时，优先考虑右和下
            priority_actions = [3, 1, 0, 2]  # 右、下、上、左
            action_space = product(priority_actions, repeat=self.horizon)
        else:
            action_space = product(self.env.actions, repeat=self.horizon)

        for actions in action_space:
            current_cost = self.cost(actions, initial_state)
            if current_cost < best_cost:
                best_cost = current_cost
                best_action = actions[0]  # 只执行第一个动作
        return best_action

（5）定义类 PI2，功能是实现 PI2（概率推断学习控制）。PI2 是一种基于采样的策略优化方法，通过采样和概率推断优化策略。它结合了随机优化和概率推断的思想，能够在复杂环境中优化控制策略，同时增强策略的探索能力。

# ======================== 5. # PI2增强探索 ========================
class PI2:
    def __init__(self, env, num_samples=20, num_iter=10, horizon=7):  # 增加样本和迭代
        self.env = env
        self.num_samples = num_samples
        self.num_iter = num_iter
        self.horizon = horizon

    def rollout(self, policy):
        """执行策略并收集轨迹（使用ε-贪婪探索）"""
        state = self.env.reset()
        trajectory = []
        total_reward = 0
        for t in range(self.horizon):
            action = policy[t] if t < len(policy) else 0
            # 添加探索
            if random.random() < 0.1:  # 10%概率随机动作
                action = random.choice(self.env.actions)
            next_s, reward, done = self.env.step(state, action)
            trajectory.append((state, action, reward))
            total_reward += reward
            state = next_s
            if done:
                break
        return trajectory, total_reward

    def update_policy(self, old_policy, trajectories, rewards):
        # 使用softmax处理权重，避免极端值
        temperature = 1.0
        new_policy = []
        for t in range(self.horizon):
            action_probs = defaultdict(float)
            total = 0
            for i in range(self.num_samples):
                if t < len(trajectories[i]):
                    _, a, r = trajectories[i][t]
                    weight = np.exp(rewards[i] / temperature)
                    action_probs[a] += weight
                    total += weight
            # 归一化为概率分布
            if total > 0:
                for a in action_probs:
                    action_probs[a] /= total
            # 按概率选择动作（而非贪心）
            actions = list(action_probs.keys())
            probs = list(action_probs.values())
            best_action = np.random.choice(actions, p=probs) if actions else 0
            new_policy.append(best_action)
        return new_policy
    def train(self):
        """训练策略"""
        # 初始化偏向右侧和下侧的策略
        policy = [3, 1, 3, 1, 3]  # 初始策略偏向右侧和下侧
        for _ in range(self.num_iter):
            trajectories = []
            rewards = []
            for _ in range(self.num_samples):
                traj, r = self.rollout(policy)
                trajectories.append(traj)
                rewards.append(r)
            policy = self.update_policy(policy, trajectories, rewards)
        return policy

（6）定义函数 epsilon_greedy_policy、mc_value_estimation 和 td0_value_estimation，功能是实现基于采样的值函数估计方法。epsilon_greedy_policy 用于实现 ε-贪婪策略，增加探索能力；mc_value_estimation 使用蒙特卡洛方法通过完整的轨迹来估计值函数；td0_value_estimation 使用时序差分学习（TD(0)）通过部分轨迹来估计值函数。这些方法用于在不完全了解环境模型的情况下估计值函数。

# ======================== 6. 基于采样的值函数估计（MC + TD） ========================
# 修正 epsilon-greedy 策略
def epsilon_greedy_policy(policy, state, epsilon, env):
    if random.random() < epsilon:
        return random.choice(env.actions)
    return policy[state]

# 修改 MC 和 TD0 的调用
def mc_value_estimation(env, policy, episodes=500, epsilon=0.1):
    returns = defaultdict(list)
    for _ in range(episodes):
        state = env.reset()
        trajectory = []
        while True:
            action = epsilon_greedy_policy(policy, state, epsilon, env)
            next_s, reward, done = env.step(state, action)
            trajectory.append((state, reward))
            state = next_s
            if done: break
        G = 0; visited = set()
        for s, r in reversed(trajectory):
            if s not in visited:
                G = r + env.gamma * G
                returns[s].append(G)
                visited.add(s)
    return {s: np.mean(returns[s]) if returns[s] else 0.0 for s in env.states}

def td0_value_estimation(env, policy, episodes=500, alpha=0.1, epsilon=0.1):
    V = {s: 0.0 for s in env.states}
    for _ in range(episodes):
        state = env.reset()
        while True:
            action = epsilon_greedy_policy(policy, state, epsilon, env)
            next_s, reward, done = env.step(state, action)
            V[state] += alpha * (reward + env.gamma * V[next_s] - V[state])
            state = next_s
            if done: break
    return V

（7）主程序整合了所有算法的演示，功能是初始化网格世界环境，并分别运行值迭代、策略迭代、MCTS、MPC、PI2 和基于采样的值函数估计方法。它展示了不同算法优化后的策略和值函数的可视化结果，以及在网格世界中的表现，用于比较和验证各算法的性能。

# ======================== 7. 主程序：整合所有算法演示 ========================
if __name__ == "__main__":
    # 初始化环境
    env = GridWorld(size=5)

    # -------------------- 7.1 基础规划：值迭代 vs 策略迭代 --------------------
    print("=== 1. 值迭代 ===")
    vi_values, vi_policy = value_iteration(env)
    env.render_policy(vi_policy, vi_values)

    print("=== 2. 策略迭代 ===")
    pi_values, pi_policy = policy_iteration(env)
    env.render_policy(pi_policy, pi_values)

    # -------------------- 7.2 采样式规划：MCTS 演示 --------------------
    print("=== 3. MCTS 演示 ===")
    root_state = env.reset()
    best_action = mcts(env, root_state, iterations=200)
    print(f"MCTS 在状态 {root_state} 推荐动作: {['上', '下', '左', '右'][best_action]}")

    # -------------------- 7.3 模型预测控制（MPC）演示 --------------------
    print("=== 4. MPC 演示 ===")
    mpc = MPC(env, horizon=5)  # 增加规划时域
    state = env.reset()
    for _ in range(5):  # 执行5步MPC
        action = mpc.optimize(state)
        next_s, _, _ = env.step(state, action)
        print(f"状态 {state} → 动作 {action} → 下一状态 {next_s}")
        state = next_s

    # -------------------- 7.4 PI₂ 演示 --------------------
    print("=== 5. PI₂ 演示 ===")
    pi2_agent = PI2(env, num_samples=10, num_iter=5, horizon=5)
    best_policy = pi2_agent.train()
    print("PI₂ 优化后的策略（轨迹长度5）:", best_policy)

    # -------------------- 7.5 基于采样的值函数估计 --------------------
    print("=== 6. 基于采样的值函数估计 ===")
    # 用值迭代策略测试，使用ε-贪婪增加探索
    test_policy = vi_policy
    mc_values = mc_value_estimation(env, test_policy, episodes=500)
    td_values = td0_value_estimation(env, test_policy, alpha=0.1, episodes=500)

    # 可视化对比
    print("蒙特卡洛值函数估计结果：")
    env.render_policy(test_policy, mc_values)
    print("TD(0) 值函数估计结果：")
    env.render_policy(test_policy, td_values)

执行后会分别绘制值迭代、策略迭代、MCTS、MPC、PI2 以及蒙特卡洛和TD(0)值函数估计的策略和值函数可视化图，展示不同算法在网格世界中的策略优化和价值评估效果。