10.3.6 综合实战:用采样规划方法解决一个网格世界问题
例如下面是一个基于强化学习和采样规划方法的例子,用于解决一个简单的网格世界(GridWorld)问题。本实例展示了多种算法的实现和应用,包括值迭代、策略迭代、蒙特卡洛树搜索(MCTS)、模型预测控制(MPC)、PI2(概率推断学习控制)以及基于采样的值函数估计方法(蒙特卡洛方法和时序差分学习TD(0))。
实例10-1:用采样规划方法解决一个网格世界问题(源码路径:codes\10\Gui.py)
实例文件Gui.py的具体实现流程如下所示。
(1)定义类 GridWorld,功能是定义一个网格世界环境,包含状态、动作、目标、障碍和奖励机制,提供状态转移函数、奖励函数以及策略和值函数的可视化功能。
# ======================== 1. 环境定义 ========================
class GridWorld:
def __init__(self, size=5):
self.size = size
self.states = range(size * size)
self.actions = [0, 1, 2, 3] # 上、下、左、右
self.target = (size - 1, size - 1) # 目标状态 (4,4)
self.walls = [(1, 1), (2, 2)] # 障碍
self.gamma = 0.9 # 折扣因子
def step(self, state, action):
x, y = state // self.size, state % self.size
# 动作执行
if action == 0:
x = max(x - 1, 0) # 上
elif action == 1:
x = min(x + 1, self.size - 1) # 下
elif action == 2:
y = max(y - 1, 0) # 左
elif action == 3:
y = min(y + 1, self.size - 1) # 右
next_state = x * self.size + y
# 奖励与终止判断
if (x, y) in self.walls:
return next_state, -1.0, True # 撞墙
elif (x, y) == self.target:
return next_state, 1.0, True # 到达目标
else:
return next_state, -0.1, False # 普通移动
def reset(self):
return 0 # 初始状态 (0,0)
def render_policy(self, policy, values=None):
"""可视化策略与值函数"""
size = self.size
fig, ax = plt.subplots(figsize=(size, size))
ax.set_xticks(np.arange(0.5, size, 1))
ax.set_yticks(np.arange(0.5, size, 1))
ax.grid(which='both', color='black', linewidth=2)
ax.set_xticklabels([]);
ax.set_yticklabels([])
# 绘制障碍与目标
for (x, y) in self.walls:
rect = plt.Rectangle((y, size - 1 - x), 1, 1, color='gray')
ax.add_patch(rect)
tx, ty = self.target
rect = plt.Rectangle((ty, size - 1 - tx), 1, 1, color='green')
ax.add_patch(rect)
# 绘制值函数
if values is not None:
for i in range(size):
for j in range(size):
state = i * size + j
ax.text(j + 0.5, size - 1 - i + 0.2, f'{values[state]:.2f}',
ha='center', va='center', fontsize=12)
# 绘制策略(箭头)
arrow_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}
for i in range(size):
for j in range(size):
state = i * size + j
if (i, j) not in self.walls and (i, j) != self.target:
ax.text(j + 0.5, size - 1 - i - 0.2, arrow_map[policy[state]],
ha='center', va='center', fontsize=16, color='red')
plt.show()
(2)定义函数 value_iteration 和 policy_iteration,功能是实现值迭代和策略迭代两种动态规划算法。值迭代通过动态规划直接求解最优值函数和策略,而策略迭代通过策略评估和策略改进交替进行,优化策略。这两种算法用于求解网格世界中的最优策略和值函数。
# ======================== 2. 基础规划:值迭代与策略迭代 ========================
def value_iteration(env, max_iter=100, theta=1e-4):
"""值迭代算法"""
values = np.zeros(len(env.states))
for _ in range(max_iter):
delta = 0
for s in env.states:
old_v = values[s]
# 计算所有动作的Q值
q_values = [env.step(s, a)[1] + env.gamma * values[env.step(s, a)[0]] for a in env.actions]
values[s] = max(q_values)
delta = max(delta, abs(old_v - values[s]))
if delta < theta:
break
# 提取最优策略
policy = np.array([np.argmax([env.step(s, a)[1] + env.gamma * values[env.step(s, a)[0]] for a in env.actions])
for s in env.states])
return values, policy
def policy_iteration(env, max_iter=100):
"""策略迭代算法"""
# 初始化随机策略
policy = np.random.choice(env.actions, size=len(env.states))
for _ in range(max_iter):
# 1. 策略评估
values = np.zeros(len(env.states))
while True:
delta = 0
for s in env.states:
old_v = values[s]
a = policy[s]
next_s, r, _ = env.step(s, a)
values[s] = r + env.gamma * values[next_s]
delta = max(delta, abs(old_v - values[s]))
if delta < 1e-4:
break
# 2. 策略改进
new_policy = np.array(
[np.argmax([env.step(s, a)[1] + env.gamma * values[env.step(s, a)[0]] for a in env.actions])
for s in env.states])
if np.all(policy == new_policy):
break
policy = new_policy
return values, policy
(3)定义类 MCTSNode 和函数 mcts,功能是实现蒙特卡洛树搜索(MCTS)。MCTS 是一种基于采样的规划方法,通过选择、扩展、模拟和回溯四个步骤,优化动作选择。它适用于复杂环境中的动作决策,能够在有限的搜索时间内找到较优的动作。
# ======================== 3. 采样式规划:MCTS 核心逻辑 ========================
class MCTSNode:
def __init__(self, state, parent=None):
self.state = state
self.parent = parent
self.children = {} # action -> MCTSNode
self.visits = 0
self.value = 0.0
def mcts(env, root_state, iterations=100, C=1.414):
"""蒙特卡洛树搜索"""
root = MCTSNode(root_state)
def uct(node):
"""Upper Confidence Bound"""
if node.visits == 0:
return float('inf')
return (node.value / node.visits) + C * np.sqrt(np.log(root.visits) / node.visits)
def rollout(node):
"""随机模拟(Rollout)"""
state = node.state
total_reward = 0
done = False
while not done:
action = random.choice(env.actions)
state, reward, done = env.step(state, action)
total_reward += reward
return total_reward
for _ in range(iterations):
node = root
done = False # 初始化done变量
# 1. 选择(Selection)
while node.children and not done:
# 找到UCT值最大的子节点
best_action = max(node.children, key=lambda a: uct(node.children[a]))
next_state, _, done = env.step(node.state, best_action)
node = node.children[best_action]
# 2. 扩展(Expansion)
if not done and not node.children:
# 检查是否所有动作都已被扩展
for a in env.actions:
next_s, _, child_done = env.step(node.state, a)
if next_s not in [child.state for child in node.children.values()]:
node.children[a] = MCTSNode(next_s, parent=node)
# 如果有子节点,选择一个继续模拟
if node.children:
# 优先选择未访问过的节点
unvisited_children = [a for a, child in node.children.items() if child.visits == 0]
if unvisited_children:
action = random.choice(unvisited_children)
else:
action = max(node.children, key=lambda a: uct(node.children[a]))
node = node.children[action]
_, _, done = env.step(node.parent.state, action)
# 3. 模拟(Simulation)
if not done:
reward = rollout(node)
else:
# 如果已经是终止状态,直接获取奖励
reward = 0
# 4. 回溯(Backpropagation)
while node is not None:
node.visits += 1
node.value += reward
node = node.parent
# 返回最优动作
if root.children:
return max(root.children, key=lambda a: root.children[a].visits)
else:
# 如果根节点没有子节点,返回随机动作
return random.choice(env.actions)
(4)定义类 MPC,功能是实现模型预测控制(MPC)。MPC 是一种基于模型的优化方法,通过优化未来一段时间内的动作序列,实现短期规划。它结合了模型预测和优化技术,能够在动态环境中实时调整控制策略。
# ======================== 4. 模型预测控制(MPC)演示 ========================
class MPC:
def __init__(self, env, horizon=7): # 增加时域长度
self.env = env
self.horizon = horizon
def cost(self, actions, initial_state):
"""计算动作序列的代价(负奖励和 + 距离目标的惩罚 + 方向持续性)"""
state = initial_state
total_cost = 0
prev_action = None
for t in range(self.horizon):
action = actions[t] if t < len(actions) else 0
next_s, reward, done = self.env.step(state, action)
# 计算到目标的距离惩罚(增加权重)
x, y = next_s // self.env.size, next_s % self.env.size
tx, ty = self.env.target
distance_penalty = (abs(x - tx) + abs(y - ty)) * 0.5
# 方向持续性奖励(减少频繁转向)
direction_penalty = 0
if prev_action is not None and action != prev_action:
direction_penalty = 0.2
total_cost += (-reward) + distance_penalty + direction_penalty
if done:
break
state = next_s
prev_action = action
return total_cost
def optimize(self, initial_state):
"""优化动作序列(使用改进的代价函数)"""
best_cost = float('inf')
best_action = 0
from itertools import product
# 限制搜索空间为合理动作(避免重复无效动作)
if initial_state == 0: # 在起点(0,0)时,优先考虑右和下
priority_actions = [3, 1, 0, 2] # 右、下、上、左
action_space = product(priority_actions, repeat=self.horizon)
else:
action_space = product(self.env.actions, repeat=self.horizon)
for actions in action_space:
current_cost = self.cost(actions, initial_state)
if current_cost < best_cost:
best_cost = current_cost
best_action = actions[0] # 只执行第一个动作
return best_action
(5)定义类 PI2,功能是实现 PI2(概率推断学习控制)。PI2 是一种基于采样的策略优化方法,通过采样和概率推断优化策略。它结合了随机优化和概率推断的思想,能够在复杂环境中优化控制策略,同时增强策略的探索能力。
# ======================== 5. # PI2增强探索 ========================
class PI2:
def __init__(self, env, num_samples=20, num_iter=10, horizon=7): # 增加样本和迭代
self.env = env
self.num_samples = num_samples
self.num_iter = num_iter
self.horizon = horizon
def rollout(self, policy):
"""执行策略并收集轨迹(使用ε-贪婪探索)"""
state = self.env.reset()
trajectory = []
total_reward = 0
for t in range(self.horizon):
action = policy[t] if t < len(policy) else 0
# 添加探索
if random.random() < 0.1: # 10%概率随机动作
action = random.choice(self.env.actions)
next_s, reward, done = self.env.step(state, action)
trajectory.append((state, action, reward))
total_reward += reward
state = next_s
if done:
break
return trajectory, total_reward
def update_policy(self, old_policy, trajectories, rewards):
# 使用softmax处理权重,避免极端值
temperature = 1.0
new_policy = []
for t in range(self.horizon):
action_probs = defaultdict(float)
total = 0
for i in range(self.num_samples):
if t < len(trajectories[i]):
_, a, r = trajectories[i][t]
weight = np.exp(rewards[i] / temperature)
action_probs[a] += weight
total += weight
# 归一化为概率分布
if total > 0:
for a in action_probs:
action_probs[a] /= total
# 按概率选择动作(而非贪心)
actions = list(action_probs.keys())
probs = list(action_probs.values())
best_action = np.random.choice(actions, p=probs) if actions else 0
new_policy.append(best_action)
return new_policy
def train(self):
"""训练策略"""
# 初始化偏向右侧和下侧的策略
policy = [3, 1, 3, 1, 3] # 初始策略偏向右侧和下侧
for _ in range(self.num_iter):
trajectories = []
rewards = []
for _ in range(self.num_samples):
traj, r = self.rollout(policy)
trajectories.append(traj)
rewards.append(r)
policy = self.update_policy(policy, trajectories, rewards)
return policy
(6)定义函数 epsilon_greedy_policy、mc_value_estimation 和 td0_value_estimation,功能是实现基于采样的值函数估计方法。epsilon_greedy_policy 用于实现 ε-贪婪策略,增加探索能力;mc_value_estimation 使用蒙特卡洛方法通过完整的轨迹来估计值函数;td0_value_estimation 使用时序差分学习(TD(0))通过部分轨迹来估计值函数。这些方法用于在不完全了解环境模型的情况下估计值函数。
# ======================== 6. 基于采样的值函数估计(MC + TD) ========================
# 修正 epsilon-greedy 策略
def epsilon_greedy_policy(policy, state, epsilon, env):
if random.random() < epsilon:
return random.choice(env.actions)
return policy[state]
# 修改 MC 和 TD0 的调用
def mc_value_estimation(env, policy, episodes=500, epsilon=0.1):
returns = defaultdict(list)
for _ in range(episodes):
state = env.reset()
trajectory = []
while True:
action = epsilon_greedy_policy(policy, state, epsilon, env)
next_s, reward, done = env.step(state, action)
trajectory.append((state, reward))
state = next_s
if done: break
G = 0; visited = set()
for s, r in reversed(trajectory):
if s not in visited:
G = r + env.gamma * G
returns[s].append(G)
visited.add(s)
return {s: np.mean(returns[s]) if returns[s] else 0.0 for s in env.states}
def td0_value_estimation(env, policy, episodes=500, alpha=0.1, epsilon=0.1):
V = {s: 0.0 for s in env.states}
for _ in range(episodes):
state = env.reset()
while True:
action = epsilon_greedy_policy(policy, state, epsilon, env)
next_s, reward, done = env.step(state, action)
V[state] += alpha * (reward + env.gamma * V[next_s] - V[state])
state = next_s
if done: break
return V
(7)主程序整合了所有算法的演示,功能是初始化网格世界环境,并分别运行值迭代、策略迭代、MCTS、MPC、PI2 和基于采样的值函数估计方法。它展示了不同算法优化后的策略和值函数的可视化结果,以及在网格世界中的表现,用于比较和验证各算法的性能。
# ======================== 7. 主程序:整合所有算法演示 ========================
if __name__ == "__main__":
# 初始化环境
env = GridWorld(size=5)
# -------------------- 7.1 基础规划:值迭代 vs 策略迭代 --------------------
print("=== 1. 值迭代 ===")
vi_values, vi_policy = value_iteration(env)
env.render_policy(vi_policy, vi_values)
print("=== 2. 策略迭代 ===")
pi_values, pi_policy = policy_iteration(env)
env.render_policy(pi_policy, pi_values)
# -------------------- 7.2 采样式规划:MCTS 演示 --------------------
print("=== 3. MCTS 演示 ===")
root_state = env.reset()
best_action = mcts(env, root_state, iterations=200)
print(f"MCTS 在状态 {root_state} 推荐动作: {['上', '下', '左', '右'][best_action]}")
# -------------------- 7.3 模型预测控制(MPC)演示 --------------------
print("=== 4. MPC 演示 ===")
mpc = MPC(env, horizon=5) # 增加规划时域
state = env.reset()
for _ in range(5): # 执行5步MPC
action = mpc.optimize(state)
next_s, _, _ = env.step(state, action)
print(f"状态 {state} → 动作 {action} → 下一状态 {next_s}")
state = next_s
# -------------------- 7.4 PI₂ 演示 --------------------
print("=== 5. PI₂ 演示 ===")
pi2_agent = PI2(env, num_samples=10, num_iter=5, horizon=5)
best_policy = pi2_agent.train()
print("PI₂ 优化后的策略(轨迹长度5):", best_policy)
# -------------------- 7.5 基于采样的值函数估计 --------------------
print("=== 6. 基于采样的值函数估计 ===")
# 用值迭代策略测试,使用ε-贪婪增加探索
test_policy = vi_policy
mc_values = mc_value_estimation(env, test_policy, episodes=500)
td_values = td0_value_estimation(env, test_policy, alpha=0.1, episodes=500)
# 可视化对比
print("蒙特卡洛值函数估计结果:")
env.render_policy(test_policy, mc_values)
print("TD(0) 值函数估计结果:")
env.render_policy(test_policy, td_values)
执行后会分别绘制值迭代、策略迭代、MCTS、MPC、PI2 以及蒙特卡洛和TD(0)值函数估计的策略和值函数可视化图,展示不同算法在网格世界中的策略优化和价值评估效果。