《DeepSeek原生应用与智能体开发实践》【摘要 书评 试读】- 京东图书
我们知道,基于PPO算法的火箭回收案例非常经典,从其实现过程可以看到,通过对整体的操作描述和控制,我们可以更好地对火箭降落的全过程进行优化。由于篇幅问题,我们直接把这个火箭回收案例代码放在配套资源中,请读者在学习本节之前,先通过运行案例代码弄清楚PPO算法。本节将延续这一自动火箭回收的经典案例,使用新的强化学习算法GRPO来完成一项新的强化学习控制技术。
CartPole强化学习环境设置
CartPole是用于强化学习的一种常用环境,在CartPole场景中,有一辆小车,智能体的任务是通过左右移动保持车上的杆竖直,若杆的倾斜度数过大,或者车子离初始位置左右的偏离程度过大,或者坚持时间到达最大帧,则游戏结束。在CartPole-V1环境中,最大帧是500。CartPole环境如图14-2所示。
图14-2 CartPole环境
下面是我们完成的一个用于演示CartPole的代码:
import gym
def main():
env = gym.make('CartPole-v1', render_mode="human")
for i_episode in range(20):
observation = env.reset()
for t in range(100):
env.render()
print(observation)
action = env.action_space.sample()
observation, reward, done, info, _ = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t + 1))
break
if __name__ == "__main__":
main()
基于GRPO的CartPole模型训练
接下来,我们将首先使用基于GRPO强化学习方案,完成CartPole模型训练,代码如下所示:
# -*- coding: utf-8 -*-
"""
GRPO (Generalized Reward Policy Optimization) 算法实现
环境:CartPole-v1
功能:训练策略网络平衡小车立杆
"""
# 常用库
import time # 时间统计
from tqdm import tqdm # 进度条显示
import matplotlib.pyplot as plt # 结果可视化
# PyTorch相关
import torch
from torch.nn import functional as F # 神经网络函数
import gym # 强化学习环境
from torch.distributions import Categorical # 分类分布采样
import numpy as np # 数值计算
class PolicyNet(torch.nn.Module):
"""策略网络定义"""
def __init__(self, state_dim, action_dim):
"""
初始化策略网络结构
:param state_dim: 状态维度 (CartPole为4)
:param action_dim: 动作维度 (CartPole为2)
"""
super(PolicyNet, self).__init__()
self.fc1 = torch.nn.Linear(state_dim, 128) # 第一全连接层
self.fc2 = torch.nn.Linear(128, action_dim) # 第二全连接层
def forward(self, state):
"""
前向传播计算动作概率
:param state: 输入状态 [batch_size, state_dim]
:return: 动作概率分布 [batch_size, action_dim]
"""
x = torch.nn.functional.relu(self.fc1(state)) # ReLU激活
logits = self.fc2(x) # 未归一化的动作分值
return F.softmax(logits, dim=1) # 转换为概率分布
def collect_trajectory_vectorized(envs, policy_net, trajectory_max_steps=500, device="cpu"):
"""
从并行环境中收集轨迹数据
:param envs: 并行环境对象 (vectorized environment)
:param policy_net: 策略网络实例
:param trajectory_max_steps: 单条轨迹最大步长
:param device: 计算设备 (cpu/cuda)
:return: (轨迹数据字典, 各环境总奖励)
"""
group_size = envs.num_envs # 并行环境数量
seed_num = np.random.randint(0, 1000) # 随机种子
states, _ = envs.reset(seed=[seed_num] * group_size) # 环境重置
# 初始化存储容器
all_states = [] # 状态序列 [T, group_size, state_dim]
all_actions = [] # 动作序列 [T, group_size]
all_log_probs = [] # 对数概率 [T, group_size]
all_rewards = torch.zeros(group_size) # 累计奖励 [group_size]
all_dones = torch.tensor([False] * group_size) # 终止标记 [group_size]
# 轨迹收集循环
for t in range(trajectory_max_steps):
# 状态转张量
states_tensor = torch.tensor(states, dtype=torch.float32, device=device)
# 计算动作概率
probs = policy_net(states_tensor) # [group_size, action_dim]
dist = Categorical(probs) # 创建分类分布
actions = dist.sample() # 采样动作 [group_size]
log_probs = dist.log_prob(actions).detach() # 对数概率 [group_size]
# 环境交互
next_states, rewards, terminated, truncated, infos = envs.step(actions.cpu().numpy())
dones = np.logical_or(terminated, truncated) # 合并终止条件
# 数据存储
all_states.append(states)
all_actions.append(actions)
all_log_probs.append(log_probs)
all_dones[dones] = True # 更新终止标记
# 奖励处理:终止环境奖励归零 + 位置惩罚
rewards[all_dones] = 0 # 终止环境奖励置零
rewards += -abs(next_states[:, 0]) # 添加水平位置惩罚
all_rewards += rewards # 累计奖励
# 状态更新
states = next_states
# 提前终止条件:所有环境都终止
if torch.all(all_dones):
break
# 后处理:归一化奖励并组织数据
normalized_rewards = (all_rewards / trajectory_max_steps).to(device) # 奖励归一化
all_states = torch.tensor(all_states).permute(1, 0, 2).to(device) # [group_size, T, state_dim]
all_log_probs = torch.stack(all_log_probs).permute(1, 0).to(device) # [group_size, T]
all_actions = torch.stack(all_actions).permute(1, 0).to(device) # [group_size, T]
# 打包轨迹数据
trajectories = {
"all_states": all_states,
"all_log_probs": all_log_probs,
"all_actions": all_actions,
"normalized_rewards": normalized_rewards
}
episode_rewards = normalized_rewards * trajectory_max_steps # 计算实际奖励
return trajectories, episode_rewards
def calc_advantages_with_grpo(trajectories):
"""
计算标准化优势值
:param trajectories: 轨迹数据字典
:return: 标准化后的优势值 [group_size]
"""
rewards = trajectories["normalized_rewards"] # 提取归一化奖励
mean_reward = torch.mean(rewards) # 计算均值
std_reward = torch.std(rewards) + 1e-8 # 计算标准差(防止除零)
advantages = (rewards - mean_reward) / std_reward # 标准化
return advantages
def grpo_update(trajectories, net, optimizer, n_iterations=20, eps=0.2):
"""
GRPO策略更新
:param trajectories: 轨迹数据字典
:param net: 策略网络
:param optimizer: 优化器
:param n_iterations: 策略更新迭代次数
:param eps: PPO截断阈值
:return: 本轮平均损失值
"""
# 计算标准化优势值 [group_size, 1]
advantages = calc_advantages_with_grpo(trajectories).unsqueeze(-1)
# 解包轨迹数据
all_states = trajectories["all_states"] # [group_size, T, state_dim]
all_log_probs = trajectories["all_log_probs"] # [group_size, T]
all_chosen_actions = trajectories["all_actions"] # [group_size, T]
batch_size = len(all_states) # group_size
# 多轮策略优化
for i_iter in range(n_iterations):
loss = 0.0
# 遍历每个并行环境的轨迹
for i in range(batch_size):
# 提取单条轨迹数据
states = all_states[i] # [T, state_dim]
log_probs = all_log_probs[i] # [T]
chosen_actions = all_chosen_actions[i] # [T]
advantage = advantages[i] # [1]
# 计算新策略的对数概率
new_log_probs = torch.log(net(states).gather(1, chosen_actions.unsqueeze(1))) # [T, 1]
# 计算概率比(重要性采样比率)
ratio = torch.exp(new_log_probs - log_probs.unsqueeze(1)) # [T, 1]
# 计算替代损失
surr1 = ratio * advantage # 未截断项
surr2 = torch.clamp(ratio, 1 - eps, 1 + eps) * advantage # 截断项
trajectory_loss = torch.mean(-torch.min(surr1, surr2)) # 取最小值
loss += trajectory_loss # 累计损失
# 计算平均损失
loss /= batch_size
# 反向传播更新参数
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss.item()
if __name__ == '__main__':
"""主训练程序"""
# [1] 环境与网络初始化
group_size = 10 # 并行环境数量
env_name = 'CartPole-v1' # 环境名称
envs = gym.vector.make(env_name, num_envs=group_size) # 创建并行环境
# 获取环境参数
state_dim = envs.single_observation_space.shape[0] # 状态维度=4
n_actions = envs.single_action_space.n # 动作数量=2
# 设备配置
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# 初始化策略网络和优化器
policy = PolicyNet(state_dim, n_actions).to(device)
optimizer = torch.optim.Adam(policy.parameters(), lr=0.02) # 学习率0.02
# 训练参数
episode_num = 50 # 训练轮数
trajectory_max_steps = 500 # 单轨迹最大步长
return_list = [] # 奖励记录
# [2] 训练主循环
start = time.time()
for i_episode in tqdm(range(episode_num)):
# [3] 收集轨迹数据
trajectories, episode_rewards = collect_trajectory_vectorized(
envs, policy, trajectory_max_steps, device=device
)
# [4] 策略更新
loss = grpo_update(trajectories, policy, optimizer)
# [5] 记录性能指标
avg_reward = sum(episode_rewards) / len(episode_rewards)
return_list.append(avg_reward.cpu().numpy())
# 打印训练信息
print(f'第 {i_episode} 次试验, 平均奖励: {avg_reward:.2f}')
# [6] 训练后处理
print("总耗时(s): ", time.time() - start)
# 保存模型
save_path = "./grpo_cartpole_policy_update_final.pth"
torch.save(policy.state_dict(), save_path)
print(f"模型已保存至: {save_path}")
# 绘制训练曲线
plt.figure(figsize=(10, 6))
plt.plot(return_list)
plt.xlabel('train epochs')
plt.ylabel('avg reward')
plt.title('GRPO on CartPole-v1')
plt.grid(True)
plt.show()
# 关闭环境
envs.close()
在上面代码中,我们首先创建一个PolicyNet用以对模型的训练,之后的GPRO在过程中学习操作,并根据奖励完成项目既定目标,并将结果进行存储。训练过程请读者自行尝试。
基于GRPO后的CartPole模型演示
模型训练完毕后,为了验证我们的训练任务,需要对基于GRPO后的CartPole模型进行演示,代码如下所示:
# test_cartpole.py
import gym
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from argparse import ArgumentParser
# 定义策略网络(必须与训练代码完全一致)
class PolicyNet(torch.nn.Module):
def __init__(self, state_dim, action_dim):
super().__init__()
self.fc1 = torch.nn.Linear(state_dim, 128)
self.fc2 = torch.nn.Linear(128, action_dim)
def forward(self, state):
x = torch.relu(self.fc1(state))
return torch.softmax(self.fc2(x), dim=1)
def load_model(model_path, device='cpu'):
"""加载训练好的模型"""
# 初始化网络结构
model = PolicyNet(state_dim=4, action_dim=2)
try:
# 加载训练权重
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
print(f"成功加载模型:{model_path}")
return model
except Exception as e:
print(f"模型加载失败:{str(e)}")
exit(1)
def run_episode(env, model, max_steps=500, render=True):
"""运行单个测试回合"""
state, _ = env.reset()
total_reward = 0
frames = []
for step in range(max_steps):
if render:
frame = env.render()
if env.render_mode == 'rgb_array':
frames.append(frame)
# 使用模型选择动作
with torch.no_grad():
state_tensor = torch.FloatTensor(state).unsqueeze(0)
action_probs = model(state_tensor)
action = torch.argmax(action_probs).item()
# 执行动作
next_state, reward, terminated, truncated, _ = env.step(action)
total_reward += reward
state = next_state
if terminated or truncated:
print(f"回合结束,步数:{step + 1},总奖励:{total_reward:.1f}")
break
return total_reward, frames
def save_gif(frames, filename, fps=30):
"""保存为GIF动画"""
plt.figure(figsize=(6, 4))
plt.axis('off')
ims = [[plt.imshow(frame, animated=True)] for frame in frames]
ani = animation.ArtistAnimation(plt.gcf(), ims, interval=50, blit=True)
ani.save(filename, writer='pillow', fps=fps)
print(f"动画已保存至:{filename}")
def main():
# 命令行参数解析
parser = ArgumentParser(description='CartPole测试程序')
parser.add_argument('--model', type=str, default='./grpo_cartpole_policy_update_final.pth',
help='模型文件路径(默认:./grpo_cartpole_policy_update_final.pth)')
parser.add_argument('--episodes', type=int, default=5,
help='测试回合数(默认:5)')
parser.add_argument('--render', type=str, choices=['human', 'rgb_array'], default='human',
help='渲染模式:human(窗口显示)或 rgb_array(生成帧)')
parser.add_argument('--save_gif', action='store_true',
help='保存为GIF动画(仅在rgb_array模式有效)')
args = parser.parse_args()
# 设备设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 创建环境
try:
env = gym.make('CartPole-v1', render_mode=args.render)
except gym.error.Error as e:
print(f"环境创建失败:{str(e)}")
print("请确保:1.已安装最新gym库 2.确认环境名称正确")
exit(1)
# 加载模型
model = load_model(args.model, device)
# 运行测试
total_rewards = []
best_frames = []
max_reward = 0
for ep in range(args.episodes):
print(f"\n=== 第 {ep + 1}/{args.episodes} 测试回合 ===")
reward, frames = run_episode(env, model)
total_rewards.append(reward)
# 记录最佳表现
if reward > max_reward and args.render == 'rgb_array':
max_reward = reward
best_frames = frames
# 输出统计信息
print("\n=== 测试结果 ===")
print(f"平均奖励:{np.mean(total_rewards):.1f} ± {np.std(total_rewards):.1f}")
print(f"最佳奖励:{max(total_rewards)}")
print(f"最差奖励:{min(total_rewards)}")
# 保存最佳表现动画
if args.save_gif and args.render == 'rgb_array' and len(best_frames) > 0:
save_gif(best_frames, "cartpole_demo.gif")
elif args.save_gif and args.render != 'rgb_array':
print("警告:--save_gif 仅在rgb_array渲染模式下有效")
env.close()
if __name__ == '__main__':
main()
通过对训练好的模型进行演示,我们打印了演示步数及其获取的奖励,如下所示:
回合结束,步数:500,总奖励:500.0
=== 第 2/5 测试回合 ===
回合结束,步数:500,总奖励:500.0
=== 第 3/5 测试回合 ===
回合结束,步数:500,总奖励:500.0
=== 第 4/5 测试回合 ===
回合结束,步数:500,总奖励:500.0
=== 第 5/5 测试回合 ===
回合结束,步数:500,总奖励:500.0
=== 测试结果 ===
平均奖励:500.0 ± 0.0
最佳奖励:500.0
最低奖励:500.0
在这个过程中,我们可以通过注释模型载入参数来观察不同状态下的模型对CartPole的操作,其图像如图14-3所示。读者可以自行尝试运行代码。
图14-3 CartPole的操作