基于GRPO的平衡车自动控制实战

《DeepSeek原生应用与智能体开发实践》【摘要 书评 试读】- 京东图书

我们知道,基于PPO算法的火箭回收案例非常经典,从其实现过程可以看到,通过对整体的操作描述和控制,我们可以更好地对火箭降落的全过程进行优化。由于篇幅问题,我们直接把这个火箭回收案例代码放在配套资源中,请读者在学习本节之前,先通过运行案例代码弄清楚PPO算法。本节将延续这一自动火箭回收的经典案例,使用新的强化学习算法GRPO来完成一项新的强化学习控制技术。

CartPole强化学习环境设置

CartPole是用于强化学习的一种常用环境,在CartPole场景中,有一辆小车,智能体的任务是通过左右移动保持车上的杆竖直,若杆的倾斜度数过大,或者车子离初始位置左右的偏离程度过大,或者坚持时间到达最大帧,则游戏结束。在CartPole-V1环境中,最大帧是500。CartPole环境如图14-2所示。

图14-2  CartPole环境

下面是我们完成的一个用于演示CartPole的代码:

import gym

def main():
    env = gym.make('CartPole-v1', render_mode="human")
    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            print(observation)
            action = env.action_space.sample()
            observation, reward, done, info, _ = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break

if __name__ == "__main__":
    main()

基于GRPO的CartPole模型训练

接下来,我们将首先使用基于GRPO强化学习方案,完成CartPole模型训练,代码如下所示:

# -*- coding: utf-8 -*-
"""
GRPO (Generalized Reward Policy Optimization) 算法实现
环境:CartPole-v1
功能:训练策略网络平衡小车立杆
"""

# 常用库
import time  # 时间统计
from tqdm import tqdm  # 进度条显示
import matplotlib.pyplot as plt  # 结果可视化

# PyTorch相关
import torch
from torch.nn import functional as F  # 神经网络函数
import gym  # 强化学习环境
from torch.distributions import Categorical  # 分类分布采样
import numpy as np  # 数值计算


class PolicyNet(torch.nn.Module):
    """策略网络定义"""

    def __init__(self, state_dim, action_dim):
        """
        初始化策略网络结构
        :param state_dim: 状态维度 (CartPole为4)
        :param action_dim: 动作维度 (CartPole为2)
        """
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, 128)  # 第一全连接层
        self.fc2 = torch.nn.Linear(128, action_dim)  # 第二全连接层

    def forward(self, state):
        """
        前向传播计算动作概率
        :param state: 输入状态 [batch_size, state_dim]
        :return: 动作概率分布 [batch_size, action_dim]
        """
        x = torch.nn.functional.relu(self.fc1(state))  # ReLU激活
        logits = self.fc2(x)  # 未归一化的动作分值
        return F.softmax(logits, dim=1)  # 转换为概率分布


def collect_trajectory_vectorized(envs, policy_net, trajectory_max_steps=500, device="cpu"):
    """
    从并行环境中收集轨迹数据
    :param envs: 并行环境对象 (vectorized environment)
    :param policy_net: 策略网络实例
    :param trajectory_max_steps: 单条轨迹最大步长
    :param device: 计算设备 (cpu/cuda)
    :return: (轨迹数据字典, 各环境总奖励)
    """
    group_size = envs.num_envs  # 并行环境数量
    seed_num = np.random.randint(0, 1000)  # 随机种子
    states, _ = envs.reset(seed=[seed_num] * group_size)  # 环境重置

    # 初始化存储容器
    all_states = []  # 状态序列 [T, group_size, state_dim]
    all_actions = []  # 动作序列 [T, group_size]
    all_log_probs = []  # 对数概率 [T, group_size]
    all_rewards = torch.zeros(group_size)  # 累计奖励 [group_size]
    all_dones = torch.tensor([False] * group_size)  # 终止标记 [group_size]

    # 轨迹收集循环
    for t in range(trajectory_max_steps):
        # 状态转张量
        states_tensor = torch.tensor(states, dtype=torch.float32, device=device)

        # 计算动作概率
        probs = policy_net(states_tensor)  # [group_size, action_dim]
        dist = Categorical(probs)  # 创建分类分布
        actions = dist.sample()  # 采样动作 [group_size]
        log_probs = dist.log_prob(actions).detach()  # 对数概率 [group_size]

        # 环境交互
        next_states, rewards, terminated, truncated, infos = envs.step(actions.cpu().numpy())
        dones = np.logical_or(terminated, truncated)  # 合并终止条件

        # 数据存储
        all_states.append(states)
        all_actions.append(actions)
        all_log_probs.append(log_probs)
        all_dones[dones] = True  # 更新终止标记

        # 奖励处理:终止环境奖励归零 + 位置惩罚
        rewards[all_dones] = 0  # 终止环境奖励置零
        rewards += -abs(next_states[:, 0])  # 添加水平位置惩罚
        all_rewards += rewards  # 累计奖励

        # 状态更新
        states = next_states

        # 提前终止条件:所有环境都终止
        if torch.all(all_dones):
            break

    # 后处理:归一化奖励并组织数据
    normalized_rewards = (all_rewards / trajectory_max_steps).to(device)  # 奖励归一化
    all_states = torch.tensor(all_states).permute(1, 0, 2).to(device)  # [group_size, T, state_dim]
    all_log_probs = torch.stack(all_log_probs).permute(1, 0).to(device)  # [group_size, T]
    all_actions = torch.stack(all_actions).permute(1, 0).to(device)  # [group_size, T]

    # 打包轨迹数据
    trajectories = {
        "all_states": all_states,
        "all_log_probs": all_log_probs,
        "all_actions": all_actions,
        "normalized_rewards": normalized_rewards
    }
    episode_rewards = normalized_rewards * trajectory_max_steps  # 计算实际奖励

    return trajectories, episode_rewards


def calc_advantages_with_grpo(trajectories):
    """
    计算标准化优势值
    :param trajectories: 轨迹数据字典
    :return: 标准化后的优势值 [group_size]
    """
    rewards = trajectories["normalized_rewards"]  # 提取归一化奖励
    mean_reward = torch.mean(rewards)        # 计算均值
    std_reward = torch.std(rewards) + 1e-8  # 计算标准差(防止除零)
    advantages = (rewards - mean_reward) / std_reward  # 标准化
    return advantages


def grpo_update(trajectories, net, optimizer, n_iterations=20, eps=0.2):
    """
    GRPO策略更新
    :param trajectories: 轨迹数据字典
    :param net: 策略网络
    :param optimizer: 优化器
    :param n_iterations: 策略更新迭代次数
    :param eps: PPO截断阈值
    :return: 本轮平均损失值
    """
    # 计算标准化优势值 [group_size, 1]
    advantages = calc_advantages_with_grpo(trajectories).unsqueeze(-1)

    # 解包轨迹数据
    all_states = trajectories["all_states"]  # [group_size, T, state_dim]
    all_log_probs = trajectories["all_log_probs"]  # [group_size, T]
    all_chosen_actions = trajectories["all_actions"]  # [group_size, T]
    batch_size = len(all_states)  # group_size

    # 多轮策略优化
    for i_iter in range(n_iterations):
        loss = 0.0

        # 遍历每个并行环境的轨迹
        for i in range(batch_size):
            # 提取单条轨迹数据
            states = all_states[i]  # [T, state_dim]
            log_probs = all_log_probs[i]  # [T]
            chosen_actions = all_chosen_actions[i]  # [T]
            advantage = advantages[i]  # [1]

            # 计算新策略的对数概率
            new_log_probs = torch.log(net(states).gather(1, chosen_actions.unsqueeze(1)))  # [T, 1]

            # 计算概率比(重要性采样比率)
            ratio = torch.exp(new_log_probs - log_probs.unsqueeze(1))  # [T, 1]

            # 计算替代损失
            surr1 = ratio * advantage  # 未截断项
            surr2 = torch.clamp(ratio, 1 - eps, 1 + eps) * advantage  # 截断项
            trajectory_loss = torch.mean(-torch.min(surr1, surr2))  # 取最小值

            loss += trajectory_loss  # 累计损失

        # 计算平均损失
        loss /= batch_size

        # 反向传播更新参数
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return loss.item()


if __name__ == '__main__':
    """主训练程序"""
    # [1] 环境与网络初始化
    group_size = 10  # 并行环境数量
    env_name = 'CartPole-v1'  # 环境名称
    envs = gym.vector.make(env_name, num_envs=group_size)  # 创建并行环境

    # 获取环境参数
    state_dim = envs.single_observation_space.shape[0]  # 状态维度=4
    n_actions = envs.single_action_space.n  # 动作数量=2

    # 设备配置
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # 初始化策略网络和优化器
    policy = PolicyNet(state_dim, n_actions).to(device)
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.02)  # 学习率0.02

    # 训练参数
    episode_num = 50  # 训练轮数
    trajectory_max_steps = 500  # 单轨迹最大步长
    return_list = []  # 奖励记录

    # [2] 训练主循环
    start = time.time()
    for i_episode in tqdm(range(episode_num)):
        # [3] 收集轨迹数据
        trajectories, episode_rewards = collect_trajectory_vectorized(
            envs, policy, trajectory_max_steps, device=device
        )

        # [4] 策略更新
        loss = grpo_update(trajectories, policy, optimizer)

        # [5] 记录性能指标
        avg_reward = sum(episode_rewards) / len(episode_rewards)
        return_list.append(avg_reward.cpu().numpy())

        # 打印训练信息
        print(f'第 {i_episode} 次试验, 平均奖励: {avg_reward:.2f}')

    # [6] 训练后处理
    print("总耗时(s): ", time.time() - start)

    # 保存模型
    save_path = "./grpo_cartpole_policy_update_final.pth"
    torch.save(policy.state_dict(), save_path)
    print(f"模型已保存至: {save_path}")

    # 绘制训练曲线
    plt.figure(figsize=(10, 6))
    plt.plot(return_list)
    plt.xlabel('train epochs')
    plt.ylabel('avg reward')
    plt.title('GRPO on CartPole-v1')
    plt.grid(True)
    plt.show()

    # 关闭环境
    envs.close()

在上面代码中,我们首先创建一个PolicyNet用以对模型的训练,之后的GPRO在过程中学习操作,并根据奖励完成项目既定目标,并将结果进行存储。训练过程请读者自行尝试。

基于GRPO后的CartPole模型演示

模型训练完毕后,为了验证我们的训练任务,需要对基于GRPO后的CartPole模型进行演示,代码如下所示:

# test_cartpole.py
import gym
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from argparse import ArgumentParser


# 定义策略网络(必须与训练代码完全一致)
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(state_dim, 128)
        self.fc2 = torch.nn.Linear(128, action_dim)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        return torch.softmax(self.fc2(x), dim=1)


def load_model(model_path, device='cpu'):
    """加载训练好的模型"""
    # 初始化网络结构
    model = PolicyNet(state_dim=4, action_dim=2)

    try:
        # 加载训练权重
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
        print(f"成功加载模型:{model_path}")
        return model
    except Exception as e:
        print(f"模型加载失败:{str(e)}")
        exit(1)


def run_episode(env, model, max_steps=500, render=True):
    """运行单个测试回合"""
    state, _ = env.reset()
    total_reward = 0
    frames = []

    for step in range(max_steps):
        if render:
            frame = env.render()
            if env.render_mode == 'rgb_array':
                frames.append(frame)

        # 使用模型选择动作
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = model(state_tensor)
        action = torch.argmax(action_probs).item()

        # 执行动作
        next_state, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        state = next_state

        if terminated or truncated:
            print(f"回合结束,步数:{step + 1},总奖励:{total_reward:.1f}")
            break

    return total_reward, frames


def save_gif(frames, filename, fps=30):
    """保存为GIF动画"""
    plt.figure(figsize=(6, 4))
    plt.axis('off')
    ims = [[plt.imshow(frame, animated=True)] for frame in frames]
    ani = animation.ArtistAnimation(plt.gcf(), ims, interval=50, blit=True)
    ani.save(filename, writer='pillow', fps=fps)
    print(f"动画已保存至:{filename}")


def main():
    # 命令行参数解析
    parser = ArgumentParser(description='CartPole测试程序')
    parser.add_argument('--model', type=str, default='./grpo_cartpole_policy_update_final.pth',
                        help='模型文件路径(默认:./grpo_cartpole_policy_update_final.pth)')
    parser.add_argument('--episodes', type=int, default=5,
                        help='测试回合数(默认:5)')
    parser.add_argument('--render', type=str, choices=['human', 'rgb_array'], default='human',
                        help='渲染模式:human(窗口显示)或 rgb_array(生成帧)')
    parser.add_argument('--save_gif', action='store_true',
                        help='保存为GIF动画(仅在rgb_array模式有效)')
    args = parser.parse_args()

    # 设备设置
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 创建环境
    try:
        env = gym.make('CartPole-v1', render_mode=args.render)
    except gym.error.Error as e:
        print(f"环境创建失败:{str(e)}")
        print("请确保:1.已安装最新gym库 2.确认环境名称正确")
        exit(1)

    # 加载模型
    model = load_model(args.model, device)

    # 运行测试
    total_rewards = []
    best_frames = []
    max_reward = 0

    for ep in range(args.episodes):
        print(f"\n=== 第 {ep + 1}/{args.episodes} 测试回合 ===")
        reward, frames = run_episode(env, model)
        total_rewards.append(reward)

        # 记录最佳表现
        if reward > max_reward and args.render == 'rgb_array':
            max_reward = reward
            best_frames = frames

    # 输出统计信息
    print("\n=== 测试结果 ===")
    print(f"平均奖励:{np.mean(total_rewards):.1f} ± {np.std(total_rewards):.1f}")
    print(f"最佳奖励:{max(total_rewards)}")
    print(f"最差奖励:{min(total_rewards)}")

    # 保存最佳表现动画
    if args.save_gif and args.render == 'rgb_array' and len(best_frames) > 0:
        save_gif(best_frames, "cartpole_demo.gif")
    elif args.save_gif and args.render != 'rgb_array':
        print("警告:--save_gif 仅在rgb_array渲染模式下有效")

    env.close()


if __name__ == '__main__':
    main()

通过对训练好的模型进行演示,我们打印了演示步数及其获取的奖励,如下所示:

回合结束,步数:500,总奖励:500.0

=== 第 2/5 测试回合 ===
回合结束,步数:500,总奖励:500.0

=== 第 3/5 测试回合 ===
回合结束,步数:500,总奖励:500.0

=== 第 4/5 测试回合 ===
回合结束,步数:500,总奖励:500.0

=== 第 5/5 测试回合 ===
回合结束,步数:500,总奖励:500.0

=== 测试结果 ===
平均奖励:500.0 ± 0.0
最佳奖励:500.0
最低奖励:500.0

在这个过程中,我们可以通过注释模型载入参数来观察不同状态下的模型对CartPole的操作,其图像如图14-3所示。读者可以自行尝试运行代码。

图14-3  CartPole的操作