2.策略梯度方法
目标是使策略 不断更新,回报更高。
计算每一个轨迹的回报,和对应的概率
目标是使回报高的轨迹概率应该高。这样整个策略的期望回报也会高。
什么是策略期望回报?
就是用这个策略跑了若干个轨迹,得到回报,然后求平均
2.1 策略梯度的主要原理
# 1. 采样一个完整的 episode
log_probs = [] # 存储每个 (s_t, a_t) 的 log π(a_t|s_t)
rewards = [] # 存储每个时间步的奖励 r_t
while not done:
action_probs = policy_net(state_tensor) # π(a|s)
action = sample_action(action_probs) # a_t ~ π(a|s)
log_prob = torch.log(action_probs[action]) # log π(a_t|s_t)
log_probs.append(log_prob)
next_state, reward, done = env.step(action)
rewards.append(reward)
# 2. 计算每个时间步的折扣回报 G_t
discounted_rewards = compute_discounted_rewards(rewards, gamma=0.99)
# 3. 计算策略梯度损失
policy_loss = []
for log_prob, G_t in zip(log_probs, discounted_rewards):
policy_loss.append(-log_prob * G_t) # 负号因为 PyTorch 默认做梯度下降
# 4. 反向传播
total_loss = torch.stack(policy_loss).sum() # 求和所有时间步的损失
optimizer.zero_grad()
total_loss.backward() # 计算梯度 ∇θ J(θ)
optimizer.step() # 更新 θ ← θ + α ∇θ J(θ)
2.2 Reinforce 算法,也称为蒙特卡洛策略梯度,是一种策略梯度算法,它使用来自整个 episode 的估计回报来更新策略参数
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class PolicyNetwork(nn.Module):
"""
策略网络,输入状态,输出动作概率
"""
def __init__(self, state_dim, action_dim, hidden_dim=64):
super(PolicyNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.softmax(self.fc2(x), dim=-1)
return x
def reinforce(env, policy_net, optimizer, num_episodes=1000, gamma=0.99):
"""
REINFORCE算法实现
参数:
env: 环境
policy_net: 策略网络
optimizer: 优化器
num_episodes: 训练episode数量
gamma: 折扣因子
返回:
每个episode的奖励列表
"""
episode_rewards = []
for episode in range(num_episodes):
state = env.reset()
log_probs = []
rewards = []
# 采样一个完整的episode
done = False
while not done:
# 将状态转换为tensor
state_tensor = torch.FloatTensor(state).unsqueeze(0) # shape: (1, state_dim)
# 通过策略网络获取动作概率
action_probs = policy_net(state_tensor) # shape: (1, action_dim)
# 从概率分布中采样一个动作
action = torch.multinomial(action_probs, 1).item()
# 也可以
# dist = torch.distributions.Categorical(action_probs)
# action = dist.sample() # 标量值
# 计算动作的log概率
log_prob = torch.log(action_probs.squeeze(0)[action]) # shape: scalar
# 执行动作
next_state, reward, done, _ = env.step(action)
# 存储log概率和奖励
log_probs.append(log_prob)
rewards.append(reward)
# 更新状态
state = next_state
# 计算episode的折扣回报
discounted_rewards = []
R = 0
for r in reversed(rewards):
R = r + gamma * R
discounted_rewards.insert(0, R)
# 标准化折扣回报(减少方差)
discounted_rewards = torch.FloatTensor(discounted_rewards)
discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
# 计算策略梯度损失
policy_loss = []
for log_prob, R in zip(log_probs, discounted_rewards):
policy_loss.append(-log_prob * R) # 负号因为我们要最大化回报
# 反向传播
optimizer.zero_grad()
policy_loss = torch.stack(policy_loss).sum() # shape: scalar
policy_loss.backward()
optimizer.step()
# 记录总奖励
episode_rewards.append(sum(rewards))
return episode_rewards
开始以为policy_loss 计算的是策略梯度,感觉很不合理,其实不是的,差了一个求导呢。
总结,policy_loss 的梯度 和 目标函数的梯度符号相反。
两者的梯度 符号相反。因此最大化目标函数等于最小化policy_loss