9.4.2 综合实战:评估AWAC、IQL和ORL方法的性能
请看下面的实例,用于在MountainCarContinuous环境中对比三种不同的算法:AWAC(Advantage Weighted Actor-Critic)、IQL(Implicit Q-Learning)和ORL(Offline-to-Online迁移框架)。首先生成了离线数据集,然后分别使用这三种算法进行训练和评估。AWAC通过优势加权行为克隆优化策略,IQL采用隐式策略更新和期望回归损失,而ORL结合了离线预训练和在线微调。最终,通过可视化训练过程和评估结果,展示了三种算法在奖励获取和性能表现上的差异。
实例9-3:评估AWAC、IQL和ORL方法的性能(源码路径:codes\9\Mo.py)
实例文件Mo.py的具体实现流程如下所示。
(1)定义类PrioritizedReplayBuffer,功能是实现一个优先经验回放缓冲区,用于存储和采样训练数据。它根据样本的优先级(TD误差)进行采样,优先级越高,被采样的概率越大。同时,它支持动态调整采样权重,并在每次采样后更新样本的优先级。
# 优先经验回放缓冲区
class PrioritizedReplayBuffer:
def __init__(self, capacity=100000, alpha=0.6, beta=0.4, beta_increment=0.001):
self.capacity = capacity
self.buffer = []
self.position = 0
self.priorities = np.zeros((capacity,), dtype=np.float32)
self.alpha = alpha
self.beta = beta
self.beta_increment = beta_increment
self.max_priority = 1.0
def add(self, state, action, reward, next_state, done):
max_priority = self.max_priority if len(self.buffer) < self.capacity else np.max(self.priorities)
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.priorities[self.position] = max_priority
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
if len(self.buffer) == 0:
return None
priorities = self.priorities[:len(self.buffer)]
probs = priorities ** self.alpha
probs /= probs.sum()
indices = np.random.choice(len(self.buffer), batch_size, p=probs)
samples = [self.buffer[idx] for idx in indices]
total = len(self.buffer)
weights = (total * probs[indices]) ** (-self.beta)
weights /= weights.max()
weights = np.array(weights, dtype=np.float32)
state, action, reward, next_state, done = map(np.stack, zip(*samples))
return state, action, reward, next_state, done, indices, weights
def update_priorities(self, indices, priorities):
for idx, priority in zip(indices, priorities):
self.priorities[idx] = priority
self.max_priority = max(self.max_priority, priority)
self.beta = min(1.0, self.beta + self.beta_increment)
def __len__(self):
return len(self.buffer)
(2)定义类Actor,功能是实现一个演员网络(Actor),用于生成策略。它是一个神经网络模型,输入状态,输出动作,通过Tanh激活函数将输出的动作值限制在[-1, 1]范围内,再乘以max_action以适应环境的动作范围。
# 优化后的神经网络模型
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.layers = nn.Sequential(
nn.Linear(state_dim, 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU(),
nn.Linear(256, action_dim),
nn.Tanh()
)
self.max_action = max_action
def forward(self, state):
return self.layers(state) * self.max_action
(3)定义类Critic,功能是实现一个评论家网络(Critic),用于评估动作的价值。它包含两个神经网络(layer1和layer2),输入状态和动作,输出两个Q值,用于计算目标Q值和当前Q值。
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
self.layer1 = nn.Sequential(
nn.Linear(state_dim + action_dim, 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU(),
nn.Linear(256, 1)
)
self.layer2 = nn.Sequential(
nn.Linear(state_dim + action_dim, 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU(),
nn.Linear(256, 1)
)
def forward(self, state, action):
sa = torch.cat([state, action], 1)
q1 = self.layer1(sa)
q2 = self.layer2(sa)
return q1, q2
def Q1(self, state, action):
sa = torch.cat([state, action], 1)
return self.layer1(sa)
(4)定义类AWAC,功能是实现AWAC(Advantage Weighted Actor-Critic)算法。它结合了演员网络和评论家网络,通过优势加权行为克隆的方式更新策略,同时使用软更新策略更新目标网络,并结合优先经验回放缓冲区优化训练过程。
class AWAC:
def __init__(self, state_dim, action_dim, max_action,
beta=0.5, tau=0.005, gamma=0.995, lr=1e-4):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
self.max_action = max_action
self.beta = beta
self.tau = tau
self.gamma = gamma
self.replay_buffer = PrioritizedReplayBuffer()
self.exploration = 0.5 # 初始探索率
self.exploration_decay = 0.995 # 探索率衰减
def select_action(self, state, eval=False):
state = torch.FloatTensor(state.reshape(1, -1)).to(device)
with torch.no_grad():
action = self.actor(state)
if not eval and random.random() < self.exploration:
action = action + torch.FloatTensor(np.random.normal(0, self.exploration, action.shape)).to(device)
action = torch.clamp(action, -self.max_action, self.max_action)
return action.cpu().numpy().flatten()
def update_exploration(self):
self.exploration = max(0.01, self.exploration * self.exploration_decay)
def train(self, batch_size=256):
if len(self.replay_buffer) < batch_size:
return
data = self.replay_buffer.sample(batch_size)
if data is None:
return
state, action, reward, next_state, done, indices, weights = data
state = torch.FloatTensor(state).to(device)
action = torch.FloatTensor(action).to(device)
reward = torch.FloatTensor(reward.reshape(-1, 1)).to(device)
next_state = torch.FloatTensor(next_state).to(device)
done = torch.FloatTensor(done.reshape(-1, 1)).to(device)
weights = torch.FloatTensor(weights.reshape(-1, 1)).to(device)
# 更新Critic
with torch.no_grad():
next_action = self.actor(next_state)
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
target_Q = torch.min(target_Q1, target_Q2)
target_Q = reward + (1 - done) * self.gamma * target_Q
current_Q1, current_Q2 = self.critic(state, action)
critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# 软更新目标网络
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
# 计算优势函数
with torch.no_grad():
V = torch.min(self.critic(state, self.actor(state))[0], self.critic(state, self.actor(state))[1])
Q = torch.min(self.critic(state, action)[0], self.critic(state, action)[1])
advantage = Q - V
# 更新Actor
actor_loss = -torch.mean(advantage.detach() * torch.exp(advantage / self.beta) *
torch.sum(self.actor(state) * action, dim=1, keepdim=True) * weights)
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# 更新优先级
with torch.no_grad():
current_Q1, current_Q2 = self.critic(state, action)
td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
new_priorities = td_errors + 1e-6
self.replay_buffer.update_priorities(indices, new_priorities)
(5)定义类IQL,功能是实现IQL(Implicit Q-Learning)算法。它通过隐式策略更新和期望回归损失来优化策略,同时使用软更新策略更新目标网络,并结合优先经验回放缓冲区优化训练过程。
class IQL:
def __init__(self, state_dim, action_dim, max_action,
tau=0.005, gamma=0.995, alpha=0.5, expectile=0.8, lr=1e-4):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
self.max_action = max_action
self.tau = tau
self.gamma = gamma
self.alpha = alpha
self.expectile = expectile
self.replay_buffer = PrioritizedReplayBuffer()
self.exploration = 0.5
self.exploration_decay = 0.995
def select_action(self, state, eval=False):
state = torch.FloatTensor(state.reshape(1, -1)).to(device)
with torch.no_grad():
action = self.actor(state)
if not eval and random.random() < self.exploration:
action = action + torch.FloatTensor(np.random.normal(0, self.exploration, action.shape)).to(device)
action = torch.clamp(action, -self.max_action, self.max_action)
return action.cpu().numpy().flatten()
def update_exploration(self):
self.exploration = max(0.01, self.exploration * self.exploration_decay)
def train(self, batch_size=256):
if len(self.replay_buffer) < batch_size:
return
data = self.replay_buffer.sample(batch_size)
if data is None:
return
state, action, reward, next_state, done, indices, weights = data
state = torch.FloatTensor(state).to(device)
action = torch.FloatTensor(action).to(device)
reward = torch.FloatTensor(reward.reshape(-1, 1)).to(device)
next_state = torch.FloatTensor(next_state).to(device)
done = torch.FloatTensor(done.reshape(-1, 1)).to(device)
weights = torch.FloatTensor(weights.reshape(-1, 1)).to(device)
# 更新Critic
with torch.no_grad():
next_action = self.actor(next_state)
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
target_Q = torch.min(target_Q1, target_Q2)
target_Q = reward + (1 - done) * self.gamma * target_Q
current_Q1, current_Q2 = self.critic(state, action)
critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# 软更新目标网络
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
# IQL的隐式策略评估
policy_action = self.actor(state)
policy_Q1, policy_Q2 = self.critic(state, policy_action)
policy_Q = torch.min(policy_Q1, policy_Q2)
with torch.no_grad():
behavior_Q1, behavior_Q2 = self.critic(state, action)
behavior_Q = torch.min(behavior_Q1, behavior_Q2)
# 期望回归损失
delta = policy_Q - behavior_Q
expectile_loss = torch.mean(torch.where(
delta > 0,
self.expectile * delta ** 2,
(self.expectile - 1) * delta ** 2
) * weights)
# 更新Actor
actor_loss = -torch.mean(policy_Q * weights) + self.alpha * expectile_loss
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# 更新优先级
with torch.no_grad():
current_Q1, current_Q2 = self.critic(state, action)
td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
new_priorities = td_errors + 1e-6
self.replay_buffer.update_priorities(indices, new_priorities)
(6)定义类ORL,功能是实现ORL(Offline-to-Online)迁移框架。类ORL结合了离线预训练和在线微调,支持在离线数据集上进行预训练,然后在在线环境中进行微调,并结合优先经验回放缓冲区优化训练过程。
class ORL:
def __init__(self, state_dim, action_dim, max_action,
tau=0.005, gamma=0.995, lr=1e-4, offline_epochs=200):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
self.max_action = max_action
self.tau = tau
self.gamma = gamma
self.offline_epochs = offline_epochs
self.offline_buffer = PrioritizedReplayBuffer()
self.online_buffer = PrioritizedReplayBuffer()
self.device = device
self.exploration = 0.5
self.exploration_decay = 0.997
def select_action(self, state, eval=False):
state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
with torch.no_grad():
action = self.actor(state)
if not eval and random.random() < self.exploration:
action = action + torch.FloatTensor(np.random.normal(0, self.exploration, action.shape)).to(self.device)
action = torch.clamp(action, -self.max_action, self.max_action)
return action.cpu().numpy().flatten()
def update_exploration(self):
self.exploration = max(0.01, self.exploration * self.exploration_decay)
def offline_train(self, batch_size=256):
if len(self.offline_buffer) < batch_size:
return
print(f"开始离线预训练,共{self.offline_epochs}个epochs...")
for epoch in tqdm(range(self.offline_epochs), leave=False):
data = self.offline_buffer.sample(batch_size)
if data is None:
continue
state, action, reward, next_state, done, indices, weights = data
state = torch.FloatTensor(state).to(self.device)
action = torch.FloatTensor(action).to(self.device)
reward = torch.FloatTensor(reward.reshape(-1, 1)).to(self.device)
next_state = torch.FloatTensor(next_state).to(self.device)
done = torch.FloatTensor(done.reshape(-1, 1)).to(self.device)
weights = torch.FloatTensor(weights.reshape(-1, 1)).to(self.device)
# 更新Critic
with torch.no_grad():
next_action = self.actor(next_state)
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
target_Q = torch.min(target_Q1, target_Q2)
target_Q = reward + (1 - done) * self.gamma * target_Q
current_Q1, current_Q2 = self.critic(state, action)
critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# 软更新目标网络
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
# 更新Actor
actor_Q = self.critic.Q1(state, self.actor(state))
actor_loss = -torch.mean(actor_Q * weights)
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# 更新优先级
with torch.no_grad():
current_Q1, current_Q2 = self.critic(state, action)
td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
new_priorities = td_errors + 1e-6
self.offline_buffer.update_priorities(indices, new_priorities)
def online_finetune(self, env, num_episodes=300):
print("开始在线微调...")
rewards_history = []
reset_fn, step_fn = gym_api_compatibility(env)
for episode in tqdm(range(num_episodes), leave=False):
state = reset_fn(env)
episode_reward = 0
done = False
steps = 0
while not done and steps < 500:
action = self.select_action(state)
next_state, reward, done = step_fn(env, action)
self.online_buffer.add(state, action, reward, next_state, done)
# 每收集一定数量的样本就训练多次
if len(self.online_buffer) >= 500 and random.random() < 0.5:
for _ in range(5):
self.train()
state = next_state
episode_reward += reward
steps += 1
rewards_history.append(episode_reward)
self.update_exploration()
if (episode + 1) % 10 == 0:
avg_reward = np.mean(rewards_history[-10:])
print(f"在线微调 - 回合 {episode + 1}, 平均奖励: {avg_reward:.2f}, 探索率: {self.exploration:.3f}")
return rewards_history
def train(self, batch_size=256):
if len(self.online_buffer) < batch_size:
return
data = self.online_buffer.sample(batch_size)
if data is None:
return
state, action, reward, next_state, done, indices, weights = data
state = torch.FloatTensor(state).to(self.device)
action = torch.FloatTensor(action).to(self.device)
reward = torch.FloatTensor(reward.reshape(-1, 1)).to(self.device)
next_state = torch.FloatTensor(next_state).to(self.device)
done = torch.FloatTensor(done.reshape(-1, 1)).to(self.device)
weights = torch.FloatTensor(weights.reshape(-1, 1)).to(self.device)
# 更新Critic
with torch.no_grad():
next_action = self.actor(next_state)
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
target_Q = torch.min(target_Q1, target_Q2)
target_Q = reward + (1 - done) * self.gamma * target_Q
current_Q1, current_Q2 = self.critic(state, action)
critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# 软更新目标网络
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
# 更新Actor
actor_Q = self.critic.Q1(state, self.actor(state))
actor_loss = -torch.mean(actor_Q * weights)
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# 更新优先级
with torch.no_grad():
current_Q1, current_Q2 = self.critic(state, action)
td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
new_priorities = td_errors + 1e-6
self.online_buffer.update_priorities(indices, new_priorities)
(7)定义函数generate_offline_data,功能是生成离线数据集。函数generate_offline_data通过随机策略在环境中采样数据,存储到优先经验回放缓冲区中,用于后续的离线训练。
# 生成离线数据集
def generate_offline_data(env, num_episodes=500):
buffer = PrioritizedReplayBuffer()
reset_fn, step_fn = gym_api_compatibility(env)
for _ in range(num_episodes):
state = reset_fn(env)
done = False
while not done:
action = env.action_space.sample()
next_state, reward, done = step_fn(env, action)
buffer.add(state, action, reward, next_state, done)
state = next_state
return buffer
(8)定义函数train_agent,功能是训练强化学习代理。函数train_agent通过指定的代理、环境和训练参数,执行训练过程,记录每个回合的奖励,并返回奖励历史记录。同时,它支持早停机制,以避免过拟合。
# 优化后的训练函数
def train_agent(agent, env, num_episodes=300, early_stopping=30, min_improvement=0.1):
rewards_history = []
consecutive_no_improvement = 0
best_avg_reward = -float('inf')
reset_fn, step_fn = gym_api_compatibility(env)
for episode in tqdm(range(num_episodes), desc="训练中", leave=False):
state = reset_fn(env)
episode_reward = 0
done = False
while not done:
action = agent.select_action(state)
next_state, reward, done = step_fn(env, action)
agent.replay_buffer.add(state, action, reward, next_state, done)
agent.train()
state = next_state
episode_reward += reward
rewards_history.append(episode_reward)
agent.update_exploration()
# 计算最近10个回合的平均奖励
if len(rewards_history) >= 10:
current_avg = np.mean(rewards_history[-10:])
# 检测奖励是否有显著提升
if current_avg > best_avg_reward + min_improvement:
best_avg_reward = current_avg
consecutive_no_improvement = 0
else:
consecutive_no_improvement += 1
# 早停检查(放宽条件)
if consecutive_no_improvement >= early_stopping and len(rewards_history) >= 50:
print(f"早停触发,连续{early_stopping}个episode没有显著提升")
break
if episode % 10 == 0 and len(rewards_history) >= 10:
avg_reward = np.mean(rewards_history[-10:])
print(f"训练 - 回合 {episode + 1}, 平均奖励: {avg_reward:.2f}")
return rewards_history
(9)定义函数evaluate_agent,功能是评估强化学习代理的性能。函数evaluate_agent通过指定的代理和环境,执行评估过程,计算平均奖励,并返回平均奖励值。同时,它支持渲染环境以可视化代理的行为。
# 优化后的评估函数
def evaluate_agent(agent, env, num_episodes=20, render=False):
total_rewards = []
reset_fn, step_fn = gym_api_compatibility(env)
for episode in range(num_episodes):
state = reset_fn(env)
episode_reward = 0
done = False
while not done:
if render and episode < 5:
env.render()
time.sleep(0.01)
action = agent.select_action(state, eval=True)
next_state, reward, done = step_fn(env, action)
state = next_state
episode_reward += reward
total_rewards.append(episode_reward)
avg_reward = np.mean(total_rewards)
print(f"平均评估奖励: {avg_reward:.2f}")
return avg_reward
(10)定义函数main,这是本实例的入口点函数,用于执行主要的训练和评估流程。函数main首先生成离线数据集,初始化AWAC、IQL和ORL代理,然后训练并评估这些代理,最后可视化训练过程和评估结果。
# 主函数
def main():
# 创建环境
env = gym.make('MountainCarContinuous-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]
reset_env, step_env = gym_api_compatibility(env)
# 生成离线数据集
print("生成离线数据集...")
offline_buffer = generate_offline_data(env, num_episodes=500)
print(f"离线数据集大小: {len(offline_buffer)}")
# 初始化三种方法
awac = AWAC(state_dim, action_dim, action_bound)
iql = IQL(state_dim, action_dim, action_bound)
orl = ORL(state_dim, action_dim, action_bound)
# 将离线数据添加到ORL的离线缓冲区
for data in offline_buffer.buffer:
if data is not None:
state, action, reward, next_state, done = data
orl.offline_buffer.add(state, action, reward, next_state, done)
# 训练AWAC
print("\n训练AWAC...")
awac_rewards = train_agent(awac, env, num_episodes=300, early_stopping=30)
# 训练IQL
print("\n训练IQL...")
iql_rewards = train_agent(iql, env, num_episodes=300, early_stopping=30)
# 训练ORL - 先离线预训练,再在线微调
print("\n训练ORL...")
orl.offline_train()
orl_rewards = orl.online_finetune(env)
# 评估三种方法
print("\n评估AWAC...")
awac_avg = evaluate_agent(awac, env)
print("\n评估IQL...")
iql_avg = evaluate_agent(iql, env)
print("\n评估ORL...")
orl_avg = evaluate_agent(orl, env)
# 可视化训练过程
plt.figure(figsize=(14, 8))
plt.plot(awac_rewards, label='AWAC', alpha=0.7)
plt.plot(iql_rewards, label='IQL', alpha=0.7)
plt.plot(orl_rewards, label='ORL', alpha=0.7)
# 计算移动平均
awac_smooth = np.convolve(awac_rewards, np.ones(10) / 10, mode='valid')
iql_smooth = np.convolve(iql_rewards, np.ones(10) / 10, mode='valid')
orl_smooth = np.convolve(orl_rewards, np.ones(10) / 10, mode='valid')
plt.plot(range(9, len(awac_rewards)), awac_smooth, label='AWAC Moving Average', linewidth=2)
plt.plot(range(9, len(iql_rewards)), iql_smooth, label='IQL Moving Average', linewidth=2)
plt.plot(range(9, len(orl_rewards)), orl_smooth, label='ORL Moving Average', linewidth=2)
plt.title('Comparison of Imitation Regularization Methods (MountainCarContinuous)', fontsize=16)
plt.xlabel('Training Episodes', fontsize=14)
plt.ylabel('Rewards', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('training_rewards.png', dpi=300)
plt.show()
# 可视化评估结果
plt.figure(figsize=(10, 6))
methods = ['AWAC', 'IQL', 'ORL']
rewards = [awac_avg, iql_avg, orl_avg]
plt.bar(methods, rewards, color=['#1f77b4', '#ff7f0e', '#2ca02c'])
plt.title('Comparison of Evaluation Rewards', fontsize=16)
plt.ylabel('Average Evaluation Rewards', fontsize=14)
plt.grid(True, alpha=0.3)
for i, v in enumerate(rewards):
plt.text(i, v + 5, f'{v:.2f}', ha='center', fontsize=12)
plt.tight_layout()
plt.savefig('evaluation_rewards.png', dpi=300)
plt.show()
env.close()
if __name__ == "__main__":
main()
执行本实例后,会绘制如下所示的两个可视化图。
- 训练过程的奖励曲线图:展示了AWAC、IQL和ORL三种算法在训练过程中的奖励变化情况,如图9-1所示,在图中绘制了每种算法的原始奖励曲线(awac_rewards、iql_rewards、orl_rewards)以及它们的移动平均曲线(awac_smooth、iql_smooth、orl_smooth)。
- 评估结果的柱状图:展示了AWAC、IQL和ORL三种算法在评估阶段的平均奖励对比,如图9-2所示。使用柱状图表示每种算法的平均评估奖励(awac_avg、iql_avg、orl_avg),并且在每个柱状图上标注了具体的数值。
图9-1 训练过程的奖励曲线图
图9-2 评估结果的柱状图