（9-4-02）离线强化学习：综合实战：评估AWAC、IQL和ORL方法的性能-CSDN博客

9.4.2 综合实战：评估AWAC、IQL和ORL方法的性能

请看下面的实例，用于在MountainCarContinuous环境中对比三种不同的算法：AWAC（Advantage Weighted Actor-Critic）、IQL（Implicit Q-Learning）和ORL（Offline-to-Online迁移框架）。首先生成了离线数据集，然后分别使用这三种算法进行训练和评估。AWAC通过优势加权行为克隆优化策略，IQL采用隐式策略更新和期望回归损失，而ORL结合了离线预训练和在线微调。最终，通过可视化训练过程和评估结果，展示了三种算法在奖励获取和性能表现上的差异。

实例9-3：评估AWAC、IQL和ORL方法的性能（源码路径：codes\9\Mo.py）

实例文件Mo.py的具体实现流程如下所示。

（1）定义类PrioritizedReplayBuffer，功能是实现一个优先经验回放缓冲区，用于存储和采样训练数据。它根据样本的优先级（TD误差）进行采样，优先级越高，被采样的概率越大。同时，它支持动态调整采样权重，并在每次采样后更新样本的优先级。

# 优先经验回放缓冲区
class PrioritizedReplayBuffer:
    def __init__(self, capacity=100000, alpha=0.6, beta=0.4, beta_increment=0.001):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)
        self.alpha = alpha
        self.beta = beta
        self.beta_increment = beta_increment
        self.max_priority = 1.0

    def add(self, state, action, reward, next_state, done):
        max_priority = self.max_priority if len(self.buffer) < self.capacity else np.max(self.priorities)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.priorities[self.position] = max_priority
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        if len(self.buffer) == 0:
            return None

        priorities = self.priorities[:len(self.buffer)]
        probs = priorities ** self.alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-self.beta)
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)

        state, action, reward, next_state, done = map(np.stack, zip(*samples))
        return state, action, reward, next_state, done, indices, weights

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.priorities[idx] = priority
            self.max_priority = max(self.max_priority, priority)

        self.beta = min(1.0, self.beta + self.beta_increment)

    def __len__(self):
        return len(self.buffer)

（2）定义类Actor，功能是实现一个演员网络（Actor），用于生成策略。它是一个神经网络模型，输入状态，输出动作，通过Tanh激活函数将输出的动作值限制在[-1, 1]范围内，再乘以max_action以适应环境的动作范围。

# 优化后的神经网络模型
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim),
            nn.Tanh()
        )
        self.max_action = max_action

    def forward(self, state):
        return self.layers(state) * self.max_action

（3）定义类Critic，功能是实现一个评论家网络（Critic），用于评估动作的价值。它包含两个神经网络（layer1和layer2），输入状态和动作，输出两个Q值，用于计算目标Q值和当前Q值。

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = self.layer1(sa)
        q2 = self.layer2(sa)
        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)
        return self.layer1(sa)

（4）定义类AWAC，功能是实现AWAC（Advantage Weighted Actor-Critic）算法。它结合了演员网络和评论家网络，通过优势加权行为克隆的方式更新策略，同时使用软更新策略更新目标网络，并结合优先经验回放缓冲区优化训练过程。

class AWAC:
    def __init__(self, state_dim, action_dim, max_action,
                 beta=0.5, tau=0.005, gamma=0.995, lr=1e-4):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

        self.max_action = max_action
        self.beta = beta
        self.tau = tau
        self.gamma = gamma
        self.replay_buffer = PrioritizedReplayBuffer()
        self.exploration = 0.5  # 初始探索率
        self.exploration_decay = 0.995  # 探索率衰减

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        with torch.no_grad():
            action = self.actor(state)
        if not eval and random.random() < self.exploration:
            action = action + torch.FloatTensor(np.random.normal(0, self.exploration, action.shape)).to(device)
            action = torch.clamp(action, -self.max_action, self.max_action)
        return action.cpu().numpy().flatten()

    def update_exploration(self):
        self.exploration = max(0.01, self.exploration * self.exploration_decay)

    def train(self, batch_size=256):
        if len(self.replay_buffer) < batch_size:
            return

        data = self.replay_buffer.sample(batch_size)
        if data is None:
            return

        state, action, reward, next_state, done, indices, weights = data

        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward.reshape(-1, 1)).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        done = torch.FloatTensor(done.reshape(-1, 1)).to(device)
        weights = torch.FloatTensor(weights.reshape(-1, 1)).to(device)

        # 更新Critic
        with torch.no_grad():
            next_action = self.actor(next_state)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 软更新目标网络
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        # 计算优势函数
        with torch.no_grad():
            V = torch.min(self.critic(state, self.actor(state))[0], self.critic(state, self.actor(state))[1])
            Q = torch.min(self.critic(state, action)[0], self.critic(state, action)[1])
            advantage = Q - V

        # 更新Actor
        actor_loss = -torch.mean(advantage.detach() * torch.exp(advantage / self.beta) *
                                 torch.sum(self.actor(state) * action, dim=1, keepdim=True) * weights)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 更新优先级
        with torch.no_grad():
            current_Q1, current_Q2 = self.critic(state, action)
            td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
            new_priorities = td_errors + 1e-6
            self.replay_buffer.update_priorities(indices, new_priorities)

（5）定义类IQL，功能是实现IQL（Implicit Q-Learning）算法。它通过隐式策略更新和期望回归损失来优化策略，同时使用软更新策略更新目标网络，并结合优先经验回放缓冲区优化训练过程。

class IQL:
    def __init__(self, state_dim, action_dim, max_action,
                 tau=0.005, gamma=0.995, alpha=0.5, expectile=0.8, lr=1e-4):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

        self.max_action = max_action
        self.tau = tau
        self.gamma = gamma
        self.alpha = alpha
        self.expectile = expectile
        self.replay_buffer = PrioritizedReplayBuffer()
        self.exploration = 0.5
        self.exploration_decay = 0.995

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        with torch.no_grad():
            action = self.actor(state)
        if not eval and random.random() < self.exploration:
            action = action + torch.FloatTensor(np.random.normal(0, self.exploration, action.shape)).to(device)
            action = torch.clamp(action, -self.max_action, self.max_action)
        return action.cpu().numpy().flatten()

    def update_exploration(self):
        self.exploration = max(0.01, self.exploration * self.exploration_decay)

    def train(self, batch_size=256):
        if len(self.replay_buffer) < batch_size:
            return

        data = self.replay_buffer.sample(batch_size)
        if data is None:
            return

        state, action, reward, next_state, done, indices, weights = data

        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward.reshape(-1, 1)).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        done = torch.FloatTensor(done.reshape(-1, 1)).to(device)
        weights = torch.FloatTensor(weights.reshape(-1, 1)).to(device)

        # 更新Critic
        with torch.no_grad():
            next_action = self.actor(next_state)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 软更新目标网络
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        # IQL的隐式策略评估
        policy_action = self.actor(state)
        policy_Q1, policy_Q2 = self.critic(state, policy_action)
        policy_Q = torch.min(policy_Q1, policy_Q2)

        with torch.no_grad():
            behavior_Q1, behavior_Q2 = self.critic(state, action)
            behavior_Q = torch.min(behavior_Q1, behavior_Q2)

        # 期望回归损失
        delta = policy_Q - behavior_Q
        expectile_loss = torch.mean(torch.where(
            delta > 0,
            self.expectile * delta ** 2,
            (self.expectile - 1) * delta ** 2
        ) * weights)

        # 更新Actor
        actor_loss = -torch.mean(policy_Q * weights) + self.alpha * expectile_loss

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 更新优先级
        with torch.no_grad():
            current_Q1, current_Q2 = self.critic(state, action)
            td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
            new_priorities = td_errors + 1e-6
            self.replay_buffer.update_priorities(indices, new_priorities)

（6）定义类ORL，功能是实现ORL（Offline-to-Online）迁移框架。类ORL结合了离线预训练和在线微调，支持在离线数据集上进行预训练，然后在在线环境中进行微调，并结合优先经验回放缓冲区优化训练过程。

class ORL:
    def __init__(self, state_dim, action_dim, max_action,
                 tau=0.005, gamma=0.995, lr=1e-4, offline_epochs=200):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

        self.max_action = max_action
        self.tau = tau
        self.gamma = gamma
        self.offline_epochs = offline_epochs
        self.offline_buffer = PrioritizedReplayBuffer()
        self.online_buffer = PrioritizedReplayBuffer()
        self.device = device
        self.exploration = 0.5
        self.exploration_decay = 0.997

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        with torch.no_grad():
            action = self.actor(state)
        if not eval and random.random() < self.exploration:
            action = action + torch.FloatTensor(np.random.normal(0, self.exploration, action.shape)).to(self.device)
            action = torch.clamp(action, -self.max_action, self.max_action)
        return action.cpu().numpy().flatten()

    def update_exploration(self):
        self.exploration = max(0.01, self.exploration * self.exploration_decay)

    def offline_train(self, batch_size=256):
        if len(self.offline_buffer) < batch_size:
            return

        print(f"开始离线预训练，共{self.offline_epochs}个epochs...")
        for epoch in tqdm(range(self.offline_epochs), leave=False):
            data = self.offline_buffer.sample(batch_size)
            if data is None:
                continue

            state, action, reward, next_state, done, indices, weights = data

            state = torch.FloatTensor(state).to(self.device)
            action = torch.FloatTensor(action).to(self.device)
            reward = torch.FloatTensor(reward.reshape(-1, 1)).to(self.device)
            next_state = torch.FloatTensor(next_state).to(self.device)
            done = torch.FloatTensor(done.reshape(-1, 1)).to(self.device)
            weights = torch.FloatTensor(weights.reshape(-1, 1)).to(self.device)

            # 更新Critic
            with torch.no_grad():
                next_action = self.actor(next_state)
                target_Q1, target_Q2 = self.critic_target(next_state, next_action)
                target_Q = torch.min(target_Q1, target_Q2)
                target_Q = reward + (1 - done) * self.gamma * target_Q

            current_Q1, current_Q2 = self.critic(state, action)
            critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # 软更新目标网络
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            # 更新Actor
            actor_Q = self.critic.Q1(state, self.actor(state))
            actor_loss = -torch.mean(actor_Q * weights)

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # 更新优先级
            with torch.no_grad():
                current_Q1, current_Q2 = self.critic(state, action)
                td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
                new_priorities = td_errors + 1e-6
                self.offline_buffer.update_priorities(indices, new_priorities)

    def online_finetune(self, env, num_episodes=300):
        print("开始在线微调...")
        rewards_history = []
        reset_fn, step_fn = gym_api_compatibility(env)

        for episode in tqdm(range(num_episodes), leave=False):
            state = reset_fn(env)
            episode_reward = 0
            done = False
            steps = 0

            while not done and steps < 500:
                action = self.select_action(state)
                next_state, reward, done = step_fn(env, action)

                self.online_buffer.add(state, action, reward, next_state, done)

                # 每收集一定数量的样本就训练多次
                if len(self.online_buffer) >= 500 and random.random() < 0.5:
                    for _ in range(5):
                        self.train()

                state = next_state
                episode_reward += reward
                steps += 1

            rewards_history.append(episode_reward)
            self.update_exploration()

            if (episode + 1) % 10 == 0:
                avg_reward = np.mean(rewards_history[-10:])
                print(f"在线微调 - 回合 {episode + 1}, 平均奖励: {avg_reward:.2f}, 探索率: {self.exploration:.3f}")

        return rewards_history

    def train(self, batch_size=256):
        if len(self.online_buffer) < batch_size:
            return

        data = self.online_buffer.sample(batch_size)
        if data is None:
            return

        state, action, reward, next_state, done, indices, weights = data

        state = torch.FloatTensor(state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward.reshape(-1, 1)).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        done = torch.FloatTensor(done.reshape(-1, 1)).to(self.device)
        weights = torch.FloatTensor(weights.reshape(-1, 1)).to(self.device)

        # 更新Critic
        with torch.no_grad():
            next_action = self.actor(next_state)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = (nn.MSELoss()(current_Q1, target_Q) + nn.MSELoss()(current_Q2, target_Q)) * weights.mean()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 软更新目标网络
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        # 更新Actor
        actor_Q = self.critic.Q1(state, self.actor(state))
        actor_loss = -torch.mean(actor_Q * weights)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 更新优先级
        with torch.no_grad():
            current_Q1, current_Q2 = self.critic(state, action)
            td_errors = torch.abs(current_Q1 - target_Q).detach().cpu().numpy()
            new_priorities = td_errors + 1e-6
            self.online_buffer.update_priorities(indices, new_priorities)

（7）定义函数generate_offline_data，功能是生成离线数据集。函数generate_offline_data通过随机策略在环境中采样数据，存储到优先经验回放缓冲区中，用于后续的离线训练。

# 生成离线数据集
def generate_offline_data(env, num_episodes=500):
    buffer = PrioritizedReplayBuffer()
    reset_fn, step_fn = gym_api_compatibility(env)
    for _ in range(num_episodes):
        state = reset_fn(env)
        done = False
        while not done:
            action = env.action_space.sample()
            next_state, reward, done = step_fn(env, action)
            buffer.add(state, action, reward, next_state, done)
            state = next_state
    return buffer

（8）定义函数train_agent，功能是训练强化学习代理。函数train_agent通过指定的代理、环境和训练参数，执行训练过程，记录每个回合的奖励，并返回奖励历史记录。同时，它支持早停机制，以避免过拟合。

# 优化后的训练函数
def train_agent(agent, env, num_episodes=300, early_stopping=30, min_improvement=0.1):
    rewards_history = []
    consecutive_no_improvement = 0
    best_avg_reward = -float('inf')
    reset_fn, step_fn = gym_api_compatibility(env)

    for episode in tqdm(range(num_episodes), desc="训练中", leave=False):
        state = reset_fn(env)
        episode_reward = 0
        done = False

        while not done:
            action = agent.select_action(state)
            next_state, reward, done = step_fn(env, action)

            agent.replay_buffer.add(state, action, reward, next_state, done)
            agent.train()

            state = next_state
            episode_reward += reward

        rewards_history.append(episode_reward)
        agent.update_exploration()

        # 计算最近10个回合的平均奖励
        if len(rewards_history) >= 10:
            current_avg = np.mean(rewards_history[-10:])

            # 检测奖励是否有显著提升
            if current_avg > best_avg_reward + min_improvement:
                best_avg_reward = current_avg
                consecutive_no_improvement = 0
            else:
                consecutive_no_improvement += 1

            # 早停检查（放宽条件）
            if consecutive_no_improvement >= early_stopping and len(rewards_history) >= 50:
                print(f"早停触发，连续{early_stopping}个episode没有显著提升")
                break

        if episode % 10 == 0 and len(rewards_history) >= 10:
            avg_reward = np.mean(rewards_history[-10:])
            print(f"训练 - 回合 {episode + 1}, 平均奖励: {avg_reward:.2f}")

    return rewards_history

（9）定义函数evaluate_agent，功能是评估强化学习代理的性能。函数evaluate_agent通过指定的代理和环境，执行评估过程，计算平均奖励，并返回平均奖励值。同时，它支持渲染环境以可视化代理的行为。

# 优化后的评估函数
def evaluate_agent(agent, env, num_episodes=20, render=False):
    total_rewards = []
    reset_fn, step_fn = gym_api_compatibility(env)
    for episode in range(num_episodes):
        state = reset_fn(env)
        episode_reward = 0
        done = False
        while not done:
            if render and episode < 5:
                env.render()
                time.sleep(0.01)
            action = agent.select_action(state, eval=True)
            next_state, reward, done = step_fn(env, action)
            state = next_state
            episode_reward += reward
        total_rewards.append(episode_reward)
    avg_reward = np.mean(total_rewards)
    print(f"平均评估奖励: {avg_reward:.2f}")
    return avg_reward

（10）定义函数main，这是本实例的入口点函数，用于执行主要的训练和评估流程。函数main首先生成离线数据集，初始化AWAC、IQL和ORL代理，然后训练并评估这些代理，最后可视化训练过程和评估结果。

# 主函数
def main():
    # 创建环境
    env = gym.make('MountainCarContinuous-v0')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound = env.action_space.high[0]
    reset_env, step_env = gym_api_compatibility(env)

    # 生成离线数据集
    print("生成离线数据集...")
    offline_buffer = generate_offline_data(env, num_episodes=500)
    print(f"离线数据集大小: {len(offline_buffer)}")

    # 初始化三种方法
    awac = AWAC(state_dim, action_dim, action_bound)
    iql = IQL(state_dim, action_dim, action_bound)
    orl = ORL(state_dim, action_dim, action_bound)

    # 将离线数据添加到ORL的离线缓冲区
    for data in offline_buffer.buffer:
        if data is not None:
            state, action, reward, next_state, done = data
            orl.offline_buffer.add(state, action, reward, next_state, done)

    # 训练AWAC
    print("\n训练AWAC...")
    awac_rewards = train_agent(awac, env, num_episodes=300, early_stopping=30)

    # 训练IQL
    print("\n训练IQL...")
    iql_rewards = train_agent(iql, env, num_episodes=300, early_stopping=30)

    # 训练ORL - 先离线预训练，再在线微调
    print("\n训练ORL...")
    orl.offline_train()
    orl_rewards = orl.online_finetune(env)

    # 评估三种方法
    print("\n评估AWAC...")
    awac_avg = evaluate_agent(awac, env)

    print("\n评估IQL...")
    iql_avg = evaluate_agent(iql, env)

    print("\n评估ORL...")
    orl_avg = evaluate_agent(orl, env)

    # 可视化训练过程
    plt.figure(figsize=(14, 8))
    plt.plot(awac_rewards, label='AWAC', alpha=0.7)
    plt.plot(iql_rewards, label='IQL', alpha=0.7)
    plt.plot(orl_rewards, label='ORL', alpha=0.7)

    # 计算移动平均
    awac_smooth = np.convolve(awac_rewards, np.ones(10) / 10, mode='valid')
    iql_smooth = np.convolve(iql_rewards, np.ones(10) / 10, mode='valid')
    orl_smooth = np.convolve(orl_rewards, np.ones(10) / 10, mode='valid')

    plt.plot(range(9, len(awac_rewards)), awac_smooth, label='AWAC Moving Average', linewidth=2)
    plt.plot(range(9, len(iql_rewards)), iql_smooth, label='IQL Moving Average', linewidth=2)
    plt.plot(range(9, len(orl_rewards)), orl_smooth, label='ORL Moving Average', linewidth=2)

    plt.title('Comparison of Imitation Regularization Methods (MountainCarContinuous)', fontsize=16)
    plt.xlabel('Training Episodes', fontsize=14)
    plt.ylabel('Rewards', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('training_rewards.png', dpi=300)
    plt.show()

    # 可视化评估结果
    plt.figure(figsize=(10, 6))
    methods = ['AWAC', 'IQL', 'ORL']
    rewards = [awac_avg, iql_avg, orl_avg]

    plt.bar(methods, rewards, color=['#1f77b4', '#ff7f0e', '#2ca02c'])
    plt.title('Comparison of Evaluation Rewards', fontsize=16)
    plt.ylabel('Average Evaluation Rewards', fontsize=14)
    plt.grid(True, alpha=0.3)

    for i, v in enumerate(rewards):
        plt.text(i, v + 5, f'{v:.2f}', ha='center', fontsize=12)

    plt.tight_layout()
    plt.savefig('evaluation_rewards.png', dpi=300)
    plt.show()

    env.close()


if __name__ == "__main__":
    main()

执行本实例后，会绘制如下所示的两个可视化图。

训练过程的奖励曲线图：展示了AWAC、IQL和ORL三种算法在训练过程中的奖励变化情况，如图9-1所示，在图中绘制了每种算法的原始奖励曲线（awac_rewards、iql_rewards、orl_rewards）以及它们的移动平均曲线（awac_smooth、iql_smooth、orl_smooth）。