import numpy as np
# 初始化参数
epsilon = 0.1
q_values = [1355,1355] # 初始化价值函数
counts = [0,0] # 记录每个臂的选择次数
total_rewards = 0
num_steps = 1000
# 模拟每个臂的真实奖励分布
class SlotMachine:
def __init__(self, mean, std_dev):
self.mean = mean
self.std_dev = std_dev
def pull_lever(self):
# 生成一个符合正态分布的随机数
reward = np.random.normal(loc=self.mean, scale=self.std_dev)
return reward
# 创建两个老虎机
slot_machine_1 = SlotMachine(mean=500, std_dev=np.sqrt(50))
slot_machine_2 = SlotMachine(mean=550, std_dev=np.sqrt(100))
true_rewards = [slot_machine_1,slot_machine_2]
for t in range(num_steps):
if np.random.rand() < epsilon:
# 探索:随机选择一个臂
arm = np.random.randint(2)
else:
# 利用:选择当前价值最高的臂
arm = np.argmax(q_values)
# 获取奖励
reward = true_rewards[arm].pull_lever()-1355 # 每次摇杆花费1355
# 更新选择次数
counts[arm] += 1
# 更新价值函数(样本平均法)
q_values[arm] += (reward - q_values[arm]) / counts[arm]
# 累计奖励
total_rewards += reward
print("最终价值函数:", q_values)
print("累计奖励:", total_rewards)