0% found this document useful (0 votes)
42 views6 pages

CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory

This document discusses different sampling strategies for multi-arm bandit problems including random, epsilon greedy, and softmax policies. It defines a Gaussian bandit environment and explores the performance of various policies through experiments.

Uploaded by

Rahul me20b145
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
42 views6 pages

CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory

This document discusses different sampling strategies for multi-arm bandit problems including random, epsilon greedy, and softmax policies. It defines a Gaussian bandit environment and explores the performance of various policies through experiments.

Uploaded by

Rahul me20b145
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.

ipynb - Colaboratory

keyboard_arrow_down CS6700 : Tutorial 1 - Multi-Arm Bandits


image.png

Goal: Analysis 3 types of sampling strategy in a MAB

keyboard_arrow_down Import dependencies


# !pip install seaborn

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import NamedTuple, List

keyboard_arrow_down Gaussian Bandit Environment


class GaussianArm(NamedTuple):
mean: float
std: float

class Env:
def __init__(self, num_arms: int, mean_reward_range: tuple, std: float):
"""
num_arms: number of bandit arms
mean_reward_range: mean reward of an arm should lie between the given range
std: standard deviation of the reward for each arm
"""
self.num_arms = num_arms
self.arms = self.create_arms(num_arms, mean_reward_range, std)

def create_arms(self, n: int, mean_reward_range: tuple, std: float) -> dict:


low_rwd, high_rwd = mean_reward_range
# creates "n" number of mean reward for each arm
means = np.random.uniform(low=low_rwd, high=high_rwd, size=(n,))
arms = {id: GaussianArm(mu, std) for id, mu in enumerate(means)}
return arms

@property
def arm_ids(self):
return list(self.arms.keys())

def step(self, arm_id: int) -> float:


arm = self.arms[arm_id]
return np.random.normal(arm.mean, arm.std) # Reward

def get_best_arm_and_expected_reward(self):
best_arm_id = max(self.arms, key=lambda x: self.arms[x].mean)
return best_arm_id, self.arms[best_arm_id].mean

def get_avg_arm_reward(self):
arm_mean_rewards = [v.mean for v in self.arms.values()]
return np.mean(arm_mean_rewards)

def plot_arms_reward_distribution(self, num_samples=1000):


"""
This function is only used to visualize the arm's distrbution.
"""
fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize=(9, 5))
colors = sns.color_palette("hls", self.num_arms)
for i, arm_id in enumerate(self.arm_ids):
reward_samples = [self.step(arm_id) for _ in range(num_samples)]
sns.histplot(reward_samples, ax=ax, stat="density", kde=True, bins=100, color=colors[i], label=f'arm_{arm_id}')
ax.legend()
plt.show()

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 1/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory

keyboard_arrow_down Policy
class BasePolicy:
@property
def name(self):
return 'base_policy'

def reset(self):
"""
This function resets the internal variable.
"""
pass

def update_arm(self, *args):


"""
This function keep track of the estimates
that we may want to update during training.
"""
pass

def select_arm(self) -> int:


"""
It returns arm_id
"""
raise Exception("Not Implemented")

keyboard_arrow_down Random Policy

class RandomPolicy(BasePolicy):
def __init__(self, arm_ids: List[int]):
self.arm_ids = arm_ids

@property
def name(self):
return 'random'

def reset(self) -> None:


"""No use."""
pass

def update_arm(self, *args) -> None:


"""No use."""
pass

def select_arm(self) -> int:


return np.random.choice(self.arm_ids)

class EpGreedyPolicy(BasePolicy):
def __init__(self, epsilon: float, arm_ids: List[int]):
self.epsilon = epsilon
self.arm_ids = arm_ids
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

@property
def name(self):
return f'ep-greedy ep:{self.epsilon}'

def reset(self) -> None:


self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

def update_arm(self, arm_id: int, arm_reward: float) -> None:


# your code for updating the Q values of each arm
pass

def select_arm(self) -> int:


# your code for selecting arm based on epsilon greedy policy
pass

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 2/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
class SoftmaxPolicy(BasePolicy):
def __init__(self, tau, arm_ids):
self.tau = tau
self.arm_ids = arm_ids
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

@property
def name(self):
return f'softmax tau:{self.tau}'

def reset(self):
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

def update_arm(self, arm_id: int, arm_reward: float) -> None:


# your code for updating the Q values of each arm
pass

def select_arm(self) -> int:


# your code for selecting arm based on softmax policy
pass

class UCB(BasePolicy):
# your code here
pass

keyboard_arrow_down Trainer

def train(env, policy: BasePolicy, timesteps):


policy_reward = np.zeros((timesteps,))
for t in range(timesteps):
arm_id = policy.select_arm()
reward = env.step(arm_id)
policy.update_arm(arm_id, reward)
policy_reward[t] = reward
return policy_reward

def avg_over_runs(env, policy: BasePolicy, timesteps, num_runs):


_, expected_max_reward = env.get_best_arm_and_expected_reward()
policy_reward_each_run = np.zeros((num_runs, timesteps))
for run in range(num_runs):
policy.reset()
policy_reward = train(env, policy, timesteps)
policy_reward_each_run[run, :] = policy_reward

# calculate avg policy reward from policy_reward_each_run


avg_policy_rewards = None # your code here (type: nd.array, shape: (timesteps,))
total_policy_regret = None # your code here (type: float)

return avg_policy_rewards, total_policy_regret

def plot_reward_curve_and_print_regret(env, policies, timesteps=200, num_runs=500):


fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize=(10, 6))
for policy in policies:
avg_policy_rewards, total_policy_regret = avg_over_runs(env, policy, timesteps, num_runs)
print('regret for {}: {:.3f}'.format(policy.name, total_policy_regret))
ax.plot(np.arange(timesteps), avg_policy_rewards, '-', label=policy.name)

_, expected_max_reward = env.get_best_arm_and_expected_reward()
ax.plot(np.arange(timesteps), [expected_max_reward]*timesteps, 'g-')

avg_arm_reward = env.get_avg_arm_reward()
ax.plot(np.arange(timesteps), [avg_arm_reward]*timesteps, 'r-')

plt.legend(loc='lower right')
plt.show()

keyboard_arrow_down Experiments
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 3/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
seed = 42
np.random.seed(seed)

num_arms = 5
mean_reward_range = (-25, 25)
std = 2.0

env = Env(num_arms, mean_reward_range, std)

env.plot_arms_reward_distribution()

best_arm, max_mean_reward = env.get_best_arm_and_expected_reward()


print(best_arm, max_mean_reward)

1 22.53571532049581

print(env.get_avg_arm_reward())

3.119254917081568

keyboard_arrow_down Please explore following values:

Epsilon greedy: [0.001, 0.01, 0.5, 0.9]


Softmax: [0.001, 1.0, 5.0, 50.0]

random_policy = RandomPolicy(env.arm_ids)
plot_reward_curve_and_print_regret(env, [random_policy], timesteps=200, num_runs=500)

regret for random: 3883.660

explore_epgreedy_epsilons = [0.001, 0.01, 0.5, 0.9]


epgreedy_policies = [EpGreedyPolicy(ep, env.arm_ids) for ep in explore_epgreedy_epsilons]
plot_reward_curve_and_print_regret(env, epgreedy_policies, timesteps=200, num_runs=500)

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 4/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory

regret for ep-greedy ep:0.001: 31.418


regret for ep-greedy ep:0.01: 85.106
regret for ep-greedy ep:0.5: 1979.134
regret for ep-greedy ep:0.9: 3515.911

explore_softmax_taus = [0.001, 1.0, 5.0, 50.0]


softmax_polices = [SoftmaxPolicy(tau, env.arm_ids) for tau in explore_softmax_taus]
plot_reward_curve_and_print_regret(env, softmax_polices, timesteps=200, num_runs=500)

regret for softmax tau:0.001: 1919.966


regret for softmax tau:1.0: 1307.562
regret for softmax tau:5.0: 414.835
regret for softmax tau:50.0: 3169.759

plot_reward_curve_and_print_regret(env, [UCB()], timesteps=200, num_runs=500)

keyboard_arrow_down Optional: Please explore different values of epsilon, tau and verify how does the behaviour changes.

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 5/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 6/6

You might also like