0% found this document useful (0 votes)

42 views6 pages

CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory

This document discusses different sampling strategies for multi-arm bandit problems including random, epsilon greedy, and softmax policies. It defines a Gaussian bandit environment and explores the performance of various policies through experiments.

Uploaded by

Rahul me20b145

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

42 views6 pages

CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory

Uploaded by

Rahul me20b145

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 6

1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.

ipynb - Colaboratory

keyboard_arrow_down CS6700 : Tutorial 1 - Multi-Arm Bandits

image.png

Goal: Analysis 3 types of sampling strategy in a MAB

keyboard_arrow_down Import dependencies

# !pip install seaborn

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import NamedTuple, List

keyboard_arrow_down Gaussian Bandit Environment

class GaussianArm(NamedTuple):
mean: float
std: float

class Env:
def __init__(self, num_arms: int, mean_reward_range: tuple, std: float):
"""
num_arms: number of bandit arms
mean_reward_range: mean reward of an arm should lie between the given range
std: standard deviation of the reward for each arm
"""
self.num_arms = num_arms
self.arms = self.create_arms(num_arms, mean_reward_range, std)

def create_arms(self, n: int, mean_reward_range: tuple, std: float) -> dict:

low_rwd, high_rwd = mean_reward_range
# creates "n" number of mean reward for each arm
means = np.random.uniform(low=low_rwd, high=high_rwd, size=(n,))
arms = {id: GaussianArm(mu, std) for id, mu in enumerate(means)}
return arms

@property
def arm_ids(self):
return list(self.arms.keys())

def step(self, arm_id: int) -> float:

arm = self.arms[arm_id]
return np.random.normal(arm.mean, arm.std) # Reward

def get_best_arm_and_expected_reward(self):
best_arm_id = max(self.arms, key=lambda x: self.arms[x].mean)
return best_arm_id, self.arms[best_arm_id].mean

def get_avg_arm_reward(self):
arm_mean_rewards = [v.mean for v in self.arms.values()]
return np.mean(arm_mean_rewards)

def plot_arms_reward_distribution(self, num_samples=1000):

"""
This function is only used to visualize the arm's distrbution.
"""
fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize=(9, 5))
colors = sns.color_palette("hls", self.num_arms)
for i, arm_id in enumerate(self.arm_ids):
reward_samples = [self.step(arm_id) for _ in range(num_samples)]
sns.histplot(reward_samples, ax=ax, stat="density", kde=True, bins=100, color=colors[i], label=f'arm_{arm_id}')
ax.legend()
plt.show()

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 1/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory

keyboard_arrow_down Policy
class BasePolicy:
@property
def name(self):
return 'base_policy'

def reset(self):
"""
This function resets the internal variable.
"""
pass

def update_arm(self, *args):

"""
This function keep track of the estimates
that we may want to update during training.
"""
pass

def select_arm(self) -> int:

"""
It returns arm_id
"""
raise Exception("Not Implemented")

keyboard_arrow_down Random Policy

class RandomPolicy(BasePolicy):
def __init__(self, arm_ids: List[int]):
self.arm_ids = arm_ids

@property
def name(self):
return 'random'

def reset(self) -> None:

"""No use."""
pass

def update_arm(self, *args) -> None:

"""No use."""
pass

def select_arm(self) -> int:

return np.random.choice(self.arm_ids)

class EpGreedyPolicy(BasePolicy):
def __init__(self, epsilon: float, arm_ids: List[int]):
self.epsilon = epsilon
self.arm_ids = arm_ids
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

@property
def name(self):
return f'ep-greedy ep:{self.epsilon}'

def reset(self) -> None:

self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

def update_arm(self, arm_id: int, arm_reward: float) -> None:

# your code for updating the Q values of each arm
pass

def select_arm(self) -> int:

# your code for selecting arm based on epsilon greedy policy
pass

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 2/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
class SoftmaxPolicy(BasePolicy):
def __init__(self, tau, arm_ids):
self.tau = tau
self.arm_ids = arm_ids
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

@property
def name(self):
return f'softmax tau:{self.tau}'

def reset(self):
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}

def update_arm(self, arm_id: int, arm_reward: float) -> None:

# your code for updating the Q values of each arm
pass

def select_arm(self) -> int:

# your code for selecting arm based on softmax policy
pass

class UCB(BasePolicy):
# your code here
pass

keyboard_arrow_down Trainer

def train(env, policy: BasePolicy, timesteps):

policy_reward = np.zeros((timesteps,))
for t in range(timesteps):
arm_id = policy.select_arm()
reward = env.step(arm_id)
policy.update_arm(arm_id, reward)
policy_reward[t] = reward
return policy_reward

def avg_over_runs(env, policy: BasePolicy, timesteps, num_runs):

_, expected_max_reward = env.get_best_arm_and_expected_reward()
policy_reward_each_run = np.zeros((num_runs, timesteps))
for run in range(num_runs):
policy.reset()
policy_reward = train(env, policy, timesteps)
policy_reward_each_run[run, :] = policy_reward

# calculate avg policy reward from policy_reward_each_run

avg_policy_rewards = None # your code here (type: nd.array, shape: (timesteps,))
total_policy_regret = None # your code here (type: float)

return avg_policy_rewards, total_policy_regret

def plot_reward_curve_and_print_regret(env, policies, timesteps=200, num_runs=500):

fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize=(10, 6))
for policy in policies:
avg_policy_rewards, total_policy_regret = avg_over_runs(env, policy, timesteps, num_runs)
print('regret for {}: {:.3f}'.format(policy.name, total_policy_regret))
ax.plot(np.arange(timesteps), avg_policy_rewards, '-', label=policy.name)

_, expected_max_reward = env.get_best_arm_and_expected_reward()
ax.plot(np.arange(timesteps), [expected_max_reward]*timesteps, 'g-')

avg_arm_reward = env.get_avg_arm_reward()
ax.plot(np.arange(timesteps), [avg_arm_reward]*timesteps, 'r-')

plt.legend(loc='lower right')
plt.show()

keyboard_arrow_down Experiments
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 3/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
seed = 42
np.random.seed(seed)

num_arms = 5
mean_reward_range = (-25, 25)
std = 2.0

env = Env(num_arms, mean_reward_range, std)

env.plot_arms_reward_distribution()

best_arm, max_mean_reward = env.get_best_arm_and_expected_reward()

print(best_arm, max_mean_reward)

1 22.53571532049581

print(env.get_avg_arm_reward())

3.119254917081568

keyboard_arrow_down Please explore following values:

Epsilon greedy: [0.001, 0.01, 0.5, 0.9]

Softmax: [0.001, 1.0, 5.0, 50.0]

random_policy = RandomPolicy(env.arm_ids)
plot_reward_curve_and_print_regret(env, [random_policy], timesteps=200, num_runs=500)

regret for random: 3883.660

explore_epgreedy_epsilons = [0.001, 0.01, 0.5, 0.9]

epgreedy_policies = [EpGreedyPolicy(ep, env.arm_ids) for ep in explore_epgreedy_epsilons]
plot_reward_curve_and_print_regret(env, epgreedy_policies, timesteps=200, num_runs=500)

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 4/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory

regret for ep-greedy ep:0.001: 31.418

regret for ep-greedy ep:0.01: 85.106
regret for ep-greedy ep:0.5: 1979.134
regret for ep-greedy ep:0.9: 3515.911

explore_softmax_taus = [0.001, 1.0, 5.0, 50.0]

softmax_polices = [SoftmaxPolicy(tau, env.arm_ids) for tau in explore_softmax_taus]
plot_reward_curve_and_print_regret(env, softmax_polices, timesteps=200, num_runs=500)

regret for softmax tau:0.001: 1919.966

regret for softmax tau:1.0: 1307.562
regret for softmax tau:5.0: 414.835
regret for softmax tau:50.0: 3169.759

plot_reward_curve_and_print_regret(env, [UCB()], timesteps=200, num_runs=500)

keyboard_arrow_down Optional: Please explore different values of epsilon, tau and verify how does the behaviour changes.

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 5/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory

https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 6/6

Introduction To Bandits: (Some Slides Stolen From Csaba's AAAI Tutorial)
No ratings yet
Introduction To Bandits: (Some Slides Stolen From Csaba's AAAI Tutorial)
16 pages
Bandit Algorithms
No ratings yet
Bandit Algorithms
2 pages
Bandit Algorithms in Hyperparameter Tuning
No ratings yet
Bandit Algorithms in Hyperparameter Tuning
1 page
Bandit Algorithms in Hyperparameter Tuning Extended Refreshed
No ratings yet
Bandit Algorithms in Hyperparameter Tuning Extended Refreshed
3 pages
Reinforcement Learning for Coders
No ratings yet
Reinforcement Learning for Coders
25 pages
EAS 240 MAB Project Description Spring 2025
No ratings yet
EAS 240 MAB Project Description Spring 2025
10 pages
Hayashi 2025
No ratings yet
Hayashi 2025
14 pages
Contextual Bandits
No ratings yet
Contextual Bandits
34 pages
29117-Article Text-33171-1-2-20240324
No ratings yet
29117-Article Text-33171-1-2-20240324
8 pages
Lecture 2 EE675
No ratings yet
Lecture 2 EE675
4 pages
CS181 P - A - : Roject New Exploration of The Multi Armed Bandit Problem
No ratings yet
CS181 P - A - : Roject New Exploration of The Multi Armed Bandit Problem
9 pages
MAB Assignment 2
No ratings yet
MAB Assignment 2
2 pages
Algorithms For The Multi-Armed Bandit Problem: Volodymyr Kuleshov Doina Precup
No ratings yet
Algorithms For The Multi-Armed Bandit Problem: Volodymyr Kuleshov Doina Precup
32 pages
Bandit
No ratings yet
Bandit
8 pages
Bandit Problems
No ratings yet
Bandit Problems
8 pages
NIPS 2008 Algorithms For Infinitely Many Armed Bandits Paper
No ratings yet
NIPS 2008 Algorithms For Infinitely Many Armed Bandits Paper
8 pages
AI8 IshikaAgarwal 23csu414
No ratings yet
AI8 IshikaAgarwal 23csu414
2 pages
NeurIPS 2021 Breaking The Moments Condition Barrier No Regret Algorithm For Bandits With Super Heavy Tailed Payoffs Paper
No ratings yet
NeurIPS 2021 Breaking The Moments Condition Barrier No Regret Algorithm For Bandits With Super Heavy Tailed Payoffs Paper
11 pages
Federated Multi-Armed Bandits
No ratings yet
Federated Multi-Armed Bandits
16 pages
Online Learning For Causal Bandits
No ratings yet
Online Learning For Causal Bandits
7 pages
26202-Article Text-30265-1-2-20230626
No ratings yet
26202-Article Text-30265-1-2-20230626
8 pages
Rlassignment 2
No ratings yet
Rlassignment 2
3 pages
Reinforcement Learning Q&A Guide
No ratings yet
Reinforcement Learning Q&A Guide
10 pages
Project
No ratings yet
Project
47 pages
Aifinal
No ratings yet
Aifinal
15 pages
Multi-Armed Bandit Algorithms and Empirical Evaluation
No ratings yet
Multi-Armed Bandit Algorithms and Empirical Evaluation
12 pages
Zhu 20 D
No ratings yet
Zhu 20 D
10 pages
Cs6046-Notes 2
No ratings yet
Cs6046-Notes 2
34 pages
LinUCB Ote
No ratings yet
LinUCB Ote
68 pages
Module 6 1 Ungraded Quizz
No ratings yet
Module 6 1 Ungraded Quizz
13 pages
NeurIPS 2019 Batched Multi Armed Bandits Problem Paper
No ratings yet
NeurIPS 2019 Batched Multi Armed Bandits Problem Paper
11 pages
CombinatorialBandit DYSPAN10
No ratings yet
CombinatorialBandit DYSPAN10
9 pages
Garbage In, Reward Out Bootstrapping Exploration in Multi-Armed Bandits
No ratings yet
Garbage In, Reward Out Bootstrapping Exploration in Multi-Armed Bandits
19 pages
Multi-Armed Bandit
No ratings yet
Multi-Armed Bandit
17 pages
2022 Multiarmed Bandit Algorithms On Zynq System-On-Chip Go Frequentist or Bayesian
No ratings yet
2022 Multiarmed Bandit Algorithms On Zynq System-On-Chip Go Frequentist or Bayesian
14 pages
Data Challenge - NC Soft
No ratings yet
Data Challenge - NC Soft
4 pages
K-Armed Bandit
No ratings yet
K-Armed Bandit
2 pages
Agrawal&Goyal 2017
No ratings yet
Agrawal&Goyal 2017
3 pages
Thompson Sampling For Contextual Bandits With Linear Payoffs
No ratings yet
Thompson Sampling For Contextual Bandits With Linear Payoffs
22 pages
Expanded Multi Armed Bandit and Probability Basics
No ratings yet
Expanded Multi Armed Bandit and Probability Basics
5 pages
On Kernelized Multi-Armed Bandits: Sayak Ray Chowdhury
No ratings yet
On Kernelized Multi-Armed Bandits: Sayak Ray Chowdhury
26 pages
Master Thesis On Mixed Model Bandits
No ratings yet
Master Thesis On Mixed Model Bandits
73 pages
Lab 9 Report Group 2
No ratings yet
Lab 9 Report Group 2
5 pages
Evendar 06 A
No ratings yet
Evendar 06 A
27 pages
DLMAIRIL01 Q4-2024 Session3
No ratings yet
DLMAIRIL01 Q4-2024 Session3
47 pages
Q1. Explain The Multi-Armed Bandit Problem and Its Key Characteristics. Illustrate Their Real-World Applications
No ratings yet
Q1. Explain The Multi-Armed Bandit Problem and Its Key Characteristics. Illustrate Their Real-World Applications
11 pages
NCC MAB Arxiv
No ratings yet
NCC MAB Arxiv
6 pages
Multi-Armed Bandit Problem With Online Clustering As Side
No ratings yet
Multi-Armed Bandit Problem With Online Clustering As Side
13 pages
10939-Article Text-14467-1-2-20201228
No ratings yet
10939-Article Text-14467-1-2-20201228
8 pages
Multi-Armed Bandits and The Stitch Fix Experimentation Platform - Stitch Fix Technology - Multithreaded
No ratings yet
Multi-Armed Bandits and The Stitch Fix Experimentation Platform - Stitch Fix Technology - Multithreaded
12 pages
Finite-Time Regret of Thompson Sampling Algorithms For Exponential Family Multi-Armed Bandits
No ratings yet
Finite-Time Regret of Thompson Sampling Algorithms For Exponential Family Multi-Armed Bandits
49 pages
Agrawal&Goyal 2013
No ratings yet
Agrawal&Goyal 2013
9 pages
An Analysis of Multi-Armed Bandit Algorithms
No ratings yet
An Analysis of Multi-Armed Bandit Algorithms
6 pages
Multi-Armed Bandits
No ratings yet
Multi-Armed Bandits
11 pages
Multi-Arm Bandit Problem Guide
No ratings yet
Multi-Arm Bandit Problem Guide
10 pages
Vincent NCC Paper On Multi Objective BAI
No ratings yet
Vincent NCC Paper On Multi Objective BAI
33 pages
Q1.ipynb - Colab
No ratings yet
Q1.ipynb - Colab
3 pages
CS6700 RL 2024 Wa1
No ratings yet
CS6700 RL 2024 Wa1
7 pages
CS6700 Reinforcement Learning Assignment
No ratings yet
CS6700 Reinforcement Learning Assignment
17 pages
DEL MAA: Rahul / Rahul MR AI0538
No ratings yet
DEL MAA: Rahul / Rahul MR AI0538
1 page
L2 Projection Piecewise
No ratings yet
L2 Projection Piecewise
9 pages
Quadrature Rules For Numerical Integration Over Triangles and Tetrahedra
No ratings yet
Quadrature Rules For Numerical Integration Over Triangles and Tetrahedra
3 pages
Hammer NumericalIntegrationSimplexes 1956
No ratings yet
Hammer NumericalIntegrationSimplexes 1956
9 pages
Assignment EE5179 ME20B145 Report
No ratings yet
Assignment EE5179 ME20B145 Report
6 pages
Gauss Legendre Quadrature Method
No ratings yet
Gauss Legendre Quadrature Method
7 pages
The Global Internal Audit Standards Ex IPPF 1677281106
No ratings yet
The Global Internal Audit Standards Ex IPPF 1677281106
12 pages
Chapter 5 Managing Diversity
No ratings yet
Chapter 5 Managing Diversity
31 pages
CS602PC - Compiler - Design - Lecture Notes - Unit - 5
No ratings yet
CS602PC - Compiler - Design - Lecture Notes - Unit - 5
28 pages
Ideathon Template
No ratings yet
Ideathon Template
12 pages
Credit Risk Management
100% (4)
Credit Risk Management
82 pages
Electro-Pneumatic Flushing for LHB Coaches
No ratings yet
Electro-Pneumatic Flushing for LHB Coaches
5 pages
Practice - Have Get Something Done - Get Someone To Do STH
No ratings yet
Practice - Have Get Something Done - Get Someone To Do STH
3 pages
I Would Like To Be A Programmer in The Future. That's My Dream Job. I'm 16, and I Want To Start Learning. What Languages, I Thin
No ratings yet
I Would Like To Be A Programmer in The Future. That's My Dream Job. I'm 16, and I Want To Start Learning. What Languages, I Thin
1 page
ELE Teaching Multigrade Class
100% (1)
ELE Teaching Multigrade Class
2 pages
Shopee Voucher Redemption Guide
No ratings yet
Shopee Voucher Redemption Guide
2 pages
Mixed Certificate Chains For The Transition To Post-Quantum Authentication in TLS 1.3
No ratings yet
Mixed Certificate Chains For The Transition To Post-Quantum Authentication in TLS 1.3
77 pages
Business Model 2022
No ratings yet
Business Model 2022
1 page
Islamic Sale Contracts Explained
No ratings yet
Islamic Sale Contracts Explained
12 pages
Science GPT
No ratings yet
Science GPT
2 pages
Personal Finance Manager V2: Accounts Categories & Budget
No ratings yet
Personal Finance Manager V2: Accounts Categories & Budget
23 pages
Multispot 80 Inverter Spot Weld
No ratings yet
Multispot 80 Inverter Spot Weld
37 pages
Jackson On Indian Removal
No ratings yet
Jackson On Indian Removal
2 pages
Downshifting S
No ratings yet
Downshifting S
5 pages
Snapchat DCF Valuation
No ratings yet
Snapchat DCF Valuation
31 pages
Ace Hardware Bidder Addendum
No ratings yet
Ace Hardware Bidder Addendum
18 pages
Einstein Travel Directions June 2014 Gateways
No ratings yet
Einstein Travel Directions June 2014 Gateways
5 pages
Microsoft Course for Eng Students
No ratings yet
Microsoft Course for Eng Students
6 pages
Ce 412
100% (3)
Ce 412
99 pages
Business Continuity Plan 7JCM
No ratings yet
Business Continuity Plan 7JCM
11 pages
Autochief® C20: Engine Control and Monitoring For General Electric
No ratings yet
Autochief® C20: Engine Control and Monitoring For General Electric
2 pages
Skill Masters Business Profile - Management and Strategy Consulting - 2024 V2.0
No ratings yet
Skill Masters Business Profile - Management and Strategy Consulting - 2024 V2.0
10 pages
Robotics Arduino Traffic Light Activity
No ratings yet
Robotics Arduino Traffic Light Activity
1 page
Lesson 1 - 085553
No ratings yet
Lesson 1 - 085553
42 pages
Tax Refund Claim Compliance
No ratings yet
Tax Refund Claim Compliance
1 page
Why Net Present Value Leads To Better Investment Decisions Than Other Criteria
No ratings yet
Why Net Present Value Leads To Better Investment Decisions Than Other Criteria
29 pages

CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory

Uploaded by

CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory

Uploaded by

1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.

keyboard_arrow_down CS6700 : Tutorial 1 - Multi-Arm Bandits

Goal: Analysis 3 types of sampling strategy in a MAB

keyboard_arrow_down Import dependencies

keyboard_arrow_down Gaussian Bandit Environment

def create_arms(self, n: int, mean_reward_range: tuple, std: float) -> dict:

def step(self, arm_id: int) -> float:

def plot_arms_reward_distribution(self, num_samples=1000):

def update_arm(self, *args):

def select_arm(self) -> int:

keyboard_arrow_down Random Policy

def reset(self) -> None:

def update_arm(self, *args) -> None:

def select_arm(self) -> int:

def reset(self) -> None:

def update_arm(self, arm_id: int, arm_reward: float) -> None:

def select_arm(self) -> int:

def update_arm(self, arm_id: int, arm_reward: float) -> None:

def select_arm(self) -> int:

def train(env, policy: BasePolicy, timesteps):

def avg_over_runs(env, policy: BasePolicy, timesteps, num_runs):

# calculate avg policy reward from policy_reward_each_run

return avg_policy_rewards, total_policy_regret

def plot_reward_curve_and_print_regret(env, policies, timesteps=200, num_runs=500):

env = Env(num_arms, mean_reward_range, std)

best_arm, max_mean_reward = env.get_best_arm_and_expected_reward()

keyboard_arrow_down Please explore following values:

Epsilon greedy: [0.001, 0.01, 0.5, 0.9]

regret for random: 3883.660

explore_epgreedy_epsilons = [0.001, 0.01, 0.5, 0.9]

regret for ep-greedy ep:0.001: 31.418

explore_softmax_taus = [0.001, 1.0, 5.0, 50.0]

regret for softmax tau:0.001: 1919.966

plot_reward_curve_and_print_regret(env, [UCB()], timesteps=200, num_runs=500)

You might also like