-
Notifications
You must be signed in to change notification settings - Fork 6.2k
/
Copy pathcustom_experiment.py
183 lines (153 loc) · 8.14 KB
/
custom_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""Example of a custom Ray Tune experiment wrapping an RLlib Algorithm.
You should only use such a customized workflow if the following conditions apply:
- You know exactly what you are doing :)
- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
is not sufficient and doesn't allow you to shape the Algorithm into behaving the way
you'd like. Note that for complex, custom evaluation procedures there are many
AlgorithmConfig options one can use (for more details, see:
https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py). # noqa
- Subclassing an RLlib Algorithm class and overriding the new class' `training_step`
method is not sufficient and doesn't allow you to define the algorithm's execution
logic the way you'd like. See an example here on how to customize the algorithm's
`training_step()` method:
https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py # noqa
How to run this script
----------------------
`python [script file name].py`
Results to expect
-----------------
You should see the following output (at the end of the experiment) in your console:
╭───────────────────────────────────────────────────────────────────────────────────────
│ Trial name status iter total time (s) ts
├───────────────────────────────────────────────────────────────────────────────────────
│ my_experiment_CartPole-v1_77083_00000 TERMINATED 10 36.7799 60000
╰───────────────────────────────────────────────────────────────────────────────────────
╭───────────────────────────────────────────────────────╮
│ reward episode_len_mean episodes_this_iter │
├───────────────────────────────────────────────────────┤
│ 254.821 254.821 12 │
╰───────────────────────────────────────────────────────╯
evaluation episode returns=[500.0, 500.0, 500.0]
Note that evaluation results (on the CartPole-v1 env) should be close to perfect
(episode return of ~500.0) as we are acting greedily inside the evaluation procedure.
"""
from typing import Dict
import numpy as np
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
torch, _ = try_import_torch()
def my_experiment(config: Dict):
# Extract the number of iterations to run from the config.
train_iterations = config.pop("train-iterations", 2)
eval_episodes_to_do = config.pop("eval-episodes", 1)
config = (
PPOConfig()
.update_from_dict(config)
.api_stack(enable_rl_module_and_learner=True)
.environment("CartPole-v1")
)
# Train for n iterations with high LR.
config.training(lr=0.001)
algo_high_lr = config.build()
for _ in range(train_iterations):
train_results = algo_high_lr.train()
# Add the phase to the result dict.
train_results["phase"] = 1
tune.report(train_results)
phase_high_lr_time = train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME]
checkpoint_training_high_lr = algo_high_lr.save()
algo_high_lr.stop()
# Train for n iterations with low LR.
config.training(lr=0.00001)
algo_low_lr = config.build()
# Load state from the high-lr algo into this one.
algo_low_lr.restore(checkpoint_training_high_lr)
for _ in range(train_iterations):
train_results = algo_low_lr.train()
# Add the phase to the result dict.
train_results["phase"] = 2
# keep time moving forward
train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME] += phase_high_lr_time
tune.report(train_results)
checkpoint_training_low_lr = algo_low_lr.save()
algo_low_lr.stop()
# After training, run a manual evaluation procedure.
# Set the number of EnvRunners for collecting training data to 0 (local
# worker only).
config.env_runners(num_env_runners=0)
eval_algo = config.build()
# Load state from the low-lr algo into this one.
eval_algo.restore(checkpoint_training_low_lr)
# The algo's local worker (SingleAgentEnvRunner) that holds a
# gym.vector.Env object and an RLModule for computing actions.
local_env_runner = eval_algo.env_runner
# Extract the gymnasium env object from the created algo (its local
# SingleAgentEnvRunner worker). Note that the env in this single-agent
# case is a gymnasium vector env and that we get its first sub-env here.
env = local_env_runner.env.unwrapped.envs[0]
# The local worker (SingleAgentEnvRunner)
rl_module = local_env_runner.module
# Run a very simple env loop and add up rewards over a single episode.
obs, infos = env.reset()
episode_returns = []
episode_lengths = []
sum_rewards = length = 0
num_episodes = 0
while num_episodes < eval_episodes_to_do:
# Call the RLModule's `forward_inference()` method to compute an
# action.
rl_module_out = rl_module.forward_inference(
{
"obs": torch.from_numpy(np.expand_dims(obs, 0)), # <- add B=1
}
)
action_logits = rl_module_out["action_dist_inputs"][0] # <- remove B=1
action = np.argmax(action_logits.detach().cpu().numpy()) # act greedily
# Step the env.
obs, reward, terminated, truncated, info = env.step(action)
# Acculumate stats and reset the env, if necessary.
sum_rewards += reward
length += 1
if terminated or truncated:
num_episodes += 1
episode_returns.append(sum_rewards)
episode_lengths.append(length)
sum_rewards = length = 0
obs, infos = env.reset()
# Compile evaluation results.
eval_results = {
"eval_returns": episode_returns,
"eval_episode_lengths": episode_lengths,
}
# Combine the most recent training results with the just collected
# evaluation results.
results = {**train_results, **eval_results}
# Report everything.
tune.report(results)
if __name__ == "__main__":
base_config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0)
# Convert to a plain dict for Tune. Note that this is usually not needed, you can
# pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object.
# However, for demonstration purposes, we show here how you can add other, arbitrary
# keys to the plain config dict and then pass these keys to your custom experiment
# function.
config_dict = base_config.to_dict()
# Set a Special flag signalling `my_experiment` how many training steps to
# perform on each: the high learning rate and low learning rate.
config_dict["train-iterations"] = 5
# Set a Special flag signalling `my_experiment` how many episodes to evaluate for.
config_dict["eval-episodes"] = 3
training_function = tune.with_resources(
my_experiment,
resources=base_config.algo_class.default_resource_request(base_config),
)
tuner = tune.Tuner(
training_function,
# Pass in your config dict.
param_space=config_dict,
)
results = tuner.fit()
best_results = results.get_best_result()
print(f"evaluation episode returns={best_results.metrics['eval_returns']}")