@@ -163,38 +163,38 @@ If your callback returns False, training is aborted early.
163
163
from stable_baselines.results_plotter import load_results, ts2xy
164
164
from stable_baselines import DDPG
165
165
from stable_baselines.ddpg import AdaptiveParamNoiseSpec
166
+ from stable_baselines import results_plotter
166
167
167
168
168
169
best_mean_reward, n_steps = - np.inf, 0
169
170
170
171
def callback (_locals , _globals ):
171
- """
172
- Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
173
- :param _locals: (dict)
174
- :param _globals: (dict)
175
- """
176
- global n_steps, best_mean_reward
177
- # Print stats every 1000 calls
178
- if (n_steps + 1 ) % 1000 == 0 :
179
- # Evaluate policy training performance
180
- x, y = ts2xy(load_results(log_dir), ' timesteps' )
181
- if len (x) > 0 :
182
- mean_reward = np.mean(y[- 100 :])
183
- print (x[- 1 ], ' timesteps' )
184
- print (" Best mean reward: {:.2f } - Last mean reward per episode: {:.2f } " .format(best_mean_reward, mean_reward))
185
-
186
- # New best model, you could save the agent here
187
- if mean_reward > best_mean_reward:
188
- best_mean_reward = mean_reward
189
- # Example for saving best model
190
- print (" Saving new best model" )
191
- _locals[' self' ].save(log_dir + ' best_model.pkl' )
192
- n_steps += 1
193
- return True
194
-
172
+ """
173
+ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
174
+ :param _locals: (dict)
175
+ :param _globals: (dict)
176
+ """
177
+ global n_steps, best_mean_reward
178
+ # Print stats every 1000 calls
179
+ if (n_steps + 1 ) % 1000 == 0 :
180
+ # Evaluate policy training performance
181
+ x, y = ts2xy(load_results(log_dir), ' timesteps' )
182
+ if len (x) > 0 :
183
+ mean_reward = np.mean(y[- 100 :])
184
+ print (x[- 1 ], ' timesteps' )
185
+ print (" Best mean reward: {:.2f } - Last mean reward per episode: {:.2f } " .format(best_mean_reward, mean_reward))
186
+
187
+ # New best model, you could save the agent here
188
+ if mean_reward > best_mean_reward:
189
+ best_mean_reward = mean_reward
190
+ # Example for saving best model
191
+ print (" Saving new best model" )
192
+ _locals[' self' ].save(log_dir + ' best_model.pkl' )
193
+ n_steps += 1
194
+ return True
195
195
196
196
# Create log dir
197
- log_dir = " / tmp/gym /"
197
+ log_dir = " tmp/"
198
198
os.makedirs(log_dir, exist_ok = True )
199
199
200
200
# Create and wrap the environment
@@ -206,7 +206,11 @@ If your callback returns False, training is aborted early.
206
206
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
207
207
model = DDPG(LnMlpPolicy, env, param_noise = param_noise, verbose = 0 )
208
208
# Train the agent
209
- model.learn(total_timesteps = int (1e5 ), callback = callback)
209
+ time_steps = 1e5
210
+ model.learn(total_timesteps = int (time_steps), callback = callback)
211
+
212
+ results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS , " DDPG LunarLander" )
213
+ plt.show()
210
214
211
215
212
216
Atari Games
0 commit comments