-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
144 lines (109 loc) · 4.79 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
# coding: utf-8
# # Navigation - Deep Q-Network implementation
from unityagents import UnityEnvironment
from maddpg_agent import Agent
import sys
import random
import torch
import numpy as np
from collections import deque
from parameters import *
import os
# Instantiate the Environment and Agent
env = UnityEnvironment(file_name='Tennis_Linux_NoVis/Tennis.x86_64')
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of actions
action_size = brain.vector_action_space_size
print('action size = ', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
agents_no = len(env_info.agents)
try:
os.mkdir("./models")
except OSError:
print("'models' dir already exists...")
def maddpg(model_number, TAU, LR_ACTOR, LR_CRITIC, fc_units, n_episodes=700, max_t=1000):
""" Deep Deterministic Policy Gradients
Params
======
n_episodes (int): maximum number of training episodes
max_t (int): maximum number of timesteps per episode
"""
fc1_units = fc_units[0]
fc2_units = fc_units[1]
agent_1 = Agent(state_size, action_size, TAU, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, random_seed=123)
agent_2 = Agent(state_size, action_size, TAU, LR_ACTOR, LR_CRITIC, fc1_units, fc2_units, random_seed=345)
agents = [agent_1, agent_2]
scores_window = deque(maxlen=100)
scores = np.zeros(agents_no)
scores_episode = []
noise = 2
noise_reduction = 0.9999
score_avg = 0
scores_avg_max = 0
solved_counter = 0
for i_episode in range(1, n_episodes+1):
env_info = env.reset(train_mode=True)[brain_name]
states = env_info.vector_observations
for agent in agents:
agent.reset()
scores = np.zeros(agents_no)
for t in range(max_t):
actions = np.array([agents[i].act(states[i],noise) for i in range(agents_no)])
noise *= noise_reduction
env_info = env.step(actions)[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
for i in range(agents_no):
agents[i].step(t,states[i], actions[i], rewards[i], next_states[i], dones[i])
states = next_states
scores += rewards
print('\rEpisode {}\tMax. score: {:.3f}'
.format(i_episode, np.max(scores)), end="")
if np.any(dones):
break
score = np.max(scores)
scores_window.append(score)
scores_episode.append(score)
score_avg = np.mean(scores_window)
if score_avg > scores_avg_max:
scores_avg_max = score_avg
if scores_avg_max >=0.5:
# save the episode model solved the environment
solved_counter += 1
if solved_counter == 1:
print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(len(scores_episode), score_avg))
with open('./models/models_solved.txt', 'a') as solved:
solved.writelines('{}, {}, {:.3f} \n'.format(model_number, i_episode, score_avg))
solved.flush()
# store the best model which solved the environment
# print('I am saving a model with scores_avg_max = ', scores_avg_max)
torch.save(agents[0].actor_local.state_dict(), './models/checkpoint_actor1_'+str(model_number)+'.pth')
torch.save(agents[1].actor_local.state_dict(), './models/checkpoint_actor2_'+str(model_number)+'.pth')
#break #comment this line if you want to continue computations even if the environment is solved
if i_episode % 100 == 0:
print('\rEpisode {}\tAverage Score: {:.3f}\tMax avg. score till now: {:.3f}'\
.format(i_episode, score, score_avg, scores_avg_max))
with open('results.txt', 'a') as output:
output.writelines(\
'{}, {}, ,{:.3f}, {:.3f}, {:.3f}, , {}, {}, {}, {} \n'.format(
model_number, i_episode, score, score_avg, scores_avg_max, \
TAU, LR_ACTOR, LR_CRITIC, fc_units))
output.flush()
return scores_episode
model_number = 1
total_no_models = len(r_fc_units)*len(r_LR_ACTOR)*len(r_LR_CRITIC)*len(r_TAU)
print('Total number of models to test: ', total_no_models)
for TAU in r_TAU:
for fc_units in r_fc_units:
for LR_ACTOR in r_LR_ACTOR:
for LR_CRITIC in r_LR_CRITIC:
maddpg(model_number, TAU, LR_ACTOR, LR_CRITIC, fc_units, n_episodes=700, max_t=1000)
model_number +=1