From 85c641e6be4ddf16a9d9f5a7b90369debe87996b Mon Sep 17 00:00:00 2001 From: TimZF Date: Fri, 17 Jan 2020 19:32:25 +0100 Subject: [PATCH] Update ddqn.py It did not converge so i did some changes it also improves performance it solves cartpole after 30 episodes Changes to hyperparameters actual train in batches in the replay method(far better perfomance on big batches) stop episode when done --- ddqn.py | 58 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/ddqn.py b/ddqn.py index dfb4b6d5..f70d7856 100644 --- a/ddqn.py +++ b/ddqn.py @@ -17,17 +17,16 @@ def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = deque(maxlen=2000) - self.gamma = 0.95 # discount rate + self.gamma = 0.98 # discount rate self.epsilon = 1.0 # exploration rate self.epsilon_min = 0.01 - self.epsilon_decay = 0.99 + self.epsilon_decay = 0.999 self.learning_rate = 0.001 self.model = self._build_model() self.target_model = self._build_model() self.update_target_model() """Huber loss for Q Learning - References: https://en.wikipedia.org/wiki/Huber_loss https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss """ @@ -47,8 +46,7 @@ def _build_model(self): model.add(Dense(24, input_dim=self.state_size, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(self.action_size, activation='linear')) - model.compile(loss=self._huber_loss, - optimizer=Adam(lr=self.learning_rate)) + model.compile(loss=self._huber_loss, optimizer=Adam(lr=self.learning_rate)) return model def update_target_model(self): @@ -66,16 +64,24 @@ def act(self, state): def replay(self, batch_size): minibatch = random.sample(self.memory, batch_size) + stateBatch = np.zeros((batch_size,self.state_size)) + targetBatch = np.zeros((batch_size,self.action_size)) + x = 0 + for state, action, reward, next_state, done in minibatch: - target = self.model.predict(state) + stateBatch[x] = state[0] + + targetBatch[x] = self.model.predict(state)[0] if done: - target[0][action] = reward + targetBatch[x][action] = reward else: - # a = self.model.predict(next_state)[0] t = self.target_model.predict(next_state)[0] - target[0][action] = reward + self.gamma * np.amax(t) - # target[0][action] = reward + self.gamma * t[np.argmax(a)] - self.model.fit(state, target, epochs=1, verbose=0) + targetBatch[x][action] = reward + self.gamma * np.amax(t) + + x+=1 + + + self.model.fit(stateBatch, targetBatch, epochs=1, verbose=0) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay @@ -92,31 +98,35 @@ def save(self, name): action_size = env.action_space.n agent = DQNAgent(state_size, action_size) # agent.load("./save/cartpole-ddqn.h5") - done = False - batch_size = 32 - + + batch_size = 128 + for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) - for time in range(500): + done = False + cumReward = 0 + while not done: # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) - #reward = reward if not done else -10 + env.render() + + cumReward+=reward + reward = reward if not done else -10 x,x_dot,theta,theta_dot = next_state r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 - reward = r1 + r2 + reward = reward + r1 + r2 + + next_state = np.reshape(next_state, [1, state_size]) agent.memorize(state, action, reward, next_state, done) state = next_state - if done: - agent.update_target_model() - print("episode: {}/{}, score: {}, e: {:.2}" - .format(e, EPISODES, time, agent.epsilon)) - break - if len(agent.memory) > batch_size: - agent.replay(batch_size) + agent.replay(min(batch_size,len(agent.memory))) + + agent.update_target_model() + print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, cumReward, agent.epsilon)) # if e % 10 == 0: # agent.save("./save/cartpole-ddqn.h5")