Skip to content

Commit

Permalink
commit
Browse files Browse the repository at this point in the history
  • Loading branch information
louisnino committed Mar 9, 2020
1 parent bc145a1 commit bd3c937
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 18 deletions.
12 changes: 0 additions & 12 deletions queue1.py

This file was deleted.

6 changes: 3 additions & 3 deletions tutorial_A3C.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def update_global(
self.mu, self.sigma = self.actor(buffer_s) #actor输出mu和sigma
self.test = self.sigma[0] #这里只是为了测试用
self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 #mu需要映射到行动空间的范围
normal_dist = tfd.Normal(self.mu, self.sigma) # no tf.contrib for tf2.0 根据mu和sigma创建正态分布
normal_dist = tfd.Normal(self.mu, self.sigma) #根据mu和sigma创建正态分布

self.a_his = buffer_a # 求action在分布下的概率。float32
log_prob = normal_dist.log_prob(self.a_his)
Expand Down Expand Up @@ -202,8 +202,8 @@ def work(self, globalAC):
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: # MAX_GLOBAL_EP最大训练EP
s = self.env.reset() #重置环境
ep_r = 0 #统计ep的总reward
s = self.env.reset() #重置环境
ep_r = 0 #统计ep的总reward
while True:
# visualize Worker_0 during training
if self.name == 'Worker_0' and total_step % 30 == 0: #worker_0,每30步渲染一次
Expand Down
9 changes: 6 additions & 3 deletions tutorial_DPPO.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
C_UPDATE_STEPS = 10 # critic update steps
S_DIM, A_DIM = 3, 1 # state dimension, action dimension
EPS = 1e-8 # epsilon

#PPO1 和PPO2 的参数
METHOD = [
dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better
Expand Down Expand Up @@ -106,7 +108,7 @@ def a_train(self, tfs, tfa, tfadv):
'''
tfs = np.array(tfs, np.float32)
tfa = np.array(tfa, np.float32)
tfadv = np.array(tfadv, np.float32)
tfadv = np.array(tfadv, np.float32) #td-error
with tf.GradientTape() as tape:
mu, sigma = self.actor(tfs)
pi = tfp.distributions.Normal(mu, sigma)
Expand All @@ -117,6 +119,7 @@ def a_train(self, tfs, tfa, tfadv):
# ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS)
surr = ratio * tfadv

## PPO1
if METHOD['name'] == 'kl_pen':
tflam = METHOD['lam']
Expand Down Expand Up @@ -180,7 +183,7 @@ def update(self):
global GLOBAL_UPDATE_COUNTER
while not COORD.should_stop(): #如果协调器没有停止
if GLOBAL_EP < EP_MAX: #EP_MAX是最大更新次数
UPDATE_EVENT.wait() #PPO进程的等待位置 wait until get batch of data
UPDATE_EVENT.wait() #PPO进程的等待位置
self.update_old_pi() # copy pi to old pi
data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers
data = np.vstack(data)
Expand Down Expand Up @@ -347,7 +350,7 @@ def work(self):
# 如果数据足够,就开始更新
if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
ROLLING_EVENT.clear() # stop collecting data
UPDATE_EVENT.set() # globalPPO update
UPDATE_EVENT.set() # global PPO update

if GLOBAL_EP >= EP_MAX: # stop training
COORD.request_stop() # 停止更新
Expand Down

0 comments on commit bd3c937

Please sign in to comment.