adding multiprocessing

xbpeng · Sep 17, 2024 · d3e608e · d3e608e
1 parent fff25c7
commit d3e608e
Show file tree

Hide file tree

Showing 18 changed files with 513 additions and 120 deletions.
diff --git a/README.md b/README.md
@@ -19,19 +19,20 @@ and it should be good to go.
 
 To train a policy, run the following command:
 
-``python run.py --env_config data/envs/dm_cheetah.yaml --agent_config a2/dm_cheetah_cem_agent.yaml --mode train --log_file output/log.txt --out_model_file output/model.pt --visualize``
+``python run.py --env_config data/envs/dm_cheetah.yaml --agent_config a2/dm_cheetah_cem_agent.yaml --mode train --log_file output/log.txt --out_model_file output/model.pt --visualize --num_workers 1``
 
 - `--env_config` specifies the configuration file for the environment.
 - `--agent_config` specifies configuration file for the agent.
 - `--visualize` enables visualization. Rendering should be disabled for faster training.
 - `--log_file` specifies the output log file, which will keep track of statistics during training.
 - `--out_model_file` specifies the output model file, which contains the model parameters.
+- `--num_workers` specifies the number of worker processes used to parallelize training.
 
 ## Test Models
 
 To load a trained model, run the following command:
 
-``python run.py --env_config data/envs/dm_cheetah_env.yaml --agent_config a2/dm_cheetah_cem_agent.yaml --mode test --model_file data/models/dm_cheetah_ppo_model.pt --visualize``
+``python run.py --env_config data/envs/dm_cheetah_env.yaml --agent_config a2/dm_cheetah_cem_agent.yaml --mode test --model_file data/models/dm_cheetah_ppo_model.pt --visualize --num_workers 1``
 
 - `--model_file` specifies the `.pt` file that contains parameters for the trained model. Pretrained models are available in `data/models/`.
 

diff --git a/a1/bc_agent.py b/a1/bc_agent.py
@@ -4,6 +4,7 @@
 import learning.agent_builder as agent_builder
 import learning.base_agent as base_agent
 import learning.bc_model as bc_model
+import util.mp_util as mp_util
 import util.torch_util as torch_util
 
 class BCAgent(base_agent.BaseAgent):
@@ -15,11 +16,16 @@ def __init__(self, config, env, device):
 
     def _load_params(self, config):
         super()._load_params(config)
-
-        buffer_size = config["exp_buffer_size"]
-        self._exp_buffer_length = max(buffer_size, self._steps_per_iter)
+
+        num_procs = mp_util.get_num_procs()
 
         self._batch_size = config["batch_size"]
+        self._batch_size = int(np.ceil(self._batch_size / num_procs))
+
+        buffer_size = config["exp_buffer_size"]
+        self._exp_buffer_length = int(np.ceil(buffer_size / num_procs))
+        self._exp_buffer_length = max(self._exp_buffer_length, self._steps_per_iter)
+
         self._update_epochs = config["update_epochs"]
         return
 
@@ -81,10 +87,8 @@ def _update_model(self):
                 batch = self._exp_buffer.sample(batch_size)
                 loss_info = self._compute_loss(batch)
 
-                self._optimizer.zero_grad()
                 loss = loss_info["loss"]
-                loss.backward()
-                self._optimizer.step()
+                self._optimizer.step(loss)
 
                 torch_util.add_torch_dict(loss_info, train_info)
 

diff --git a/a1/dm_cheetah_bc_agent.yaml b/a1/dm_cheetah_bc_agent.yaml
@@ -6,6 +6,10 @@ model:
   actor_std_type: "FIXED"
   action_std: 0.2
 
+optimizer:
+    type: "SGD"
+    learning_rate: 5e-4
+
 expert_config: "data/agents/bc_expert_agent.yaml"
 expert_model_file: "data/models/dm_cheetah_expert_model.pt"
 
@@ -14,7 +18,6 @@ steps_per_iter: 100
 iters_per_output: 10
 test_episodes: 10
 
-learning_rate: 5e-4
 update_epochs: 20
 
 exp_buffer_size: 1000000

diff --git a/a1/dm_walker_bc_agent.yaml b/a1/dm_walker_bc_agent.yaml
@@ -5,7 +5,11 @@ model:
   actor_init_output_scale: 0.01
   actor_std_type: "FIXED"
   action_std: 0.2
-
+
+optimizer:
+    type: "SGD"
+    learning_rate: 5e-4
+
 expert_config: "data/agents/bc_expert_agent.yaml"
 expert_model_file: "data/models/dm_walker_run_expert_model.pt"
 
@@ -14,7 +18,6 @@ steps_per_iter: 100
 iters_per_output: 10
 test_episodes: 10
 
-learning_rate: 5e-4
 update_epochs: 20
 
 exp_buffer_size: 1000000

diff --git a/a2/cem_agent.py b/a2/cem_agent.py
@@ -3,6 +3,7 @@
 
 import learning.base_agent as base_agent
 import learning.cem_model as cem_model
+import util.mp_util as mp_util
 
 class CEMAgent(base_agent.BaseAgent):
     NAME = "CEM"
@@ -57,10 +58,16 @@ def _update_sample_count(self):
         return self._sample_count
 
     def _train_iter(self):
-        candidates = self._sample_candidates(self._population_size)
+        num_procs = mp_util.get_num_procs()
+        num_candidates = int(np.ceil(self._population_size / num_procs))
+        candidates = self._sample_candidates(num_candidates)
 
         rets, ep_lens = self._eval_candidates(candidates)
-
+        candidates, rets, ep_lens = self._gather_candidates(candidates, rets, ep_lens)
+
+        rets = rets.cpu().numpy()
+        ep_lens = ep_lens.cpu().numpy()
+
         curr_best_idx = np.argmax(rets)
         curr_best_ret = rets[curr_best_idx]
 
@@ -82,6 +89,8 @@ def _train_iter(self):
         num_eps = self._population_size * self._eps_per_candidate
         mean_param_std = torch.mean(new_std)
 
+        assert(self._check_synced()), "Network parameters desynchronized"
+
         train_info = {
             "mean_return": train_return,
             "mean_ep_len": train_ep_len,
@@ -90,6 +99,29 @@ def _train_iter(self):
         }
         return train_info
 
+    def _gather_candidates(self, candidates, rets, ep_lens):
+        candidates = mp_util.all_gather(candidates)
+        rets = mp_util.all_gather(rets)
+        ep_lens = mp_util.all_gather(ep_lens)
+
+        candidates = torch.cat(candidates, dim=0)
+        rets = torch.cat(rets, dim=0)
+        ep_lens = torch.cat(ep_lens, dim=0)
+
+        return candidates, rets, ep_lens
+
+    def _check_synced(self):
+        global_param_mean = mp_util.broadcast(self._param_mean)
+        global_param_std = mp_util.broadcast(self._param_std)
+        global_best_params = mp_util.broadcast(self._best_params)
+
+        synced = torch.equal(global_param_mean, self._param_mean)
+        synced &= torch.equal(global_param_std, self._param_std)
+        synced &= torch.equal(global_best_params, self._best_params)
+
+        return synced
+
+
 
     def _sample_candidates(self, n):
         '''
@@ -118,8 +150,8 @@ def _eval_candidates(self, candidates):
         n = candidates.shape[0]
 
         # placeholder
-        rets = np.zeros(n)
-        ep_lens = np.zeros(n)
+        rets = torch.zeros(n, device=self._device)
+        ep_lens = torch.zeros(n, device=self._device)
 
         return rets, ep_lens
 

diff --git a/a2/dm_cheetah_pg_agent.yaml b/a2/dm_cheetah_pg_agent.yaml
@@ -8,17 +8,20 @@ model:
 
   critic_net: "fc_2layers_128units"
 
-
+actor_optimizer:
+    type: "SGD"
+    learning_rate: 2e-3
+
+critic_optimizer:
+    type: "SGD"
+    learning_rate: 2e-3
+
 discount: 0.99
 steps_per_iter: 4096
 iters_per_output: 50
 test_episodes: 32
 critic_update_epoch: 5
 
-# optimizer parameters
-actor_learning_rate: 2e-3
-critic_learning_rate: 2e-3
-
 batch_size: 512
 norm_adv_clip: 4.0
 action_bound_weight: 10.0
diff --git a/a2/pg_agent.py b/a2/pg_agent.py
@@ -3,7 +3,9 @@
 
 import envs.base_env as base_env
 import learning.base_agent as base_agent
+import learning.mp_optimizer as mp_optimizer
 import learning.pg_model as pg_model
+import util.mp_util as mp_util
 import util.torch_util as torch_util
 
 class PGAgent(base_agent.BaseAgent):
@@ -16,7 +18,11 @@ def __init__(self, config, env, device):
     def _load_params(self, config):
         super()._load_params(config)
 
+        num_procs = mp_util.get_num_procs()
+
         self._batch_size = config["batch_size"]
+        self._batch_size = int(np.ceil(self._batch_size / num_procs))
+
         self._critic_update_epoch = config["critic_update_epoch"]
         self._norm_adv_clip = config["norm_adv_clip"]
         self._action_bound_weight = config["action_bound_weight"]
@@ -56,16 +62,15 @@ def _init_iter(self):
         return
 
     def _build_optimizer(self, config):
-        actor_lr = float(config["actor_learning_rate"])
-        critic_lr = float(config["critic_learning_rate"])
-
+        actor_opt_config = config["actor_optimizer"]
         actor_params = list(self._model._actor_layers.parameters())+list(self._model._action_dist.parameters())
         actor_params_grad = [p for p in actor_params if p.requires_grad]
-        self._actor_optimizer = torch.optim.SGD(actor_params_grad, actor_lr, momentum=0.9)
-
+        self._actor_optimizer = mp_optimizer.MPOptimizer(actor_opt_config, actor_params_grad)
+
+        critic_opt_config = config["critic_optimizer"]
         critic_params = list(self._model._critic_layers.parameters())+list(self._model._critic_out.parameters())
         critic_params_grad = [p for p in critic_params if p.requires_grad]
-        self._critic_optimizer = torch.optim.SGD(critic_params_grad, critic_lr, momentum=0.9)
+        self._critic_optimizer = mp_optimizer.MPOptimizer(critic_opt_config, critic_params_grad)
 
         return
 
@@ -152,9 +157,7 @@ def _update_critic(self, batch):
 
         loss = self._calc_critic_loss(norm_obs, tar_val)
 
-        self._critic_optimizer.zero_grad()
-        loss.backward()
-        self._critic_optimizer.step()
+        self._critic_optimizer.step(loss)
 
         info = {
             "critic_loss": loss
@@ -179,13 +182,13 @@ def _update_actor(self, batch):
                 action_bound_loss = torch.mean(action_bound_loss)
                 loss += self._action_bound_weight * action_bound_loss
                 info["action_bound_loss"] = action_bound_loss.detach()
-
-        self._actor_optimizer.zero_grad()
-        loss.backward()
-        self._actor_optimizer.step()
+
+        self._actor_optimizer.step(loss)
 
         return info
 
+
+
     def _calc_return(self, r, done):
         '''
         TODO 2.1: Given a tensor of per-timestep rewards (r), and a tensor (done)

diff --git a/a3/atari_breakout_dqn_agent.yaml b/a3/atari_breakout_dqn_agent.yaml
@@ -10,8 +10,9 @@ iters_per_output: 50000
 test_episodes: 10
 normalizer_samples: 0
 
-# optimizer parameters
-learning_rate: 5e-4
+optimizer:
+    type: "SGD"
+    learning_rate: 5e-4
 
 exp_buffer_size: 200000
 updates_per_iter: 1

diff --git a/a3/atari_pong_dqn_agent.yaml b/a3/atari_pong_dqn_agent.yaml
@@ -10,8 +10,9 @@ iters_per_output: 50000
 test_episodes: 10
 normalizer_samples: 0
 
-# optimizer parameters
-learning_rate: 5e-4
+optimizer:
+    type: "SGD"
+    learning_rate: 5e-4
 
 exp_buffer_size: 200000
 updates_per_iter: 1

diff --git a/a3/dqn_agent.py b/a3/dqn_agent.py
@@ -5,25 +5,34 @@
 import envs.base_env as base_env
 import learning.base_agent as base_agent
 import learning.dqn_model as dqn_model
+import util.mp_util as mp_util
 import util.torch_util as torch_util
 
 class DQNAgent(base_agent.BaseAgent):
     NAME = "DQN"
 
     def __init__(self, config, env, device):
         super().__init__(config, env, device)
+
+        self._sync_tar_model()
         return
 
     def _load_params(self, config):
         super()._load_params(config)
+
+        num_procs = mp_util.get_num_procs()
 
         buffer_size = config["exp_buffer_size"]
-        self._exp_buffer_length = int(buffer_size)
+        self._exp_buffer_length = int(np.ceil(buffer_size / (num_procs)))
         self._exp_buffer_length = max(self._exp_buffer_length, self._steps_per_iter)
 
-        self._updates_per_iter = config["updates_per_iter"]
         self._batch_size = config["batch_size"]
+        self._batch_size = int(np.ceil(self._batch_size / num_procs))
+
         self._init_samples = config["init_samples"]
+        self._init_samples = int(np.ceil(self._init_samples / num_procs))
+
+        self._updates_per_iter = config["updates_per_iter"]
         self._tar_net_update_iters = config["tar_net_update_iters"]
 
         self._exp_anneal_samples = config.get("exp_anneal_samples", np.inf)
@@ -40,7 +49,6 @@ def _build_model(self, config):
         for param in self._tar_model.parameters():
             param.requires_grad = False
 
-        self._sync_tar_model()
         return
 
     def _get_exp_buffer_length(self):
@@ -80,10 +88,8 @@ def _update_model(self):
             batch = self._exp_buffer.sample(self._batch_size)
             loss_info = self._compute_loss(batch)
 
-            self._optimizer.zero_grad()
             loss = loss_info["loss"]
-            loss.backward()
-            self._optimizer.step()
+            self._optimizer.step(loss)
 
             torch_util.add_torch_dict(loss_info, train_info)
 

diff --git a/data/agents/bc_expert_agent.yaml b/data/agents/bc_expert_agent.yaml
@@ -8,11 +8,11 @@ model:
 
   critic_net: "fc_2layers_128units"
 
-
+optimizer:
+    type: "SGD"
+    learning_rate: 1e-3
+
 discount: 0.99
 steps_per_iter: 4096
 iters_per_output: 10
-test_episodes: 32
-
-# optimizer parameters
-learning_rate: 1e-3
+test_episodes: 32