text setup

xbpeng · Jun 12, 2023 · 911e9ae · 911e9ae
1 parent 45fc20a
commit 911e9ae
Show file tree

Hide file tree

Showing 33 changed files with 865 additions and 70 deletions.
diff --git a/a1/a1.pdf b/a1/a1.pdf
diff --git a/a1/bc_agent.py b/a1/bc_agent.py
@@ -118,11 +118,13 @@ def _decide_action(self, obs, info):
 
         ## a) sample an action from the policy
         # placeholder
-        a = torch.zeros([self._env.compute_act_shape()], device=self._device)
+        a_space = self._env.get_action_space()
+        a = torch.zeros(a_space.shape, device=self._device)
 
         ## b) query the expert for an action
         # placeholder
-        expert_a = torch.zeros([self._env.compute_act_shape()], device=self._device)
+        a_space = self._env.get_action_space()
+        expert_a = torch.zeros(a_space.shape, device=self._device)
 
         a_info = {
             "expert_a": expert_a

diff --git a/a2/a2.pdf b/a2/a2.pdf
diff --git a/a2/pg_agent.py b/a2/pg_agent.py
@@ -79,11 +79,8 @@ def _decide_action(self, obs, info):
             norm_a = norm_action_dist.mode
         else:
             assert(False), "Unsupported agent mode: {}".format(self._mode)
-
-        norm_a_logp = norm_action_dist.log_prob(norm_a)
-
+
         norm_a = norm_a.detach()
-        norm_a_logp = norm_a_logp.detach()
         a = self._a_norm.unnormalize(norm_a)
 
         info = dict()

diff --git a/a3/a3.pdf b/a3/a3.pdf
diff --git a/a3/atari_dqn_agent.yaml b/a3/atari_dqn_agent.yaml
@@ -0,0 +1,24 @@
+agent_name: "DQN"
+
+model:
+  q_net: "cnn_3conv_1fc_0"
+  q_init_output_scale: 1.0
+
+discount: 0.99
+steps_per_iter: 1
+iters_per_output: 20000
+test_episodes: 10
+normalizer_samples: 0
+
+# optimizer parameters
+learning_rate: 5e-4
+
+exp_buffer_size: 200000
+updates_per_iter: 1
+batch_size: 128
+init_samples: 50000
+tar_net_update_iters: 10000
+
+exp_anneal_samples: 1000000
+exp_prob_beg: 1.0
+exp_prob_end: 0.1
diff --git a/a3/dqn_agent.py b/a3/dqn_agent.py
@@ -0,0 +1,181 @@
+import gym
+import numpy as np
+import torch
+
+import envs.base_env as base_env
+import learning.base_agent as base_agent
+import learning.dqn_model as dqn_model
+import util.torch_util as torch_util
+
+class DQNAgent(base_agent.BaseAgent):
+    NAME = "DQN"
+
+    def __init__(self, config, env, device):
+        super().__init__(config, env, device)
+        return
+
+    def _load_params(self, config):
+        super()._load_params(config)
+
+        buffer_size = config["exp_buffer_size"]
+        self._exp_buffer_length = int(buffer_size)
+        self._exp_buffer_length = max(self._exp_buffer_length, self._steps_per_iter)
+
+        self._updates_per_iter = config["updates_per_iter"]
+        self._batch_size = config["batch_size"]
+        self._init_samples = config["init_samples"]
+        self._tar_net_update_iters = config["tar_net_update_iters"]
+
+        self._exp_anneal_samples = config.get("exp_anneal_samples", np.inf)
+        self._exp_prob_beg = config.get("exp_prob_beg", 1.0)
+        self._exp_prob_end = config.get("exp_prob_end", 1.0)
+
+        return
+
+    def _build_model(self, config):
+        model_config = config["model"]
+        self._model = dqn_model.DQNModel(model_config, self._env)
+
+        self._tar_model = dqn_model.DQNModel(model_config, self._env)
+        for param in self._tar_model.parameters():
+            param.requires_grad = False
+
+        self._sync_tar_model()
+        return
+
+    def _get_exp_buffer_length(self):
+        return self._exp_buffer_length
+
+    def _decide_action(self, obs, info):
+        norm_obs = self._obs_norm.normalize(obs)
+        qs = self._model.eval_q(norm_obs)
+
+        if (self._mode == base_agent.AgentMode.TRAIN):
+            a = self._sample_action(qs)
+        elif (self._mode == base_agent.AgentMode.TEST):
+            a = torch.argmax(qs, dim=-1)
+        else:
+            assert(False), "Unsupported agent mode: {}".format(self._mode)
+
+        a_info = {}
+        return a, a_info
+
+    def _init_train(self):
+        super()._init_train()
+        self._collect_init_samples(self._init_samples)
+        return
+
+    def _collect_init_samples(self, samples):
+        self.eval()
+        self.set_mode(base_agent.AgentMode.TRAIN)
+        self._rollout_train(samples)
+        return
+
+    def _update_model(self):
+        self.train()
+
+        train_info = dict()
+
+        for i in range(self._updates_per_iter):
+            batch = self._exp_buffer.sample(self._batch_size)
+            loss_info = self._compute_loss(batch)
+
+            self._optimizer.zero_grad()
+            loss = loss_info["loss"]
+            loss.backward()
+            self._optimizer.step()
+
+            torch_util.add_torch_dict(loss_info, train_info)
+
+        torch_util.scale_torch_dict(1.0 / self._updates_per_iter, train_info)
+
+        if (self._iter % self._tar_net_update_iters == 0):
+            self._sync_tar_model()
+
+        return train_info
+
+    def _log_train_info(self, train_info, test_info, start_time):
+        super()._log_train_info(train_info, test_info, start_time)
+        self._logger.log("Exp_Prob", self._get_exp_prob())
+        return
+
+    def _compute_loss(self, batch):
+        r = batch["reward"]
+        done = batch["done"]
+        a = batch["action"]
+        norm_obs = self._obs_norm.normalize(batch["obs"])
+        norm_next_obs = self._obs_norm.normalize(batch["next_obs"])
+
+        tar_vals = self._compute_tar_vals(r, norm_next_obs, done)
+        q_loss = self._compute_q_loss(norm_obs, a, tar_vals)
+
+        info = {"loss": q_loss}
+        return info
+
+
+
+    def _get_exp_prob(self):
+        '''
+        TODO 1.1: Calculate the epsilon-greedy exploration probability given the current sample
+        count. The exploration probability should start with a value of self._exp_prob_beg, and
+        then linearly annealed to self._exp_prob_end over the course of self._exp_anneal_samples
+        timesteps.
+        '''
+
+        # placeholder
+        prob = 1.0
+
+        return prob
+
+    def _sample_action(self, qs):
+        '''
+        TODO 1.2: Sample actions according to the Q-values of each action (qs). Implement epsilon
+        greedy exploration, where the probability of sampling a random action (epsilon) is specified
+        by self._get_exp_prob(). With probability 1 - epsilon, greedily select the action with
+        the highest Q-value. With probability epsilon, select a random action uniformly from the
+        set of possible actions. The output (a) should be a tensor containing the index of the selected
+        action.
+        '''
+        exp_prob = self._get_exp_prob()
+
+        # placeholder
+        a = torch.zeros(qs.shape[0], device=self._device, dtype=torch.int64)
+        return a
+
+    def _compute_tar_vals(self, r, norm_next_obs, done):
+        '''
+        TODO 1.3: Compute target values for updating the Q-function. The inputs consist of a tensor
+        of rewards (r), normalized observations at the next timestep (norm_next_obs), and done flags
+        (done) indicating if a timestep is the last timestep of an episode. The output (tar_vals)
+        should be a tensor containing the target values for each sample. The target values should
+        be calculated using the target model (self._tar_model), not the main model (self._model).
+        The Q-function can be queried by using the method eval_q(norm_obs).
+        '''
+
+        # placeholder
+        tar_vals = torch.zeros_like(r)
+
+        return tar_vals
+
+    def _compute_q_loss(self, norm_obs, a, tar_vals):
+        '''
+        TODO 1.4: Compute a loss for updating the Q-function. The inputs consist of a tensor of
+        normalized observations (norm_obs), a tensor containing the indices of actions selected
+        at each timestep (a), and target values for each timestep (tar_vals). The output (loss)
+        should be a scalar tensor containing the loss for updating the Q-function.
+        '''
+
+        # placeholder
+        loss = torch.zeros(1)
+
+        return loss
+
+    def _sync_tar_model(self):
+        '''
+        TODO 1.5: Update the target model by copying the parameters from the main model. The
+        main model is given by self._model, and the target model is given by self._tar_model.
+        HINT: self._model.parameters() can be used to retrieve a list of tensors containing
+        the parameters of a model.
+        '''
+
+        return
diff --git a/data/envs/atari_breakout.yaml b/data/envs/atari_breakout.yaml
@@ -0,0 +1,3 @@
+env_name: "atari_Breakout"
+
+max_timesteps: 5000
diff --git a/data/envs/atari_ms_pacman.yaml b/data/envs/atari_ms_pacman.yaml
@@ -0,0 +1 @@
+env_name: "atari_MsPacman"
diff --git a/data/envs/atari_pong.yaml b/data/envs/atari_pong.yaml
@@ -0,0 +1 @@
+env_name: "atari_Pong"
diff --git a/data/envs/atari_space_invaders.yaml b/data/envs/atari_space_invaders.yaml
@@ -0,0 +1 @@
+env_name: "atari_SpaceInvaders"
diff --git a/envs/atari_env.py b/envs/atari_env.py
@@ -0,0 +1,99 @@
+import envs.base_env as base_env
+import envs.atari_wrappers as atari_wrappers
+
+import gym
+import numpy as np
+import torch
+
+class AtariEnv(base_env.BaseEnv):
+    def __init__(self, config, device, visualize):
+        super().__init__(visualize)
+
+        self._device = device
+        self._curr_obs = None
+        self._curr_info = None
+
+        self._env = self._build_atari_env(config, visualize)
+
+        self.reset()
+
+        return
+
+    def reset(self):
+        obs, info = self._env.reset()
+        self._curr_obs = self._convert_obs(obs)
+        self._curr_info = info
+        return self._curr_obs, self._curr_info
+
+    def step(self, action):
+        a = action.cpu().numpy()[0]
+        obs, reward, terminated, truncated, info = self._env.step(a)
+        self._curr_obs = self._convert_obs(obs)
+        self._curr_info = info
+
+        # train with clipped reward and then test with unclipped rewards
+        if (self._mode is base_env.EnvMode.TRAIN):
+            reward = reward[1] # clipped reward
+        else:
+            reward = reward[0] # unclipped reward
+
+        reward = torch.tensor([reward], device=self._device)
+        done = self._compute_done(terminated, truncated) 
+
+        return self._curr_obs, reward, done, self._curr_info 
+
+    def _build_atari_env(self, config, visualize):
+        env_name = config["env_name"]
+        max_timesteps = config.get("max_timesteps", None)
+        assert(env_name.startswith("atari_"))
+
+        env_name = env_name[6:]
+        env_name = env_name + "NoFrameskip-v4"
+
+        atari_env = atari_wrappers.make_atari(env_name, 
+                                              max_episode_steps=max_timesteps,
+                                              visualize=visualize)
+        atari_env = atari_wrappers.wrap_deepmind(atari_env,
+                                                 warp_frame=True,
+                                                 frame_stack=True)
+
+        return atari_env
+
+    def _convert_obs(self, obs):
+        num_frames = obs.count()
+        frames = [np.expand_dims(obs.frame(i), axis=-3) for i in range(num_frames)]
+        frames = np.concatenate(frames, axis=-3)
+
+        obs = torch.tensor(frames, device=self._device)
+        obs = obs.unsqueeze(0)
+
+        return  obs
+
+    def _compute_done(self, terminated, truncated):
+        if (terminated): 
+            done = torch.tensor([base_env.DoneFlags.FAIL.value], device=self._device)
+        elif (truncated): 
+            done = torch.tensor([base_env.DoneFlags.TIME.value], device=self._device)
+        else:
+            done = torch.tensor([base_env.DoneFlags.NULL.value], device=self._device)
+        return done
+
+    def get_obs_space(self):
+        obs_space = self._env.observation_space
+        obs_shape = list(obs_space.shape)
+        obs_shape = obs_shape[-1:] + obs_shape[:-1]
+        obs_space = gym.spaces.Box(
+            low=obs_space.low.min(),
+            high=obs_space.high.max(),
+            shape=obs_shape,
+            dtype=obs_space.dtype,
+        )
+        return obs_space
+
+    def get_action_space(self):
+        a_space = self._env.action_space
+        a_space._shape = (1,)
+        return a_space
+
+    def get_reward_bounds(self):
+        return (-1.0, 1.0)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		env_name: "atari_Breakout"

		max_timesteps: 5000