Skip to content

Commit

Permalink
text setup
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason Peng committed Jun 12, 2023
1 parent 45fc20a commit 911e9ae
Show file tree
Hide file tree
Showing 33 changed files with 865 additions and 70 deletions.
Binary file modified a1/a1.pdf
Binary file not shown.
6 changes: 4 additions & 2 deletions a1/bc_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,13 @@ def _decide_action(self, obs, info):

## a) sample an action from the policy
# placeholder
a = torch.zeros([self._env.compute_act_shape()], device=self._device)
a_space = self._env.get_action_space()
a = torch.zeros(a_space.shape, device=self._device)

## b) query the expert for an action
# placeholder
expert_a = torch.zeros([self._env.compute_act_shape()], device=self._device)
a_space = self._env.get_action_space()
expert_a = torch.zeros(a_space.shape, device=self._device)

a_info = {
"expert_a": expert_a
Expand Down
Binary file modified a2/a2.pdf
Binary file not shown.
5 changes: 1 addition & 4 deletions a2/pg_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,8 @@ def _decide_action(self, obs, info):
norm_a = norm_action_dist.mode
else:
assert(False), "Unsupported agent mode: {}".format(self._mode)

norm_a_logp = norm_action_dist.log_prob(norm_a)


norm_a = norm_a.detach()
norm_a_logp = norm_a_logp.detach()
a = self._a_norm.unnormalize(norm_a)

info = dict()
Expand Down
Binary file added a3/a3.pdf
Binary file not shown.
24 changes: 24 additions & 0 deletions a3/atari_dqn_agent.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
agent_name: "DQN"

model:
q_net: "cnn_3conv_1fc_0"
q_init_output_scale: 1.0

discount: 0.99
steps_per_iter: 1
iters_per_output: 20000
test_episodes: 10
normalizer_samples: 0

# optimizer parameters
learning_rate: 5e-4

exp_buffer_size: 200000
updates_per_iter: 1
batch_size: 128
init_samples: 50000
tar_net_update_iters: 10000

exp_anneal_samples: 1000000
exp_prob_beg: 1.0
exp_prob_end: 0.1
181 changes: 181 additions & 0 deletions a3/dqn_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import gym
import numpy as np
import torch

import envs.base_env as base_env
import learning.base_agent as base_agent
import learning.dqn_model as dqn_model
import util.torch_util as torch_util

class DQNAgent(base_agent.BaseAgent):
NAME = "DQN"

def __init__(self, config, env, device):
super().__init__(config, env, device)
return

def _load_params(self, config):
super()._load_params(config)

buffer_size = config["exp_buffer_size"]
self._exp_buffer_length = int(buffer_size)
self._exp_buffer_length = max(self._exp_buffer_length, self._steps_per_iter)

self._updates_per_iter = config["updates_per_iter"]
self._batch_size = config["batch_size"]
self._init_samples = config["init_samples"]
self._tar_net_update_iters = config["tar_net_update_iters"]

self._exp_anneal_samples = config.get("exp_anneal_samples", np.inf)
self._exp_prob_beg = config.get("exp_prob_beg", 1.0)
self._exp_prob_end = config.get("exp_prob_end", 1.0)

return

def _build_model(self, config):
model_config = config["model"]
self._model = dqn_model.DQNModel(model_config, self._env)

self._tar_model = dqn_model.DQNModel(model_config, self._env)
for param in self._tar_model.parameters():
param.requires_grad = False

self._sync_tar_model()
return

def _get_exp_buffer_length(self):
return self._exp_buffer_length

def _decide_action(self, obs, info):
norm_obs = self._obs_norm.normalize(obs)
qs = self._model.eval_q(norm_obs)

if (self._mode == base_agent.AgentMode.TRAIN):
a = self._sample_action(qs)
elif (self._mode == base_agent.AgentMode.TEST):
a = torch.argmax(qs, dim=-1)
else:
assert(False), "Unsupported agent mode: {}".format(self._mode)

a_info = {}
return a, a_info

def _init_train(self):
super()._init_train()
self._collect_init_samples(self._init_samples)
return

def _collect_init_samples(self, samples):
self.eval()
self.set_mode(base_agent.AgentMode.TRAIN)
self._rollout_train(samples)
return

def _update_model(self):
self.train()

train_info = dict()

for i in range(self._updates_per_iter):
batch = self._exp_buffer.sample(self._batch_size)
loss_info = self._compute_loss(batch)

self._optimizer.zero_grad()
loss = loss_info["loss"]
loss.backward()
self._optimizer.step()

torch_util.add_torch_dict(loss_info, train_info)

torch_util.scale_torch_dict(1.0 / self._updates_per_iter, train_info)

if (self._iter % self._tar_net_update_iters == 0):
self._sync_tar_model()

return train_info

def _log_train_info(self, train_info, test_info, start_time):
super()._log_train_info(train_info, test_info, start_time)
self._logger.log("Exp_Prob", self._get_exp_prob())
return

def _compute_loss(self, batch):
r = batch["reward"]
done = batch["done"]
a = batch["action"]
norm_obs = self._obs_norm.normalize(batch["obs"])
norm_next_obs = self._obs_norm.normalize(batch["next_obs"])

tar_vals = self._compute_tar_vals(r, norm_next_obs, done)
q_loss = self._compute_q_loss(norm_obs, a, tar_vals)

info = {"loss": q_loss}
return info



def _get_exp_prob(self):
'''
TODO 1.1: Calculate the epsilon-greedy exploration probability given the current sample
count. The exploration probability should start with a value of self._exp_prob_beg, and
then linearly annealed to self._exp_prob_end over the course of self._exp_anneal_samples
timesteps.
'''

# placeholder
prob = 1.0

return prob

def _sample_action(self, qs):
'''
TODO 1.2: Sample actions according to the Q-values of each action (qs). Implement epsilon
greedy exploration, where the probability of sampling a random action (epsilon) is specified
by self._get_exp_prob(). With probability 1 - epsilon, greedily select the action with
the highest Q-value. With probability epsilon, select a random action uniformly from the
set of possible actions. The output (a) should be a tensor containing the index of the selected
action.
'''
exp_prob = self._get_exp_prob()

# placeholder
a = torch.zeros(qs.shape[0], device=self._device, dtype=torch.int64)
return a

def _compute_tar_vals(self, r, norm_next_obs, done):
'''
TODO 1.3: Compute target values for updating the Q-function. The inputs consist of a tensor
of rewards (r), normalized observations at the next timestep (norm_next_obs), and done flags
(done) indicating if a timestep is the last timestep of an episode. The output (tar_vals)
should be a tensor containing the target values for each sample. The target values should
be calculated using the target model (self._tar_model), not the main model (self._model).
The Q-function can be queried by using the method eval_q(norm_obs).
'''

# placeholder
tar_vals = torch.zeros_like(r)

return tar_vals

def _compute_q_loss(self, norm_obs, a, tar_vals):
'''
TODO 1.4: Compute a loss for updating the Q-function. The inputs consist of a tensor of
normalized observations (norm_obs), a tensor containing the indices of actions selected
at each timestep (a), and target values for each timestep (tar_vals). The output (loss)
should be a scalar tensor containing the loss for updating the Q-function.
'''

# placeholder
loss = torch.zeros(1)

return loss

def _sync_tar_model(self):
'''
TODO 1.5: Update the target model by copying the parameters from the main model. The
main model is given by self._model, and the target model is given by self._tar_model.
HINT: self._model.parameters() can be used to retrieve a list of tensors containing
the parameters of a model.
'''

return
3 changes: 3 additions & 0 deletions data/envs/atari_breakout.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
env_name: "atari_Breakout"

max_timesteps: 5000
1 change: 1 addition & 0 deletions data/envs/atari_ms_pacman.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
env_name: "atari_MsPacman"
1 change: 1 addition & 0 deletions data/envs/atari_pong.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
env_name: "atari_Pong"
1 change: 1 addition & 0 deletions data/envs/atari_space_invaders.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
env_name: "atari_SpaceInvaders"
99 changes: 99 additions & 0 deletions envs/atari_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import envs.base_env as base_env
import envs.atari_wrappers as atari_wrappers

import gym
import numpy as np
import torch

class AtariEnv(base_env.BaseEnv):
def __init__(self, config, device, visualize):
super().__init__(visualize)

self._device = device
self._curr_obs = None
self._curr_info = None

self._env = self._build_atari_env(config, visualize)

self.reset()

return

def reset(self):
obs, info = self._env.reset()
self._curr_obs = self._convert_obs(obs)
self._curr_info = info
return self._curr_obs, self._curr_info

def step(self, action):
a = action.cpu().numpy()[0]
obs, reward, terminated, truncated, info = self._env.step(a)
self._curr_obs = self._convert_obs(obs)
self._curr_info = info

# train with clipped reward and then test with unclipped rewards
if (self._mode is base_env.EnvMode.TRAIN):
reward = reward[1] # clipped reward
else:
reward = reward[0] # unclipped reward

reward = torch.tensor([reward], device=self._device)
done = self._compute_done(terminated, truncated)

return self._curr_obs, reward, done, self._curr_info

def _build_atari_env(self, config, visualize):
env_name = config["env_name"]
max_timesteps = config.get("max_timesteps", None)
assert(env_name.startswith("atari_"))

env_name = env_name[6:]
env_name = env_name + "NoFrameskip-v4"

atari_env = atari_wrappers.make_atari(env_name,
max_episode_steps=max_timesteps,
visualize=visualize)
atari_env = atari_wrappers.wrap_deepmind(atari_env,
warp_frame=True,
frame_stack=True)

return atari_env

def _convert_obs(self, obs):
num_frames = obs.count()
frames = [np.expand_dims(obs.frame(i), axis=-3) for i in range(num_frames)]
frames = np.concatenate(frames, axis=-3)

obs = torch.tensor(frames, device=self._device)
obs = obs.unsqueeze(0)

return obs

def _compute_done(self, terminated, truncated):
if (terminated):
done = torch.tensor([base_env.DoneFlags.FAIL.value], device=self._device)
elif (truncated):
done = torch.tensor([base_env.DoneFlags.TIME.value], device=self._device)
else:
done = torch.tensor([base_env.DoneFlags.NULL.value], device=self._device)
return done

def get_obs_space(self):
obs_space = self._env.observation_space
obs_shape = list(obs_space.shape)
obs_shape = obs_shape[-1:] + obs_shape[:-1]
obs_space = gym.spaces.Box(
low=obs_space.low.min(),
high=obs_space.high.max(),
shape=obs_shape,
dtype=obs_space.dtype,
)
return obs_space

def get_action_space(self):
a_space = self._env.action_space
a_space._shape = (1,)
return a_space

def get_reward_bounds(self):
return (-1.0, 1.0)
Loading

0 comments on commit 911e9ae

Please sign in to comment.