From 1c05c811dde9c4add82aef81883eb5c196001bc9 Mon Sep 17 00:00:00 2001
From: Rohan138 <rohanpotdar138@gmail.com>
Date: Sun, 7 Nov 2021 00:50:29 -0400
Subject: [PATCH 1/4] Fixed imports

---
 maddpg.py              | 20 ++++++++++++--------
 maddpg_tf_policy.py    |  5 +++--
 maddpg_torch_policy.py | 15 +++++----------
 pettingzoo_maddpg.py   |  2 +-
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/maddpg.py b/maddpg.py
index a60f0d9..02572d4 100644
--- a/maddpg.py
+++ b/maddpg.py
@@ -20,6 +20,7 @@
 from ray.rllib.utils.typing import TrainerConfigDict
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils import merge_dicts
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -66,13 +67,15 @@
     "good_policy": "maddpg",
     # Algorithm for adversary policies.
     "adv_policy": "maddpg",
-    # list of other agent_ids and policies to approximate (See MADDPG Section 4.2)
-    "learn_other_policies": None,
 
     # === Replay buffer ===
     # Size of the replay buffer. Note that if async_updates is set, then
     # each worker will have a replay buffer of this size.
-    "buffer_size": int(1e6),
+    "buffer_size": DEPRECATED_VALUE,
+    "replay_buffer_config": {
+        "type": "LocalReplayBuffer",
+        "capacity": int(1e6),
+    },
     # Observation compression. Note that compression makes simulation slow in
     # MPE.
     "compress_observations": False,
@@ -157,10 +160,11 @@ def sampler(policy, obs):
             return policy.compute_actions(obs)[0]
         new_act_n = [sampler(policy, obs) for policy, obs in zip(policies.values(), new_obs_n)]
     else:
-        target_act_sampler_n = [p.target_act_sampler for p in policies.values()]
         new_obs_ph_n = [p.new_obs_ph for p in policies.values()]
-        feed_dict = dict(zip(new_obs_ph_n, new_obs_n))
-        new_act_n = p.sess.run(target_act_sampler_n, feed_dict)
+        for i, p in enumerate(policies.values()):
+            feed_dict = {new_obs_ph_n[i]: new_obs_n[i]}
+            new_act = p.get_session().run(p.target_act_sampler, feed_dict)
+            samples.update({"new_actions_%d" % i: new_act})
 
     samples.update(
         {"new_actions_%d" % i: new_act
@@ -174,8 +178,8 @@ def sampler(policy, obs):
 def add_maddpg_postprocessing(config):
     """Add the before learn on batch hook.
 
-    This hook is called explicitly prior to TrainOneStep() in the execution
-    setups for DQN and APEX.
+    This hook shares the joint batches across all agents
+    for MADDPG's centralized critic update
     """
 
     def f(batch, workers, config):
diff --git a/maddpg_tf_policy.py b/maddpg_tf_policy.py
index f98bb83..f64cd82 100644
--- a/maddpg_tf_policy.py
+++ b/maddpg_tf_policy.py
@@ -1,5 +1,6 @@
 import ray
-from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip, _adjust_nstep
+from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip
+from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.models import ModelCatalog
 from ray.rllib.policy.sample_batch import SampleBatch
@@ -34,7 +35,7 @@ def postprocess_trajectory(self,
 
         # N-step Q adjustments
         if self.config["n_step"] > 1:
-            _adjust_nstep(self.config["n_step"], self.config["gamma"],
+            adjust_nstep(self.config["n_step"], self.config["gamma"],
                           sample_batch[SampleBatch.CUR_OBS],
                           sample_batch[SampleBatch.ACTIONS],
                           sample_batch[SampleBatch.REWARDS],
diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py
index df793bc..e12e2a2 100644
--- a/maddpg_torch_policy.py
+++ b/maddpg_torch_policy.py
@@ -7,7 +7,7 @@
 
 from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss
 from ray.rllib.utils.typing import TrainerConfigDict, TensorType, LocalOptimizer
-from ray.rllib.agents.dqn.dqn_tf_policy import _adjust_nstep
+from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.policy import Policy
@@ -27,8 +27,8 @@
 
 def validate_spaces(policy: Policy, obs_space, action_space, config) -> None:
     if isinstance(obs_space, Discrete) or isinstance(action_space, Discrete):
-        logging.warning("Discrete spaces may not work with Torch MADDPG; \
-            consider using framework=tf instead")
+        logging.warning("Discrete spaces may not work correctly with \
+        pytorch MADDPG; consider using framework=tf instead")
     policy.observation_space = _make_continuous_space(obs_space)
     policy.action_space = _make_continuous_space(action_space)
 
@@ -39,11 +39,6 @@ def build_maddpg_models_and_action_dist(policy: Policy, obs_space, action_space,
         model = build_ddpg_models(policy, obs_space, action_space, config)
     else:
         model = build_maddpg_models(policy, obs_space, action_space, config)
-    device = (torch.device("cuda")
-              if torch.cuda.is_available() else torch.device("cpu"))
-    policy.model = policy.model.to(device)
-    policy.target_model = policy.target_model.to(device)
-
     return model, TorchDeterministic
 
 
@@ -218,7 +213,7 @@ def postprocess_nstep(policy: Policy, batch: SampleBatch,
                                    other_agent_batches=None, episode=None):
     # N-step Q adjustments
     if policy.config["n_step"] > 1:
-        _adjust_nstep(policy.config["n_step"], policy.config["gamma"],
+        adjust_nstep(policy.config["n_step"], policy.config["gamma"],
                       batch[SampleBatch.CUR_OBS],
                       batch[SampleBatch.ACTIONS],
                       batch[SampleBatch.REWARDS],
@@ -300,5 +295,5 @@ def get_default_config():
     before_loss_init=setup_late_mixins,
     make_model_and_action_dist=build_maddpg_models_and_action_dist,
     apply_gradients_fn=apply_gradients_fn,
-    mixins=[TargetNetworkMixin, ComputeTDErrorMixin]
+    mixins=[TargetNetworkMixin, ComputeTDErrorMixin, SetJointSpacesMixin]
 )
diff --git a/pettingzoo_maddpg.py b/pettingzoo_maddpg.py
index b4c3229..33f615f 100644
--- a/pettingzoo_maddpg.py
+++ b/pettingzoo_maddpg.py
@@ -167,7 +167,7 @@ def gen_policy(i):
             # === Multi-agent setting ===
             "multiagent": {
                 "policies": policies,
-                "policy_mapping_fn": lambda name, _: policy_ids[agents.index(name)],
+                "policy_mapping_fn": lambda name: policy_ids[agents.index(name)],
                 # Workaround because MADDPG requires agent_id: int but actual ids are strings like 'speaker_0'
             },
     

From a3c9d928eb85af772676b390a7531c6a3e5eed2e Mon Sep 17 00:00:00 2001
From: Rohan138 <rapotdar@purdue.edu>
Date: Fri, 24 Dec 2021 12:48:58 +0530
Subject: [PATCH 2/4] Initial support for ray-v2.0.0

---
 maddpg.py              | 340 ++++++++++++++++++++++-------------------
 maddpg_tf_policy.py    | 254 ++++++++++++++++--------------
 maddpg_torch_model.py  | 170 ++++++++++++---------
 maddpg_torch_policy.py | 203 ++++++++++++++----------
 pettingzoo_maddpg.py   | 254 +++++++++++++++++-------------
 5 files changed, 688 insertions(+), 533 deletions(-)

diff --git a/maddpg.py b/maddpg.py
index 02572d4..6d84d09 100644
--- a/maddpg.py
+++ b/maddpg.py
@@ -12,141 +12,37 @@
 import logging
 from typing import Optional, Type
 
-from ray.rllib.agents.trainer import COMMON_CONFIG, with_common_config
-from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
-from maddpg_tf_policy import MADDPGTFPolicy
-from maddpg_torch_policy import MADDPGTorchPolicy
+from ray.rllib.agents.dqn.dqn import DQNTrainer
+from ray.rllib.agents.dqn.simple_q import \
+    DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG
+from ray.rllib.agents.trainer import COMMON_CONFIG, Trainer
 from ray.rllib.policy.policy import Policy
-from ray.rllib.utils.typing import TrainerConfigDict
-from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils import merge_dicts
+from ray.rllib.utils.annotations import override
 from ray.rllib.utils.deprecation import DEPRECATED_VALUE
+from ray.rllib.utils.typing import TrainerConfigDict
+
+from maddpg_tf_policy import MADDPGTFPolicy
+from maddpg_torch_policy import MADDPGTorchPolicy
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-# yapf: disable
-# __sphinx_doc_begin__
-DEFAULT_CONFIG = with_common_config({
-    # === Framework to run the algorithm ===
-    "framework": "tf",
-
-    # === Settings for each individual policy ===
-    # ID of the agent controlled by this policy
-    "agent_id": None,
-    # Use a local critic for this policy.
-    "use_local_critic": False,
-
-    # === Evaluation ===
-    # Evaluation interval
-    "evaluation_interval": None,
-    # Number of episodes to run per evaluation period.
-    "evaluation_num_episodes": 10,
-
-    # === Model ===
-    # Apply a state preprocessor with spec given by the "model" config option
-    # (like other RL algorithms). This is mostly useful if you have a weird
-    # observation shape, like an image. Disabled by default.
-    "use_state_preprocessor": False,
-    # Postprocess the policy network model output with these hidden layers. If
-    # use_state_preprocessor is False, then these will be the *only* hidden
-    # layers in the network.
-    "actor_hiddens": [64, 64],
-    # Hidden layers activation of the postprocessing stage of the policy
-    # network
-    "actor_hidden_activation": "relu",
-    # Postprocess the critic network model output with these hidden layers;
-    # again, if use_state_preprocessor is True, then the state will be
-    # preprocessed by the model specified with the "model" config option first.
-    "critic_hiddens": [64, 64],
-    # Hidden layers activation of the postprocessing state of the critic.
-    "critic_hidden_activation": "relu",
-    # N-step Q learning
-    "n_step": 1,
-    # Algorithm for good policies.
-    "good_policy": "maddpg",
-    # Algorithm for adversary policies.
-    "adv_policy": "maddpg",
-
-    # === Replay buffer ===
-    # Size of the replay buffer. Note that if async_updates is set, then
-    # each worker will have a replay buffer of this size.
-    "buffer_size": DEPRECATED_VALUE,
-    "replay_buffer_config": {
-        "type": "LocalReplayBuffer",
-        "capacity": int(1e6),
-    },
-    # Observation compression. Note that compression makes simulation slow in
-    # MPE.
-    "compress_observations": False,
-    # If set, this will fix the ratio of replayed from a buffer and learned on
-    # timesteps to sampled from an environment and stored in the replay buffer
-    # timesteps. Otherwise, the replay will proceed at the native ratio
-    # determined by (train_batch_size / rollout_fragment_length).
-    "training_intensity": None,
-    # Force lockstep replay mode for MADDPG.
-    "multiagent": merge_dicts(COMMON_CONFIG["multiagent"], {
-        "replay_mode": "lockstep",
-    }),
-
-    # === Optimization ===
-    # Learning rate for the critic (Q-function) optimizer.
-    "critic_lr": 1e-2,
-    # Learning rate for the actor (policy) optimizer.
-    "actor_lr": 1e-2,
-    # Update the target network every `target_network_update_freq` steps.
-    "target_network_update_freq": 0,
-    # Update the target by \tau * policy + (1-\tau) * target_policy
-    "tau": 0.01,
-    # Weights for feature regularization for the actor
-    "actor_feature_reg": 0.001,
-    # If not None, clip gradients during optimization at this value
-    "grad_clip": 100,
-    # How many steps of the model to sample before learning starts.
-    "learning_starts": 1024 * 25,
-    # Update the replay buffer with this many samples at once. Note that this
-    # setting applies per-worker if num_workers > 1.
-    "rollout_fragment_length": 100,
-    # Size of a batched sampled from replay buffer for training. Note that
-    # if async_updates is set, then each worker returns gradients for a
-    # batch of this size.
-    "train_batch_size": 1024,
-    # Number of env steps to optimize for before returning
-    "timesteps_per_iteration": 0,
-
-    # torch-specific model configs
-    "twin_q": False,
-    # delayed policy update
-    "policy_delay": 1,
-    # target policy smoothing
-    # (this also replaces OU exploration noise with IID Gaussian exploration noise, for now)
-    "smooth_target_policy": False,
-    "use_huber": False,
-    "huber_threshold": 1.0,
-    "l2_reg": None,
-
-    # === Parallelism ===
-    # Number of workers for collecting samples with. This only makes sense
-    # to increase if your environment is particularly slow to sample, or if
-    # you're using the Async or Ape-X optimizers.
-    "num_workers": 1,
-    # Prevent iterations from going lower than this time span
-    "min_iter_time_s": 0,
-})
-# __sphinx_doc_end__
-# yapf: enable
-
-
-def before_learn_on_batch(multi_agent_batch, policies, train_batch_size, framework="tf"):
+def maddpg_learn_on_batch(multi_agent_batch, workers, config):
+    policies = dict(
+        workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p))
+    )
     samples = {}
+    train_batch_size = config["train_batch_size"]
+    framework= config["framework"]
 
     # Modify keys.
     for pid, p in policies.items():
         i = p.config["agent_id"]
         keys = multi_agent_batch.policy_batches[pid].keys()
         keys = ["_".join([k, str(i)]) for k in keys]
-        samples.update(
-            dict(zip(keys, multi_agent_batch.policy_batches[pid].values())))
+        samples.update(dict(zip(keys, multi_agent_batch.policy_batches[pid].values())))
 
     # Make ops and feed_dict to get "new_obs" from target action sampler.
     new_obs_n = list()
@@ -154,11 +50,15 @@ def before_learn_on_batch(multi_agent_batch, policies, train_batch_size, framewo
     for k, v in samples.items():
         if "new_obs" in k:
             new_obs_n.append(v)
-    
+
     if framework == "torch":
+
         def sampler(policy, obs):
             return policy.compute_actions(obs)[0]
-        new_act_n = [sampler(policy, obs) for policy, obs in zip(policies.values(), new_obs_n)]
+
+        new_act_n = [
+            sampler(policy, obs) for policy, obs in zip(policies.values(), new_obs_n)
+        ]
     else:
         new_obs_ph_n = [p.new_obs_ph for p in policies.values()]
         for i, p in enumerate(policies.values()):
@@ -167,46 +67,164 @@ def sampler(policy, obs):
             samples.update({"new_actions_%d" % i: new_act})
 
     samples.update(
-        {"new_actions_%d" % i: new_act
-         for i, new_act in enumerate(new_act_n)})
-    
+        {"new_actions_%d" % i: new_act for i, new_act in enumerate(new_act_n)}
+    )
+
     # Share samples among agents.
     policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()}
     return MultiAgentBatch(policy_batches, train_batch_size)
 
+# yapf: disable
+# __sphinx_doc_begin__
+DEFAULT_CONFIG = Trainer.merge_trainer_configs(
+    SIMPLEQ_DEFAULT_CONFIG,
+    {
+        # === Framework to run the algorithm ===
+        "framework": "tf",
+
+        # === Settings for each individual policy ===
+        # ID of the agent controlled by this policy
+        "agent_id": None,
+        # Use a local critic for this policy.
+        "use_local_critic": False,
+
+        # === Evaluation ===
+        # Evaluation interval
+        "evaluation_interval": None,
+        # Number of episodes to run per evaluation period.
+        "evaluation_num_episodes": 10,
+
+        # === Model ===
+        # Apply a state preprocessor with spec given by the "model" config option
+        # (like other RL algorithms). This is mostly useful if you have a weird
+        # observation shape, like an image. Disabled by default.
+        "use_state_preprocessor": False,
+        # Postprocess the policy network model output with these hidden layers. If
+        # use_state_preprocessor is False, then these will be the *only* hidden
+        # layers in the network.
+        "actor_hiddens": [64, 64],
+        # Hidden layers activation of the postprocessing stage of the policy
+        # network
+        "actor_hidden_activation": "relu",
+        # Postprocess the critic network model output with these hidden layers;
+        # again, if use_state_preprocessor is True, then the state will be
+        # preprocessed by the model specified with the "model" config option first.
+        "critic_hiddens": [64, 64],
+        # Hidden layers activation of the postprocessing state of the critic.
+        "critic_hidden_activation": "relu",
+        # N-step Q learning
+        "n_step": 1,
+        # Algorithm for good policies.
+        "good_policy": "maddpg",
+        # Algorithm for adversary policies.
+        "adv_policy": "maddpg",
+
+        # === Replay buffer ===
+        # Size of the replay buffer. Note that if async_updates is set, then
+        # each worker will have a replay buffer of this size.
+        "buffer_size": DEPRECATED_VALUE,
+        "replay_buffer_config": {
+            "type": "MultiAgentReplayBuffer",
+            "capacity": 1000000,
+        },
+        # Observation compression. Note that compression makes simulation slow in
+        # MPE.
+        "compress_observations": False,
+        # If set, this will fix the ratio of replayed from a buffer and learned on
+        # timesteps to sampled from an environment and stored in the replay buffer
+        # timesteps. Otherwise, the replay will proceed at the native ratio
+        # determined by (train_batch_size / rollout_fragment_length).
+        "training_intensity": None,
+        # Force lockstep replay mode for MADDPG.
+        "multiagent": merge_dicts(COMMON_CONFIG["multiagent"], {
+            "replay_mode": "lockstep",
+        }),
+        # Callback to share multi-agent batch for maddpg
+        "before_learn_on_batch": maddpg_learn_on_batch,
+
+        # === Optimization ===
+        # Learning rate for the critic (Q-function) optimizer.
+        "critic_lr": 1e-2,
+        # Learning rate for the actor (policy) optimizer.
+        "actor_lr": 1e-2,
+        # Update the target network every `target_network_update_freq` steps.
+        "target_network_update_freq": 0,
+        # Update the target by \tau * policy + (1-\tau) * target_policy
+        "tau": 0.01,
+        # Weights for feature regularization for the actor
+        "actor_feature_reg": 0.001,
+        # If not None, clip gradients during optimization at this value
+        "grad_clip": 100,
+        # How many steps of the model to sample before learning starts.
+        "learning_starts": 1024 * 25,
+        # Update the replay buffer with this many samples at once. Note that this
+        # setting applies per-worker if num_workers > 1.
+        "rollout_fragment_length": 100,
+        # Size of a batched sampled from replay buffer for training. Note that
+        # if async_updates is set, then each worker returns gradients for a
+        # batch of this size.
+        "train_batch_size": 1024,
+        # Number of env steps to optimize for before returning
+        "timesteps_per_iteration": 0,
+
+        # === Exploration ===
+        "exploration_config": {
+            # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output
+            # actions (after a possible pure random phase of n timesteps).
+            "type": "OrnsteinUhlenbeckNoise",
+            # For how many timesteps should we return completely random actions,
+            # before we start adding (scaled) noise?
+            "random_timesteps": 1000,
+            # The OU-base scaling factor to always apply to action-added noise.
+            "ou_base_scale": 0.1,
+            # The OU theta param.
+            "ou_theta": 0.15,
+            # The OU sigma param.
+            "ou_sigma": 0.2,
+            # The initial noise scaling factor.
+            "initial_scale": 1.0,
+            # The final noise scaling factor.
+            "final_scale": 0.02,
+            # Timesteps over which to anneal scale (from initial to final values).
+            "scale_timesteps": 10000,
+        },
+
+        # torch-specific model configs
+        "twin_q": False,
+        # delayed policy update
+        "policy_delay": 1,
+        # target policy smoothing
+        # (this also replaces OU exploration noise with IID Gaussian exploration noise, for now)
+        "smooth_target_policy": False,
+        "use_huber": False,
+        "huber_threshold": 1.0,
+        "l2_reg": None,
+
+        # === Parallelism ===
+        # Number of workers for collecting samples with. This only makes sense
+        # to increase if your environment is particularly slow to sample, or if
+        # you're using the Async or Ape-X optimizers.
+        "num_workers": 1,
+        # Prevent iterations from going lower than this time span
+        "min_iter_time_s": 0,
+    },
+    _allow_unknown_configs=True,
+)
+# __sphinx_doc_end__
+# yapf: enable
+
 
-def add_maddpg_postprocessing(config):
-    """Add the before learn on batch hook.
-
-    This hook shares the joint batches across all agents
-    for MADDPG's centralized critic update
-    """
-
-    def f(batch, workers, config):
-        policies = dict(workers.local_worker()
-                        .foreach_trainable_policy(lambda p, i: (i, p)))
-        return before_learn_on_batch(batch, policies,
-                                     config["train_batch_size"], config["framework"])
-
-    config["before_learn_on_batch"] = f
-    return config
-
-def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
-    """Policy class picker function. Class is chosen based on DL-framework.
-    Args:
-        config (TrainerConfigDict): The trainer's configuration dict.
-    Returns:
-        Optional[Type[Policy]]: The Policy class to use with PGTrainer.
-            If None, use `default_policy` provided in build_trainer().
-    """
-    if config["framework"] == "torch":
-        return MADDPGTorchPolicy
-    else:
-        return MADDPGTFPolicy
-
-MADDPGTrainer = GenericOffPolicyTrainer.with_updates(
-    name="MADDPG",
-    default_config=DEFAULT_CONFIG,
-    default_policy=MADDPGTFPolicy,
-    get_policy_class=get_policy_class,
-    validate_config=add_maddpg_postprocessing)
+class MADDPGTrainer(DQNTrainer):
+    @classmethod
+    @override(DQNTrainer)
+    def get_default_config(cls) -> TrainerConfigDict:
+        return DEFAULT_CONFIG
+
+    @override(DQNTrainer)
+    def get_default_policy_class(
+        self, config: TrainerConfigDict
+    ) -> Optional[Type[Policy]]:
+        if config["framework"] == "torch":
+            return MADDPGTorchPolicy
+        else:
+            return MADDPGTFPolicy
diff --git a/maddpg_tf_policy.py b/maddpg_tf_policy.py
index f64cd82..9edc08a 100644
--- a/maddpg_tf_policy.py
+++ b/maddpg_tf_policy.py
@@ -1,19 +1,19 @@
+import logging
+
+import numpy as np
 import ray
+from gym.spaces import Box, Discrete
 from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip
-from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
+from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.models import ModelCatalog
+from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
 
-import logging
-from gym.spaces import Box, Discrete
-import numpy as np
-
 logger = logging.getLogger(__name__)
 
 tf1, tf, tfv = try_import_tf()
@@ -24,23 +24,26 @@ class MADDPGPostprocessing:
     """Implements agentwise termination signal and n-step learning."""
 
     @override(Policy)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
+    def postprocess_trajectory(
+        self, sample_batch, other_agent_batches=None, episode=None
+    ):
         # FIXME: Get done from info is required since agentwise done is not
         #  supported now.
         sample_batch[SampleBatch.DONES] = self.get_done_from_info(
-            sample_batch[SampleBatch.INFOS])
+            sample_batch[SampleBatch.INFOS]
+        )
 
         # N-step Q adjustments
         if self.config["n_step"] > 1:
-            adjust_nstep(self.config["n_step"], self.config["gamma"],
-                          sample_batch[SampleBatch.CUR_OBS],
-                          sample_batch[SampleBatch.ACTIONS],
-                          sample_batch[SampleBatch.REWARDS],
-                          sample_batch[SampleBatch.NEXT_OBS],
-                          sample_batch[SampleBatch.DONES])
+            adjust_nstep(
+                self.config["n_step"],
+                self.config["gamma"],
+                sample_batch[SampleBatch.CUR_OBS],
+                sample_batch[SampleBatch.ACTIONS],
+                sample_batch[SampleBatch.REWARDS],
+                sample_batch[SampleBatch.NEXT_OBS],
+                sample_batch[SampleBatch.DONES],
+            )
 
         return sample_batch
 
@@ -54,8 +57,7 @@ def __init__(self, obs_space, act_space, config):
 
         # FIXME: Get done from info is required since agentwise done is not
         # supported now.
-        self.get_done_from_info = np.vectorize(
-            lambda info: info.get("done", False))
+        self.get_done_from_info = np.vectorize(lambda info: info.get("done", False))
 
         agent_id = config["agent_id"]
         if agent_id is None:
@@ -68,21 +70,19 @@ def _make_continuous_space(space):
             if isinstance(space, Box):
                 return space
             elif isinstance(space, Discrete):
-                return Box(
-                    low=np.zeros((space.n, )), high=np.ones((space.n, )))
+                return Box(low=np.zeros((space.n,)), high=np.ones((space.n,)))
             else:
                 raise UnsupportedSpaceException(
-                    "Space {} is not supported.".format(space))
+                    "Space {} is not supported.".format(space)
+                )
 
         obs_space_n = [
             _make_continuous_space(space)
-            for _, (_, space, _,
-                    _) in config["multiagent"]["policies"].items()
+            for _, (_, space, _, _) in config["multiagent"]["policies"].items()
         ]
         act_space_n = [
             _make_continuous_space(space)
-            for _, (_, _, space,
-                    _) in config["multiagent"]["policies"].items()
+            for _, (_, _, space, _) in config["multiagent"]["policies"].items()
         ]
 
         # _____ Placeholders
@@ -90,9 +90,9 @@ def _make_continuous_space(space):
         def _make_ph_n(space_n, name=""):
             return [
                 tf1.placeholder(
-                    tf.float32,
-                    shape=(None, ) + space.shape,
-                    name=name + "_%d" % i) for i, space in enumerate(space_n)
+                    tf.float32, shape=(None,) + space.shape, name=name + "_%d" % i
+                )
+                for i, space in enumerate(space_n)
             ]
 
         obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.OBS)
@@ -100,14 +100,14 @@ def _make_ph_n(space_n, name=""):
         new_obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.NEXT_OBS)
         new_act_ph_n = _make_ph_n(act_space_n, "new_actions")
         rew_ph = tf1.placeholder(
-            tf.float32, shape=None, name="rewards_{}".format(agent_id))
+            tf.float32, shape=None, name="rewards_{}".format(agent_id)
+        )
         done_ph = tf1.placeholder(
-            tf.float32, shape=None, name="dones_{}".format(agent_id))
+            tf.float32, shape=None, name="dones_{}".format(agent_id)
+        )
 
         if config["use_local_critic"]:
-            obs_space_n, act_space_n = [obs_space_n[agent_id]], [
-                act_space_n[agent_id]
-            ]
+            obs_space_n, act_space_n = [obs_space_n[agent_id]], [act_space_n[agent_id]]
             obs_ph_n, act_ph_n = [obs_ph_n[agent_id]], [act_ph_n[agent_id]]
             new_obs_ph_n, new_act_ph_n = [new_obs_ph_n[agent_id]], [
                 new_act_ph_n[agent_id]
@@ -124,7 +124,8 @@ def _make_ph_n(space_n, name=""):
             config["use_state_preprocessor"],
             config["critic_hiddens"],
             getattr(tf.nn, config["critic_hidden_activation"]),
-            scope="critic")
+            scope="critic",
+        )
 
         # Build critic network for t + 1.
         target_critic, _, _, target_critic_vars = self._build_critic_network(
@@ -135,39 +136,44 @@ def _make_ph_n(space_n, name=""):
             config["use_state_preprocessor"],
             config["critic_hiddens"],
             getattr(tf.nn, config["critic_hidden_activation"]),
-            scope="target_critic")
+            scope="target_critic",
+        )
 
         # Build critic loss.
         td_error = tf.subtract(
             tf.stop_gradient(
-                rew_ph + (1.0 - done_ph) *
-                (config["gamma"]**config["n_step"]) * target_critic[:, 0]),
-            critic[:, 0])
-        critic_loss = tf.reduce_mean(td_error**2)
+                rew_ph
+                + (1.0 - done_ph)
+                * (config["gamma"] ** config["n_step"])
+                * target_critic[:, 0]
+            ),
+            critic[:, 0],
+        )
+        critic_loss = tf.reduce_mean(td_error ** 2)
 
         # _____ Policy Network
         # Build actor network for t.
-        act_sampler, actor_feature, actor_model, actor_vars = (
-            self._build_actor_network(
-                obs_ph_n[agent_id],
-                obs_space_n[agent_id],
-                act_space_n[agent_id],
-                config["use_state_preprocessor"],
-                config["actor_hiddens"],
-                getattr(tf.nn, config["actor_hidden_activation"]),
-                scope="actor"))
+        act_sampler, actor_feature, actor_model, actor_vars = self._build_actor_network(
+            obs_ph_n[agent_id],
+            obs_space_n[agent_id],
+            act_space_n[agent_id],
+            config["use_state_preprocessor"],
+            config["actor_hiddens"],
+            getattr(tf.nn, config["actor_hidden_activation"]),
+            scope="actor",
+        )
 
         # Build actor network for t + 1.
         self.new_obs_ph = new_obs_ph_n[agent_id]
-        self.target_act_sampler, _, _, target_actor_vars = (
-            self._build_actor_network(
-                self.new_obs_ph,
-                obs_space_n[agent_id],
-                act_space_n[agent_id],
-                config["use_state_preprocessor"],
-                config["actor_hiddens"],
-                getattr(tf.nn, config["actor_hidden_activation"]),
-                scope="target_actor"))
+        self.target_act_sampler, _, _, target_actor_vars = self._build_actor_network(
+            self.new_obs_ph,
+            obs_space_n[agent_id],
+            act_space_n[agent_id],
+            config["use_state_preprocessor"],
+            config["actor_hiddens"],
+            getattr(tf.nn, config["actor_hidden_activation"]),
+            scope="target_actor",
+        )
 
         # Build actor loss.
         act_n = act_ph_n.copy()
@@ -180,11 +186,13 @@ def _make_ph_n(space_n, name=""):
             config["use_state_preprocessor"],
             config["critic_hiddens"],
             getattr(tf.nn, config["critic_hidden_activation"]),
-            scope="critic")
+            scope="critic",
+        )
         actor_loss = -tf.reduce_mean(critic)
         if config["actor_feature_reg"] is not None:
             actor_loss += config["actor_feature_reg"] * tf.reduce_mean(
-                actor_feature**2)
+                actor_feature ** 2
+            )
 
         # _____ Losses
         self.losses = {"critic": critic_loss, "actor": actor_loss}
@@ -192,12 +200,11 @@ def _make_ph_n(space_n, name=""):
         # _____ Optimizers
         self.optimizers = {
             "critic": tf1.train.AdamOptimizer(config["critic_lr"]),
-            "actor": tf1.train.AdamOptimizer(config["actor_lr"])
+            "actor": tf1.train.AdamOptimizer(config["actor_lr"]),
         }
 
         # _____ Build variable update ops.
-        self.tau = tf1.placeholder_with_default(
-            config["tau"], shape=(), name="tau")
+        self.tau = tf1.placeholder_with_default(config["tau"], shape=(), name="tau")
 
         def _make_target_update_op(vs, target_vs, tau):
             return [
@@ -206,8 +213,8 @@ def _make_target_update_op(vs, target_vs, tau):
             ]
 
         self.update_target_vars = _make_target_update_op(
-            critic_vars + actor_vars, target_critic_vars + target_actor_vars,
-            self.tau)
+            critic_vars + actor_vars, target_critic_vars + target_actor_vars, self.tau
+        )
 
         def _make_set_weight_op(variables):
             vs = list()
@@ -215,9 +222,9 @@ def _make_set_weight_op(variables):
                 vs += v
             phs = [
                 tf1.placeholder(
-                    tf.float32,
-                    shape=v.get_shape(),
-                    name=v.name.split(":")[0] + "_ph") for v in vs
+                    tf.float32, shape=v.get_shape(), name=v.name.split(":")[0] + "_ph"
+                )
+                for v in vs
             ]
             return tf.group(*[v.assign(ph) for v, ph in zip(vs, phs)]), phs
 
@@ -225,7 +232,7 @@ def _make_set_weight_op(variables):
             "critic": critic_vars,
             "actor": actor_vars,
             "target_critic": target_critic_vars,
-            "target_actor": target_actor_vars
+            "target_actor": target_actor_vars,
         }
         self.update_vars, self.vars_ph = _make_set_weight_op(self.vars)
 
@@ -234,11 +241,11 @@ def _make_set_weight_op(variables):
         self.sess = tf1.get_default_session()
 
         def _make_loss_inputs(placeholders):
-            return [(ph.name.split("/")[-1].split(":")[0], ph)
-                    for ph in placeholders]
+            return [(ph.name.split("/")[-1].split(":")[0], ph) for ph in placeholders]
 
-        loss_inputs = _make_loss_inputs(obs_ph_n + act_ph_n + new_obs_ph_n +
-                                        new_act_ph_n + [rew_ph, done_ph])
+        loss_inputs = _make_loss_inputs(
+            obs_ph_n + act_ph_n + new_obs_ph_n + new_act_ph_n + [rew_ph, done_ph]
+        )
 
         TFPolicy.__init__(
             self,
@@ -250,7 +257,8 @@ def _make_loss_inputs(placeholders):
             sampled_action=act_sampler,
             loss=actor_loss + critic_loss,
             loss_inputs=loss_inputs,
-            dist_inputs=actor_feature)
+            dist_inputs=actor_feature,
+        )
 
         del self.view_requirements["prev_actions"]
         del self.view_requirements["prev_rewards"]
@@ -267,21 +275,22 @@ def optimizer(self):
     @override(TFPolicy)
     def gradients(self, optimizer, loss):
         self.gvs = {
-            k: minimize_and_clip(optimizer, self.losses[k], self.vars[k],
-                                 self.config["grad_clip"])
+            k: minimize_and_clip(
+                optimizer, self.losses[k], self.vars[k], self.config["grad_clip"]
+            )
             for k, optimizer in self.optimizers.items()
         }
         return self.gvs["critic"] + self.gvs["actor"]
 
     @override(TFPolicy)
     def build_apply_op(self, optimizer, grads_and_vars):
-        critic_apply_op = self.optimizers["critic"].apply_gradients(
-            self.gvs["critic"])
+        critic_apply_op = self.optimizers["critic"].apply_gradients(self.gvs["critic"])
 
         with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]):
             with tf1.control_dependencies([critic_apply_op]):
                 actor_apply_op = self.optimizers["actor"].apply_gradients(
-                    self.gvs["actor"])
+                    self.gvs["actor"]
+                )
 
         return actor_apply_op
 
@@ -303,8 +312,8 @@ def get_weights(self):
     @override(TFPolicy)
     def set_weights(self, weights):
         self.sess.run(
-            self.update_vars,
-            feed_dict=dict(zip(self.vars_ph, weights["_state"])))
+            self.update_vars, feed_dict=dict(zip(self.vars_ph, weights["_state"]))
+        )
 
     @override(Policy)
     def get_state(self):
@@ -314,24 +323,33 @@ def get_state(self):
     def set_state(self, state):
         TFPolicy.set_state(self, state)
 
-    def _build_critic_network(self,
-                              obs_n,
-                              act_n,
-                              obs_space_n,
-                              act_space_n,
-                              use_state_preprocessor,
-                              hiddens,
-                              activation=None,
-                              scope=None):
+    def _build_critic_network(
+        self,
+        obs_n,
+        act_n,
+        obs_space_n,
+        act_space_n,
+        use_state_preprocessor,
+        hiddens,
+        activation=None,
+        scope=None,
+    ):
         with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
             if use_state_preprocessor:
                 model_n = [
-                    ModelCatalog.get_model_v2({
-                        SampleBatch.OBS: obs,
-                        "is_training": self._get_is_training_placeholder(),
-                    }, obs_space, act_space, 1, self.config["model"])
+                    ModelCatalog.get_model_v2(
+                        {
+                            SampleBatch.OBS: obs,
+                            "is_training": self._get_is_training_placeholder(),
+                        },
+                        obs_space,
+                        act_space,
+                        1,
+                        self.config["model"],
+                    )
                     for obs, obs_space, act_space in zip(
-                        obs_n, obs_space_n, act_space_n)
+                        obs_n, obs_space_n, act_space_n
+                    )
                 ]
                 out_n = [model.last_layer for model in model_n]
                 out = tf.concat(out_n + act_n, axis=1)
@@ -340,39 +358,45 @@ def _build_critic_network(self,
                 out = tf.concat(obs_n + act_n, axis=1)
 
             for hidden in hiddens:
-                out = tf1.layers.dense(
-                    out, units=hidden, activation=activation)
+                out = tf1.layers.dense(out, units=hidden, activation=activation)
             feature = out
             out = tf1.layers.dense(feature, units=1, activation=None)
 
         return out, feature, model_n, tf1.global_variables(scope.name)
 
-    def _build_actor_network(self,
-                             obs,
-                             obs_space,
-                             act_space,
-                             use_state_preprocessor,
-                             hiddens,
-                             activation=None,
-                             scope=None):
+    def _build_actor_network(
+        self,
+        obs,
+        obs_space,
+        act_space,
+        use_state_preprocessor,
+        hiddens,
+        activation=None,
+        scope=None,
+    ):
         with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
             if use_state_preprocessor:
-                model = ModelCatalog.get_model_v2({
-                    SampleBatch.OBS: obs,
-                    "is_training": self._get_is_training_placeholder(),
-                }, obs_space, act_space, 1, self.config["model"])
+                model = ModelCatalog.get_model_v2(
+                    {
+                        SampleBatch.OBS: obs,
+                        "is_training": self._get_is_training_placeholder(),
+                    },
+                    obs_space,
+                    act_space,
+                    1,
+                    self.config["model"],
+                )
                 out = model.last_layer
             else:
                 model = None
                 out = obs
 
             for hidden in hiddens:
-                out = tf1.layers.dense(
-                    out, units=hidden, activation=activation)
-            feature = tf1.layers.dense(
-                out, units=act_space.shape[0], activation=None)
+                out = tf1.layers.dense(out, units=hidden, activation=activation)
+            feature = tf1.layers.dense(out, units=act_space.shape[0], activation=None)
             sampler = tfp.distributions.RelaxedOneHotCategorical(
-                temperature=1.0, logits=feature).sample()
+                temperature=1.0, logits=feature
+            ).sample()
 
         return sampler, feature, model, tf1.global_variables(scope.name)
 
diff --git a/maddpg_torch_model.py b/maddpg_torch_model.py
index 6dfe133..9cb5753 100644
--- a/maddpg_torch_model.py
+++ b/maddpg_torch_model.py
@@ -1,43 +1,45 @@
-from numpy.core.fromnumeric import shape
-import ray
+from typing import Dict, List, Union
+
 import gym
-from gym.spaces import Discrete, Box
 import numpy as np
-from typing import List, Dict, Union
+import ray
+from gym.spaces import Box, Discrete
+from numpy.core.fromnumeric import shape
 from ray.rllib.agents.ddpg.ddpg_torch_model import DDPGTorchModel
-
+from ray.rllib.agents.ddpg.noop_model import TorchNoopModel
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.misc import SlimFC
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.view_requirement import ViewRequirement
-from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.utils.typing import ModelConfigDict, TensorType
 from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.policy.policy import Policy
-from ray.rllib.utils.typing import TrainerConfigDict, TensorType, LocalOptimizer, GradInfoDict
-from ray.rllib.agents.ddpg.noop_model import TorchNoopModel
-from ray.rllib.models import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import (GradInfoDict, LocalOptimizer,
+                                    ModelConfigDict, TensorType,
+                                    TrainerConfigDict)
 
 torch, nn = try_import_torch()
 
+
 def _make_continuous_space(space):
     if isinstance(space, Box):
         return space
     elif isinstance(space, Discrete):
-        return Box(
-            low=np.zeros((space.n, )), high=np.ones((space.n, )))
+        return Box(low=np.zeros((space.n,)), high=np.ones((space.n,)))
     else:
-        raise UnsupportedSpaceException(
-            "Space {} is not supported.".format(space))
+        raise UnsupportedSpaceException("Space {} is not supported.".format(space))
 
 
-def build_maddpg_models(policy: Policy, 
-        obs_space: Box, action_space: Box, 
-        config: TrainerConfigDict) -> ModelV2:
+def build_maddpg_models(
+    policy: Policy, obs_space: Box, action_space: Box, config: TrainerConfigDict
+) -> ModelV2:
 
-    config["model"]["multiagent"] = config["multiagent"] # Needed for critic obs_space and act_space
+    config["model"]["multiagent"] = config[
+        "multiagent"
+    ]  # Needed for critic obs_space and act_space
     if policy.config["use_state_preprocessor"]:
         default_model = None  # catalog decides
         num_outputs = 256  # arbitrary
@@ -60,8 +62,9 @@ def build_maddpg_models(policy: Policy,
         critic_hidden_activation=config["critic_hidden_activation"],
         critic_hiddens=config["critic_hiddens"],
         twin_q=config["twin_q"],
-        add_layer_norm=(policy.config["exploration_config"].get("type") ==
-                        "ParameterNoise"),
+        add_layer_norm=(
+            policy.config["exploration_config"].get("type") == "ParameterNoise"
+        ),
     )
 
     policy.target_model = ModelCatalog.get_model_v2(
@@ -78,15 +81,16 @@ def build_maddpg_models(policy: Policy,
         critic_hidden_activation=config["critic_hidden_activation"],
         critic_hiddens=config["critic_hiddens"],
         twin_q=config["twin_q"],
-        add_layer_norm=(policy.config["exploration_config"].get("type") ==
-                        "ParameterNoise"),
+        add_layer_norm=(
+            policy.config["exploration_config"].get("type") == "ParameterNoise"
+        ),
     )
 
     return policy.model
 
 
 class MADDPGTorchModel(TorchModelV2, nn.Module):
-    '''
+    """
     Extension of TorchModelV2 for MADDPG
     Note that the critic takes in the joint state and action over all agents
     Data flow:
@@ -96,25 +100,28 @@ class MADDPGTorchModel(TorchModelV2, nn.Module):
         model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
     Note that this class by itself is not a valid model unless you
     implement forward() in a subclass.
-    '''
+    """
 
     def __init__(
-            self,
-            observation_space: Box,
-            action_space: Box,
-            num_outputs: int,
-            model_config: ModelConfigDict,
-            name: str,
-            # Extra MADDPGActionModel args:
-            actor_hiddens: List[int] = [256, 256],
-            actor_hidden_activation: str = "relu",
-            critic_hiddens: List[int] = [256, 256],
-            critic_hidden_activation: str = "relu",
-            twin_q: bool = False,
-            add_layer_norm: bool = False):
-        
+        self,
+        observation_space: Box,
+        action_space: Box,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+        # Extra MADDPGActionModel args:
+        actor_hiddens: List[int] = [256, 256],
+        actor_hidden_activation: str = "relu",
+        critic_hiddens: List[int] = [256, 256],
+        critic_hidden_activation: str = "relu",
+        twin_q: bool = False,
+        add_layer_norm: bool = False,
+    ):
+
         nn.Module.__init__(self)
-        TorchModelV2.__init__(self, observation_space, action_space, num_outputs, model_config, name)
+        TorchModelV2.__init__(
+            self, observation_space, action_space, num_outputs, model_config, name
+        )
 
         self.action_dim = np.prod(action_space.shape)
 
@@ -122,8 +129,7 @@ def __init__(
         self.policy_model = nn.Sequential()
         ins = np.prod(observation_space.shape)
         self.obs_ins = ins
-        activation = get_activation_fn(
-            actor_hidden_activation, framework="torch")
+        activation = get_activation_fn(actor_hidden_activation, framework="torch")
         for i, n in enumerate(actor_hiddens):
             self.policy_model.add_module(
                 "action_{}".format(i),
@@ -131,11 +137,14 @@ def __init__(
                     ins,
                     n,
                     initializer=torch.nn.init.xavier_uniform_,
-                    activation_fn=activation))
+                    activation_fn=activation,
+                ),
+            )
             # Add LayerNorm after each Dense.
             if add_layer_norm:
-                self.policy_model.add_module("LayerNorm_A_{}".format(i),
-                                             nn.LayerNorm(n))
+                self.policy_model.add_module(
+                    "LayerNorm_A_{}".format(i), nn.LayerNorm(n)
+                )
             ins = n
 
         self.policy_model.add_module(
@@ -144,27 +153,26 @@ def __init__(
                 ins,
                 self.action_dim,
                 initializer=torch.nn.init.xavier_uniform_,
-                activation_fn=None))
+                activation_fn=None,
+            ),
+        )
 
         # Build MADDPG Critic and Target Critic
 
         obs_space_n = [
             _make_continuous_space(space)
-            for _, (_, space, _,
-                    _) in model_config["multiagent"]["policies"].items()
+            for _, (_, space, _, _) in model_config["multiagent"]["policies"].items()
         ]
         act_space_n = [
             _make_continuous_space(space)
-            for _, (_, _, space,
-                    _) in model_config["multiagent"]["policies"].items()
+            for _, (_, _, space, _) in model_config["multiagent"]["policies"].items()
         ]
         self.critic_obs = np.sum([obs_space.shape[0] for obs_space in obs_space_n])
         self.critic_act = np.sum([act_space.shape[0] for act_space in act_space_n])
 
         # Build the Q-net(s), including target Q-net(s).
         def build_q_net(name_):
-            activation = get_activation_fn(
-                critic_hidden_activation, framework="torch")
+            activation = get_activation_fn(critic_hidden_activation, framework="torch")
             # For continuous actions: Feed obs and actions (concatenated)
             # through the NN. For discrete actions, only obs.
             q_net = nn.Sequential()
@@ -176,7 +184,9 @@ def build_q_net(name_):
                         ins,
                         n,
                         initializer=nn.init.xavier_uniform_,
-                        activation_fn=activation))
+                        activation_fn=activation,
+                    ),
+                )
                 ins = n
 
             q_net.add_module(
@@ -185,25 +195,37 @@ def build_q_net(name_):
                     ins,
                     1,
                     initializer=torch.nn.init.xavier_uniform_,
-                    activation_fn=None))
+                    activation_fn=None,
+                ),
+            )
             return q_net
-        
+
         self.q_model = build_q_net("q")
         if twin_q:
             self.twin_q_model = build_q_net("twin_q")
         else:
             self.twin_q_model = None
-        
-        self.view_requirements[SampleBatch.ACTIONS] = ViewRequirement(SampleBatch.ACTIONS)
+
+        self.view_requirements[SampleBatch.ACTIONS] = ViewRequirement(
+            SampleBatch.ACTIONS
+        )
         self.view_requirements["new_actions"] = ViewRequirement("new_actions")
         self.view_requirements["t"] = ViewRequirement("t")
         self.view_requirements[SampleBatch.NEXT_OBS] = ViewRequirement(
-                data_col=SampleBatch.OBS,
-                shift=1,
-                space=self.obs_space)
+            data_col=SampleBatch.OBS, shift=1, space=self.obs_space
+        )
+
+        # TODO: This shouldn't be necessary? Looks like the target_model isn't
+        # being moved to the gpu automatically, need to fix this somewhere
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.policy_model = self.policy_model.to(device)
+        self.q_model = self.q_model.to(device)
+        if twin_q:
+            self.twin_q_model = self.twin_q_model.to(device)
 
-    def get_q_values(self, model_out_n: List[TensorType],
-                     act_n: List[TensorType]) -> TensorType:
+    def get_q_values(
+        self, model_out_n: List[TensorType], act_n: List[TensorType]
+    ) -> TensorType:
         """Return the Q estimates for the most recent forward pass.
         This implements Q(s, a).
         Args:
@@ -218,8 +240,9 @@ def get_q_values(self, model_out_n: List[TensorType],
         act_n = torch.cat(act_n, dim=-1)
         return self.q_model(torch.cat([model_out_n, act_n], -1))
 
-    def get_twin_q_values(self, model_out_n: TensorType,
-                          act_n: TensorType) -> TensorType:
+    def get_twin_q_values(
+        self, model_out_n: TensorType, act_n: TensorType
+    ) -> TensorType:
         """Same as get_q_values but using the twin Q net.
         This implements the twin Q(s, a).
         Args:
@@ -249,20 +272,23 @@ def get_policy_output(self, model_out: TensorType) -> TensorType:
         entropy = dist.base_dist._categorical.entropy()
         return action
 
-    def policy_variables(self, as_dict: bool = False
-                         ) -> Union[List[TensorType], Dict[str, TensorType]]:
+    def policy_variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
         """Return the list of variables for the policy net."""
         if as_dict:
             return self.policy_model.state_dict()
         return list(self.policy_model.parameters())
 
-    def q_variables(self, as_dict=False
-                    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+    def q_variables(
+        self, as_dict=False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
         """Return the list of variables for Q / twin Q nets."""
         if as_dict:
             return {
                 **self.q_model.state_dict(),
-                **(self.twin_q_model.state_dict() if self.twin_q_model else {})
+                **(self.twin_q_model.state_dict() if self.twin_q_model else {}),
             }
-        return list(self.q_model.parameters()) + \
-            (list(self.twin_q_model.parameters()) if self.twin_q_model else [])
\ No newline at end of file
+        return list(self.q_model.parameters()) + (
+            list(self.twin_q_model.parameters()) if self.twin_q_model else []
+        )
diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py
index e12e2a2..221adca 100644
--- a/maddpg_torch_policy.py
+++ b/maddpg_torch_policy.py
@@ -1,40 +1,49 @@
 import logging
-from gym.spaces import Box, Discrete
-import numpy as np
 from typing import Dict, Tuple
 
-from maddpg_torch_model import build_maddpg_models, _make_continuous_space
-
-from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss
-from ray.rllib.utils.typing import TrainerConfigDict, TensorType, LocalOptimizer
+import numpy as np
+from gym.spaces import Box, Discrete
+from ray.rllib.agents.ddpg.ddpg_tf_policy import (
+    build_ddpg_models, get_distribution_inputs_and_class)
+from ray.rllib.agents.ddpg.ddpg_torch_policy import (TargetNetworkMixin,
+                                                     apply_gradients_fn,
+                                                     make_ddpg_optimizers)
+from ray.rllib.agents.ddpg.noop_model import TorchNoopModel
 from ray.rllib.evaluation.postprocessing import adjust_nstep
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.action_dist import ActionDistribution
 from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.policy_template import build_policy_class
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.agents.ddpg.ddpg_torch_policy import apply_gradients_fn, \
-    make_ddpg_optimizers, TargetNetworkMixin
-from ray.rllib.agents.ddpg.ddpg_tf_policy import build_ddpg_models, get_distribution_inputs_and_class
-from ray.rllib.models.action_dist import ActionDistribution
-from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
-from ray.rllib.agents.ddpg.noop_model import TorchNoopModel
-from ray.rllib.models import ModelCatalog
+from ray.rllib.utils.torch_utils import (apply_grad_clipping, huber_loss,
+                                         l2_loss)
+from ray.rllib.utils.typing import (LocalOptimizer, TensorType,
+                                    TrainerConfigDict)
+
+from maddpg_torch_model import _make_continuous_space, build_maddpg_models
 
 logger = logging.getLogger(__name__)
 
 torch, nn = try_import_torch()
 
+
 def validate_spaces(policy: Policy, obs_space, action_space, config) -> None:
     if isinstance(obs_space, Discrete) or isinstance(action_space, Discrete):
-        logging.warning("Discrete spaces may not work correctly with \
-        pytorch MADDPG; consider using framework=tf instead")
+        logging.warning(
+            "Discrete spaces may not work correctly with \
+        pytorch MADDPG; consider using framework=tf instead"
+        )
     policy.observation_space = _make_continuous_space(obs_space)
     policy.action_space = _make_continuous_space(action_space)
 
-def build_maddpg_models_and_action_dist(policy: Policy, obs_space, action_space,
-                                        config: TrainerConfigDict) -> Tuple[ModelV2, ActionDistribution]:
-    
+
+def build_maddpg_models_and_action_dist(
+    policy: Policy, obs_space, action_space, config: TrainerConfigDict
+) -> Tuple[ModelV2, ActionDistribution]:
+
     if policy.config["use_local_critic"]:
         model = build_ddpg_models(policy, obs_space, action_space, config)
     else:
@@ -42,14 +51,16 @@ def build_maddpg_models_and_action_dist(policy: Policy, obs_space, action_space,
     return model, TorchDeterministic
 
 
-def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: SampleBatch) -> TensorType:
+def maddpg_actor_critic_loss(
+    policy: Policy, model: ModelV2, _, train_batch: SampleBatch
+) -> TensorType:
     if not hasattr(policy, "td_error") or policy.td_error is None:
         policy.actor_loss = torch.zeros(len(train_batch))
         policy.critic_loss = torch.zeros(len(train_batch))
         policy.td_error = torch.zeros(len(train_batch))
         policy.q_t = torch.zeros(len(train_batch))
         return policy.actor_loss, policy.critic_loss
-    
+
     twin_q = policy.config["twin_q"]
     gamma = policy.config["gamma"]
     n_step = policy.config["n_step"]
@@ -58,12 +69,12 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam
     l2_reg = policy.config["l2_reg"]
     agent_id = policy.config["agent_id"]
     n_agents = len(policy.config["multiagent"]["policies"])
-    
+
     input_dict = {
         "obs": train_batch["_".join([SampleBatch.CUR_OBS, str(agent_id)])],
         "is_training": True,
-    }  
-    
+    }
+
     input_dict_next = {
         "obs": train_batch["_".join([SampleBatch.NEXT_OBS, str(agent_id)])],
         "is_training": True,
@@ -82,9 +93,11 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam
         target_noise_clip = policy.config["target_noise_clip"]
         clipped_normal_sample = torch.clamp(
             torch.normal(
-                mean=torch.zeros(policy_tp1.size()),
-                std=policy.config["target_noise"]).to(policy_tp1.device),
-            -target_noise_clip, target_noise_clip)
+                mean=torch.zeros(policy_tp1.size()), std=policy.config["target_noise"]
+            ).to(policy_tp1.device),
+            -target_noise_clip,
+            target_noise_clip,
+        )
 
         policy_tp1_smoothed = torch.min(
             torch.max(
@@ -92,35 +105,52 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam
                 torch.tensor(
                     policy.action_space.low,
                     dtype=torch.float32,
-                    device=policy_tp1.device)),
+                    device=policy_tp1.device,
+                ),
+            ),
             torch.tensor(
-                policy.action_space.high,
-                dtype=torch.float32,
-                device=policy_tp1.device))
+                policy.action_space.high, dtype=torch.float32, device=policy_tp1.device
+            ),
+        )
     else:
         # No smoothing, just use deterministic actions.
         policy_tp1_smoothed = policy_tp1
-    
-    obs_n = [train_batch["_".join([SampleBatch.CUR_OBS, str(id)])] for id in range(n_agents)]
-    act_n = [train_batch["_".join([SampleBatch.ACTIONS, str(id)])] for id in range(n_agents)]
-    next_obs_n = [train_batch["_".join([SampleBatch.NEXT_OBS, str(id)])] for id in range(n_agents)]
+
+    obs_n = [
+        train_batch["_".join([SampleBatch.CUR_OBS, str(id)])] for id in range(n_agents)
+    ]
+    act_n = [
+        train_batch["_".join([SampleBatch.ACTIONS, str(id)])] for id in range(n_agents)
+    ]
+    next_obs_n = [
+        train_batch["_".join([SampleBatch.NEXT_OBS, str(id)])] for id in range(n_agents)
+    ]
     next_policy_n = [train_batch["new_actions_{}".format(id)] for id in range(n_agents)]
     next_policy_n[agent_id] = policy_tp1_smoothed
     rewards = train_batch["rewards_{}".format(agent_id)]
     dones = train_batch["dones_{}".format(agent_id)]
-    
+
     if policy.config["use_state_preprocessor"]:
         # Create all state preprocessors
         model_n = [
-            ModelCatalog.get_model_v2(obs_space, act_space, 1, 
-            policy.config["model"], default_model=TorchNoopModel)
+            ModelCatalog.get_model_v2(
+                obs_space,
+                act_space,
+                1,
+                policy.config["model"],
+                default_model=TorchNoopModel,
+            )
             for obs_space, act_space in zip(policy.obs_space_n, policy.act_space_n)
-            ]
+        ]
         # Get states from preprocessors
-        model_out_n = [model.forward({SampleBatch.OBS: obs, "is_training": True}, [], None)[0] for \
-            model, obs in zip(model_n, obs_n)]
-        model_out_next_n = [model.forward({SampleBatch.OBS: next_obs, "is_training": True}, [], None)[0] for \
-            model, next_obs in zip(model_n, next_obs_n)]
+        model_out_n = [
+            model.forward({SampleBatch.OBS: obs, "is_training": True}, [], None)[0]
+            for model, obs in zip(model_n, obs_n)
+        ]
+        model_out_next_n = [
+            model.forward({SampleBatch.OBS: next_obs, "is_training": True}, [], None)[0]
+            for model, next_obs in zip(model_n, next_obs_n)
+        ]
     else:
         model_out_n = obs_n
         model_out_next_n = next_obs_n
@@ -131,7 +161,7 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam
     # Compute this here so policy_n can be modified without deepcopying act_n
     if twin_q:
         twin_q_t = model.get_twin_q_values(model_out_n, act_n)
-    
+
     # Q-values for current policy (no noise) in given current state
     policy_n = act_n
     policy_n[agent_id] = policy_t
@@ -145,7 +175,8 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam
 
     if twin_q:
         twin_q_tp1 = policy.target_model.get_twin_q_values(
-            model_out_next_n, next_policy_n)
+            model_out_next_n, next_policy_n
+        )
 
     q_t_selected = torch.squeeze(q_t, axis=len(q_t.shape) - 1)
 
@@ -156,36 +187,36 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam
     q_tp1_best = torch.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
     q_tp1_best_masked = (~dones).float() * q_tp1_best
 
-    q_t_selected_target = (rewards + gamma**n_step * q_tp1_best_masked).detach()
+    q_t_selected_target = (rewards + gamma ** n_step * q_tp1_best_masked).detach()
 
     # Compute the error (potentially clipped).
     if twin_q:
         td_error = q_t_selected - q_t_selected_target
         twin_td_error = twin_q_t_selected - q_t_selected_target
         if use_huber:
-            errors = huber_loss(td_error, huber_threshold) \
-                + huber_loss(twin_td_error, huber_threshold)
+            errors = huber_loss(td_error, huber_threshold) + huber_loss(
+                twin_td_error, huber_threshold
+            )
         else:
-            errors = 0.5 * \
-                (torch.pow(td_error, 2.0) + torch.pow(twin_td_error, 2.0))
+            errors = 0.5 * (torch.pow(td_error, 2.0) + torch.pow(twin_td_error, 2.0))
     else:
         td_error = q_t_selected - q_t_selected_target
         if use_huber:
             errors = huber_loss(td_error, huber_threshold)
         else:
             errors = 0.5 * torch.pow(td_error, 2.0)
-    
+
     critic_loss = torch.mean(errors)
 
     # Add l2-regularization if required.
     if l2_reg is not None:
         for name, var in model.policy_variables(as_dict=True).items():
             if "bias" not in name:
-                actor_loss += (l2_reg * l2_loss(var))
+                actor_loss += l2_reg * l2_loss(var)
         for name, var in model.q_variables(as_dict=True).items():
             if "bias" not in name:
-                critic_loss += (l2_reg * l2_loss(var))
-    
+                critic_loss += l2_reg * l2_loss(var)
+
     # Store values for stats function.
     policy.actor_loss = actor_loss
     policy.critic_loss = critic_loss
@@ -204,30 +235,38 @@ def build_maddpg_stats(policy: Policy, batch: SampleBatch) -> Dict[str, TensorTy
         "max_q": torch.max(policy.q_t),
         "min_q": torch.min(policy.q_t),
         "mean_td_error": torch.mean(policy.td_error),
-        "td_error": policy.td_error
+        "td_error": policy.td_error,
     }
     return stats
 
 
-def postprocess_nstep(policy: Policy, batch: SampleBatch,
-                                   other_agent_batches=None, episode=None):
+def postprocess_nstep(
+    policy: Policy, batch: SampleBatch, other_agent_batches=None, episode=None
+):
     # N-step Q adjustments
     if policy.config["n_step"] > 1:
-        adjust_nstep(policy.config["n_step"], policy.config["gamma"],
-                      batch[SampleBatch.CUR_OBS],
-                      batch[SampleBatch.ACTIONS],
-                      batch[SampleBatch.REWARDS],
-                      batch[SampleBatch.NEXT_OBS],
-                      batch[SampleBatch.DONES])
+        adjust_nstep(
+            policy.config["n_step"],
+            policy.config["gamma"],
+            batch[SampleBatch.CUR_OBS],
+            batch[SampleBatch.ACTIONS],
+            batch[SampleBatch.REWARDS],
+            batch[SampleBatch.NEXT_OBS],
+            batch[SampleBatch.DONES],
+        )
 
     return batch
 
 
-def make_maddpg_optimizers(policy: Policy, config: TrainerConfigDict) -> Tuple[LocalOptimizer]:
+def make_maddpg_optimizers(
+    policy: Policy, config: TrainerConfigDict
+) -> Tuple[LocalOptimizer]:
     return make_ddpg_optimizers(policy, config)
 
 
-def before_init_fn(policy: Policy, obs_space, action_space, config: TrainerConfigDict) -> None:
+def before_init_fn(
+    policy: Policy, obs_space, action_space, config: TrainerConfigDict
+) -> None:
     policy.global_step = 0
     # Check agent_id
     agent_id = config["agent_id"]
@@ -241,13 +280,16 @@ class ComputeTDErrorMixin:
     def __init__(self, loss_fn):
         def compute_td_error(obs_t, act_t, rew_t, obs_tp1, done_mask):
             input_dict = self._lazy_tensor_dict(
-                SampleBatch({
-                    SampleBatch.CUR_OBS: obs_t,
-                    SampleBatch.ACTIONS: act_t,
-                    SampleBatch.REWARDS: rew_t,
-                    SampleBatch.NEXT_OBS: obs_tp1,
-                    SampleBatch.DONES: done_mask,
-                }))
+                SampleBatch(
+                    {
+                        SampleBatch.CUR_OBS: obs_t,
+                        SampleBatch.ACTIONS: act_t,
+                        SampleBatch.REWARDS: rew_t,
+                        SampleBatch.NEXT_OBS: obs_tp1,
+                        SampleBatch.DONES: done_mask,
+                    }
+                )
+            )
             # Do forward pass on loss to update td errors attribute
             loss_fn(self, self.model, None, input_dict)
 
@@ -261,28 +303,31 @@ class SetJointSpacesMixin:
     def __init__(self, config: TrainerConfigDict):
         self.obs_space_n = [
             _make_continuous_space(space)
-            for _, (_, space, _,
-                    _) in config["multiagent"]["policies"].items()
+            for _, (_, space, _, _) in config["multiagent"]["policies"].items()
         ]
         self.act_space_n = [
             _make_continuous_space(space)
-            for _, (_, _, space,
-                    _) in config["multiagent"]["policies"].items()
+            for _, (_, _, space, _) in config["multiagent"]["policies"].items()
         ]
 
-def setup_late_mixins(policy: Policy, obs_space, action_space, config: TrainerConfigDict) -> None:
+
+def setup_late_mixins(
+    policy: Policy, obs_space, action_space, config: TrainerConfigDict
+) -> None:
     ComputeTDErrorMixin.__init__(policy, maddpg_actor_critic_loss)
     TargetNetworkMixin.__init__(policy)
     SetJointSpacesMixin.__init__(policy, config)
 
+
 def get_default_config():
     import maddpg
+
     return maddpg.DEFAULT_CONFIG
 
 
 MADDPGTorchPolicy = build_policy_class(
     name="MADDPGTorchPolicy",
-    framework='torch',
+    framework="torch",
     loss_fn=maddpg_actor_critic_loss,
     get_default_config=get_default_config,
     stats_fn=build_maddpg_stats,
@@ -295,5 +340,5 @@ def get_default_config():
     before_loss_init=setup_late_mixins,
     make_model_and_action_dist=build_maddpg_models_and_action_dist,
     apply_gradients_fn=apply_gradients_fn,
-    mixins=[TargetNetworkMixin, ComputeTDErrorMixin, SetJointSpacesMixin]
+    mixins=[TargetNetworkMixin, ComputeTDErrorMixin, SetJointSpacesMixin],
 )
diff --git a/pettingzoo_maddpg.py b/pettingzoo_maddpg.py
index 33f615f..8f18756 100644
--- a/pettingzoo_maddpg.py
+++ b/pettingzoo_maddpg.py
@@ -1,14 +1,17 @@
+import argparse
+import os
+from importlib import import_module
+
 import numpy as np
 import ray
+import supersuit as ss
 from ray import tune
-from ray.tune.registry import register_trainable, register_env
 from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
-import maddpg
-import supersuit as ss
-import argparse
-from importlib import import_module
 from ray.tune import CLIReporter
-import os
+from ray.tune.registry import register_env, register_trainable
+
+import maddpg
+
 
 def parse_args():
     # Environment
@@ -18,58 +21,91 @@ def parse_args():
         "--env-type",
         choices=["mpe", "sisl", "atari", "butterfly", "classic", "magent"],
         default="mpe",
-        help="The PettingZoo environment type"
+        help="The PettingZoo environment type",
     )
     parser.add_argument(
         "--env-name",
         type=str,
         default="simple_spread_v2",
-        help="The PettingZoo environment to use"
+        help="The PettingZoo environment to use",
     )
     parser.add_argument(
         "--framework",
         choices=["tf", "tf2", "tfe", "torch"],
         default="tf",
-        help="The DL framework specifier.")
+        help="The DL framework specifier.",
+    )
     parser.add_argument(
         "--log-level",
         choices=["DEBUG", "INFO", "WARN", "ERROR"],
         default="ERROR",
-        help="The log level for tune.run()")
-    parser.add_argument("--max-episode-len", type=int, default=25,
-                        help="maximum episode length")
-    parser.add_argument("--num-episodes", type=int, default=60000,
-                        help="number of episodes")
-    parser.add_argument("--num-adversaries", type=int, default=0,
-                        help="number of adversarial agents")
-    parser.add_argument("--good-policy", type=str, default="maddpg",
-                        help="policy for good agents")
-    parser.add_argument("--adv-policy", type=str, default="maddpg",
-                        help="policy of adversaries")
+        help="The log level for tune.run()",
+    )
+    parser.add_argument(
+        "--max-episode-len", type=int, default=25, help="maximum episode length"
+    )
+    parser.add_argument(
+        "--num-episodes", type=int, default=60000, help="number of episodes"
+    )
+    parser.add_argument(
+        "--num-adversaries", type=int, default=0, help="number of adversarial agents"
+    )
+    parser.add_argument(
+        "--good-policy", type=str, default="maddpg", help="policy for good agents"
+    )
+    parser.add_argument(
+        "--adv-policy", type=str, default="maddpg", help="policy of adversaries"
+    )
 
     # Core training parameters
-    parser.add_argument("--lr", type=float, default=1e-2,
-                        help="learning rate for Adam optimizer")
-    parser.add_argument("--gamma", type=float, default=0.95,
-                        help="discount factor")
-    parser.add_argument("--rollout-fragment-length", type=int, default=25,
-                        help="number of data points sampled /update /worker")
-    parser.add_argument("--train-batch-size", type=int, default=1024,
-                        help="number of data points /update")
-    parser.add_argument("--n-step", type=int, default=1,
-                        help="length of multistep value backup")
-    parser.add_argument("--num-units", type=int, default=64,
-                        help="number of units in the mlp")
-    parser.add_argument("--replay-buffer", type=int, default=1000000,
-                        help="size of replay buffer in training")
+    parser.add_argument(
+        "--lr", type=float, default=1e-2, help="learning rate for Adam optimizer"
+    )
+    parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
+    parser.add_argument(
+        "--rollout-fragment-length",
+        type=int,
+        default=25,
+        help="number of data points sampled /update /worker",
+    )
+    parser.add_argument(
+        "--train-batch-size",
+        type=int,
+        default=1024,
+        help="number of data points /update",
+    )
+    parser.add_argument(
+        "--n-step", type=int, default=1, help="length of multistep value backup"
+    )
+    parser.add_argument(
+        "--num-units", type=int, default=64, help="number of units in the mlp"
+    )
+    parser.add_argument(
+        "--replay-buffer",
+        type=int,
+        default=1000000,
+        help="size of replay buffer in training",
+    )
 
     # Checkpoint
-    parser.add_argument("--checkpoint-freq", type=int, default=10000,
-                        help="save model once every time this many iterations are completed")
-    parser.add_argument("--local-dir", type=str, default="~/ray_results",
-                        help="path to save checkpoints")
-    parser.add_argument("--restore", type=str, default=None,
-                        help="directory in which training state and model are loaded")
+    parser.add_argument(
+        "--checkpoint-freq",
+        type=int,
+        default=10000,
+        help="save model once every time this many iterations are completed",
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=str,
+        default="~/ray_results",
+        help="path to save checkpoints",
+    )
+    parser.add_argument(
+        "--restore",
+        type=str,
+        default=None,
+        help="directory in which training state and model are loaded",
+    )
 
     # Parallelism
     parser.add_argument("--num-workers", type=int, default=1)
@@ -77,14 +113,24 @@ def parse_args():
     parser.add_argument("--num-gpus", type=int, default=0)
 
     # Evaluation
-    parser.add_argument("--eval-freq", type=int, default=0,
-                        help="evaluate model every time this many iterations are completed")
-    parser.add_argument("--eval-num-episodes", type=int, default=5,
-                        help="Number of episodes to run for evaluation")
-    parser.add_argument("--render", type=bool, default=False,
-                        help="render environment for evaluation")
-    parser.add_argument("--record", type=str, default=None,
-                        help="path to store evaluation videos")
+    parser.add_argument(
+        "--eval-freq",
+        type=int,
+        default=0,
+        help="evaluate model every time this many iterations are completed",
+    )
+    parser.add_argument(
+        "--eval-num-episodes",
+        type=int,
+        default=5,
+        help="Number of episodes to run for evaluation",
+    )
+    parser.add_argument(
+        "--render", type=bool, default=False, help="render environment for evaluation"
+    )
+    parser.add_argument(
+        "--record", type=str, default=None, help="path to store evaluation videos"
+    )
     return parser.parse_args()
 
 
@@ -112,8 +158,10 @@ def env_creator(config):
 
     def gen_policy(i):
         use_local_critic = [
-            args.adv_policy == "ddpg" if i < args.num_adversaries else
-            args.good_policy == "ddpg" for i in range(len(env.agents))
+            args.adv_policy == "ddpg"
+            if i < args.num_adversaries
+            else args.good_policy == "ddpg"
+            for i in range(len(env.agents))
         ]
         return (
             None,
@@ -122,64 +170,58 @@ def gen_policy(i):
             {
                 "agent_id": i,
                 "use_local_critic": use_local_critic[i],
-            }
+            },
         )
 
-    policies = {"policy_%d" %i: gen_policy(i) for i in range(len(env.agents))}
+    policies = {"policy_%d" % i: gen_policy(i) for i in range(len(env.agents))}
     policy_ids = list(policies.keys())
 
-    config={
-            # === Setup ===
-            "framework": args.framework,
-            "log_level": args.log_level,
-            "env": env_name,
-            "num_workers": args.num_workers,
-            "num_gpus": args.num_gpus,
-            "num_gpus_per_worker": 0,
-            "num_envs_per_worker": args.num_envs_per_worker,
-            "horizon": args.max_episode_len,
-
-            # === Policy Config ===
-            # --- Model ---
-            "good_policy": args.good_policy,
-            "adv_policy": args.adv_policy,
-            "actor_hiddens": [args.num_units] * 2,
-            "actor_hidden_activation": "relu",
-            "critic_hiddens": [args.num_units] * 2,
-            "critic_hidden_activation": "relu",
-            "n_step": args.n_step,
-            "gamma": args.gamma,
-
-            # --- Exploration ---
-            "tau": 0.01,
-
-            # --- Replay buffer ---
-            "buffer_size": args.replay_buffer,
-
-            # --- Optimization ---
-            "actor_lr": args.lr,
-            "critic_lr": args.lr,
-            "learning_starts": args.train_batch_size * args.max_episode_len,
-            "rollout_fragment_length": args.rollout_fragment_length,
-            "train_batch_size": args.train_batch_size,
-            "batch_mode": "truncate_episodes",
-
-            # === Multi-agent setting ===
-            "multiagent": {
-                "policies": policies,
-                "policy_mapping_fn": lambda name: policy_ids[agents.index(name)],
-                # Workaround because MADDPG requires agent_id: int but actual ids are strings like 'speaker_0'
-            },
-    
-            # === Evaluation and rendering ===
-            "evaluation_interval": args.eval_freq,
-            "evaluation_num_episodes": args.eval_num_episodes,
-            "evaluation_config": {
-                "record_env": args.record,
-                "render_env": args.render,
-            },
-        }
-    
+    config = {
+        # === Setup ===
+        "framework": args.framework,
+        "log_level": args.log_level,
+        "env": env_name,
+        "num_workers": args.num_workers,
+        "num_gpus": args.num_gpus,
+        "num_gpus_per_worker": 0,
+        "num_envs_per_worker": args.num_envs_per_worker,
+        "horizon": args.max_episode_len,
+        # === Policy Config ===
+        # --- Model ---
+        "good_policy": args.good_policy,
+        "adv_policy": args.adv_policy,
+        "actor_hiddens": [args.num_units] * 2,
+        "actor_hidden_activation": "relu",
+        "critic_hiddens": [args.num_units] * 2,
+        "critic_hidden_activation": "relu",
+        "n_step": args.n_step,
+        "gamma": args.gamma,
+        # --- Exploration ---
+        "tau": 0.01,
+        # --- Replay buffer ---
+        "buffer_size": args.replay_buffer,
+        # --- Optimization ---
+        "actor_lr": args.lr,
+        "critic_lr": args.lr,
+        "learning_starts": args.train_batch_size * args.max_episode_len,
+        "rollout_fragment_length": args.rollout_fragment_length,
+        "train_batch_size": args.train_batch_size,
+        "batch_mode": "truncate_episodes",
+        # === Multi-agent setting ===
+        "multiagent": {
+            "policies": policies,
+            "policy_mapping_fn": lambda name: policy_ids[agents.index(name)],
+            # Workaround because MADDPG requires agent_id: int but actual ids are strings like 'speaker_0'
+        },
+        # === Evaluation and rendering ===
+        "evaluation_interval": args.eval_freq,
+        "evaluation_num_episodes": args.eval_num_episodes,
+        "evaluation_config": {
+            "record_env": args.record,
+            "render_env": args.render,
+        },
+    }
+
     tune.run(
         MADDPGAgent,
         name="Torch_MADDPG",
@@ -191,10 +233,10 @@ def gen_policy(i):
         checkpoint_freq=args.checkpoint_freq,
         local_dir=os.path.join(args.local_dir, env_name),
         restore=args.restore,
-        verbose = 1
+        verbose=1,
     )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parse_args()
     main(args)

From f78705fd677d1d8b6def9cde42d84a7503c3c6f2 Mon Sep 17 00:00:00 2001
From: Rohan138 <rapotdar@purdue.edu>
Date: Fri, 24 Dec 2021 13:07:51 +0530
Subject: [PATCH 3/4] Minor fixes

---
 maddpg.py              | 25 ++++++++++++-------------
 maddpg_torch_model.py  | 10 +++++++---
 maddpg_torch_policy.py | 18 ++++++++++--------
 pettingzoo_maddpg.py   |  2 +-
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/maddpg.py b/maddpg.py
index 6d84d09..704d4e1 100644
--- a/maddpg.py
+++ b/maddpg.py
@@ -13,8 +13,7 @@
 from typing import Optional, Type
 
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.dqn.simple_q import \
-    DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG
+from ray.rllib.agents.dqn.simple_q import DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG
 from ray.rllib.agents.trainer import COMMON_CONFIG, Trainer
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
@@ -29,13 +28,14 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+
 def maddpg_learn_on_batch(multi_agent_batch, workers, config):
     policies = dict(
         workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p))
     )
     samples = {}
     train_batch_size = config["train_batch_size"]
-    framework= config["framework"]
+    framework = config["framework"]
 
     # Modify keys.
     for pid, p in policies.items():
@@ -74,6 +74,7 @@ def sampler(policy, obs):
     policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()}
     return MultiAgentBatch(policy_batches, train_batch_size)
 
+
 # yapf: disable
 # __sphinx_doc_begin__
 DEFAULT_CONFIG = Trainer.merge_trainer_configs(
@@ -165,22 +166,16 @@ def sampler(policy, obs):
         # batch of this size.
         "train_batch_size": 1024,
         # Number of env steps to optimize for before returning
-        "timesteps_per_iteration": 0,
+        "timesteps_per_iteration": 1000,
 
         # === Exploration ===
         "exploration_config": {
-            # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output
-            # actions (after a possible pure random phase of n timesteps).
-            "type": "OrnsteinUhlenbeckNoise",
+            "type": "GaussianNoise",
             # For how many timesteps should we return completely random actions,
             # before we start adding (scaled) noise?
             "random_timesteps": 1000,
-            # The OU-base scaling factor to always apply to action-added noise.
-            "ou_base_scale": 0.1,
-            # The OU theta param.
-            "ou_theta": 0.15,
-            # The OU sigma param.
-            "ou_sigma": 0.2,
+            # The stddev (sigma) to be used for the actions
+            "stddev": 0.5,
             # The initial noise scaling factor.
             "initial_scale": 1.0,
             # The final noise scaling factor.
@@ -188,6 +183,10 @@ def sampler(policy, obs):
             # Timesteps over which to anneal scale (from initial to final values).
             "scale_timesteps": 10000,
         },
+        # Extra configuration that disables exploration.
+        "evaluation_config": {
+            "explore": False
+        },
 
         # torch-specific model configs
         "twin_q": False,
diff --git a/maddpg_torch_model.py b/maddpg_torch_model.py
index 9cb5753..67c1688 100644
--- a/maddpg_torch_model.py
+++ b/maddpg_torch_model.py
@@ -17,9 +17,13 @@
 from ray.rllib.policy.view_requirement import ViewRequirement
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.utils.typing import (GradInfoDict, LocalOptimizer,
-                                    ModelConfigDict, TensorType,
-                                    TrainerConfigDict)
+from ray.rllib.utils.typing import (
+    GradInfoDict,
+    LocalOptimizer,
+    ModelConfigDict,
+    TensorType,
+    TrainerConfigDict,
+)
 
 torch, nn = try_import_torch()
 
diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py
index 221adca..c801c31 100644
--- a/maddpg_torch_policy.py
+++ b/maddpg_torch_policy.py
@@ -4,10 +4,14 @@
 import numpy as np
 from gym.spaces import Box, Discrete
 from ray.rllib.agents.ddpg.ddpg_tf_policy import (
-    build_ddpg_models, get_distribution_inputs_and_class)
-from ray.rllib.agents.ddpg.ddpg_torch_policy import (TargetNetworkMixin,
-                                                     apply_gradients_fn,
-                                                     make_ddpg_optimizers)
+    build_ddpg_models,
+    get_distribution_inputs_and_class,
+)
+from ray.rllib.agents.ddpg.ddpg_torch_policy import (
+    TargetNetworkMixin,
+    apply_gradients_fn,
+    make_ddpg_optimizers,
+)
 from ray.rllib.agents.ddpg.noop_model import TorchNoopModel
 from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.models import ModelCatalog
@@ -18,10 +22,8 @@
 from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.utils.torch_utils import (apply_grad_clipping, huber_loss,
-                                         l2_loss)
-from ray.rllib.utils.typing import (LocalOptimizer, TensorType,
-                                    TrainerConfigDict)
+from ray.rllib.utils.torch_utils import apply_grad_clipping, huber_loss, l2_loss
+from ray.rllib.utils.typing import LocalOptimizer, TensorType, TrainerConfigDict
 
 from maddpg_torch_model import _make_continuous_space, build_maddpg_models
 
diff --git a/pettingzoo_maddpg.py b/pettingzoo_maddpg.py
index 8f18756..2b2f825 100644
--- a/pettingzoo_maddpg.py
+++ b/pettingzoo_maddpg.py
@@ -224,7 +224,7 @@ def gen_policy(i):
 
     tune.run(
         MADDPGAgent,
-        name="Torch_MADDPG",
+        name=f"MADDPG/{args.framework}/{args.env_name}",
         config=config,
         progress_reporter=CLIReporter(),
         stop={

From 43dce5069c64a44f858a13ce28c6078dfb977ed0 Mon Sep 17 00:00:00 2001
From: Rohan138 <rapotdar@purdue.edu>
Date: Fri, 13 May 2022 12:32:39 -0700
Subject: [PATCH 4/4] import fixes

---
 maddpg_tf_policy.py    | 5 ++---
 maddpg_torch_policy.py | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/maddpg_tf_policy.py b/maddpg_tf_policy.py
index 9edc08a..5cc9f3e 100644
--- a/maddpg_tf_policy.py
+++ b/maddpg_tf_policy.py
@@ -3,9 +3,8 @@
 import numpy as np
 import ray
 from gym.spaces import Box, Discrete
-from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip
+from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip, _adjust_nstep
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.models import ModelCatalog
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
@@ -35,7 +34,7 @@ def postprocess_trajectory(
 
         # N-step Q adjustments
         if self.config["n_step"] > 1:
-            adjust_nstep(
+            _adjust_nstep(
                 self.config["n_step"],
                 self.config["gamma"],
                 sample_batch[SampleBatch.CUR_OBS],
diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py
index 4946152..3b7e80f 100644
--- a/maddpg_torch_policy.py
+++ b/maddpg_torch_policy.py
@@ -13,16 +13,16 @@
     make_ddpg_optimizers,
 )
 from ray.rllib.agents.ddpg.noop_model import TorchNoopModel
-from ray.rllib.evaluation.postprocessing import adjust_nstep
 from ray.rllib.models import ModelCatalog
 from ray.rllib.models.action_dist import ActionDistribution
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.policy_template import build_policy_class
+from ray.rllib.agents.dqn.dqn_tf_policy import _adjust_nstep
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.utils.torch_utils import apply_grad_clipping, huber_loss, l2_loss
+from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss
 from ray.rllib.utils.typing import LocalOptimizer, TensorType, TrainerConfigDict
 
 from maddpg_torch_model import _make_continuous_space, build_maddpg_models
@@ -248,7 +248,7 @@ def postprocess_nstep(
 ):
     # N-step Q adjustments
     if policy.config["n_step"] > 1:
-        adjust_nstep(
+        _adjust_nstep(
             policy.config["n_step"],
             policy.config["gamma"],
             batch[SampleBatch.CUR_OBS],