From 1c05c811dde9c4add82aef81883eb5c196001bc9 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 7 Nov 2021 00:50:29 -0400 Subject: [PATCH 1/4] Fixed imports --- maddpg.py | 20 ++++++++++++-------- maddpg_tf_policy.py | 5 +++-- maddpg_torch_policy.py | 15 +++++---------- pettingzoo_maddpg.py | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/maddpg.py b/maddpg.py index a60f0d9..02572d4 100644 --- a/maddpg.py +++ b/maddpg.py @@ -20,6 +20,7 @@ from ray.rllib.utils.typing import TrainerConfigDict from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch from ray.rllib.utils import merge_dicts +from ray.rllib.utils.deprecation import DEPRECATED_VALUE logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -66,13 +67,15 @@ "good_policy": "maddpg", # Algorithm for adversary policies. "adv_policy": "maddpg", - # list of other agent_ids and policies to approximate (See MADDPG Section 4.2) - "learn_other_policies": None, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. - "buffer_size": int(1e6), + "buffer_size": DEPRECATED_VALUE, + "replay_buffer_config": { + "type": "LocalReplayBuffer", + "capacity": int(1e6), + }, # Observation compression. Note that compression makes simulation slow in # MPE. "compress_observations": False, @@ -157,10 +160,11 @@ def sampler(policy, obs): return policy.compute_actions(obs)[0] new_act_n = [sampler(policy, obs) for policy, obs in zip(policies.values(), new_obs_n)] else: - target_act_sampler_n = [p.target_act_sampler for p in policies.values()] new_obs_ph_n = [p.new_obs_ph for p in policies.values()] - feed_dict = dict(zip(new_obs_ph_n, new_obs_n)) - new_act_n = p.sess.run(target_act_sampler_n, feed_dict) + for i, p in enumerate(policies.values()): + feed_dict = {new_obs_ph_n[i]: new_obs_n[i]} + new_act = p.get_session().run(p.target_act_sampler, feed_dict) + samples.update({"new_actions_%d" % i: new_act}) samples.update( {"new_actions_%d" % i: new_act @@ -174,8 +178,8 @@ def sampler(policy, obs): def add_maddpg_postprocessing(config): """Add the before learn on batch hook. - This hook is called explicitly prior to TrainOneStep() in the execution - setups for DQN and APEX. + This hook shares the joint batches across all agents + for MADDPG's centralized critic update """ def f(batch, workers, config): diff --git a/maddpg_tf_policy.py b/maddpg_tf_policy.py index f98bb83..f64cd82 100644 --- a/maddpg_tf_policy.py +++ b/maddpg_tf_policy.py @@ -1,5 +1,6 @@ import ray -from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip, _adjust_nstep +from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip +from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY from ray.rllib.models import ModelCatalog from ray.rllib.policy.sample_batch import SampleBatch @@ -34,7 +35,7 @@ def postprocess_trajectory(self, # N-step Q adjustments if self.config["n_step"] > 1: - _adjust_nstep(self.config["n_step"], self.config["gamma"], + adjust_nstep(self.config["n_step"], self.config["gamma"], sample_batch[SampleBatch.CUR_OBS], sample_batch[SampleBatch.ACTIONS], sample_batch[SampleBatch.REWARDS], diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py index df793bc..e12e2a2 100644 --- a/maddpg_torch_policy.py +++ b/maddpg_torch_policy.py @@ -7,7 +7,7 @@ from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss from ray.rllib.utils.typing import TrainerConfigDict, TensorType, LocalOptimizer -from ray.rllib.agents.dqn.dqn_tf_policy import _adjust_nstep +from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.policy import Policy @@ -27,8 +27,8 @@ def validate_spaces(policy: Policy, obs_space, action_space, config) -> None: if isinstance(obs_space, Discrete) or isinstance(action_space, Discrete): - logging.warning("Discrete spaces may not work with Torch MADDPG; \ - consider using framework=tf instead") + logging.warning("Discrete spaces may not work correctly with \ + pytorch MADDPG; consider using framework=tf instead") policy.observation_space = _make_continuous_space(obs_space) policy.action_space = _make_continuous_space(action_space) @@ -39,11 +39,6 @@ def build_maddpg_models_and_action_dist(policy: Policy, obs_space, action_space, model = build_ddpg_models(policy, obs_space, action_space, config) else: model = build_maddpg_models(policy, obs_space, action_space, config) - device = (torch.device("cuda") - if torch.cuda.is_available() else torch.device("cpu")) - policy.model = policy.model.to(device) - policy.target_model = policy.target_model.to(device) - return model, TorchDeterministic @@ -218,7 +213,7 @@ def postprocess_nstep(policy: Policy, batch: SampleBatch, other_agent_batches=None, episode=None): # N-step Q adjustments if policy.config["n_step"] > 1: - _adjust_nstep(policy.config["n_step"], policy.config["gamma"], + adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS], batch[SampleBatch.REWARDS], @@ -300,5 +295,5 @@ def get_default_config(): before_loss_init=setup_late_mixins, make_model_and_action_dist=build_maddpg_models_and_action_dist, apply_gradients_fn=apply_gradients_fn, - mixins=[TargetNetworkMixin, ComputeTDErrorMixin] + mixins=[TargetNetworkMixin, ComputeTDErrorMixin, SetJointSpacesMixin] ) diff --git a/pettingzoo_maddpg.py b/pettingzoo_maddpg.py index b4c3229..33f615f 100644 --- a/pettingzoo_maddpg.py +++ b/pettingzoo_maddpg.py @@ -167,7 +167,7 @@ def gen_policy(i): # === Multi-agent setting === "multiagent": { "policies": policies, - "policy_mapping_fn": lambda name, _: policy_ids[agents.index(name)], + "policy_mapping_fn": lambda name: policy_ids[agents.index(name)], # Workaround because MADDPG requires agent_id: int but actual ids are strings like 'speaker_0' }, From a3c9d928eb85af772676b390a7531c6a3e5eed2e Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 24 Dec 2021 12:48:58 +0530 Subject: [PATCH 2/4] Initial support for ray-v2.0.0 --- maddpg.py | 340 ++++++++++++++++++++++------------------- maddpg_tf_policy.py | 254 ++++++++++++++++-------------- maddpg_torch_model.py | 170 ++++++++++++--------- maddpg_torch_policy.py | 203 ++++++++++++++---------- pettingzoo_maddpg.py | 254 +++++++++++++++++------------- 5 files changed, 688 insertions(+), 533 deletions(-) diff --git a/maddpg.py b/maddpg.py index 02572d4..6d84d09 100644 --- a/maddpg.py +++ b/maddpg.py @@ -12,141 +12,37 @@ import logging from typing import Optional, Type -from ray.rllib.agents.trainer import COMMON_CONFIG, with_common_config -from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer -from maddpg_tf_policy import MADDPGTFPolicy -from maddpg_torch_policy import MADDPGTorchPolicy +from ray.rllib.agents.dqn.dqn import DQNTrainer +from ray.rllib.agents.dqn.simple_q import \ + DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG +from ray.rllib.agents.trainer import COMMON_CONFIG, Trainer from ray.rllib.policy.policy import Policy -from ray.rllib.utils.typing import TrainerConfigDict -from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch +from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.utils import merge_dicts +from ray.rllib.utils.annotations import override from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray.rllib.utils.typing import TrainerConfigDict + +from maddpg_tf_policy import MADDPGTFPolicy +from maddpg_torch_policy import MADDPGTorchPolicy logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# yapf: disable -# __sphinx_doc_begin__ -DEFAULT_CONFIG = with_common_config({ - # === Framework to run the algorithm === - "framework": "tf", - - # === Settings for each individual policy === - # ID of the agent controlled by this policy - "agent_id": None, - # Use a local critic for this policy. - "use_local_critic": False, - - # === Evaluation === - # Evaluation interval - "evaluation_interval": None, - # Number of episodes to run per evaluation period. - "evaluation_num_episodes": 10, - - # === Model === - # Apply a state preprocessor with spec given by the "model" config option - # (like other RL algorithms). This is mostly useful if you have a weird - # observation shape, like an image. Disabled by default. - "use_state_preprocessor": False, - # Postprocess the policy network model output with these hidden layers. If - # use_state_preprocessor is False, then these will be the *only* hidden - # layers in the network. - "actor_hiddens": [64, 64], - # Hidden layers activation of the postprocessing stage of the policy - # network - "actor_hidden_activation": "relu", - # Postprocess the critic network model output with these hidden layers; - # again, if use_state_preprocessor is True, then the state will be - # preprocessed by the model specified with the "model" config option first. - "critic_hiddens": [64, 64], - # Hidden layers activation of the postprocessing state of the critic. - "critic_hidden_activation": "relu", - # N-step Q learning - "n_step": 1, - # Algorithm for good policies. - "good_policy": "maddpg", - # Algorithm for adversary policies. - "adv_policy": "maddpg", - - # === Replay buffer === - # Size of the replay buffer. Note that if async_updates is set, then - # each worker will have a replay buffer of this size. - "buffer_size": DEPRECATED_VALUE, - "replay_buffer_config": { - "type": "LocalReplayBuffer", - "capacity": int(1e6), - }, - # Observation compression. Note that compression makes simulation slow in - # MPE. - "compress_observations": False, - # If set, this will fix the ratio of replayed from a buffer and learned on - # timesteps to sampled from an environment and stored in the replay buffer - # timesteps. Otherwise, the replay will proceed at the native ratio - # determined by (train_batch_size / rollout_fragment_length). - "training_intensity": None, - # Force lockstep replay mode for MADDPG. - "multiagent": merge_dicts(COMMON_CONFIG["multiagent"], { - "replay_mode": "lockstep", - }), - - # === Optimization === - # Learning rate for the critic (Q-function) optimizer. - "critic_lr": 1e-2, - # Learning rate for the actor (policy) optimizer. - "actor_lr": 1e-2, - # Update the target network every `target_network_update_freq` steps. - "target_network_update_freq": 0, - # Update the target by \tau * policy + (1-\tau) * target_policy - "tau": 0.01, - # Weights for feature regularization for the actor - "actor_feature_reg": 0.001, - # If not None, clip gradients during optimization at this value - "grad_clip": 100, - # How many steps of the model to sample before learning starts. - "learning_starts": 1024 * 25, - # Update the replay buffer with this many samples at once. Note that this - # setting applies per-worker if num_workers > 1. - "rollout_fragment_length": 100, - # Size of a batched sampled from replay buffer for training. Note that - # if async_updates is set, then each worker returns gradients for a - # batch of this size. - "train_batch_size": 1024, - # Number of env steps to optimize for before returning - "timesteps_per_iteration": 0, - - # torch-specific model configs - "twin_q": False, - # delayed policy update - "policy_delay": 1, - # target policy smoothing - # (this also replaces OU exploration noise with IID Gaussian exploration noise, for now) - "smooth_target_policy": False, - "use_huber": False, - "huber_threshold": 1.0, - "l2_reg": None, - - # === Parallelism === - # Number of workers for collecting samples with. This only makes sense - # to increase if your environment is particularly slow to sample, or if - # you're using the Async or Ape-X optimizers. - "num_workers": 1, - # Prevent iterations from going lower than this time span - "min_iter_time_s": 0, -}) -# __sphinx_doc_end__ -# yapf: enable - - -def before_learn_on_batch(multi_agent_batch, policies, train_batch_size, framework="tf"): +def maddpg_learn_on_batch(multi_agent_batch, workers, config): + policies = dict( + workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p)) + ) samples = {} + train_batch_size = config["train_batch_size"] + framework= config["framework"] # Modify keys. for pid, p in policies.items(): i = p.config["agent_id"] keys = multi_agent_batch.policy_batches[pid].keys() keys = ["_".join([k, str(i)]) for k in keys] - samples.update( - dict(zip(keys, multi_agent_batch.policy_batches[pid].values()))) + samples.update(dict(zip(keys, multi_agent_batch.policy_batches[pid].values()))) # Make ops and feed_dict to get "new_obs" from target action sampler. new_obs_n = list() @@ -154,11 +50,15 @@ def before_learn_on_batch(multi_agent_batch, policies, train_batch_size, framewo for k, v in samples.items(): if "new_obs" in k: new_obs_n.append(v) - + if framework == "torch": + def sampler(policy, obs): return policy.compute_actions(obs)[0] - new_act_n = [sampler(policy, obs) for policy, obs in zip(policies.values(), new_obs_n)] + + new_act_n = [ + sampler(policy, obs) for policy, obs in zip(policies.values(), new_obs_n) + ] else: new_obs_ph_n = [p.new_obs_ph for p in policies.values()] for i, p in enumerate(policies.values()): @@ -167,46 +67,164 @@ def sampler(policy, obs): samples.update({"new_actions_%d" % i: new_act}) samples.update( - {"new_actions_%d" % i: new_act - for i, new_act in enumerate(new_act_n)}) - + {"new_actions_%d" % i: new_act for i, new_act in enumerate(new_act_n)} + ) + # Share samples among agents. policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()} return MultiAgentBatch(policy_batches, train_batch_size) +# yapf: disable +# __sphinx_doc_begin__ +DEFAULT_CONFIG = Trainer.merge_trainer_configs( + SIMPLEQ_DEFAULT_CONFIG, + { + # === Framework to run the algorithm === + "framework": "tf", + + # === Settings for each individual policy === + # ID of the agent controlled by this policy + "agent_id": None, + # Use a local critic for this policy. + "use_local_critic": False, + + # === Evaluation === + # Evaluation interval + "evaluation_interval": None, + # Number of episodes to run per evaluation period. + "evaluation_num_episodes": 10, + + # === Model === + # Apply a state preprocessor with spec given by the "model" config option + # (like other RL algorithms). This is mostly useful if you have a weird + # observation shape, like an image. Disabled by default. + "use_state_preprocessor": False, + # Postprocess the policy network model output with these hidden layers. If + # use_state_preprocessor is False, then these will be the *only* hidden + # layers in the network. + "actor_hiddens": [64, 64], + # Hidden layers activation of the postprocessing stage of the policy + # network + "actor_hidden_activation": "relu", + # Postprocess the critic network model output with these hidden layers; + # again, if use_state_preprocessor is True, then the state will be + # preprocessed by the model specified with the "model" config option first. + "critic_hiddens": [64, 64], + # Hidden layers activation of the postprocessing state of the critic. + "critic_hidden_activation": "relu", + # N-step Q learning + "n_step": 1, + # Algorithm for good policies. + "good_policy": "maddpg", + # Algorithm for adversary policies. + "adv_policy": "maddpg", + + # === Replay buffer === + # Size of the replay buffer. Note that if async_updates is set, then + # each worker will have a replay buffer of this size. + "buffer_size": DEPRECATED_VALUE, + "replay_buffer_config": { + "type": "MultiAgentReplayBuffer", + "capacity": 1000000, + }, + # Observation compression. Note that compression makes simulation slow in + # MPE. + "compress_observations": False, + # If set, this will fix the ratio of replayed from a buffer and learned on + # timesteps to sampled from an environment and stored in the replay buffer + # timesteps. Otherwise, the replay will proceed at the native ratio + # determined by (train_batch_size / rollout_fragment_length). + "training_intensity": None, + # Force lockstep replay mode for MADDPG. + "multiagent": merge_dicts(COMMON_CONFIG["multiagent"], { + "replay_mode": "lockstep", + }), + # Callback to share multi-agent batch for maddpg + "before_learn_on_batch": maddpg_learn_on_batch, + + # === Optimization === + # Learning rate for the critic (Q-function) optimizer. + "critic_lr": 1e-2, + # Learning rate for the actor (policy) optimizer. + "actor_lr": 1e-2, + # Update the target network every `target_network_update_freq` steps. + "target_network_update_freq": 0, + # Update the target by \tau * policy + (1-\tau) * target_policy + "tau": 0.01, + # Weights for feature regularization for the actor + "actor_feature_reg": 0.001, + # If not None, clip gradients during optimization at this value + "grad_clip": 100, + # How many steps of the model to sample before learning starts. + "learning_starts": 1024 * 25, + # Update the replay buffer with this many samples at once. Note that this + # setting applies per-worker if num_workers > 1. + "rollout_fragment_length": 100, + # Size of a batched sampled from replay buffer for training. Note that + # if async_updates is set, then each worker returns gradients for a + # batch of this size. + "train_batch_size": 1024, + # Number of env steps to optimize for before returning + "timesteps_per_iteration": 0, + + # === Exploration === + "exploration_config": { + # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output + # actions (after a possible pure random phase of n timesteps). + "type": "OrnsteinUhlenbeckNoise", + # For how many timesteps should we return completely random actions, + # before we start adding (scaled) noise? + "random_timesteps": 1000, + # The OU-base scaling factor to always apply to action-added noise. + "ou_base_scale": 0.1, + # The OU theta param. + "ou_theta": 0.15, + # The OU sigma param. + "ou_sigma": 0.2, + # The initial noise scaling factor. + "initial_scale": 1.0, + # The final noise scaling factor. + "final_scale": 0.02, + # Timesteps over which to anneal scale (from initial to final values). + "scale_timesteps": 10000, + }, + + # torch-specific model configs + "twin_q": False, + # delayed policy update + "policy_delay": 1, + # target policy smoothing + # (this also replaces OU exploration noise with IID Gaussian exploration noise, for now) + "smooth_target_policy": False, + "use_huber": False, + "huber_threshold": 1.0, + "l2_reg": None, + + # === Parallelism === + # Number of workers for collecting samples with. This only makes sense + # to increase if your environment is particularly slow to sample, or if + # you're using the Async or Ape-X optimizers. + "num_workers": 1, + # Prevent iterations from going lower than this time span + "min_iter_time_s": 0, + }, + _allow_unknown_configs=True, +) +# __sphinx_doc_end__ +# yapf: enable + -def add_maddpg_postprocessing(config): - """Add the before learn on batch hook. - - This hook shares the joint batches across all agents - for MADDPG's centralized critic update - """ - - def f(batch, workers, config): - policies = dict(workers.local_worker() - .foreach_trainable_policy(lambda p, i: (i, p))) - return before_learn_on_batch(batch, policies, - config["train_batch_size"], config["framework"]) - - config["before_learn_on_batch"] = f - return config - -def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]: - """Policy class picker function. Class is chosen based on DL-framework. - Args: - config (TrainerConfigDict): The trainer's configuration dict. - Returns: - Optional[Type[Policy]]: The Policy class to use with PGTrainer. - If None, use `default_policy` provided in build_trainer(). - """ - if config["framework"] == "torch": - return MADDPGTorchPolicy - else: - return MADDPGTFPolicy - -MADDPGTrainer = GenericOffPolicyTrainer.with_updates( - name="MADDPG", - default_config=DEFAULT_CONFIG, - default_policy=MADDPGTFPolicy, - get_policy_class=get_policy_class, - validate_config=add_maddpg_postprocessing) +class MADDPGTrainer(DQNTrainer): + @classmethod + @override(DQNTrainer) + def get_default_config(cls) -> TrainerConfigDict: + return DEFAULT_CONFIG + + @override(DQNTrainer) + def get_default_policy_class( + self, config: TrainerConfigDict + ) -> Optional[Type[Policy]]: + if config["framework"] == "torch": + return MADDPGTorchPolicy + else: + return MADDPGTFPolicy diff --git a/maddpg_tf_policy.py b/maddpg_tf_policy.py index f64cd82..9edc08a 100644 --- a/maddpg_tf_policy.py +++ b/maddpg_tf_policy.py @@ -1,19 +1,19 @@ +import logging + +import numpy as np import ray +from gym.spaces import Box, Discrete from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip -from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY +from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.models import ModelCatalog +from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.utils.annotations import override from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.utils.framework import try_import_tf, try_import_tfp -import logging -from gym.spaces import Box, Discrete -import numpy as np - logger = logging.getLogger(__name__) tf1, tf, tfv = try_import_tf() @@ -24,23 +24,26 @@ class MADDPGPostprocessing: """Implements agentwise termination signal and n-step learning.""" @override(Policy) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): + def postprocess_trajectory( + self, sample_batch, other_agent_batches=None, episode=None + ): # FIXME: Get done from info is required since agentwise done is not # supported now. sample_batch[SampleBatch.DONES] = self.get_done_from_info( - sample_batch[SampleBatch.INFOS]) + sample_batch[SampleBatch.INFOS] + ) # N-step Q adjustments if self.config["n_step"] > 1: - adjust_nstep(self.config["n_step"], self.config["gamma"], - sample_batch[SampleBatch.CUR_OBS], - sample_batch[SampleBatch.ACTIONS], - sample_batch[SampleBatch.REWARDS], - sample_batch[SampleBatch.NEXT_OBS], - sample_batch[SampleBatch.DONES]) + adjust_nstep( + self.config["n_step"], + self.config["gamma"], + sample_batch[SampleBatch.CUR_OBS], + sample_batch[SampleBatch.ACTIONS], + sample_batch[SampleBatch.REWARDS], + sample_batch[SampleBatch.NEXT_OBS], + sample_batch[SampleBatch.DONES], + ) return sample_batch @@ -54,8 +57,7 @@ def __init__(self, obs_space, act_space, config): # FIXME: Get done from info is required since agentwise done is not # supported now. - self.get_done_from_info = np.vectorize( - lambda info: info.get("done", False)) + self.get_done_from_info = np.vectorize(lambda info: info.get("done", False)) agent_id = config["agent_id"] if agent_id is None: @@ -68,21 +70,19 @@ def _make_continuous_space(space): if isinstance(space, Box): return space elif isinstance(space, Discrete): - return Box( - low=np.zeros((space.n, )), high=np.ones((space.n, ))) + return Box(low=np.zeros((space.n,)), high=np.ones((space.n,))) else: raise UnsupportedSpaceException( - "Space {} is not supported.".format(space)) + "Space {} is not supported.".format(space) + ) obs_space_n = [ _make_continuous_space(space) - for _, (_, space, _, - _) in config["multiagent"]["policies"].items() + for _, (_, space, _, _) in config["multiagent"]["policies"].items() ] act_space_n = [ _make_continuous_space(space) - for _, (_, _, space, - _) in config["multiagent"]["policies"].items() + for _, (_, _, space, _) in config["multiagent"]["policies"].items() ] # _____ Placeholders @@ -90,9 +90,9 @@ def _make_continuous_space(space): def _make_ph_n(space_n, name=""): return [ tf1.placeholder( - tf.float32, - shape=(None, ) + space.shape, - name=name + "_%d" % i) for i, space in enumerate(space_n) + tf.float32, shape=(None,) + space.shape, name=name + "_%d" % i + ) + for i, space in enumerate(space_n) ] obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.OBS) @@ -100,14 +100,14 @@ def _make_ph_n(space_n, name=""): new_obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.NEXT_OBS) new_act_ph_n = _make_ph_n(act_space_n, "new_actions") rew_ph = tf1.placeholder( - tf.float32, shape=None, name="rewards_{}".format(agent_id)) + tf.float32, shape=None, name="rewards_{}".format(agent_id) + ) done_ph = tf1.placeholder( - tf.float32, shape=None, name="dones_{}".format(agent_id)) + tf.float32, shape=None, name="dones_{}".format(agent_id) + ) if config["use_local_critic"]: - obs_space_n, act_space_n = [obs_space_n[agent_id]], [ - act_space_n[agent_id] - ] + obs_space_n, act_space_n = [obs_space_n[agent_id]], [act_space_n[agent_id]] obs_ph_n, act_ph_n = [obs_ph_n[agent_id]], [act_ph_n[agent_id]] new_obs_ph_n, new_act_ph_n = [new_obs_ph_n[agent_id]], [ new_act_ph_n[agent_id] @@ -124,7 +124,8 @@ def _make_ph_n(space_n, name=""): config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), - scope="critic") + scope="critic", + ) # Build critic network for t + 1. target_critic, _, _, target_critic_vars = self._build_critic_network( @@ -135,39 +136,44 @@ def _make_ph_n(space_n, name=""): config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), - scope="target_critic") + scope="target_critic", + ) # Build critic loss. td_error = tf.subtract( tf.stop_gradient( - rew_ph + (1.0 - done_ph) * - (config["gamma"]**config["n_step"]) * target_critic[:, 0]), - critic[:, 0]) - critic_loss = tf.reduce_mean(td_error**2) + rew_ph + + (1.0 - done_ph) + * (config["gamma"] ** config["n_step"]) + * target_critic[:, 0] + ), + critic[:, 0], + ) + critic_loss = tf.reduce_mean(td_error ** 2) # _____ Policy Network # Build actor network for t. - act_sampler, actor_feature, actor_model, actor_vars = ( - self._build_actor_network( - obs_ph_n[agent_id], - obs_space_n[agent_id], - act_space_n[agent_id], - config["use_state_preprocessor"], - config["actor_hiddens"], - getattr(tf.nn, config["actor_hidden_activation"]), - scope="actor")) + act_sampler, actor_feature, actor_model, actor_vars = self._build_actor_network( + obs_ph_n[agent_id], + obs_space_n[agent_id], + act_space_n[agent_id], + config["use_state_preprocessor"], + config["actor_hiddens"], + getattr(tf.nn, config["actor_hidden_activation"]), + scope="actor", + ) # Build actor network for t + 1. self.new_obs_ph = new_obs_ph_n[agent_id] - self.target_act_sampler, _, _, target_actor_vars = ( - self._build_actor_network( - self.new_obs_ph, - obs_space_n[agent_id], - act_space_n[agent_id], - config["use_state_preprocessor"], - config["actor_hiddens"], - getattr(tf.nn, config["actor_hidden_activation"]), - scope="target_actor")) + self.target_act_sampler, _, _, target_actor_vars = self._build_actor_network( + self.new_obs_ph, + obs_space_n[agent_id], + act_space_n[agent_id], + config["use_state_preprocessor"], + config["actor_hiddens"], + getattr(tf.nn, config["actor_hidden_activation"]), + scope="target_actor", + ) # Build actor loss. act_n = act_ph_n.copy() @@ -180,11 +186,13 @@ def _make_ph_n(space_n, name=""): config["use_state_preprocessor"], config["critic_hiddens"], getattr(tf.nn, config["critic_hidden_activation"]), - scope="critic") + scope="critic", + ) actor_loss = -tf.reduce_mean(critic) if config["actor_feature_reg"] is not None: actor_loss += config["actor_feature_reg"] * tf.reduce_mean( - actor_feature**2) + actor_feature ** 2 + ) # _____ Losses self.losses = {"critic": critic_loss, "actor": actor_loss} @@ -192,12 +200,11 @@ def _make_ph_n(space_n, name=""): # _____ Optimizers self.optimizers = { "critic": tf1.train.AdamOptimizer(config["critic_lr"]), - "actor": tf1.train.AdamOptimizer(config["actor_lr"]) + "actor": tf1.train.AdamOptimizer(config["actor_lr"]), } # _____ Build variable update ops. - self.tau = tf1.placeholder_with_default( - config["tau"], shape=(), name="tau") + self.tau = tf1.placeholder_with_default(config["tau"], shape=(), name="tau") def _make_target_update_op(vs, target_vs, tau): return [ @@ -206,8 +213,8 @@ def _make_target_update_op(vs, target_vs, tau): ] self.update_target_vars = _make_target_update_op( - critic_vars + actor_vars, target_critic_vars + target_actor_vars, - self.tau) + critic_vars + actor_vars, target_critic_vars + target_actor_vars, self.tau + ) def _make_set_weight_op(variables): vs = list() @@ -215,9 +222,9 @@ def _make_set_weight_op(variables): vs += v phs = [ tf1.placeholder( - tf.float32, - shape=v.get_shape(), - name=v.name.split(":")[0] + "_ph") for v in vs + tf.float32, shape=v.get_shape(), name=v.name.split(":")[0] + "_ph" + ) + for v in vs ] return tf.group(*[v.assign(ph) for v, ph in zip(vs, phs)]), phs @@ -225,7 +232,7 @@ def _make_set_weight_op(variables): "critic": critic_vars, "actor": actor_vars, "target_critic": target_critic_vars, - "target_actor": target_actor_vars + "target_actor": target_actor_vars, } self.update_vars, self.vars_ph = _make_set_weight_op(self.vars) @@ -234,11 +241,11 @@ def _make_set_weight_op(variables): self.sess = tf1.get_default_session() def _make_loss_inputs(placeholders): - return [(ph.name.split("/")[-1].split(":")[0], ph) - for ph in placeholders] + return [(ph.name.split("/")[-1].split(":")[0], ph) for ph in placeholders] - loss_inputs = _make_loss_inputs(obs_ph_n + act_ph_n + new_obs_ph_n + - new_act_ph_n + [rew_ph, done_ph]) + loss_inputs = _make_loss_inputs( + obs_ph_n + act_ph_n + new_obs_ph_n + new_act_ph_n + [rew_ph, done_ph] + ) TFPolicy.__init__( self, @@ -250,7 +257,8 @@ def _make_loss_inputs(placeholders): sampled_action=act_sampler, loss=actor_loss + critic_loss, loss_inputs=loss_inputs, - dist_inputs=actor_feature) + dist_inputs=actor_feature, + ) del self.view_requirements["prev_actions"] del self.view_requirements["prev_rewards"] @@ -267,21 +275,22 @@ def optimizer(self): @override(TFPolicy) def gradients(self, optimizer, loss): self.gvs = { - k: minimize_and_clip(optimizer, self.losses[k], self.vars[k], - self.config["grad_clip"]) + k: minimize_and_clip( + optimizer, self.losses[k], self.vars[k], self.config["grad_clip"] + ) for k, optimizer in self.optimizers.items() } return self.gvs["critic"] + self.gvs["actor"] @override(TFPolicy) def build_apply_op(self, optimizer, grads_and_vars): - critic_apply_op = self.optimizers["critic"].apply_gradients( - self.gvs["critic"]) + critic_apply_op = self.optimizers["critic"].apply_gradients(self.gvs["critic"]) with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]): with tf1.control_dependencies([critic_apply_op]): actor_apply_op = self.optimizers["actor"].apply_gradients( - self.gvs["actor"]) + self.gvs["actor"] + ) return actor_apply_op @@ -303,8 +312,8 @@ def get_weights(self): @override(TFPolicy) def set_weights(self, weights): self.sess.run( - self.update_vars, - feed_dict=dict(zip(self.vars_ph, weights["_state"]))) + self.update_vars, feed_dict=dict(zip(self.vars_ph, weights["_state"])) + ) @override(Policy) def get_state(self): @@ -314,24 +323,33 @@ def get_state(self): def set_state(self, state): TFPolicy.set_state(self, state) - def _build_critic_network(self, - obs_n, - act_n, - obs_space_n, - act_space_n, - use_state_preprocessor, - hiddens, - activation=None, - scope=None): + def _build_critic_network( + self, + obs_n, + act_n, + obs_space_n, + act_space_n, + use_state_preprocessor, + hiddens, + activation=None, + scope=None, + ): with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope: if use_state_preprocessor: model_n = [ - ModelCatalog.get_model_v2({ - SampleBatch.OBS: obs, - "is_training": self._get_is_training_placeholder(), - }, obs_space, act_space, 1, self.config["model"]) + ModelCatalog.get_model_v2( + { + SampleBatch.OBS: obs, + "is_training": self._get_is_training_placeholder(), + }, + obs_space, + act_space, + 1, + self.config["model"], + ) for obs, obs_space, act_space in zip( - obs_n, obs_space_n, act_space_n) + obs_n, obs_space_n, act_space_n + ) ] out_n = [model.last_layer for model in model_n] out = tf.concat(out_n + act_n, axis=1) @@ -340,39 +358,45 @@ def _build_critic_network(self, out = tf.concat(obs_n + act_n, axis=1) for hidden in hiddens: - out = tf1.layers.dense( - out, units=hidden, activation=activation) + out = tf1.layers.dense(out, units=hidden, activation=activation) feature = out out = tf1.layers.dense(feature, units=1, activation=None) return out, feature, model_n, tf1.global_variables(scope.name) - def _build_actor_network(self, - obs, - obs_space, - act_space, - use_state_preprocessor, - hiddens, - activation=None, - scope=None): + def _build_actor_network( + self, + obs, + obs_space, + act_space, + use_state_preprocessor, + hiddens, + activation=None, + scope=None, + ): with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope: if use_state_preprocessor: - model = ModelCatalog.get_model_v2({ - SampleBatch.OBS: obs, - "is_training": self._get_is_training_placeholder(), - }, obs_space, act_space, 1, self.config["model"]) + model = ModelCatalog.get_model_v2( + { + SampleBatch.OBS: obs, + "is_training": self._get_is_training_placeholder(), + }, + obs_space, + act_space, + 1, + self.config["model"], + ) out = model.last_layer else: model = None out = obs for hidden in hiddens: - out = tf1.layers.dense( - out, units=hidden, activation=activation) - feature = tf1.layers.dense( - out, units=act_space.shape[0], activation=None) + out = tf1.layers.dense(out, units=hidden, activation=activation) + feature = tf1.layers.dense(out, units=act_space.shape[0], activation=None) sampler = tfp.distributions.RelaxedOneHotCategorical( - temperature=1.0, logits=feature).sample() + temperature=1.0, logits=feature + ).sample() return sampler, feature, model, tf1.global_variables(scope.name) diff --git a/maddpg_torch_model.py b/maddpg_torch_model.py index 6dfe133..9cb5753 100644 --- a/maddpg_torch_model.py +++ b/maddpg_torch_model.py @@ -1,43 +1,45 @@ -from numpy.core.fromnumeric import shape -import ray +from typing import Dict, List, Union + import gym -from gym.spaces import Discrete, Box import numpy as np -from typing import List, Dict, Union +import ray +from gym.spaces import Box, Discrete +from numpy.core.fromnumeric import shape from ray.rllib.agents.ddpg.ddpg_torch_model import DDPGTorchModel - +from ray.rllib.agents.ddpg.noop_model import TorchNoopModel +from ray.rllib.models import ModelCatalog +from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.torch.misc import SlimFC from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.models.utils import get_activation_fn +from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.view_requirement import ViewRequirement -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import ModelConfigDict, TensorType from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.typing import TrainerConfigDict, TensorType, LocalOptimizer, GradInfoDict -from ray.rllib.agents.ddpg.noop_model import TorchNoopModel -from ray.rllib.models import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import (GradInfoDict, LocalOptimizer, + ModelConfigDict, TensorType, + TrainerConfigDict) torch, nn = try_import_torch() + def _make_continuous_space(space): if isinstance(space, Box): return space elif isinstance(space, Discrete): - return Box( - low=np.zeros((space.n, )), high=np.ones((space.n, ))) + return Box(low=np.zeros((space.n,)), high=np.ones((space.n,))) else: - raise UnsupportedSpaceException( - "Space {} is not supported.".format(space)) + raise UnsupportedSpaceException("Space {} is not supported.".format(space)) -def build_maddpg_models(policy: Policy, - obs_space: Box, action_space: Box, - config: TrainerConfigDict) -> ModelV2: +def build_maddpg_models( + policy: Policy, obs_space: Box, action_space: Box, config: TrainerConfigDict +) -> ModelV2: - config["model"]["multiagent"] = config["multiagent"] # Needed for critic obs_space and act_space + config["model"]["multiagent"] = config[ + "multiagent" + ] # Needed for critic obs_space and act_space if policy.config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary @@ -60,8 +62,9 @@ def build_maddpg_models(policy: Policy, critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], - add_layer_norm=(policy.config["exploration_config"].get("type") == - "ParameterNoise"), + add_layer_norm=( + policy.config["exploration_config"].get("type") == "ParameterNoise" + ), ) policy.target_model = ModelCatalog.get_model_v2( @@ -78,15 +81,16 @@ def build_maddpg_models(policy: Policy, critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], - add_layer_norm=(policy.config["exploration_config"].get("type") == - "ParameterNoise"), + add_layer_norm=( + policy.config["exploration_config"].get("type") == "ParameterNoise" + ), ) return policy.model class MADDPGTorchModel(TorchModelV2, nn.Module): - ''' + """ Extension of TorchModelV2 for MADDPG Note that the critic takes in the joint state and action over all agents Data flow: @@ -96,25 +100,28 @@ class MADDPGTorchModel(TorchModelV2, nn.Module): model_out, actions -> get_twin_q_values() -> Q_twin(s, a) Note that this class by itself is not a valid model unless you implement forward() in a subclass. - ''' + """ def __init__( - self, - observation_space: Box, - action_space: Box, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - # Extra MADDPGActionModel args: - actor_hiddens: List[int] = [256, 256], - actor_hidden_activation: str = "relu", - critic_hiddens: List[int] = [256, 256], - critic_hidden_activation: str = "relu", - twin_q: bool = False, - add_layer_norm: bool = False): - + self, + observation_space: Box, + action_space: Box, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + # Extra MADDPGActionModel args: + actor_hiddens: List[int] = [256, 256], + actor_hidden_activation: str = "relu", + critic_hiddens: List[int] = [256, 256], + critic_hidden_activation: str = "relu", + twin_q: bool = False, + add_layer_norm: bool = False, + ): + nn.Module.__init__(self) - TorchModelV2.__init__(self, observation_space, action_space, num_outputs, model_config, name) + TorchModelV2.__init__( + self, observation_space, action_space, num_outputs, model_config, name + ) self.action_dim = np.prod(action_space.shape) @@ -122,8 +129,7 @@ def __init__( self.policy_model = nn.Sequential() ins = np.prod(observation_space.shape) self.obs_ins = ins - activation = get_activation_fn( - actor_hidden_activation, framework="torch") + activation = get_activation_fn(actor_hidden_activation, framework="torch") for i, n in enumerate(actor_hiddens): self.policy_model.add_module( "action_{}".format(i), @@ -131,11 +137,14 @@ def __init__( ins, n, initializer=torch.nn.init.xavier_uniform_, - activation_fn=activation)) + activation_fn=activation, + ), + ) # Add LayerNorm after each Dense. if add_layer_norm: - self.policy_model.add_module("LayerNorm_A_{}".format(i), - nn.LayerNorm(n)) + self.policy_model.add_module( + "LayerNorm_A_{}".format(i), nn.LayerNorm(n) + ) ins = n self.policy_model.add_module( @@ -144,27 +153,26 @@ def __init__( ins, self.action_dim, initializer=torch.nn.init.xavier_uniform_, - activation_fn=None)) + activation_fn=None, + ), + ) # Build MADDPG Critic and Target Critic obs_space_n = [ _make_continuous_space(space) - for _, (_, space, _, - _) in model_config["multiagent"]["policies"].items() + for _, (_, space, _, _) in model_config["multiagent"]["policies"].items() ] act_space_n = [ _make_continuous_space(space) - for _, (_, _, space, - _) in model_config["multiagent"]["policies"].items() + for _, (_, _, space, _) in model_config["multiagent"]["policies"].items() ] self.critic_obs = np.sum([obs_space.shape[0] for obs_space in obs_space_n]) self.critic_act = np.sum([act_space.shape[0] for act_space in act_space_n]) # Build the Q-net(s), including target Q-net(s). def build_q_net(name_): - activation = get_activation_fn( - critic_hidden_activation, framework="torch") + activation = get_activation_fn(critic_hidden_activation, framework="torch") # For continuous actions: Feed obs and actions (concatenated) # through the NN. For discrete actions, only obs. q_net = nn.Sequential() @@ -176,7 +184,9 @@ def build_q_net(name_): ins, n, initializer=nn.init.xavier_uniform_, - activation_fn=activation)) + activation_fn=activation, + ), + ) ins = n q_net.add_module( @@ -185,25 +195,37 @@ def build_q_net(name_): ins, 1, initializer=torch.nn.init.xavier_uniform_, - activation_fn=None)) + activation_fn=None, + ), + ) return q_net - + self.q_model = build_q_net("q") if twin_q: self.twin_q_model = build_q_net("twin_q") else: self.twin_q_model = None - - self.view_requirements[SampleBatch.ACTIONS] = ViewRequirement(SampleBatch.ACTIONS) + + self.view_requirements[SampleBatch.ACTIONS] = ViewRequirement( + SampleBatch.ACTIONS + ) self.view_requirements["new_actions"] = ViewRequirement("new_actions") self.view_requirements["t"] = ViewRequirement("t") self.view_requirements[SampleBatch.NEXT_OBS] = ViewRequirement( - data_col=SampleBatch.OBS, - shift=1, - space=self.obs_space) + data_col=SampleBatch.OBS, shift=1, space=self.obs_space + ) + + # TODO: This shouldn't be necessary? Looks like the target_model isn't + # being moved to the gpu automatically, need to fix this somewhere + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.policy_model = self.policy_model.to(device) + self.q_model = self.q_model.to(device) + if twin_q: + self.twin_q_model = self.twin_q_model.to(device) - def get_q_values(self, model_out_n: List[TensorType], - act_n: List[TensorType]) -> TensorType: + def get_q_values( + self, model_out_n: List[TensorType], act_n: List[TensorType] + ) -> TensorType: """Return the Q estimates for the most recent forward pass. This implements Q(s, a). Args: @@ -218,8 +240,9 @@ def get_q_values(self, model_out_n: List[TensorType], act_n = torch.cat(act_n, dim=-1) return self.q_model(torch.cat([model_out_n, act_n], -1)) - def get_twin_q_values(self, model_out_n: TensorType, - act_n: TensorType) -> TensorType: + def get_twin_q_values( + self, model_out_n: TensorType, act_n: TensorType + ) -> TensorType: """Same as get_q_values but using the twin Q net. This implements the twin Q(s, a). Args: @@ -249,20 +272,23 @@ def get_policy_output(self, model_out: TensorType) -> TensorType: entropy = dist.base_dist._categorical.entropy() return action - def policy_variables(self, as_dict: bool = False - ) -> Union[List[TensorType], Dict[str, TensorType]]: + def policy_variables( + self, as_dict: bool = False + ) -> Union[List[TensorType], Dict[str, TensorType]]: """Return the list of variables for the policy net.""" if as_dict: return self.policy_model.state_dict() return list(self.policy_model.parameters()) - def q_variables(self, as_dict=False - ) -> Union[List[TensorType], Dict[str, TensorType]]: + def q_variables( + self, as_dict=False + ) -> Union[List[TensorType], Dict[str, TensorType]]: """Return the list of variables for Q / twin Q nets.""" if as_dict: return { **self.q_model.state_dict(), - **(self.twin_q_model.state_dict() if self.twin_q_model else {}) + **(self.twin_q_model.state_dict() if self.twin_q_model else {}), } - return list(self.q_model.parameters()) + \ - (list(self.twin_q_model.parameters()) if self.twin_q_model else []) \ No newline at end of file + return list(self.q_model.parameters()) + ( + list(self.twin_q_model.parameters()) if self.twin_q_model else [] + ) diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py index e12e2a2..221adca 100644 --- a/maddpg_torch_policy.py +++ b/maddpg_torch_policy.py @@ -1,40 +1,49 @@ import logging -from gym.spaces import Box, Discrete -import numpy as np from typing import Dict, Tuple -from maddpg_torch_model import build_maddpg_models, _make_continuous_space - -from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss -from ray.rllib.utils.typing import TrainerConfigDict, TensorType, LocalOptimizer +import numpy as np +from gym.spaces import Box, Discrete +from ray.rllib.agents.ddpg.ddpg_tf_policy import ( + build_ddpg_models, get_distribution_inputs_and_class) +from ray.rllib.agents.ddpg.ddpg_torch_policy import (TargetNetworkMixin, + apply_gradients_fn, + make_ddpg_optimizers) +from ray.rllib.agents.ddpg.noop_model import TorchNoopModel from ray.rllib.evaluation.postprocessing import adjust_nstep +from ray.rllib.models import ModelCatalog +from ray.rllib.models.action_dist import ActionDistribution from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.models.torch.torch_action_dist import TorchDeterministic from ray.rllib.policy.policy import Policy from ray.rllib.policy.policy_template import build_policy_class +from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.agents.ddpg.ddpg_torch_policy import apply_gradients_fn, \ - make_ddpg_optimizers, TargetNetworkMixin -from ray.rllib.agents.ddpg.ddpg_tf_policy import build_ddpg_models, get_distribution_inputs_and_class -from ray.rllib.models.action_dist import ActionDistribution -from ray.rllib.models.torch.torch_action_dist import TorchDeterministic -from ray.rllib.agents.ddpg.noop_model import TorchNoopModel -from ray.rllib.models import ModelCatalog +from ray.rllib.utils.torch_utils import (apply_grad_clipping, huber_loss, + l2_loss) +from ray.rllib.utils.typing import (LocalOptimizer, TensorType, + TrainerConfigDict) + +from maddpg_torch_model import _make_continuous_space, build_maddpg_models logger = logging.getLogger(__name__) torch, nn = try_import_torch() + def validate_spaces(policy: Policy, obs_space, action_space, config) -> None: if isinstance(obs_space, Discrete) or isinstance(action_space, Discrete): - logging.warning("Discrete spaces may not work correctly with \ - pytorch MADDPG; consider using framework=tf instead") + logging.warning( + "Discrete spaces may not work correctly with \ + pytorch MADDPG; consider using framework=tf instead" + ) policy.observation_space = _make_continuous_space(obs_space) policy.action_space = _make_continuous_space(action_space) -def build_maddpg_models_and_action_dist(policy: Policy, obs_space, action_space, - config: TrainerConfigDict) -> Tuple[ModelV2, ActionDistribution]: - + +def build_maddpg_models_and_action_dist( + policy: Policy, obs_space, action_space, config: TrainerConfigDict +) -> Tuple[ModelV2, ActionDistribution]: + if policy.config["use_local_critic"]: model = build_ddpg_models(policy, obs_space, action_space, config) else: @@ -42,14 +51,16 @@ def build_maddpg_models_and_action_dist(policy: Policy, obs_space, action_space, return model, TorchDeterministic -def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: SampleBatch) -> TensorType: +def maddpg_actor_critic_loss( + policy: Policy, model: ModelV2, _, train_batch: SampleBatch +) -> TensorType: if not hasattr(policy, "td_error") or policy.td_error is None: policy.actor_loss = torch.zeros(len(train_batch)) policy.critic_loss = torch.zeros(len(train_batch)) policy.td_error = torch.zeros(len(train_batch)) policy.q_t = torch.zeros(len(train_batch)) return policy.actor_loss, policy.critic_loss - + twin_q = policy.config["twin_q"] gamma = policy.config["gamma"] n_step = policy.config["n_step"] @@ -58,12 +69,12 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam l2_reg = policy.config["l2_reg"] agent_id = policy.config["agent_id"] n_agents = len(policy.config["multiagent"]["policies"]) - + input_dict = { "obs": train_batch["_".join([SampleBatch.CUR_OBS, str(agent_id)])], "is_training": True, - } - + } + input_dict_next = { "obs": train_batch["_".join([SampleBatch.NEXT_OBS, str(agent_id)])], "is_training": True, @@ -82,9 +93,11 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam target_noise_clip = policy.config["target_noise_clip"] clipped_normal_sample = torch.clamp( torch.normal( - mean=torch.zeros(policy_tp1.size()), - std=policy.config["target_noise"]).to(policy_tp1.device), - -target_noise_clip, target_noise_clip) + mean=torch.zeros(policy_tp1.size()), std=policy.config["target_noise"] + ).to(policy_tp1.device), + -target_noise_clip, + target_noise_clip, + ) policy_tp1_smoothed = torch.min( torch.max( @@ -92,35 +105,52 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam torch.tensor( policy.action_space.low, dtype=torch.float32, - device=policy_tp1.device)), + device=policy_tp1.device, + ), + ), torch.tensor( - policy.action_space.high, - dtype=torch.float32, - device=policy_tp1.device)) + policy.action_space.high, dtype=torch.float32, device=policy_tp1.device + ), + ) else: # No smoothing, just use deterministic actions. policy_tp1_smoothed = policy_tp1 - - obs_n = [train_batch["_".join([SampleBatch.CUR_OBS, str(id)])] for id in range(n_agents)] - act_n = [train_batch["_".join([SampleBatch.ACTIONS, str(id)])] for id in range(n_agents)] - next_obs_n = [train_batch["_".join([SampleBatch.NEXT_OBS, str(id)])] for id in range(n_agents)] + + obs_n = [ + train_batch["_".join([SampleBatch.CUR_OBS, str(id)])] for id in range(n_agents) + ] + act_n = [ + train_batch["_".join([SampleBatch.ACTIONS, str(id)])] for id in range(n_agents) + ] + next_obs_n = [ + train_batch["_".join([SampleBatch.NEXT_OBS, str(id)])] for id in range(n_agents) + ] next_policy_n = [train_batch["new_actions_{}".format(id)] for id in range(n_agents)] next_policy_n[agent_id] = policy_tp1_smoothed rewards = train_batch["rewards_{}".format(agent_id)] dones = train_batch["dones_{}".format(agent_id)] - + if policy.config["use_state_preprocessor"]: # Create all state preprocessors model_n = [ - ModelCatalog.get_model_v2(obs_space, act_space, 1, - policy.config["model"], default_model=TorchNoopModel) + ModelCatalog.get_model_v2( + obs_space, + act_space, + 1, + policy.config["model"], + default_model=TorchNoopModel, + ) for obs_space, act_space in zip(policy.obs_space_n, policy.act_space_n) - ] + ] # Get states from preprocessors - model_out_n = [model.forward({SampleBatch.OBS: obs, "is_training": True}, [], None)[0] for \ - model, obs in zip(model_n, obs_n)] - model_out_next_n = [model.forward({SampleBatch.OBS: next_obs, "is_training": True}, [], None)[0] for \ - model, next_obs in zip(model_n, next_obs_n)] + model_out_n = [ + model.forward({SampleBatch.OBS: obs, "is_training": True}, [], None)[0] + for model, obs in zip(model_n, obs_n) + ] + model_out_next_n = [ + model.forward({SampleBatch.OBS: next_obs, "is_training": True}, [], None)[0] + for model, next_obs in zip(model_n, next_obs_n) + ] else: model_out_n = obs_n model_out_next_n = next_obs_n @@ -131,7 +161,7 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam # Compute this here so policy_n can be modified without deepcopying act_n if twin_q: twin_q_t = model.get_twin_q_values(model_out_n, act_n) - + # Q-values for current policy (no noise) in given current state policy_n = act_n policy_n[agent_id] = policy_t @@ -145,7 +175,8 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam if twin_q: twin_q_tp1 = policy.target_model.get_twin_q_values( - model_out_next_n, next_policy_n) + model_out_next_n, next_policy_n + ) q_t_selected = torch.squeeze(q_t, axis=len(q_t.shape) - 1) @@ -156,36 +187,36 @@ def maddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: Sam q_tp1_best = torch.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (~dones).float() * q_tp1_best - q_t_selected_target = (rewards + gamma**n_step * q_tp1_best_masked).detach() + q_t_selected_target = (rewards + gamma ** n_step * q_tp1_best_masked).detach() # Compute the error (potentially clipped). if twin_q: td_error = q_t_selected - q_t_selected_target twin_td_error = twin_q_t_selected - q_t_selected_target if use_huber: - errors = huber_loss(td_error, huber_threshold) \ - + huber_loss(twin_td_error, huber_threshold) + errors = huber_loss(td_error, huber_threshold) + huber_loss( + twin_td_error, huber_threshold + ) else: - errors = 0.5 * \ - (torch.pow(td_error, 2.0) + torch.pow(twin_td_error, 2.0)) + errors = 0.5 * (torch.pow(td_error, 2.0) + torch.pow(twin_td_error, 2.0)) else: td_error = q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) else: errors = 0.5 * torch.pow(td_error, 2.0) - + critic_loss = torch.mean(errors) # Add l2-regularization if required. if l2_reg is not None: for name, var in model.policy_variables(as_dict=True).items(): if "bias" not in name: - actor_loss += (l2_reg * l2_loss(var)) + actor_loss += l2_reg * l2_loss(var) for name, var in model.q_variables(as_dict=True).items(): if "bias" not in name: - critic_loss += (l2_reg * l2_loss(var)) - + critic_loss += l2_reg * l2_loss(var) + # Store values for stats function. policy.actor_loss = actor_loss policy.critic_loss = critic_loss @@ -204,30 +235,38 @@ def build_maddpg_stats(policy: Policy, batch: SampleBatch) -> Dict[str, TensorTy "max_q": torch.max(policy.q_t), "min_q": torch.min(policy.q_t), "mean_td_error": torch.mean(policy.td_error), - "td_error": policy.td_error + "td_error": policy.td_error, } return stats -def postprocess_nstep(policy: Policy, batch: SampleBatch, - other_agent_batches=None, episode=None): +def postprocess_nstep( + policy: Policy, batch: SampleBatch, other_agent_batches=None, episode=None +): # N-step Q adjustments if policy.config["n_step"] > 1: - adjust_nstep(policy.config["n_step"], policy.config["gamma"], - batch[SampleBatch.CUR_OBS], - batch[SampleBatch.ACTIONS], - batch[SampleBatch.REWARDS], - batch[SampleBatch.NEXT_OBS], - batch[SampleBatch.DONES]) + adjust_nstep( + policy.config["n_step"], + policy.config["gamma"], + batch[SampleBatch.CUR_OBS], + batch[SampleBatch.ACTIONS], + batch[SampleBatch.REWARDS], + batch[SampleBatch.NEXT_OBS], + batch[SampleBatch.DONES], + ) return batch -def make_maddpg_optimizers(policy: Policy, config: TrainerConfigDict) -> Tuple[LocalOptimizer]: +def make_maddpg_optimizers( + policy: Policy, config: TrainerConfigDict +) -> Tuple[LocalOptimizer]: return make_ddpg_optimizers(policy, config) -def before_init_fn(policy: Policy, obs_space, action_space, config: TrainerConfigDict) -> None: +def before_init_fn( + policy: Policy, obs_space, action_space, config: TrainerConfigDict +) -> None: policy.global_step = 0 # Check agent_id agent_id = config["agent_id"] @@ -241,13 +280,16 @@ class ComputeTDErrorMixin: def __init__(self, loss_fn): def compute_td_error(obs_t, act_t, rew_t, obs_tp1, done_mask): input_dict = self._lazy_tensor_dict( - SampleBatch({ - SampleBatch.CUR_OBS: obs_t, - SampleBatch.ACTIONS: act_t, - SampleBatch.REWARDS: rew_t, - SampleBatch.NEXT_OBS: obs_tp1, - SampleBatch.DONES: done_mask, - })) + SampleBatch( + { + SampleBatch.CUR_OBS: obs_t, + SampleBatch.ACTIONS: act_t, + SampleBatch.REWARDS: rew_t, + SampleBatch.NEXT_OBS: obs_tp1, + SampleBatch.DONES: done_mask, + } + ) + ) # Do forward pass on loss to update td errors attribute loss_fn(self, self.model, None, input_dict) @@ -261,28 +303,31 @@ class SetJointSpacesMixin: def __init__(self, config: TrainerConfigDict): self.obs_space_n = [ _make_continuous_space(space) - for _, (_, space, _, - _) in config["multiagent"]["policies"].items() + for _, (_, space, _, _) in config["multiagent"]["policies"].items() ] self.act_space_n = [ _make_continuous_space(space) - for _, (_, _, space, - _) in config["multiagent"]["policies"].items() + for _, (_, _, space, _) in config["multiagent"]["policies"].items() ] -def setup_late_mixins(policy: Policy, obs_space, action_space, config: TrainerConfigDict) -> None: + +def setup_late_mixins( + policy: Policy, obs_space, action_space, config: TrainerConfigDict +) -> None: ComputeTDErrorMixin.__init__(policy, maddpg_actor_critic_loss) TargetNetworkMixin.__init__(policy) SetJointSpacesMixin.__init__(policy, config) + def get_default_config(): import maddpg + return maddpg.DEFAULT_CONFIG MADDPGTorchPolicy = build_policy_class( name="MADDPGTorchPolicy", - framework='torch', + framework="torch", loss_fn=maddpg_actor_critic_loss, get_default_config=get_default_config, stats_fn=build_maddpg_stats, @@ -295,5 +340,5 @@ def get_default_config(): before_loss_init=setup_late_mixins, make_model_and_action_dist=build_maddpg_models_and_action_dist, apply_gradients_fn=apply_gradients_fn, - mixins=[TargetNetworkMixin, ComputeTDErrorMixin, SetJointSpacesMixin] + mixins=[TargetNetworkMixin, ComputeTDErrorMixin, SetJointSpacesMixin], ) diff --git a/pettingzoo_maddpg.py b/pettingzoo_maddpg.py index 33f615f..8f18756 100644 --- a/pettingzoo_maddpg.py +++ b/pettingzoo_maddpg.py @@ -1,14 +1,17 @@ +import argparse +import os +from importlib import import_module + import numpy as np import ray +import supersuit as ss from ray import tune -from ray.tune.registry import register_trainable, register_env from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv -import maddpg -import supersuit as ss -import argparse -from importlib import import_module from ray.tune import CLIReporter -import os +from ray.tune.registry import register_env, register_trainable + +import maddpg + def parse_args(): # Environment @@ -18,58 +21,91 @@ def parse_args(): "--env-type", choices=["mpe", "sisl", "atari", "butterfly", "classic", "magent"], default="mpe", - help="The PettingZoo environment type" + help="The PettingZoo environment type", ) parser.add_argument( "--env-name", type=str, default="simple_spread_v2", - help="The PettingZoo environment to use" + help="The PettingZoo environment to use", ) parser.add_argument( "--framework", choices=["tf", "tf2", "tfe", "torch"], default="tf", - help="The DL framework specifier.") + help="The DL framework specifier.", + ) parser.add_argument( "--log-level", choices=["DEBUG", "INFO", "WARN", "ERROR"], default="ERROR", - help="The log level for tune.run()") - parser.add_argument("--max-episode-len", type=int, default=25, - help="maximum episode length") - parser.add_argument("--num-episodes", type=int, default=60000, - help="number of episodes") - parser.add_argument("--num-adversaries", type=int, default=0, - help="number of adversarial agents") - parser.add_argument("--good-policy", type=str, default="maddpg", - help="policy for good agents") - parser.add_argument("--adv-policy", type=str, default="maddpg", - help="policy of adversaries") + help="The log level for tune.run()", + ) + parser.add_argument( + "--max-episode-len", type=int, default=25, help="maximum episode length" + ) + parser.add_argument( + "--num-episodes", type=int, default=60000, help="number of episodes" + ) + parser.add_argument( + "--num-adversaries", type=int, default=0, help="number of adversarial agents" + ) + parser.add_argument( + "--good-policy", type=str, default="maddpg", help="policy for good agents" + ) + parser.add_argument( + "--adv-policy", type=str, default="maddpg", help="policy of adversaries" + ) # Core training parameters - parser.add_argument("--lr", type=float, default=1e-2, - help="learning rate for Adam optimizer") - parser.add_argument("--gamma", type=float, default=0.95, - help="discount factor") - parser.add_argument("--rollout-fragment-length", type=int, default=25, - help="number of data points sampled /update /worker") - parser.add_argument("--train-batch-size", type=int, default=1024, - help="number of data points /update") - parser.add_argument("--n-step", type=int, default=1, - help="length of multistep value backup") - parser.add_argument("--num-units", type=int, default=64, - help="number of units in the mlp") - parser.add_argument("--replay-buffer", type=int, default=1000000, - help="size of replay buffer in training") + parser.add_argument( + "--lr", type=float, default=1e-2, help="learning rate for Adam optimizer" + ) + parser.add_argument("--gamma", type=float, default=0.95, help="discount factor") + parser.add_argument( + "--rollout-fragment-length", + type=int, + default=25, + help="number of data points sampled /update /worker", + ) + parser.add_argument( + "--train-batch-size", + type=int, + default=1024, + help="number of data points /update", + ) + parser.add_argument( + "--n-step", type=int, default=1, help="length of multistep value backup" + ) + parser.add_argument( + "--num-units", type=int, default=64, help="number of units in the mlp" + ) + parser.add_argument( + "--replay-buffer", + type=int, + default=1000000, + help="size of replay buffer in training", + ) # Checkpoint - parser.add_argument("--checkpoint-freq", type=int, default=10000, - help="save model once every time this many iterations are completed") - parser.add_argument("--local-dir", type=str, default="~/ray_results", - help="path to save checkpoints") - parser.add_argument("--restore", type=str, default=None, - help="directory in which training state and model are loaded") + parser.add_argument( + "--checkpoint-freq", + type=int, + default=10000, + help="save model once every time this many iterations are completed", + ) + parser.add_argument( + "--local-dir", + type=str, + default="~/ray_results", + help="path to save checkpoints", + ) + parser.add_argument( + "--restore", + type=str, + default=None, + help="directory in which training state and model are loaded", + ) # Parallelism parser.add_argument("--num-workers", type=int, default=1) @@ -77,14 +113,24 @@ def parse_args(): parser.add_argument("--num-gpus", type=int, default=0) # Evaluation - parser.add_argument("--eval-freq", type=int, default=0, - help="evaluate model every time this many iterations are completed") - parser.add_argument("--eval-num-episodes", type=int, default=5, - help="Number of episodes to run for evaluation") - parser.add_argument("--render", type=bool, default=False, - help="render environment for evaluation") - parser.add_argument("--record", type=str, default=None, - help="path to store evaluation videos") + parser.add_argument( + "--eval-freq", + type=int, + default=0, + help="evaluate model every time this many iterations are completed", + ) + parser.add_argument( + "--eval-num-episodes", + type=int, + default=5, + help="Number of episodes to run for evaluation", + ) + parser.add_argument( + "--render", type=bool, default=False, help="render environment for evaluation" + ) + parser.add_argument( + "--record", type=str, default=None, help="path to store evaluation videos" + ) return parser.parse_args() @@ -112,8 +158,10 @@ def env_creator(config): def gen_policy(i): use_local_critic = [ - args.adv_policy == "ddpg" if i < args.num_adversaries else - args.good_policy == "ddpg" for i in range(len(env.agents)) + args.adv_policy == "ddpg" + if i < args.num_adversaries + else args.good_policy == "ddpg" + for i in range(len(env.agents)) ] return ( None, @@ -122,64 +170,58 @@ def gen_policy(i): { "agent_id": i, "use_local_critic": use_local_critic[i], - } + }, ) - policies = {"policy_%d" %i: gen_policy(i) for i in range(len(env.agents))} + policies = {"policy_%d" % i: gen_policy(i) for i in range(len(env.agents))} policy_ids = list(policies.keys()) - config={ - # === Setup === - "framework": args.framework, - "log_level": args.log_level, - "env": env_name, - "num_workers": args.num_workers, - "num_gpus": args.num_gpus, - "num_gpus_per_worker": 0, - "num_envs_per_worker": args.num_envs_per_worker, - "horizon": args.max_episode_len, - - # === Policy Config === - # --- Model --- - "good_policy": args.good_policy, - "adv_policy": args.adv_policy, - "actor_hiddens": [args.num_units] * 2, - "actor_hidden_activation": "relu", - "critic_hiddens": [args.num_units] * 2, - "critic_hidden_activation": "relu", - "n_step": args.n_step, - "gamma": args.gamma, - - # --- Exploration --- - "tau": 0.01, - - # --- Replay buffer --- - "buffer_size": args.replay_buffer, - - # --- Optimization --- - "actor_lr": args.lr, - "critic_lr": args.lr, - "learning_starts": args.train_batch_size * args.max_episode_len, - "rollout_fragment_length": args.rollout_fragment_length, - "train_batch_size": args.train_batch_size, - "batch_mode": "truncate_episodes", - - # === Multi-agent setting === - "multiagent": { - "policies": policies, - "policy_mapping_fn": lambda name: policy_ids[agents.index(name)], - # Workaround because MADDPG requires agent_id: int but actual ids are strings like 'speaker_0' - }, - - # === Evaluation and rendering === - "evaluation_interval": args.eval_freq, - "evaluation_num_episodes": args.eval_num_episodes, - "evaluation_config": { - "record_env": args.record, - "render_env": args.render, - }, - } - + config = { + # === Setup === + "framework": args.framework, + "log_level": args.log_level, + "env": env_name, + "num_workers": args.num_workers, + "num_gpus": args.num_gpus, + "num_gpus_per_worker": 0, + "num_envs_per_worker": args.num_envs_per_worker, + "horizon": args.max_episode_len, + # === Policy Config === + # --- Model --- + "good_policy": args.good_policy, + "adv_policy": args.adv_policy, + "actor_hiddens": [args.num_units] * 2, + "actor_hidden_activation": "relu", + "critic_hiddens": [args.num_units] * 2, + "critic_hidden_activation": "relu", + "n_step": args.n_step, + "gamma": args.gamma, + # --- Exploration --- + "tau": 0.01, + # --- Replay buffer --- + "buffer_size": args.replay_buffer, + # --- Optimization --- + "actor_lr": args.lr, + "critic_lr": args.lr, + "learning_starts": args.train_batch_size * args.max_episode_len, + "rollout_fragment_length": args.rollout_fragment_length, + "train_batch_size": args.train_batch_size, + "batch_mode": "truncate_episodes", + # === Multi-agent setting === + "multiagent": { + "policies": policies, + "policy_mapping_fn": lambda name: policy_ids[agents.index(name)], + # Workaround because MADDPG requires agent_id: int but actual ids are strings like 'speaker_0' + }, + # === Evaluation and rendering === + "evaluation_interval": args.eval_freq, + "evaluation_num_episodes": args.eval_num_episodes, + "evaluation_config": { + "record_env": args.record, + "render_env": args.render, + }, + } + tune.run( MADDPGAgent, name="Torch_MADDPG", @@ -191,10 +233,10 @@ def gen_policy(i): checkpoint_freq=args.checkpoint_freq, local_dir=os.path.join(args.local_dir, env_name), restore=args.restore, - verbose = 1 + verbose=1, ) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() main(args) From f78705fd677d1d8b6def9cde42d84a7503c3c6f2 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 24 Dec 2021 13:07:51 +0530 Subject: [PATCH 3/4] Minor fixes --- maddpg.py | 25 ++++++++++++------------- maddpg_torch_model.py | 10 +++++++--- maddpg_torch_policy.py | 18 ++++++++++-------- pettingzoo_maddpg.py | 2 +- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/maddpg.py b/maddpg.py index 6d84d09..704d4e1 100644 --- a/maddpg.py +++ b/maddpg.py @@ -13,8 +13,7 @@ from typing import Optional, Type from ray.rllib.agents.dqn.dqn import DQNTrainer -from ray.rllib.agents.dqn.simple_q import \ - DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG +from ray.rllib.agents.dqn.simple_q import DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG from ray.rllib.agents.trainer import COMMON_CONFIG, Trainer from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch @@ -29,13 +28,14 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) + def maddpg_learn_on_batch(multi_agent_batch, workers, config): policies = dict( workers.local_worker().foreach_trainable_policy(lambda p, i: (i, p)) ) samples = {} train_batch_size = config["train_batch_size"] - framework= config["framework"] + framework = config["framework"] # Modify keys. for pid, p in policies.items(): @@ -74,6 +74,7 @@ def sampler(policy, obs): policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()} return MultiAgentBatch(policy_batches, train_batch_size) + # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = Trainer.merge_trainer_configs( @@ -165,22 +166,16 @@ def sampler(policy, obs): # batch of this size. "train_batch_size": 1024, # Number of env steps to optimize for before returning - "timesteps_per_iteration": 0, + "timesteps_per_iteration": 1000, # === Exploration === "exploration_config": { - # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output - # actions (after a possible pure random phase of n timesteps). - "type": "OrnsteinUhlenbeckNoise", + "type": "GaussianNoise", # For how many timesteps should we return completely random actions, # before we start adding (scaled) noise? "random_timesteps": 1000, - # The OU-base scaling factor to always apply to action-added noise. - "ou_base_scale": 0.1, - # The OU theta param. - "ou_theta": 0.15, - # The OU sigma param. - "ou_sigma": 0.2, + # The stddev (sigma) to be used for the actions + "stddev": 0.5, # The initial noise scaling factor. "initial_scale": 1.0, # The final noise scaling factor. @@ -188,6 +183,10 @@ def sampler(policy, obs): # Timesteps over which to anneal scale (from initial to final values). "scale_timesteps": 10000, }, + # Extra configuration that disables exploration. + "evaluation_config": { + "explore": False + }, # torch-specific model configs "twin_q": False, diff --git a/maddpg_torch_model.py b/maddpg_torch_model.py index 9cb5753..67c1688 100644 --- a/maddpg_torch_model.py +++ b/maddpg_torch_model.py @@ -17,9 +17,13 @@ from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import (GradInfoDict, LocalOptimizer, - ModelConfigDict, TensorType, - TrainerConfigDict) +from ray.rllib.utils.typing import ( + GradInfoDict, + LocalOptimizer, + ModelConfigDict, + TensorType, + TrainerConfigDict, +) torch, nn = try_import_torch() diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py index 221adca..c801c31 100644 --- a/maddpg_torch_policy.py +++ b/maddpg_torch_policy.py @@ -4,10 +4,14 @@ import numpy as np from gym.spaces import Box, Discrete from ray.rllib.agents.ddpg.ddpg_tf_policy import ( - build_ddpg_models, get_distribution_inputs_and_class) -from ray.rllib.agents.ddpg.ddpg_torch_policy import (TargetNetworkMixin, - apply_gradients_fn, - make_ddpg_optimizers) + build_ddpg_models, + get_distribution_inputs_and_class, +) +from ray.rllib.agents.ddpg.ddpg_torch_policy import ( + TargetNetworkMixin, + apply_gradients_fn, + make_ddpg_optimizers, +) from ray.rllib.agents.ddpg.noop_model import TorchNoopModel from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.models import ModelCatalog @@ -18,10 +22,8 @@ from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import (apply_grad_clipping, huber_loss, - l2_loss) -from ray.rllib.utils.typing import (LocalOptimizer, TensorType, - TrainerConfigDict) +from ray.rllib.utils.torch_utils import apply_grad_clipping, huber_loss, l2_loss +from ray.rllib.utils.typing import LocalOptimizer, TensorType, TrainerConfigDict from maddpg_torch_model import _make_continuous_space, build_maddpg_models diff --git a/pettingzoo_maddpg.py b/pettingzoo_maddpg.py index 8f18756..2b2f825 100644 --- a/pettingzoo_maddpg.py +++ b/pettingzoo_maddpg.py @@ -224,7 +224,7 @@ def gen_policy(i): tune.run( MADDPGAgent, - name="Torch_MADDPG", + name=f"MADDPG/{args.framework}/{args.env_name}", config=config, progress_reporter=CLIReporter(), stop={ From 43dce5069c64a44f858a13ce28c6078dfb977ed0 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 13 May 2022 12:32:39 -0700 Subject: [PATCH 4/4] import fixes --- maddpg_tf_policy.py | 5 ++--- maddpg_torch_policy.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/maddpg_tf_policy.py b/maddpg_tf_policy.py index 9edc08a..5cc9f3e 100644 --- a/maddpg_tf_policy.py +++ b/maddpg_tf_policy.py @@ -3,9 +3,8 @@ import numpy as np import ray from gym.spaces import Box, Discrete -from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip +from ray.rllib.agents.dqn.dqn_tf_policy import minimize_and_clip, _adjust_nstep from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY -from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.models import ModelCatalog from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import SampleBatch @@ -35,7 +34,7 @@ def postprocess_trajectory( # N-step Q adjustments if self.config["n_step"] > 1: - adjust_nstep( + _adjust_nstep( self.config["n_step"], self.config["gamma"], sample_batch[SampleBatch.CUR_OBS], diff --git a/maddpg_torch_policy.py b/maddpg_torch_policy.py index 4946152..3b7e80f 100644 --- a/maddpg_torch_policy.py +++ b/maddpg_torch_policy.py @@ -13,16 +13,16 @@ make_ddpg_optimizers, ) from ray.rllib.agents.ddpg.noop_model import TorchNoopModel -from ray.rllib.evaluation.postprocessing import adjust_nstep from ray.rllib.models import ModelCatalog from ray.rllib.models.action_dist import ActionDistribution from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.torch.torch_action_dist import TorchDeterministic from ray.rllib.policy.policy import Policy from ray.rllib.policy.policy_template import build_policy_class +from ray.rllib.agents.dqn.dqn_tf_policy import _adjust_nstep from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import apply_grad_clipping, huber_loss, l2_loss +from ray.rllib.utils.torch_ops import apply_grad_clipping, huber_loss, l2_loss from ray.rllib.utils.typing import LocalOptimizer, TensorType, TrainerConfigDict from maddpg_torch_model import _make_continuous_space, build_maddpg_models @@ -248,7 +248,7 @@ def postprocess_nstep( ): # N-step Q adjustments if policy.config["n_step"] > 1: - adjust_nstep( + _adjust_nstep( policy.config["n_step"], policy.config["gamma"], batch[SampleBatch.CUR_OBS],