diff --git a/python/ray/tune/registry.py b/python/ray/tune/registry.py index 250e8b8b227d..0baca1899ea2 100644 --- a/python/ray/tune/registry.py +++ b/python/ray/tune/registry.py @@ -6,16 +6,18 @@ from ray.experimental.internal_kv import _internal_kv_initialized, \ _internal_kv_get, _internal_kv_put from ray.tune.error import TuneError +from typing import Callable TRAINABLE_CLASS = "trainable_class" ENV_CREATOR = "env_creator" RLLIB_MODEL = "rllib_model" RLLIB_PREPROCESSOR = "rllib_preprocessor" RLLIB_ACTION_DIST = "rllib_action_dist" +RLLIB_INPUT = "rllib_input" TEST = "__test__" KNOWN_CATEGORIES = [ TRAINABLE_CLASS, ENV_CREATOR, RLLIB_MODEL, RLLIB_PREPROCESSOR, - RLLIB_ACTION_DIST, TEST + RLLIB_ACTION_DIST, RLLIB_INPUT, TEST ] logger = logging.getLogger(__name__) @@ -87,6 +89,27 @@ def register_env(name, env_creator): _global_registry.register(ENV_CREATOR, name, env_creator) +def register_input(name: str, input_creator: Callable): + """Register a custom input api for RLLib. + + Args: + name (str): Name to register. + input_creator (IOContext -> InputReader): Callable that creates an + input reader. + """ + if not callable(input_creator): + raise TypeError("Second argument must be callable.", input_creator) + _global_registry.register(RLLIB_INPUT, name, input_creator) + + +def registry_contains_input(name: str) -> bool: + return _global_registry.contains(RLLIB_INPUT, name) + + +def registry_get_input(name: str) -> Callable: + return _global_registry.get(RLLIB_INPUT, name) + + def check_serializability(key, value): _global_registry.register(TEST, key, value) @@ -168,7 +191,10 @@ def get(self, k): def flush(self): for k, v in self.to_flush.items(): - self.references[k] = ray.put(v) + if isinstance(v, ray.ObjectRef): + self.references[k] = v + else: + self.references[k] = ray.put(v) self.to_flush.clear() diff --git a/rllib/BUILD b/rllib/BUILD index ef7b579a07d8..a4acc1380c4a 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -512,11 +512,11 @@ py_test( # CQLTrainer py_test( - name = "test_cql_sac", + name = "test_cql", tags = ["agents_dir"], size = "medium", - data = glob(["tests/data/moab/*.json"]), - srcs = ["agents/cql/tests/test_cql_sac.py"] + srcs = ["agents/cql/tests/test_cql.py"], + data = ["tests/data/pendulum/small.json"], ) # DDPGTrainer diff --git a/rllib/agents/cql/__init__.py b/rllib/agents/cql/__init__.py index 0945f4e37ddc..154804137c51 100644 --- a/rllib/agents/cql/__init__.py +++ b/rllib/agents/cql/__init__.py @@ -1,20 +1,8 @@ -from ray.rllib.agents.cql.cql_apex_sac import CQLApexSACTrainer, CQLAPEXSAC_DEFAULT_CONFIG -from ray.rllib.agents.cql.cql_dqn import CQLDQNTrainer, CQLDQN_DEFAULT_CONFIG -from ray.rllib.agents.cql.cql_sac import CQLSACTrainer, CQLSAC_DEFAULT_CONFIG -from ray.rllib.agents.cql.cql_sac_torch_policy import CQLSACTorchPolicy -from ray.rllib.agents.cql.cql_sac_tf_policy import CQLSACTFPolicy -from ray.rllib.agents.cql.cql_dqn_tf_policy import CQLDQNTFPolicy -from ray.rllib.agents.cql.cql_sac_tf_model import CQLSACTFModel +from ray.rllib.agents.cql.cql import CQLTrainer, CQL_DEFAULT_CONFIG +from ray.rllib.agents.cql.cql_torch_policy import CQLTorchPolicy __all__ = [ - "CQLAPEXSAC_DEFAULT_CONFIG", - "CQLDQN_DEFAULT_CONFIG", - "CQLSAC_DEFAULT_CONFIG", - "CQLDQNTFPolicy", - "CQLSACTFPolicy", - "CQLSACTFModel", - "CQLSACTorchPolicy", - "CQLApexSACTrainer", - "CQLDQNTrainer", - "CQLSACTrainer", + "CQL_DEFAULT_CONFIG", + "CQLTorchPolicy", + "CQLTrainer", ] diff --git a/rllib/agents/cql/cql.py b/rllib/agents/cql/cql.py new file mode 100644 index 000000000000..c09bf6fed6b5 --- /dev/null +++ b/rllib/agents/cql/cql.py @@ -0,0 +1,221 @@ +"""CQL (derived from SAC). +""" +import logging +import numpy as np +from typing import Optional, Type + +from ray.rllib.agents.cql.cql_tf_policy import CQLTFPolicy +from ray.rllib.agents.cql.cql_torch_policy import CQLTorchPolicy +from ray.rllib.agents.sac.sac import SACTrainer, \ + DEFAULT_CONFIG as SAC_CONFIG +from ray.rllib.execution.metric_ops import StandardMetricsReporting +from ray.rllib.execution.replay_buffer import LocalReplayBuffer +from ray.rllib.execution.replay_ops import Replay +from ray.rllib.execution.train_ops import TrainTFMultiGPU, TrainOneStep, \ + UpdateTargetNetwork +from ray.rllib.offline import InputReader +from ray.rllib.offline.shuffled_input import ShuffledInput +from ray.rllib.policy.policy import LEARNER_STATS_KEY, Policy +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils import merge_dicts +from ray.rllib.utils.framework import try_import_tf, try_import_tfp +from ray.rllib.utils.typing import TrainerConfigDict + +tf1, tf, tfv = try_import_tf() +tfp = try_import_tfp() +logger = logging.getLogger(__name__) +replay_buffer = None + +# yapf: disable +# __sphinx_doc_begin__ +CQL_DEFAULT_CONFIG = merge_dicts( + SAC_CONFIG, { + # You should override this to point to an offline dataset. + "input": "sampler", + # Custom input config + "input_config": {}, + # Switch off off-policy evaluation. + "input_evaluation": [], + # Number of iterations with Behavior Cloning Pretraining. + "bc_iters": 20000, + # CQL loss temperature. + "temperature": 1.0, + # Number of actions to sample for CQL loss. + "num_actions": 10, + # Whether to use the Lagrangian for Alpha Prime (in CQL loss). + "lagrangian": False, + # Lagrangian threshold. + "lagrangian_thresh": 5.0, + # Min Q weight multiplier. + "min_q_weight": 5.0, + # Replay buffer should be larger or equal the size of the offline + # dataset. + "buffer_size": int(1e6), + }) +# __sphinx_doc_end__ +# yapf: enable + + +def validate_config(config: TrainerConfigDict): + if config["num_gpus"] > 1: + raise ValueError("`num_gpus` > 1 not yet supported for CQL!") + + # CQL-torch performs the optimizer steps inside the loss function. + # Using the multi-GPU optimizer will therefore not work (see multi-GPU + # check above) and we must use the simple optimizer for now. + if config["simple_optimizer"] is not True and \ + config["framework"] == "torch": + config["simple_optimizer"] = True + + if config["framework"] in ["tf", "tf2", "tfe"] and tfp is None: + logger.warning( + "You need `tensorflow_probability` in order to run CQL! " + "Install it via `pip install tensorflow_probability`. Your " + f"tf.__version__={tf.__version__ if tf else None}." + "Trying to import tfp results in the following error:") + try_import_tfp(error=True) + + +def execution_plan(workers, config): + if config.get("prioritized_replay"): + prio_args = { + "prioritized_replay_alpha": config["prioritized_replay_alpha"], + "prioritized_replay_beta": config["prioritized_replay_beta"], + "prioritized_replay_eps": config["prioritized_replay_eps"], + } + else: + prio_args = {} + + local_replay_buffer = LocalReplayBuffer( + num_shards=1, + learning_starts=config["learning_starts"], + buffer_size=config["buffer_size"], + replay_batch_size=config["train_batch_size"], + replay_mode=config["multiagent"]["replay_mode"], + replay_sequence_length=config.get("replay_sequence_length", 1), + replay_burn_in=config.get("burn_in", 0), + replay_zero_init_states=config.get("zero_init_states", True), + **prio_args) + + global replay_buffer + replay_buffer = local_replay_buffer + + def update_prio(item): + samples, info_dict = item + if config.get("prioritized_replay"): + prio_dict = {} + for policy_id, info in info_dict.items(): + # TODO(sven): This is currently structured differently for + # torch/tf. Clean up these results/info dicts across + # policies (note: fixing this in torch_policy.py will + # break e.g. DDPPO!). + td_error = info.get("td_error", + info[LEARNER_STATS_KEY].get("td_error")) + samples.policy_batches[policy_id].set_get_interceptor(None) + prio_dict[policy_id] = (samples.policy_batches[policy_id] + .get("batch_indexes"), td_error) + local_replay_buffer.update_priorities(prio_dict) + return info_dict + + # (2) Read and train on experiences from the replay buffer. Every batch + # returned from the LocalReplay() iterator is passed to TrainOneStep to + # take a SGD step, and then we decide whether to update the target network. + post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) + + if config["simple_optimizer"]: + train_step_op = TrainOneStep(workers) + else: + train_step_op = TrainTFMultiGPU( + workers=workers, + sgd_minibatch_size=config["train_batch_size"], + num_sgd_iter=1, + num_gpus=config["num_gpus"], + shuffle_sequences=True, + _fake_gpus=config["_fake_gpus"], + framework=config.get("framework")) + + replay_op = Replay(local_buffer=local_replay_buffer) \ + .for_each(lambda x: post_fn(x, workers, config)) \ + .for_each(train_step_op) \ + .for_each(update_prio) \ + .for_each(UpdateTargetNetwork( + workers, config["target_network_update_freq"])) + + return StandardMetricsReporting( + replay_op, workers, config, + by_steps_trained=True + ) + + +def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]: + if config["framework"] == "torch": + return CQLTorchPolicy + + +def after_init(trainer): + # Add the entire dataset to Replay Buffer (global variable) + global replay_buffer + reader = trainer.workers.local_worker().input_reader + + # For d4rl, add the D4RLReaders' dataset to the buffer. + if isinstance(trainer.config["input"], str) and \ + "d4rl" in trainer.config["input"]: + dataset = reader.dataset + replay_buffer.add_batch(dataset) + # For a list of files, add each file's entire content to the buffer. + elif isinstance(reader, ShuffledInput): + num_batches = 0 + total_timesteps = 0 + for batch in reader.child.read_all_files(): + num_batches += 1 + total_timesteps += len(batch) + # Add NEXT_OBS if not available. This is slightly hacked + # as for the very last time step, we will use next-obs=zeros + # and therefore force-set DONE=True to avoid this missing + # next-obs to cause learning problems. + if SampleBatch.NEXT_OBS not in batch: + obs = batch[SampleBatch.OBS] + batch[SampleBatch.NEXT_OBS] = \ + np.concatenate([obs[1:], np.zeros_like(obs[0:1])]) + batch[SampleBatch.DONES][-1] = True + replay_buffer.add_batch(batch) + print(f"Loaded {num_batches} batches ({total_timesteps} ts) into the " + f"replay buffer, which has capacity {replay_buffer.buffer_size}.") + elif isinstance(reader, InputReader): + num_batches = 0 + total_timesteps = 0 + try: + while total_timesteps < replay_buffer.buffer_size: + batch = reader.next() + num_batches += 1 + total_timesteps += len(batch) + # Add NEXT_OBS if not available. This is slightly hacked + # as for the very last time step, we will use next-obs=zeros + # and therefore force-set DONE=True to avoid this missing + # next-obs to cause learning problems. + if SampleBatch.NEXT_OBS not in batch: + obs = batch[SampleBatch.OBS] + batch[SampleBatch.NEXT_OBS] = \ + np.concatenate([obs[1:], np.zeros_like(obs[0:1])]) + batch[SampleBatch.DONES][-1] = True + replay_buffer.add_batch(batch) + except StopIteration: + pass + print(f"Loaded {num_batches} batches ({total_timesteps} ts) into the " + f"replay buffer, which has capacity {replay_buffer.buffer_size}.") + else: + raise ValueError( + "Unknown offline input! config['input'] must either be list of " + "offline files (json) or a D4RL-specific InputReader specifier " + "(e.g. 'd4rl.hopper-medium-v0').") + + +CQLTrainer = SACTrainer.with_updates( + name="CQL", + default_config=CQL_DEFAULT_CONFIG, + validate_config=validate_config, + default_policy=CQLTFPolicy, + get_policy_class=get_policy_class, + after_init=after_init, + execution_plan=execution_plan, +) diff --git a/rllib/agents/cql/cql_apex_sac.py b/rllib/agents/cql/cql_apex_sac.py deleted file mode 100644 index 21831d4d3130..000000000000 --- a/rllib/agents/cql/cql_apex_sac.py +++ /dev/null @@ -1,52 +0,0 @@ -from ray.rllib.agents.dqn.apex import apex_execution_plan -from ray.rllib.agents.cql.cql_sac import CQLSAC_DEFAULT_CONFIG, CQLSACTrainer - -# yapf: disable -# __sphinx_doc_begin__ - -CQLAPEXSAC_DEFAULT_CONFIG = CQLSACTrainer.merge_trainer_configs( - CQLSAC_DEFAULT_CONFIG, # see also the options in sac.py, which are also supported - { - "optimizer": { - "max_weight_sync_delay": 400, - "num_replay_buffer_shards": 4, - "debug": False, - }, - "n_step": 1, - "num_gpus": 0, - "num_workers": 32, - "buffer_size": 200000, - "learning_starts": 5000, - "train_batch_size": 512, - "rollout_fragment_length": 50, - "target_network_update_freq": 0, - "timesteps_per_iteration": 1000, - "exploration_config": {"type": "StochasticSampling"}, - "worker_side_prioritization": True, - "min_iter_time_s": 10, - # We need to implement a version of Prioritized Replay for SAC - # that takes into account the policy entropy term of the loss. - # And for CQL_SAC, we need to also consider the CQL regularizer - "prioritized_replay": False, - # If set, this will fix the ratio of sampled to replayed timesteps. - # Otherwise, replay will proceed as fast as possible. - "training_intensity": None, - # Which mode to use in the ParallelRollouts operator used to collect - # samples. For more details check the operator in rollout_ops module. - "parallel_rollouts_mode": "async", - # This only applies if async mode is used (above config setting). - # Controls the max number of async requests in flight per actor - "parallel_rollouts_num_async": 2, - }, -) - - -# __sphinx_doc_end__ -# yapf: enable - - -CQLApexSACTrainer = CQLSACTrainer.with_updates( - name="CQL_APEX_SAC", - default_config=CQLAPEXSAC_DEFAULT_CONFIG, - execution_plan=apex_execution_plan, -) diff --git a/rllib/agents/cql/cql_sac.py b/rllib/agents/cql/cql_sac.py deleted file mode 100644 index ed1d7021e57c..000000000000 --- a/rllib/agents/cql/cql_sac.py +++ /dev/null @@ -1,63 +0,0 @@ -"""CQL (derived from SAC). -""" -from typing import Optional, Type - -from ray.rllib.agents.cql.cql_sac_tf_policy import CQLSACTFPolicy -from ray.rllib.agents.sac.sac import SACTrainer, \ - DEFAULT_CONFIG as SAC_CONFIG -from ray.rllib.agents.cql.cql_sac_torch_policy import CQLSACTorchPolicy -from ray.rllib.utils.typing import TrainerConfigDict -from ray.rllib.policy.policy import Policy -from ray.rllib.utils import merge_dicts - -# yapf: disable -# __sphinx_doc_begin__ -CQLSAC_DEFAULT_CONFIG = merge_dicts( - SAC_CONFIG, { - # You should override this to point to an offline dataset. - "input": "sampler", - # Offline RL does not need IS estimators - "input_evaluation": [], - # Number of iterations with Behavior Cloning Pretraining - "bc_iters": 20000, - # CQL Loss Temperature - "temperature": 1.0, - # Num Actions to sample for CQL Loss - "num_actions": 10, - # Whether to use the Langrangian for Alpha Prime (in CQL Loss) - "lagrangian": False, - # Lagrangian Threshold - "lagrangian_thresh": 5.0, - # Min Q Weight multiplier - "min_q_weight": 5.0, - # Initial value to use for the Alpha Prime (in CQL Loss). - "initial_alpha_prime": 1.0, - # The default value is set as the same of SAC which is good for - # online training. For offline training we could start to optimize - # the models right away. - "learning_starts": 1500, - # Replay Buffer should be size of offline dataset for fastest - # training - "buffer_size": 1000000, - # Upper bound for alpha value during the lagrangian constraint - "alpha_upper_bound": 1.0, - # Lower bound for alpha value during the lagrangian constraint - "alpha_lower_bound": 0.0, - }) -# __sphinx_doc_end__ -# yapf: enable - - -def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - return CQLSACTorchPolicy - else: - return CQLSACTFPolicy - - -CQLSACTrainer = SACTrainer.with_updates( - name="CQL_SAC", - default_config=CQLSAC_DEFAULT_CONFIG, - default_policy=CQLSACTFPolicy, - get_policy_class=get_policy_class, -) diff --git a/rllib/agents/cql/cql_sac_tf_model.py b/rllib/agents/cql/cql_sac_tf_model.py deleted file mode 100644 index 8bdef0d3c7b8..000000000000 --- a/rllib/agents/cql/cql_sac_tf_model.py +++ /dev/null @@ -1,78 +0,0 @@ -import gym -import numpy as np -from typing import Optional - -from ray.rllib.agents.sac.sac_tf_model import SACTFModel -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.typing import ModelConfigDict - -tf1, tf, tfv = try_import_tf() - - -class CQLSACTFModel(SACTFModel): - """Extension of SACTFModel for CQL. - - To customize, do one of the following: - - sub-class CQLTFModel and override one or more of its methods. - - Use CQL's `Q_model` and `policy_model` keys to tweak the default model - behaviors (e.g. fcnet_hiddens, conv_filters, etc..). - - Use CQL's `Q_model->custom_model` and `policy_model->custom_model` keys - to specify your own custom Q-model(s) and policy-models, which will be - created within this CQLTFModel (see `build_policy_model` and - `build_q_model`. - - Note: It is not recommended to override the `forward` method for CQL. This - would lead to shared weights (between policy and Q-nets), which will then - not be optimized by either of the critic- or actor-optimizers! - - Data flow: - `obs` -> forward() (should stay a noop method!) -> `model_out` - `model_out` -> get_policy_output() -> pi(actions|obs) - `model_out`, `actions` -> get_q_values() -> Q(s, a) - `model_out`, `actions` -> get_twin_q_values() -> Q_twin(s, a) - """ - - def __init__(self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: Optional[int], - model_config: ModelConfigDict, - name: str, - policy_model_config: ModelConfigDict = None, - q_model_config: ModelConfigDict = None, - twin_q: bool = False, - initial_alpha: float = 1.0, - target_entropy: Optional[float] = None, - lagrangian: bool = False, - initial_alpha_prime: float = 1.0): - """Initialize a CQLSACTFModel instance. - - Args: - policy_model_config (ModelConfigDict): The config dict for the - policy network. - q_model_config (ModelConfigDict): The config dict for the - Q-network(s) (2 if twin_q=True). - twin_q (bool): Build twin Q networks (Q-net and target) for more - stable Q-learning. - initial_alpha (float): The initial value for the to-be-optimized - alpha parameter (default: 1.0). - target_entropy (Optional[float]): A target entropy value for - the to-be-optimized alpha parameter. If None, will use the - defaults described in the papers for SAC (and discrete SAC). - lagrangian (bool): Whether to automatically adjust value via - Lagrangian dual gradient descent. - initial_alpha_prime (float): The initial value for the to-be-optimized - alpha_prime parameter (default: 1.0). - - Note that the core layers for forward() are not defined here, this - only defines the layers for the output heads. Those layers for - forward() should be defined in subclasses of CQLModel. - """ - super(CQLSACTFModel, self).__init__(obs_space, action_space, num_outputs, - model_config, name, policy_model_config, - q_model_config, twin_q, initial_alpha, - target_entropy) - if lagrangian: - self.log_alpha_prime = tf.Variable( - np.log(initial_alpha_prime), dtype=tf.float32, name="log_alpha_prime") - self.alpha_prime = tf.exp(self.log_alpha_prime) diff --git a/rllib/agents/cql/cql_sac_tf_policy.py b/rllib/agents/cql/cql_sac_tf_policy.py deleted file mode 100644 index 53d21fea1d3b..000000000000 --- a/rllib/agents/cql/cql_sac_tf_policy.py +++ /dev/null @@ -1,387 +0,0 @@ -""" -TF policy class used for CQL. -""" -from functools import partial - -import numpy as np -import gym -import logging -from typing import Dict, Union, Type, List - -import ray -import ray.experimental.tf_utils -from ray.rllib.agents.cql.cql_sac_tf_model import CQLSACTFModel -from ray.rllib.agents.sac.sac_tf_policy import ActorCriticOptimizerMixin, \ - ComputeTDErrorMixin, TargetNetworkMixin, stats, \ - compute_and_clip_gradients, apply_gradients, SACTFPolicy, sac_actor_critic_loss -from ray.rllib.models import ModelCatalog, MODEL_DEFAULTS -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import TFActionDistribution -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_tf, try_import_tfp -from ray.rllib.utils.typing import TensorType, TrainerConfigDict, LocalOptimizer, \ - ModelGradients - -tf1, tf, tfv = try_import_tf() -tfp = try_import_tfp() - -logger = logging.getLogger(__name__) - - -def build_cql_sac_model(policy: Policy, obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict) -> ModelV2: - """Constructs the necessary ModelV2 for the Policy and returns it. - - Args: - policy (Policy): The TFPolicy that will use the models. - obs_space (gym.spaces.Space): The observation space. - action_space (gym.spaces.Space): The action space. - config (TrainerConfigDict): The CQL trainer's config dict. - - Returns: - ModelV2: The ModelV2 to be used by the Policy. Note: An additional - target model will be created in this function and assigned to - `policy.target_model`. - """ - # With separate state-preprocessor (before obs+action concat). - num_outputs = int(np.product(obs_space.shape)) - - # Force-ignore any additionally provided hidden layer sizes. - # Everything should be configured using CQL_SAC's "Q_model" and "policy_model" - # settings. - policy_model_config = MODEL_DEFAULTS.copy() - policy_model_config.update(config["policy_model"]) - q_model_config = MODEL_DEFAULTS.copy() - q_model_config.update(config["Q_model"]) - - assert config["framework"] != "torch" - default_model_cls = CQLSACTFModel - - model = ModelCatalog.get_model_v2( - obs_space=obs_space, - action_space=action_space, - num_outputs=num_outputs, - model_config=config["model"], - framework=config["framework"], - default_model=default_model_cls, - name="cql_sac_model", - policy_model_config=policy_model_config, - q_model_config=q_model_config, - twin_q=config["twin_q"], - initial_alpha=config["initial_alpha"], - target_entropy=config["target_entropy"], - lagrangian=config["lagrangian"], - initial_alpha_prime=config["initial_alpha_prime"]) - - assert isinstance(model, default_model_cls) - - # Create an exact copy of the model and store it in `policy.target_model`. - # This will be used for tau-synched Q-target models that run behind the - # actual Q-networks and are used for target q-value calculations in the - # loss terms. - policy.target_model = ModelCatalog.get_model_v2( - obs_space=obs_space, - action_space=action_space, - num_outputs=num_outputs, - model_config=config["model"], - framework=config["framework"], - default_model=default_model_cls, - name="target_cql_sac_model", - policy_model_config=policy_model_config, - q_model_config=q_model_config, - twin_q=config["twin_q"], - initial_alpha=config["initial_alpha"], - target_entropy=config["target_entropy"], - lagrangian=config["lagrangian"], - initial_alpha_prime=config["initial_alpha_prime"]) - - assert isinstance(policy.target_model, default_model_cls) - - return model - - -# Returns policy tiled actions and log probabilities for CQL Loss -def policy_actions_repeat(model, action_dist, obs, num_repeat=1): - obs_temp = tf.tile(obs, [num_repeat, 1]) - policy_dist = action_dist(model.get_policy_output(obs_temp), model) - actions = policy_dist.sample() - log_p = tf.expand_dims(policy_dist.logp(actions), -1) - return actions, tf.squeeze(log_p, axis=len(log_p.shape) - 1) - - -def q_values_repeat(model, obs, actions, twin=False): - action_shape = tf.shape(actions)[0] - obs_shape = tf.shape(obs)[0] - num_repeat = action_shape // obs_shape - obs_temp = tf.tile(obs, [num_repeat, 1]) - if twin: - preds = model.get_q_values(obs_temp, actions) - else: - preds = model.get_twin_q_values(obs_temp, actions) - preds = tf.reshape(preds, [obs_shape, num_repeat, 1]) - return preds - - -def cql_loss(policy: Policy, model: ModelV2, dist_class: Type[TFActionDistribution], - train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: - """Constructs the loss for the Soft Actor Critic. - - Args: - policy (Policy): The Policy to calculate the loss for. - model (ModelV2): The Model to calculate the loss for. - dist_class (Type[ActionDistribution]: The action distr. class. - train_batch (SampleBatch): The training data. - - Returns: - Union[TensorType, List[TensorType]]: A single loss tensor or a list - of loss tensors. - """ - # For best performance, turn deterministic off - deterministic = policy.config["_deterministic_loss"] - twin_q = policy.config["twin_q"] - discount = policy.config["gamma"] - - # CQL Parameters - bc_iters = policy.config["bc_iters"] - cql_temp = policy.config["temperature"] - num_actions = policy.config["num_actions"] - min_q_weight = policy.config["min_q_weight"] - use_lagrange = policy.config["lagrangian"] - target_action_gap = policy.config["lagrangian_thresh"] - - obs = train_batch[SampleBatch.CUR_OBS] - actions = train_batch[SampleBatch.ACTIONS] - rewards = train_batch[SampleBatch.REWARDS] - next_obs = train_batch[SampleBatch.NEXT_OBS] - terminals = train_batch[SampleBatch.DONES] - - # Execute SAC Policy as it is - sac_loss_res = sac_actor_critic_loss(policy, model, dist_class, train_batch) - - # CQL Loss (We are using Entropy version of CQL (the best version)) - rand_actions = policy._unif_dist.sample([tf.shape(actions)[0] * num_actions, - actions.shape[-1]]) - curr_actions, curr_logp = policy_actions_repeat(model, policy.action_dist_class, - obs, num_actions) - next_actions, next_logp = policy_actions_repeat(model, policy.action_dist_class, - next_obs, num_actions) - curr_logp = tf.reshape(curr_logp, [tf.shape(actions)[0], num_actions, 1]) - next_logp = tf.reshape(next_logp, [tf.shape(actions)[0], num_actions, 1]) - - q1_rand = q_values_repeat(model, policy.model_out_t, rand_actions) - q1_curr_actions = q_values_repeat(model, policy.model_out_t, curr_actions) - q1_next_actions = q_values_repeat(model, policy.model_out_t, next_actions) - - if twin_q: - q2_rand = q_values_repeat(model, policy.model_out_t, rand_actions, twin=True) - q2_curr_actions = q_values_repeat( - model, policy.model_out_t, curr_actions, twin=True) - q2_next_actions = q_values_repeat( - model, policy.model_out_t, next_actions, twin=True) - - random_density = np.log(0.5**curr_actions.shape[-1].value) - cat_q1 = tf.concat([ - q1_rand - random_density, q1_next_actions - tf.stop_gradient(next_logp), - q1_curr_actions - tf.stop_gradient(curr_logp) - ], 1) - if twin_q: - cat_q2 = tf.concat([ - q2_rand - random_density, q2_next_actions - tf.stop_gradient(next_logp), - q2_curr_actions - tf.stop_gradient(curr_logp) - ], 1) - - min_qf1_loss = tf.reduce_mean(tf.reduce_logsumexp( - cat_q1 / cql_temp, axis=1)) * min_q_weight * cql_temp - min_qf1_loss = min_qf1_loss - tf.reduce_mean(policy.q_t_selected) * min_q_weight - if twin_q: - min_qf2_loss = tf.reduce_mean(tf.reduce_logsumexp( - cat_q2 / cql_temp, axis=1)) * min_q_weight * cql_temp - min_qf2_loss = min_qf2_loss - tf.reduce_mean(policy.twin_q_t_selected) * min_q_weight - - if use_lagrange: - alpha_upper_bound = policy.config["alpha_upper_bound"] - alpha_lower_bound = policy.config["alpha_lower_bound"] - alpha_prime = tf.clip_by_value( - tf.exp(model.log_alpha_prime), clip_value_min=alpha_lower_bound, clip_value_max=alpha_upper_bound) - min_qf1_loss = alpha_prime * (min_qf1_loss - target_action_gap) - if twin_q: - min_qf2_loss = alpha_prime * (min_qf2_loss - target_action_gap) - alpha_prime_loss = 0.5 * (-min_qf1_loss - min_qf2_loss) - else: - alpha_prime_loss = -min_qf1_loss - - cql_loss = [min_qf1_loss] - if twin_q: - cql_loss.append(min_qf2_loss) - - policy.critic_loss[0] += min_qf1_loss - if twin_q: - policy.critic_loss[1] += min_qf2_loss - - # Save for stats function. - # CQL Stats - policy.cql_loss = cql_loss - if use_lagrange: - policy.log_alpha_prime_value = model.log_alpha_prime - policy.alpha_prime_value = model.alpha_prime - policy.alpha_prime_loss = alpha_prime_loss - # In a custom apply op we handle the losses separately, but return them - # combined in one loss here. - return sac_loss_res + alpha_prime_loss - else: - return sac_loss_res - - -def cql_compute_and_clip_gradients(policy: Policy, optimizer: LocalOptimizer, - loss: TensorType) -> ModelGradients: - """Gradients computing function (from loss tensor, using local optimizer). - - Note: For CQL, optimizer and loss are ignored b/c we have 1 extra - loss and 1 local optimizer (all stored in policy). - `optimizer` will be used, though, in the tf-eager case b/c it is then a - fake optimizer (OptimizerWrapper) object with a `tape` property to - generate a GradientTape object for gradient recording. - - Args: - policy (Policy): The Policy object that generated the loss tensor and - that holds the given local optimizer. - optimizer (LocalOptimizer): The tf (local) optimizer object to - calculate the gradients with. - loss (TensorType): The loss tensor for which gradients should be - calculated. - - Returns: - ModelGradients: List of the possibly clipped gradients- and variable - tuples. - """ - # Eager: Use GradientTape (which is a property of the `optimizer` object - # (an OptimizerWrapper): see rllib/policy/eager_tf_policy.py). - grads_and_vars = compute_and_clip_gradients(policy, optimizer, loss) - if policy.config["lagrangian"]: - if policy.config["framework"] in ["tf2", "tfe"]: - tape = optimizer.tape - alpha_prime_vars = [policy.model.log_alpha_prime] - alpha_prime_grads_and_vars = list( - zip(tape.gradient(policy.alpha_prime_loss, alpha_prime_vars), alpha_prime_vars)) - # Tf1.x: Use optimizer.compute_gradients() - else: - alpha_prime_grads_and_vars = policy._alpha_prime_optimizer.compute_gradients( - policy.alpha_prime_loss, var_list=[policy.model.log_alpha_prime]) - - # Clip if necessary. - if policy.config["grad_clip"]: - clip_func = partial( - tf.clip_by_norm, clip_norm=policy.config["grad_clip"]) - else: - clip_func = tf.identity - - # Save grads and vars for later use in `build_apply_op`. - policy._alpha_prime_grads_and_vars = [(clip_func(g), v) - for (g, v) in alpha_prime_grads_and_vars - if g is not None] - - grads_and_vars = tuple(list(grads_and_vars) + policy._alpha_prime_grads_and_vars) - - return grads_and_vars - - -def cql_apply_gradients( - policy: Policy, optimizer: LocalOptimizer, - grads_and_vars: ModelGradients) -> Union["tf.Operation", None]: - """Gradients applying function (from list of "grad_and_var" tuples). - - Args: - policy (Policy): The Policy object whose Model(s) the given gradients - should be applied to. - optimizer (LocalOptimizer): The tf (local) optimizer object through - which to apply the gradients. - grads_and_vars (ModelGradients): The list of grad_and_var tuples to - apply via the given optimizer. - - Returns: - Union[tf.Operation, None]: The tf op to be used to run the apply - operation. None for eager mode. - """ - grads_group_ops = apply_gradients(policy, optimizer, grads_and_vars) - if policy.config["lagrangian"]: - # Eager mode -> Just apply and return None. - if policy.config["framework"] in ["tf2", "tfe"]: - policy._alpha_prime_optimizer.apply_gradients( - policy._alpha_prime_grads_and_vars) - # Tf static graph -> Return op. - else: - alpha_prime_apply_ops = policy._alpha_prime_optimizer.apply_gradients( - policy._alpha_prime_grads_and_vars) - grads_group_ops = tf.group([grads_group_ops, alpha_prime_apply_ops]) - - return grads_group_ops - - -def cql_stats(policy: Policy, - train_batch: SampleBatch) -> Dict[str, TensorType]: - cql_dict = stats(policy, train_batch) - cql_dict["cql_loss"] = tf.reduce_mean(tf.stack(policy.cql_loss)) - if policy.config["lagrangian"]: - cql_dict["log_alpha_prime_value"] = policy.log_alpha_prime_value - cql_dict["alpha_prime_value"] = policy.alpha_prime_value - cql_dict["alpha_prime_loss"] = policy.alpha_prime_loss - return cql_dict - - -class CQLActorCriticOptimizerMixin(ActorCriticOptimizerMixin): - def __init__(self, config): - super().__init__(config) - if config["framework"] in ["tf2", "tfe"]: - if config["lagrangian"]: - self._alpha_prime_optimizer = tf.keras.optimizers.Adam( - learning_rate=config["optimization"]["critic_learning_rate"]) - else: - if config["lagrangian"]: - self._alpha_prime_optimizer = tf1.train.AdamOptimizer( - learning_rate=config["optimization"]["critic_learning_rate"]) - - -def cql_setup_early_mixins(policy: Policy, obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict): - """Call mixin classes' constructors before Policy's initialization. - - Adds the necessary optimizers to the given Policy. - - Args: - policy (Policy): The Policy object. - obs_space (gym.spaces.Space): The Policy's observation space. - action_space (gym.spaces.Space): The Policy's action space. - config (TrainerConfigDict): The Policy's config. - """ - CQLActorCriticOptimizerMixin.__init__(policy, config) - - -def cql_setup_mid_mixins(policy: Policy, obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict) -> None: - action_low = policy.model.action_space.low[0] - action_high = policy.model.action_space.high[0] - policy._unif_dist = tfp.distributions.Uniform(action_low, action_high, - name = "uniform_rand_actions") - ComputeTDErrorMixin.__init__(policy, cql_loss) - - -# Build a child class of `TFPolicy`, given the custom functions defined -# above. -CQLSACTFPolicy = SACTFPolicy.with_updates( - name="CQLSACTFPolicy", - get_default_config=lambda: ray.rllib.agents.cql.CQLSAC_DEFAULT_CONFIG, - make_model=build_cql_sac_model, - loss_fn=cql_loss, - stats_fn=cql_stats, - gradients_fn=cql_compute_and_clip_gradients, - apply_gradients_fn=cql_apply_gradients, - mixins=[ - TargetNetworkMixin, CQLActorCriticOptimizerMixin, ComputeTDErrorMixin - ], - before_init=cql_setup_early_mixins, - before_loss_init=cql_setup_mid_mixins, -) diff --git a/rllib/agents/cql/cql_tf_policy.py b/rllib/agents/cql/cql_tf_policy.py new file mode 100644 index 000000000000..f1e8ace02aad --- /dev/null +++ b/rllib/agents/cql/cql_tf_policy.py @@ -0,0 +1,398 @@ +""" +TensorFlow policy class used for CQL. +""" +from functools import partial +import numpy as np +import gym +import logging +from typing import Dict, List, Type, Union + +import ray +import ray.experimental.tf_utils +from ray.rllib.agents.sac.sac_tf_policy import \ + apply_gradients as sac_apply_gradients, \ + compute_and_clip_gradients as sac_compute_and_clip_gradients,\ + get_distribution_inputs_and_class, _get_dist_class, build_sac_model, \ + postprocess_trajectory, setup_late_mixins, stats, validate_spaces, \ + ActorCriticOptimizerMixin as SACActorCriticOptimizerMixin, \ + ComputeTDErrorMixin, TargetNetworkMixin +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.policy.tf_policy_template import build_tf_policy +from ray.rllib.policy.policy import Policy +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.exploration.random import Random +from ray.rllib.utils.framework import get_variable, \ + try_import_tf, try_import_tfp +from ray.rllib.utils.typing import LocalOptimizer, ModelGradients, \ + TensorType, TrainerConfigDict + +tf1, tf, tfv = try_import_tf() +tfp = try_import_tfp() + +logger = logging.getLogger(__name__) + +MEAN_MIN = -9.0 +MEAN_MAX = 9.0 + + +# Returns policy tiled actions and log probabilities for CQL Loss +def policy_actions_repeat(model, action_dist, obs, num_repeat=1): + obs_temp = tf.reshape( + tf.tile(tf.expand_dims(obs, 1), [1, num_repeat, 1]), + [-1, obs.shape[1]]) + logits = model.get_policy_output(obs_temp) + policy_dist = action_dist(logits, model) + actions, logp_ = policy_dist.sample_logp() + logp = tf.expand_dims(logp_, -1) + return actions, tf.reshape(logp, [tf.shape(obs)[0], num_repeat, 1]) + + +def q_values_repeat(model, obs, actions, twin=False): + action_shape = tf.shape(actions)[0] + obs_shape = tf.shape(obs)[0] + num_repeat = action_shape // obs_shape + obs_temp = tf.reshape( + tf.tile(tf.expand_dims(obs, 1), [1, num_repeat, 1]), + [-1, tf.shape(obs)[1]]) + if not twin: + preds_ = model.get_q_values(obs_temp, actions) + else: + preds_ = model.get_twin_q_values(obs_temp, actions) + preds = tf.reshape(preds_, [tf.shape(obs)[0], num_repeat, 1]) + return preds + + +def cql_loss(policy: Policy, model: ModelV2, + dist_class: Type[TFActionDistribution], + train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: + logger.info(f"Current iteration = {policy.cur_iter}") + policy.cur_iter += 1 + + # For best performance, turn deterministic off + deterministic = policy.config["_deterministic_loss"] + assert not deterministic + twin_q = policy.config["twin_q"] + discount = policy.config["gamma"] + + # CQL Parameters + bc_iters = policy.config["bc_iters"] + cql_temp = policy.config["temperature"] + num_actions = policy.config["num_actions"] + min_q_weight = policy.config["min_q_weight"] + use_lagrange = policy.config["lagrangian"] + target_action_gap = policy.config["lagrangian_thresh"] + + obs = train_batch[SampleBatch.CUR_OBS] + actions = tf.cast(train_batch[SampleBatch.ACTIONS], tf.float32) + rewards = tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) + next_obs = train_batch[SampleBatch.NEXT_OBS] + terminals = train_batch[SampleBatch.DONES] + + model_out_t, _ = model({ + "obs": obs, + "is_training": True, + }, [], None) + + model_out_tp1, _ = model({ + "obs": next_obs, + "is_training": True, + }, [], None) + + target_model_out_tp1, _ = policy.target_model({ + "obs": next_obs, + "is_training": True, + }, [], None) + + action_dist_class = _get_dist_class( + # policy, + policy.config, + policy.action_space, + ) + action_dist_t = action_dist_class( + model.get_policy_output(model_out_t), model) + policy_t, log_pis_t = action_dist_t.sample_logp() + log_pis_t = tf.expand_dims(log_pis_t, -1) + + # Unlike original SAC, Alpha and Actor Loss are computed first. + # Alpha Loss + alpha_loss = -tf.reduce_mean( + model.log_alpha * tf.stop_gradient(log_pis_t + model.target_entropy)) + + # Policy Loss (Either Behavior Clone Loss or SAC Loss) + alpha = tf.math.exp(model.log_alpha) + if policy.cur_iter >= bc_iters: + min_q = model.get_q_values(model_out_t, policy_t) + if twin_q: + twin_q_ = model.get_twin_q_values(model_out_t, policy_t) + min_q = tf.math.minimum(min_q, twin_q_) + actor_loss = tf.reduce_mean( + tf.stop_gradient(alpha) * log_pis_t - min_q) + else: + bc_logp = action_dist_t.logp(actions) + actor_loss = tf.reduce_mean( + tf.stop_gradient(alpha) * log_pis_t - bc_logp) + # actor_loss = -tf.reduce_mean(bc_logp) + + # Critic Loss (Standard SAC Critic L2 Loss + CQL Entropy Loss) + # SAC Loss: + # Q-values for the batched actions. + action_dist_tp1 = action_dist_class( + model.get_policy_output(model_out_tp1), model) + policy_tp1, _ = action_dist_tp1.sample_logp() + + q_t = model.get_q_values(model_out_t, actions) + q_t_selected = tf.squeeze(q_t, axis=-1) + if twin_q: + twin_q_t = model.get_twin_q_values(model_out_t, actions) + twin_q_t_selected = tf.squeeze(twin_q_t, axis=-1) + + # Target q network evaluation. + q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1) + if twin_q: + twin_q_tp1 = policy.target_model.get_twin_q_values( + target_model_out_tp1, policy_tp1) + # Take min over both twin-NNs. + q_tp1 = tf.math.minimum(q_tp1, twin_q_tp1) + + q_tp1_best = tf.squeeze(input=q_tp1, axis=-1) + q_tp1_best_masked = (1.0 - tf.cast(terminals, tf.float32)) * q_tp1_best + + # compute RHS of bellman equation + q_t_target = tf.stop_gradient( + rewards + (discount**policy.config["n_step"]) * q_tp1_best_masked) + + # Compute the TD-error (potentially clipped), for priority replay buffer + base_td_error = tf.math.abs(q_t_selected - q_t_target) + if twin_q: + twin_td_error = tf.math.abs(twin_q_t_selected - q_t_target) + td_error = 0.5 * (base_td_error + twin_td_error) + else: + td_error = base_td_error + + critic_loss_1 = tf.keras.losses.MSE(q_t_selected, q_t_target) + if twin_q: + critic_loss_2 = tf.keras.losses.MSE(twin_q_t_selected, q_t_target) + + # CQL Loss (We are using Entropy version of CQL (the best version)) + rand_actions, _ = policy._random_action_generator.get_exploration_action( + action_distribution=action_dist_class( + tf.tile(action_dist_tp1.inputs, (num_actions, 1)), model), + timestep=0, + explore=True) + curr_actions, curr_logp = policy_actions_repeat(model, action_dist_class, + model_out_t, num_actions) + next_actions, next_logp = policy_actions_repeat(model, action_dist_class, + model_out_tp1, num_actions) + + q1_rand = q_values_repeat(model, model_out_t, rand_actions) + q1_curr_actions = q_values_repeat(model, model_out_t, curr_actions) + q1_next_actions = q_values_repeat(model, model_out_t, next_actions) + + if twin_q: + q2_rand = q_values_repeat(model, model_out_t, rand_actions, twin=True) + q2_curr_actions = q_values_repeat( + model, model_out_t, curr_actions, twin=True) + q2_next_actions = q_values_repeat( + model, model_out_t, next_actions, twin=True) + + random_density = np.log(0.5**int(curr_actions.shape[-1])) + cat_q1 = tf.concat([ + q1_rand - random_density, + q1_next_actions - tf.stop_gradient(next_logp), + q1_curr_actions - tf.stop_gradient(curr_logp) + ], 1) + if twin_q: + cat_q2 = tf.concat([ + q2_rand - random_density, + q2_next_actions - tf.stop_gradient(next_logp), + q2_curr_actions - tf.stop_gradient(curr_logp) + ], 1) + + min_qf1_loss_ = tf.reduce_mean( + tf.reduce_logsumexp(cat_q1 / cql_temp, + axis=1)) * min_q_weight * cql_temp + min_qf1_loss = min_qf1_loss_ - (tf.reduce_mean(q_t) * min_q_weight) + if twin_q: + min_qf2_loss_ = tf.reduce_mean( + tf.reduce_logsumexp(cat_q2 / cql_temp, + axis=1)) * min_q_weight * cql_temp + min_qf2_loss = min_qf2_loss_ - ( + tf.reduce_mean(twin_q_t) * min_q_weight) + + if use_lagrange: + alpha_prime = tf.clip_by_value(model.log_alpha_prime.exp(), 0.0, + 1000000.0)[0] + min_qf1_loss = alpha_prime * (min_qf1_loss - target_action_gap) + if twin_q: + min_qf2_loss = alpha_prime * (min_qf2_loss - target_action_gap) + alpha_prime_loss = 0.5 * (-min_qf1_loss - min_qf2_loss) + else: + alpha_prime_loss = -min_qf1_loss + + cql_loss = [min_qf1_loss] + if twin_q: + cql_loss.append(min_qf2_loss) + + critic_loss = [critic_loss_1 + min_qf1_loss] + if twin_q: + critic_loss.append(critic_loss_2 + min_qf2_loss) + + # Save for stats function. + policy.q_t = q_t_selected + policy.policy_t = policy_t + policy.log_pis_t = log_pis_t + policy.td_error = td_error + policy.actor_loss = actor_loss + policy.critic_loss = critic_loss + policy.alpha_loss = alpha_loss + policy.log_alpha_value = model.log_alpha + policy.alpha_value = alpha + policy.target_entropy = model.target_entropy + # CQL Stats + policy.cql_loss = cql_loss + if use_lagrange: + policy.log_alpha_prime_value = model.log_alpha_prime[0] + policy.alpha_prime_value = alpha_prime + policy.alpha_prime_loss = alpha_prime_loss + + # Return all loss terms corresponding to our optimizers. + if use_lagrange: + return actor_loss + tf.math.add_n(critic_loss) + alpha_loss + \ + alpha_prime_loss + return actor_loss + tf.math.add_n(critic_loss) + alpha_loss + + +def cql_stats(policy: Policy, + train_batch: SampleBatch) -> Dict[str, TensorType]: + sac_dict = stats(policy, train_batch) + sac_dict["cql_loss"] = tf.reduce_mean(tf.stack(policy.cql_loss)) + if policy.config["lagrangian"]: + sac_dict["log_alpha_prime_value"] = policy.log_alpha_prime_value + sac_dict["alpha_prime_value"] = policy.alpha_prime_value + sac_dict["alpha_prime_loss"] = policy.alpha_prime_loss + return sac_dict + + +class ActorCriticOptimizerMixin(SACActorCriticOptimizerMixin): + def __init__(self, config): + super().__init__(config) + if config["lagrangian"]: + # Eager mode. + if config["framework"] in ["tf2", "tfe"]: + self._alpha_prime_optimizer = tf.keras.optimizers.Adam( + learning_rate=config["optimization"][ + "critic_learning_rate"]) + # Static graph mode. + else: + self._alpha_prime_optimizer = tf1.train.AdamOptimizer( + learning_rate=config["optimization"][ + "critic_learning_rate"]) + + +def setup_early_mixins(policy: Policy, obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + config: TrainerConfigDict) -> None: + """Call mixin classes' constructors before Policy's initialization. + + Adds the necessary optimizers to the given Policy. + + Args: + policy (Policy): The Policy object. + obs_space (gym.spaces.Space): The Policy's observation space. + action_space (gym.spaces.Space): The Policy's action space. + config (TrainerConfigDict): The Policy's config. + """ + policy.cur_iter = 0 + ActorCriticOptimizerMixin.__init__(policy, config) + if config["lagrangian"]: + policy.model.log_alpha_prime = get_variable( + 0.0, framework="tf", trainable=True, tf_name="log_alpha_prime") + policy.alpha_prime_optim = tf.keras.optimizers.Adam( + learning_rate=config["optimization"]["critic_learning_rate"], ) + # Generic random action generator for calculating CQL-loss. + policy._random_action_generator = Random( + action_space, + model=None, + framework="tf2", + policy_config=config, + num_workers=0, + worker_index=0) + + +def compute_gradients_fn(policy: Policy, optimizer: LocalOptimizer, + loss: TensorType) -> ModelGradients: + grads_and_vars = sac_compute_and_clip_gradients(policy, optimizer, loss) + + if policy.config["lagrangian"]: + # Eager: Use GradientTape (which is a property of the `optimizer` + # object (an OptimizerWrapper): see rllib/policy/eager_tf_policy.py). + if policy.config["framework"] in ["tf2", "tfe"]: + tape = optimizer.tape + log_alpha_prime = [policy.model.log_alpha_prime] + alpha_prime_grads_and_vars = list( + zip( + tape.gradient(policy.alpha_prime_loss, log_alpha_prime), + log_alpha_prime)) + # Tf1.x: Use optimizer.compute_gradients() + else: + alpha_prime_grads_and_vars = \ + policy._alpha_prime_optimizer.compute_gradients( + policy.alpha_prime_loss, + var_list=[policy.model.log_alpha_prime]) + + # Clip if necessary. + if policy.config["grad_clip"]: + clip_func = partial( + tf.clip_by_norm, clip_norm=policy.config["grad_clip"]) + else: + clip_func = tf.identity + + # Save grads and vars for later use in `build_apply_op`. + policy._alpha_prime_grads_and_vars = [ + (clip_func(g), v) for (g, v) in alpha_prime_grads_and_vars + if g is not None + ] + + grads_and_vars += policy._alpha_prime_grads_and_vars + return grads_and_vars + + +def apply_gradients_fn(policy, optimizer, grads_and_vars): + sac_results = sac_apply_gradients(policy, optimizer, grads_and_vars) + + if policy.config["lagrangian"]: + # Eager mode -> Just apply and return None. + if policy.config["framework"] in ["tf2", "tfe"]: + policy._alpha_prime_optimizer.apply_gradients( + policy._alpha_prime_grads_and_vars) + return + # Tf static graph -> Return grouped op. + else: + alpha_prime_apply_op = \ + policy._alpha_prime_optimizer.apply_gradients( + policy._alpha_prime_grads_and_vars, + global_step=tf1.train.get_or_create_global_step()) + return tf.group([sac_results, alpha_prime_apply_op]) + return sac_results + + +# Build a child class of `TFPolicy`, given the custom functions defined +# above. +CQLTFPolicy = build_tf_policy( + name="CQLTFPolicy", + loss_fn=cql_loss, + get_default_config=lambda: ray.rllib.agents.cql.cql.CQL_DEFAULT_CONFIG, + validate_spaces=validate_spaces, + stats_fn=cql_stats, + postprocess_fn=postprocess_trajectory, + before_init=setup_early_mixins, + after_init=setup_late_mixins, + make_model=build_sac_model, + mixins=[ + ActorCriticOptimizerMixin, TargetNetworkMixin, ComputeTDErrorMixin + ], + action_distribution_fn=get_distribution_inputs_and_class, + gradients_fn=compute_gradients_fn, + apply_gradients_fn=apply_gradients_fn, +) diff --git a/rllib/agents/cql/cql_sac_torch_policy.py b/rllib/agents/cql/cql_torch_policy.py similarity index 68% rename from rllib/agents/cql/cql_sac_torch_policy.py rename to rllib/agents/cql/cql_torch_policy.py index c9fe4c7dad1b..d54bcc5d3a54 100644 --- a/rllib/agents/cql/cql_sac_torch_policy.py +++ b/rllib/agents/cql/cql_torch_policy.py @@ -14,6 +14,7 @@ build_sac_model_and_action_dist, optimizer_fn, ComputeTDErrorMixin, \ TargetNetworkMixin, setup_late_mixins, action_distribution_fn from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper +from ray.rllib.policy.policy import LEARNER_STATS_KEY from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.policy.policy import Policy @@ -22,22 +23,26 @@ from ray.rllib.utils.typing import LocalOptimizer, TensorType, \ TrainerConfigDict from ray.rllib.utils.torch_ops import apply_grad_clipping, \ - convert_to_torch_tensor + convert_to_torch_tensor, concat_multi_gpu_td_errors torch, nn = try_import_torch() F = nn.functional logger = logging.getLogger(__name__) +MEAN_MIN = -9.0 +MEAN_MAX = 9.0 + # Returns policy tiled actions and log probabilities for CQL Loss def policy_actions_repeat(model, action_dist, obs, num_repeat=1): obs_temp = obs.unsqueeze(1).repeat(1, num_repeat, 1).view( obs.shape[0] * num_repeat, obs.shape[1]) - policy_dist = action_dist(model.get_policy_output(obs_temp), model) - actions = policy_dist.sample() - log_p = torch.unsqueeze(policy_dist.logp(actions), -1) - return actions, log_p.squeeze() + logits = model.get_policy_output(obs_temp) + policy_dist = action_dist(logits, model) + actions, logp_ = policy_dist.sample_logp() + logp = logp_.unsqueeze(-1) + return actions, logp.view(obs.shape[0], num_repeat, 1) def q_values_repeat(model, obs, actions, twin=False): @@ -47,20 +52,25 @@ def q_values_repeat(model, obs, actions, twin=False): obs_temp = obs.unsqueeze(1).repeat(1, num_repeat, 1).view( obs.shape[0] * num_repeat, obs.shape[1]) if not twin: - preds = model.get_q_values(obs_temp, actions) + preds_ = model.get_q_values(obs_temp, actions) else: - preds = model.get_twin_q_values(obs_temp, actions) - preds = preds.view(obs.shape[0], num_repeat, 1) + preds_ = model.get_twin_q_values(obs_temp, actions) + preds = preds_.view(obs.shape[0], num_repeat, 1) return preds def cql_loss(policy: Policy, model: ModelV2, dist_class: Type[TorchDistributionWrapper], train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: - print(policy.cur_iter) + logger.info(f"Current iteration = {policy.cur_iter}") policy.cur_iter += 1 + + # Look up the target model (tower) using the model tower. + target_model = policy.target_models[model] + # For best performance, turn deterministic off deterministic = policy.config["_deterministic_loss"] + assert not deterministic twin_q = policy.config["twin_q"] discount = policy.config["gamma"] action_low = model.action_space.low[0] @@ -76,7 +86,7 @@ def cql_loss(policy: Policy, model: ModelV2, obs = train_batch[SampleBatch.CUR_OBS] actions = train_batch[SampleBatch.ACTIONS] - rewards = train_batch[SampleBatch.REWARDS] + rewards = train_batch[SampleBatch.REWARDS].float() next_obs = train_batch[SampleBatch.NEXT_OBS] terminals = train_batch[SampleBatch.DONES] @@ -90,23 +100,28 @@ def cql_loss(policy: Policy, model: ModelV2, "is_training": True, }, [], None) - target_model_out_tp1, _ = policy.target_model({ + target_model_out_tp1, _ = target_model({ "obs": next_obs, "is_training": True, }, [], None) - action_dist_class = _get_dist_class(policy.config, policy.action_space) + action_dist_class = _get_dist_class(policy, policy.config, + policy.action_space) action_dist_t = action_dist_class( model.get_policy_output(model_out_t), policy.model) - policy_t = action_dist_t.sample() if not deterministic else \ - action_dist_t.deterministic_sample() - log_pis_t = torch.unsqueeze(action_dist_t.logp(policy_t), -1) + policy_t, log_pis_t = action_dist_t.sample_logp() + log_pis_t = torch.unsqueeze(log_pis_t, -1) # Unlike original SAC, Alpha and Actor Loss are computed first. # Alpha Loss alpha_loss = -(model.log_alpha * (log_pis_t + model.target_entropy).detach()).mean() + if obs.shape[0] == policy.config["train_batch_size"]: + policy.alpha_optim.zero_grad() + alpha_loss.backward() + policy.alpha_optim.step() + # Policy Loss (Either Behavior Clone Loss or SAC Loss) alpha = torch.exp(model.log_alpha) if policy.cur_iter >= bc_iters: @@ -117,51 +132,55 @@ def cql_loss(policy: Policy, model: ModelV2, actor_loss = (alpha.detach() * log_pis_t - min_q).mean() else: bc_logp = action_dist_t.logp(actions) - actor_loss = (alpha * log_pis_t - bc_logp).mean() + actor_loss = (alpha.detach() * log_pis_t - bc_logp).mean() + # actor_loss = -bc_logp.mean() + + if obs.shape[0] == policy.config["train_batch_size"]: + policy.actor_optim.zero_grad() + actor_loss.backward(retain_graph=True) + policy.actor_optim.step() # Critic Loss (Standard SAC Critic L2 Loss + CQL Entropy Loss) - # SAC Loss + # SAC Loss: + # Q-values for the batched actions. action_dist_tp1 = action_dist_class( model.get_policy_output(model_out_tp1), policy.model) - policy_tp1 = action_dist_tp1.sample() if not deterministic else \ - action_dist_tp1.deterministic_sample() + policy_tp1, _ = action_dist_tp1.sample_logp() - # Q-values for the batched actions. q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) + q_t_selected = torch.squeeze(q_t, dim=-1) if twin_q: twin_q_t = model.get_twin_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) + twin_q_t_selected = torch.squeeze(twin_q_t, dim=-1) # Target q network evaluation. - q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, - policy_tp1) + q_tp1 = target_model.get_q_values(target_model_out_tp1, policy_tp1) if twin_q: - twin_q_tp1 = policy.target_model.get_twin_q_values( - target_model_out_tp1, policy_tp1) + twin_q_tp1 = target_model.get_twin_q_values(target_model_out_tp1, + policy_tp1) # Take min over both twin-NNs. q_tp1 = torch.min(q_tp1, twin_q_tp1) - q_t = torch.squeeze(q_t, dim=-1) - if twin_q: - twin_q_t = torch.squeeze(twin_q_t, dim=-1) - - q_tp1 = torch.squeeze(input=q_tp1, dim=-1) - q_tp1 = (1.0 - terminals.float()) * q_tp1 + q_tp1_best = torch.squeeze(input=q_tp1, dim=-1) + q_tp1_best_masked = (1.0 - terminals.float()) * q_tp1_best # compute RHS of bellman equation q_t_target = ( - rewards + (discount**policy.config["n_step"]) * q_tp1).detach() + rewards + + (discount**policy.config["n_step"]) * q_tp1_best_masked).detach() # Compute the TD-error (potentially clipped), for priority replay buffer - base_td_error = torch.abs(q_t - q_t_target) + base_td_error = torch.abs(q_t_selected - q_t_target) if twin_q: - twin_td_error = torch.abs(twin_q_t - q_t_target) + twin_td_error = torch.abs(twin_q_t_selected - q_t_target) td_error = 0.5 * (base_td_error + twin_td_error) else: td_error = base_td_error - critic_loss = [nn.MSELoss()(q_t, q_t_target)] + + critic_loss_1 = nn.functional.mse_loss(q_t_selected, q_t_target) if twin_q: - critic_loss.append(nn.MSELoss()(twin_q_t, q_t_target)) + critic_loss_2 = nn.functional.mse_loss(twin_q_t_selected, q_t_target) # CQL Loss (We are using Entropy version of CQL (the best version)) rand_actions = convert_to_torch_tensor( @@ -169,12 +188,9 @@ def cql_loss(policy: Policy, model: ModelV2, actions.shape[-1]).uniform_(action_low, action_high), policy.device) curr_actions, curr_logp = policy_actions_repeat(model, action_dist_class, - obs, num_actions) + model_out_t, num_actions) next_actions, next_logp = policy_actions_repeat(model, action_dist_class, - next_obs, num_actions) - - curr_logp = curr_logp.view(actions.shape[0], num_actions, 1) - next_logp = next_logp.view(actions.shape[0], num_actions, 1) + model_out_tp1, num_actions) q1_rand = q_values_repeat(model, model_out_t, rand_actions) q1_curr_actions = q_values_repeat(model, model_out_t, curr_actions) @@ -198,13 +214,13 @@ def cql_loss(policy: Policy, model: ModelV2, q2_curr_actions - curr_logp.detach() ], 1) - min_qf1_loss = torch.logsumexp( + min_qf1_loss_ = torch.logsumexp( cat_q1 / cql_temp, dim=1).mean() * min_q_weight * cql_temp - min_qf1_loss = min_qf1_loss - q_t.mean() * min_q_weight + min_qf1_loss = min_qf1_loss_ - (q_t.mean() * min_q_weight) if twin_q: - min_qf2_loss = torch.logsumexp( + min_qf2_loss_ = torch.logsumexp( cat_q2 / cql_temp, dim=1).mean() * min_q_weight * cql_temp - min_qf2_loss = min_qf2_loss - twin_q_t.mean() * min_q_weight + min_qf2_loss = min_qf2_loss_ - (twin_q_t.mean() * min_q_weight) if use_lagrange: alpha_prime = torch.clamp( @@ -216,32 +232,47 @@ def cql_loss(policy: Policy, model: ModelV2, else: alpha_prime_loss = -min_qf1_loss - cql_loss = [min_qf2_loss] + cql_loss = [min_qf1_loss] if twin_q: cql_loss.append(min_qf2_loss) - critic_loss[0] += min_qf1_loss + critic_loss = [critic_loss_1 + min_qf1_loss] if twin_q: - critic_loss[1] += min_qf2_loss + critic_loss.append(critic_loss_2 + min_qf2_loss) + + if obs.shape[0] == policy.config["train_batch_size"]: + policy.critic_optims[0].zero_grad() + critic_loss[0].backward(retain_graph=True) + policy.critic_optims[0].step() + + if twin_q: + policy.critic_optims[1].zero_grad() + critic_loss[1].backward(retain_graph=False) + policy.critic_optims[1].step() # Save for stats function. - policy.q_t = q_t + policy.q_t = q_t_selected policy.policy_t = policy_t policy.log_pis_t = log_pis_t - policy.td_error = td_error + model.td_error = td_error policy.actor_loss = actor_loss policy.critic_loss = critic_loss policy.alpha_loss = alpha_loss policy.log_alpha_value = model.log_alpha policy.alpha_value = alpha policy.target_entropy = model.target_entropy - # CQL Stats + # CQL Stats. policy.cql_loss = cql_loss if use_lagrange: policy.log_alpha_prime_value = model.log_alpha_prime[0] policy.alpha_prime_value = alpha_prime policy.alpha_prime_loss = alpha_prime_loss + if obs.shape[0] == policy.config["train_batch_size"]: + policy.alpha_prime_optim.zero_grad() + alpha_prime_loss.backward() + policy.alpha_prime_optim.step() + # Return all loss terms corresponding to our optimizers. if use_lagrange: return tuple([policy.actor_loss] + policy.critic_loss + @@ -288,13 +319,29 @@ def cql_setup_late_mixins(policy: Policy, obs_space: gym.spaces.Space, policy.device) +def compute_gradients_fn(policy, postprocessed_batch): + batches = [policy._lazy_tensor_dict(postprocessed_batch)] + model = policy.model + policy._loss(policy, model, policy.dist_class, batches[0]) + stats = { + LEARNER_STATS_KEY: policy._convert_to_non_torch_type( + cql_stats(policy, batches[0])) + } + return [None, stats] + + +def apply_gradients_fn(policy, gradients): + return + + # Build a child class of `TorchPolicy`, given the custom functions defined # above. -CQLSACTorchPolicy = build_policy_class( - name="CQLSACTorchPolicy", +# Bonsai update: Torch is defunct due to lacking template api in 1.3. +CQLTorchPolicy = build_policy_class( + name="CQLTorchPolicy", framework="torch", loss_fn=cql_loss, - get_default_config=lambda: ray.rllib.agents.cql.cql.CQLSAC_DEFAULT_CONFIG, + get_default_config=lambda: ray.rllib.agents.cql.cql.CQL_DEFAULT_CONFIG, stats_fn=cql_stats, postprocess_fn=postprocess_trajectory, extra_grad_process_fn=apply_grad_clipping, @@ -302,6 +349,9 @@ def cql_setup_late_mixins(policy: Policy, obs_space: gym.spaces.Space, validate_spaces=validate_spaces, before_loss_init=cql_setup_late_mixins, make_model_and_action_dist=build_sac_model_and_action_dist, + extra_learn_fetches_fn=concat_multi_gpu_td_errors, mixins=[TargetNetworkMixin, ComputeTDErrorMixin], action_distribution_fn=action_distribution_fn, + # compute_gradients_fn=compute_gradients_fn, + apply_gradients_fn=apply_gradients_fn, ) diff --git a/rllib/agents/cql/tests/test_cql.py b/rllib/agents/cql/tests/test_cql.py new file mode 100644 index 000000000000..ec249084813b --- /dev/null +++ b/rllib/agents/cql/tests/test_cql.py @@ -0,0 +1,136 @@ +import numpy as np +from pathlib import Path +import os +import unittest + +import ray +import ray.rllib.agents.cql as cql +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.test_utils import check_compute_single_action, \ + framework_iterator + +tf1, tf, tfv = try_import_tf() +torch, _ = try_import_torch() + + +class TestCQL(unittest.TestCase): + @classmethod + def setUpClass(cls): + ray.init() + + @classmethod + def tearDownClass(cls): + ray.shutdown() + + def test_cql_compilation(self): + """Test whether a CQLTrainer can be built with all frameworks.""" + + # Learns from a historic-data file. + # To generate this data, first run: + # $ ./train.py --run=SAC --env=Pendulum-v0 \ + # --stop='{"timesteps_total": 50000}' \ + # --config='{"output": "/tmp/out"}' + rllib_dir = Path(__file__).parent.parent.parent.parent + print("rllib dir={}".format(rllib_dir)) + data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json") + print("data_file={} exists={}".format(data_file, + os.path.isfile(data_file))) + + config = cql.CQL_DEFAULT_CONFIG.copy() + config["env"] = "Pendulum-v0" + config["input"] = [data_file] + + # In the files, we use here for testing, actions have already + # been normalized. + # This is usually the case when the file was generated by another + # RLlib algorithm (e.g. PPO or SAC). + # config["actions_in_input_normalized"] = False + config["clip_actions"] = True + config["train_batch_size"] = 2000 + + config["num_workers"] = 0 # Run locally. + config["twin_q"] = True + config["learning_starts"] = 0 + config["bc_iters"] = 2 # 2 BC iters, 2 CQL iters. + config["rollout_fragment_length"] = 1 + + # Switch on off-policy evaluation. + config["input_evaluation"] = ["is"] + + config["evaluation_interval"] = 2 + config["evaluation_num_episodes"] = 10 + config["evaluation_config"]["input"] = "sampler" + # config["evaluation_parallel_to_training"] = False + config["evaluation_num_workers"] = 2 + + num_iterations = 4 + + # Test for tf/torch frameworks. + for fw in framework_iterator(config, frameworks={"tf"}): + trainer = cql.CQLTrainer(config=config) + print("CQLTrainer is created.") + for i in range(num_iterations): + train_result = trainer.train() + eval_result = train_result.get("evaluation") + print(f"Iteration {i+1} completed.") + if eval_result: + print(f"iter={trainer.iteration} " + f"R={eval_result['episode_reward_mean']}") + + check_compute_single_action(trainer) + + # Get policy and model. + pol = trainer.get_policy() + cql_model = pol.model + if fw == "tf": + pol.get_session().__enter__() + + # Example on how to do evaluation on the trained Trainer + # using the data from CQL's global replay buffer. + # Get a sample (MultiAgentBatch -> SampleBatch). + from ray.rllib.agents.cql.cql import replay_buffer + batch = replay_buffer.replay().policy_batches["default_policy"] + + if fw == "torch": + obs = torch.from_numpy(batch["obs"]) + else: + obs = batch["obs"] + batch["actions"] = batch["actions"].astype(np.float32) + + # Pass the observations through our model to get the + # features, which then to pass through the Q-head. + model_out, _ = cql_model({"obs": obs}) + # The estimated Q-values from the (historic) actions in the batch. + if fw == "torch": + q_values_old = cql_model.get_q_values( + model_out, torch.from_numpy(batch["actions"])) + else: + q_values_old = cql_model.get_q_values( + tf.convert_to_tensor(model_out), batch["actions"]) + + # The estimated Q-values for the new actions computed + # by our trainer policy. + actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] + if fw == "torch": + q_values_new = cql_model.get_q_values( + model_out, torch.from_numpy(actions_new)) + else: + q_values_new = cql_model.get_q_values(model_out, actions_new) + + if fw == "tf": + q_values_old, q_values_new = pol.get_session().run( + [q_values_old, q_values_new]) + + print(f"Q-val batch={q_values_old}") + print(f"Q-val policy={q_values_new}") + + if fw == "tf": + pol.get_session().__exit__(None, None, None) + + trainer.stop() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/agents/cql/tests/test_cql_sac.py b/rllib/agents/cql/tests/test_cql_sac.py deleted file mode 100644 index ca74b6c86945..000000000000 --- a/rllib/agents/cql/tests/test_cql_sac.py +++ /dev/null @@ -1,622 +0,0 @@ -from gym import Env -from gym.spaces import Box, Discrete, Tuple -import numpy as np -import re -import unittest - -import ray -import ray.rllib.agents.sac as sac -from ray.rllib.agents.cql import CQLSACTrainer, CQLSAC_DEFAULT_CONFIG -from ray.rllib.agents.sac.sac_tf_policy import sac_actor_critic_loss as tf_loss -from ray.rllib.agents.sac.sac_torch_policy import actor_critic_loss as \ - loss_torch -from ray.rllib.env.wrappers.moab_wrapper import MOAB_MOVE_TO_CENTER_ENV_NAME -from ray.rllib.examples.env.random_env import RandomEnv -from ray.rllib.examples.models.batch_norm_model import KerasBatchNormModel, \ - TorchBatchNormModel -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.tf.tf_action_dist import Dirichlet -from ray.rllib.models.torch.torch_action_dist import TorchDirichlet -from ray.rllib.execution.replay_buffer import LocalReplayBuffer -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.numpy import fc, huber_loss, relu -from ray.rllib.utils.spaces.simplex import Simplex -from ray.rllib.utils.test_utils import check, check_compute_single_action, \ - framework_iterator -from ray.rllib.utils.torch_ops import convert_to_torch_tensor - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - - -class SimpleEnv(Env): - def __init__(self, config): - if config.get("simplex_actions", False): - self.action_space = Simplex((2, )) - else: - self.action_space = Box(0.0, 1.0, (1, )) - self.observation_space = Box(0.0, 1.0, (1, )) - self.max_steps = config.get("max_steps", 100) - self.state = None - self.steps = None - - def reset(self): - self.state = self.observation_space.sample() - self.steps = 0 - return self.state - - def step(self, action): - self.steps += 1 - # Reward is 1.0 - (max(actions) - state). - [r] = 1.0 - np.abs(np.max(action) - self.state) - d = self.steps >= self.max_steps - self.state = self.observation_space.sample() - return self.state, r, d, {} - - -class TestCQLSAC(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init(local_mode=True) - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_cqlsac_compilation(self): - """Tests whether an SACTrainer can be built with all frameworks.""" - config = CQLSAC_DEFAULT_CONFIG.copy() - config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() - config["num_workers"] = 0 # Run locally. - config["twin_q"] = True - config["clip_actions"] = False - config["normalize_actions"] = True - config["learning_starts"] = 0 - config["prioritized_replay"] = False - config["train_batch_size"] = 256 #10 - config["input"] = "rllib/tests/data/moab/*.json" - config["input_evaluation"] = [] - config["bc_iters"] = 5 - config["temperature"] = 1.0 - config["num_actions"] = 10 - config["lagrangian"] = True # False - # Lagrangian Threshold - config["lagrangian_thresh"] = 5.0 - config["min_q_weight"] = 5.0 - # Initial value to use for the Alpha Prime (in CQL Loss). - config["initial_alpha_prime"] = 1.0 - config["evaluation_config"] = { - "input": "sampler", - "explore": False, - } - config["evaluation_interval"] = 1 - config["evaluation_num_episodes"] = 10 - config["evaluation_num_workers"] = 1 - - num_iterations = 1 - - ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) - ModelCatalog.register_custom_model("batch_norm_torch", - TorchBatchNormModel) - - image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) - simple_space = Box(-1.0, 1.0, shape=(3, )) - - # frameworks=("tf2", "tf", "tfe", "torch") - for fw in framework_iterator(config=config, frameworks=("tf2", "tf", "tfe")): - # Test for different env types (discrete w/ and w/o image, + cont). - for env in [ - # RandomEnv, - # "MsPacmanNoFrameskip-v4", - # "CartPole-v0", - MOAB_MOVE_TO_CENTER_ENV_NAME, - ]: - print("Env={}".format(env)) - if env == RandomEnv: - config["env_config"] = { - "observation_space": Tuple( - [simple_space, - Discrete(2), image_space]), - "action_space": Box(-1.0, 1.0, shape=(1, )), - } - else: - config["env_config"] = {} - # Test making the Q-model a custom one for CartPole, otherwise, - # use the default model. - config["Q_model"]["custom_model"] = "batch_norm{}".format( - "_torch" - if fw == "torch" else "") if env == "CartPole-v0" else None - trainer = CQLSACTrainer(config=config, env=env) - for i in range(num_iterations): - results = trainer.train() - print(results) - check_compute_single_action(trainer) - trainer.stop() - - @unittest.skip("TODO(Edi): Adapt...") - def test_cqlsac_loss_function(self): - self.skipTest("TODO(Edi): Adapt...") - """Tests SAC loss function results across all frameworks.""" - config = sac.DEFAULT_CONFIG.copy() - # Run locally. - config["num_workers"] = 0 - config["learning_starts"] = 0 - config["twin_q"] = False - config["gamma"] = 0.99 - # Switch on deterministic loss so we can compare the loss values. - config["_deterministic_loss"] = True - # Use very simple nets. - config["Q_model"]["fcnet_hiddens"] = [10] - config["policy_model"]["fcnet_hiddens"] = [10] - # Make sure, timing differences do not affect trainer.train(). - config["min_iter_time_s"] = 0 - # Test SAC with Simplex action space. - config["env_config"] = {"simplex_actions": True} - - map_ = { - # Action net. - "default_policy/fc_1/kernel": "action_model._hidden_layers.0." - "_model.0.weight", - "default_policy/fc_1/bias": "action_model._hidden_layers.0." - "_model.0.bias", - "default_policy/fc_out/kernel": "action_model." - "_logits._model.0.weight", - "default_policy/fc_out/bias": "action_model._logits._model.0.bias", - "default_policy/value_out/kernel": "action_model." - "_value_branch._model.0.weight", - "default_policy/value_out/bias": "action_model." - "_value_branch._model.0.bias", - # Q-net. - "default_policy/fc_1_1/kernel": "q_net." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_1/bias": "q_net." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_1/kernel": "q_net._logits._model.0.weight", - "default_policy/fc_out_1/bias": "q_net._logits._model.0.bias", - "default_policy/value_out_1/kernel": "q_net." - "_value_branch._model.0.weight", - "default_policy/value_out_1/bias": "q_net." - "_value_branch._model.0.bias", - "default_policy/log_alpha": "log_alpha", - # Target action-net. - "default_policy/fc_1_2/kernel": "action_model." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_2/bias": "action_model." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_2/kernel": "action_model." - "_logits._model.0.weight", - "default_policy/fc_out_2/bias": "action_model." - "_logits._model.0.bias", - "default_policy/value_out_2/kernel": "action_model." - "_value_branch._model.0.weight", - "default_policy/value_out_2/bias": "action_model." - "_value_branch._model.0.bias", - # Target Q-net - "default_policy/fc_1_3/kernel": "q_net." - "_hidden_layers.0._model.0.weight", - "default_policy/fc_1_3/bias": "q_net." - "_hidden_layers.0._model.0.bias", - "default_policy/fc_out_3/kernel": "q_net." - "_logits._model.0.weight", - "default_policy/fc_out_3/bias": "q_net." - "_logits._model.0.bias", - "default_policy/value_out_3/kernel": "q_net." - "_value_branch._model.0.weight", - "default_policy/value_out_3/bias": "q_net." - "_value_branch._model.0.bias", - "default_policy/log_alpha_1": "log_alpha", - } - - env = SimpleEnv - batch_size = 100 - if env is SimpleEnv: - obs_size = (batch_size, 1) - actions = np.random.random(size=(batch_size, 2)) - elif env == "CartPole-v0": - obs_size = (batch_size, 4) - actions = np.random.randint(0, 2, size=(batch_size, )) - else: - obs_size = (batch_size, 3) - actions = np.random.random(size=(batch_size, 1)) - - # Batch of size=n. - input_ = self._get_batch_helper(obs_size, actions, batch_size) - - # Simply compare loss values AND grads of all frameworks with each - # other. - prev_fw_loss = weights_dict = None - expect_c, expect_a, expect_e, expect_t = None, None, None, None - # History of tf-updated NN-weights over n training steps. - tf_updated_weights = [] - # History of input batches used. - tf_inputs = [] - for fw, sess in framework_iterator( - config, frameworks=("tf", "torch"), session=True): - # Generate Trainer and get its default Policy object. - trainer = sac.SACTrainer(config=config, env=env) - policy = trainer.get_policy() - p_sess = None - if sess: - p_sess = policy.get_session() - - # Set all weights (of all nets) to fixed values. - if weights_dict is None: - # Start with the tf vars-dict. - assert fw in ["tf2", "tf", "tfe"] - weights_dict = policy.get_weights() - if fw == "tfe": - log_alpha = weights_dict[10] - weights_dict = self._translate_tfe_weights( - weights_dict, map_) - else: - assert fw == "torch" # Then transfer that to torch Model. - model_dict = self._translate_weights_to_torch( - weights_dict, map_) - policy.model.load_state_dict(model_dict) - policy.target_model.load_state_dict(model_dict) - - if fw == "tf": - log_alpha = weights_dict["default_policy/log_alpha"] - elif fw == "torch": - # Actually convert to torch tensors (by accessing everything). - input_ = policy._lazy_tensor_dict(input_) - input_ = {k: input_[k] for k in input_.keys()} - log_alpha = policy.model.log_alpha.detach().cpu().numpy()[0] - - # Only run the expectation once, should be the same anyways - # for all frameworks. - if expect_c is None: - expect_c, expect_a, expect_e, expect_t = \ - self._sac_loss_helper(input_, weights_dict, - sorted(weights_dict.keys()), - log_alpha, fw, - gamma=config["gamma"], sess=sess) - - # Get actual outs and compare to expectation AND previous - # framework. c=critic, a=actor, e=entropy, t=td-error. - if fw == "tf": - c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \ - p_sess.run([ - policy.critic_loss, - policy.actor_loss, - policy.alpha_loss, - policy.td_error, - policy.optimizer().compute_gradients( - policy.critic_loss[0], - [v for v in policy.model.q_variables() if - "value_" not in v.name]), - policy.optimizer().compute_gradients( - policy.actor_loss, - [v for v in policy.model.policy_variables() if - "value_" not in v.name]), - policy.optimizer().compute_gradients( - policy.alpha_loss, policy.model.log_alpha)], - feed_dict=policy._get_loss_inputs_dict( - input_, shuffle=False)) - tf_c_grads = [g for g, v in tf_c_grads] - tf_a_grads = [g for g, v in tf_a_grads] - tf_e_grads = [g for g, v in tf_e_grads] - - elif fw == "tfe": - with tf.GradientTape() as tape: - tf_loss(policy, policy.model, None, input_) - c, a, e, t = policy.critic_loss, policy.actor_loss, \ - policy.alpha_loss, policy.td_error - vars = tape.watched_variables() - tf_c_grads = tape.gradient(c[0], vars[6:10]) - tf_a_grads = tape.gradient(a, vars[2:6]) - tf_e_grads = tape.gradient(e, vars[10]) - - elif fw == "torch": - loss_torch(policy, policy.model, None, input_) - c, a, e, t = policy.critic_loss, policy.actor_loss, \ - policy.alpha_loss, policy.td_error - - # Test actor gradients. - policy.actor_optim.zero_grad() - assert all(v.grad is None for v in policy.model.q_variables()) - assert all( - v.grad is None for v in policy.model.policy_variables()) - assert policy.model.log_alpha.grad is None - a.backward() - # `actor_loss` depends on Q-net vars (but these grads must - # be ignored and overridden in critic_loss.backward!). - assert not all( - torch.mean(v.grad) == 0 - for v in policy.model.policy_variables()) - assert not all( - torch.min(v.grad) == 0 - for v in policy.model.policy_variables()) - assert policy.model.log_alpha.grad is None - # Compare with tf ones. - torch_a_grads = [ - v.grad for v in policy.model.policy_variables() - if v.grad is not None - ] - check(tf_a_grads[2], - np.transpose(torch_a_grads[0].detach().cpu())) - - # Test critic gradients. - policy.critic_optims[0].zero_grad() - assert all( - torch.mean(v.grad) == 0.0 - for v in policy.model.q_variables() if v.grad is not None) - assert all( - torch.min(v.grad) == 0.0 - for v in policy.model.q_variables() if v.grad is not None) - assert policy.model.log_alpha.grad is None - c[0].backward() - assert not all( - torch.mean(v.grad) == 0 - for v in policy.model.q_variables() if v.grad is not None) - assert not all( - torch.min(v.grad) == 0 for v in policy.model.q_variables() - if v.grad is not None) - assert policy.model.log_alpha.grad is None - # Compare with tf ones. - torch_c_grads = [v.grad for v in policy.model.q_variables()] - check(tf_c_grads[0], - np.transpose(torch_c_grads[2].detach().cpu())) - # Compare (unchanged(!) actor grads) with tf ones. - torch_a_grads = [ - v.grad for v in policy.model.policy_variables() - ] - check(tf_a_grads[2], - np.transpose(torch_a_grads[0].detach().cpu())) - - # Test alpha gradient. - policy.alpha_optim.zero_grad() - assert policy.model.log_alpha.grad is None - e.backward() - assert policy.model.log_alpha.grad is not None - check(policy.model.log_alpha.grad, tf_e_grads) - - check(c, expect_c) - check(a, expect_a) - check(e, expect_e) - check(t, expect_t) - - # Store this framework's losses in prev_fw_loss to compare with - # next framework's outputs. - if prev_fw_loss is not None: - check(c, prev_fw_loss[0]) - check(a, prev_fw_loss[1]) - check(e, prev_fw_loss[2]) - check(t, prev_fw_loss[3]) - - prev_fw_loss = (c, a, e, t) - - # Update weights from our batch (n times). - for update_iteration in range(5): - print("train iteration {}".format(update_iteration)) - if fw == "tf": - in_ = self._get_batch_helper(obs_size, actions, batch_size) - tf_inputs.append(in_) - # Set a fake-batch to use - # (instead of sampling from replay buffer). - buf = LocalReplayBuffer.get_instance_for_testing() - buf._fake_batch = in_ - trainer.train() - updated_weights = policy.get_weights() - # Net must have changed. - if tf_updated_weights: - check( - updated_weights["default_policy/fc_1/kernel"], - tf_updated_weights[-1][ - "default_policy/fc_1/kernel"], - false=True) - tf_updated_weights.append(updated_weights) - - # Compare with updated tf-weights. Must all be the same. - else: - tf_weights = tf_updated_weights[update_iteration] - in_ = tf_inputs[update_iteration] - # Set a fake-batch to use - # (instead of sampling from replay buffer). - buf = LocalReplayBuffer.get_instance_for_testing() - buf._fake_batch = in_ - trainer.train() - # Compare updated model. - for tf_key in sorted(tf_weights.keys()): - if re.search("_[23]|alpha", tf_key): - continue - tf_var = tf_weights[tf_key] - torch_var = policy.model.state_dict()[map_[tf_key]] - if tf_var.shape != torch_var.shape: - check( - tf_var, - np.transpose(torch_var.detach().cpu()), - rtol=0.05) - else: - check(tf_var, torch_var, rtol=0.05) - # And alpha. - check(policy.model.log_alpha, - tf_weights["default_policy/log_alpha"]) - # Compare target nets. - for tf_key in sorted(tf_weights.keys()): - if not re.search("_[23]", tf_key): - continue - tf_var = tf_weights[tf_key] - torch_var = policy.target_model.state_dict()[map_[ - tf_key]] - if tf_var.shape != torch_var.shape: - check( - tf_var, - np.transpose(torch_var.detach().cpu()), - rtol=0.05) - else: - check(tf_var, torch_var, rtol=0.05) - - def _get_batch_helper(self, obs_size, actions, batch_size): - return { - SampleBatch.CUR_OBS: np.random.random(size=obs_size), - SampleBatch.ACTIONS: actions, - SampleBatch.REWARDS: np.random.random(size=(batch_size, )), - SampleBatch.DONES: np.random.choice( - [True, False], size=(batch_size, )), - SampleBatch.NEXT_OBS: np.random.random(size=obs_size), - "weights": np.random.random(size=(batch_size, )), - } - - def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, - sess): - """Emulates SAC loss functions for tf and torch.""" - # ks: - # 0=log_alpha - # 1=target log-alpha (not used) - - # 2=action hidden bias - # 3=action hidden kernel - # 4=action out bias - # 5=action out kernel - - # 6=Q hidden bias - # 7=Q hidden kernel - # 8=Q out bias - # 9=Q out kernel - - # 14=target Q hidden bias - # 15=target Q hidden kernel - # 16=target Q out bias - # 17=target Q out kernel - alpha = np.exp(log_alpha) - # cls = TorchSquashedGaussian if fw == "torch" else SquashedGaussian - cls = TorchDirichlet if fw == "torch" else Dirichlet - model_out_t = train_batch[SampleBatch.CUR_OBS] - model_out_tp1 = train_batch[SampleBatch.NEXT_OBS] - target_model_out_tp1 = train_batch[SampleBatch.NEXT_OBS] - - # get_policy_output - action_dist_t = cls( - fc( - relu( - fc(model_out_t, - weights[ks[1]], - weights[ks[0]], - framework=fw)), weights[ks[9]], weights[ks[8]]), None) - policy_t = action_dist_t.deterministic_sample() - log_pis_t = action_dist_t.logp(policy_t) - if sess: - log_pis_t = sess.run(log_pis_t) - policy_t = sess.run(policy_t) - log_pis_t = np.expand_dims(log_pis_t, -1) - - # Get policy output for t+1. - action_dist_tp1 = cls( - fc( - relu( - fc(model_out_tp1, - weights[ks[1]], - weights[ks[0]], - framework=fw)), weights[ks[9]], weights[ks[8]]), None) - policy_tp1 = action_dist_tp1.deterministic_sample() - log_pis_tp1 = action_dist_tp1.logp(policy_tp1) - if sess: - log_pis_tp1 = sess.run(log_pis_tp1) - policy_tp1 = sess.run(policy_tp1) - log_pis_tp1 = np.expand_dims(log_pis_tp1, -1) - - # Q-values for the actually selected actions. - # get_q_values - q_t = fc( - relu( - fc(np.concatenate( - [model_out_t, train_batch[SampleBatch.ACTIONS]], -1), - weights[ks[3]], - weights[ks[2]], - framework=fw)), - weights[ks[11]], - weights[ks[10]], - framework=fw) - - # Q-values for current policy in given current state. - # get_q_values - q_t_det_policy = fc( - relu( - fc(np.concatenate([model_out_t, policy_t], -1), - weights[ks[3]], - weights[ks[2]], - framework=fw)), - weights[ks[11]], - weights[ks[10]], - framework=fw) - - # Target q network evaluation. - # target_model.get_q_values - if fw == "tf": - q_tp1 = fc( - relu( - fc(np.concatenate([target_model_out_tp1, policy_tp1], -1), - weights[ks[7]], - weights[ks[6]], - framework=fw)), - weights[ks[15]], - weights[ks[14]], - framework=fw) - else: - assert fw == "tfe" - q_tp1 = fc( - relu( - fc(np.concatenate([target_model_out_tp1, policy_tp1], -1), - weights[ks[7]], - weights[ks[6]], - framework=fw)), - weights[ks[9]], - weights[ks[8]], - framework=fw) - - q_t_selected = np.squeeze(q_t, axis=-1) - q_tp1 -= alpha * log_pis_tp1 - q_tp1_best = np.squeeze(q_tp1, axis=-1) - dones = train_batch[SampleBatch.DONES] - rewards = train_batch[SampleBatch.REWARDS] - if fw == "torch": - dones = dones.float().numpy() - rewards = rewards.numpy() - q_tp1_best_masked = (1.0 - dones) * q_tp1_best - q_t_selected_target = rewards + gamma * q_tp1_best_masked - base_td_error = np.abs(q_t_selected - q_t_selected_target) - td_error = base_td_error - critic_loss = [ - np.mean(train_batch["weights"] * - huber_loss(q_t_selected_target - q_t_selected)) - ] - target_entropy = -np.prod((1, )) - alpha_loss = -np.mean(log_alpha * (log_pis_t + target_entropy)) - actor_loss = np.mean(alpha * log_pis_t - q_t_det_policy) - - return critic_loss, actor_loss, alpha_loss, td_error - - def _translate_weights_to_torch(self, weights_dict, map_): - model_dict = { - map_[k]: convert_to_torch_tensor( - np.transpose(v) if re.search("kernel", k) else np.array([v]) - if re.search("log_alpha", k) else v) - for i, (k, v) in enumerate(weights_dict.items()) if i < 13 - } - - return model_dict - - def _translate_tfe_weights(self, weights_dict, map_): - model_dict = { - "default_policy/log_alpha": None, - "default_policy/log_alpha_target": None, - "default_policy/sequential/action_1/kernel": weights_dict[2], - "default_policy/sequential/action_1/bias": weights_dict[3], - "default_policy/sequential/action_out/kernel": weights_dict[4], - "default_policy/sequential/action_out/bias": weights_dict[5], - "default_policy/sequential_1/q_hidden_0/kernel": weights_dict[6], - "default_policy/sequential_1/q_hidden_0/bias": weights_dict[7], - "default_policy/sequential_1/q_out/kernel": weights_dict[8], - "default_policy/sequential_1/q_out/bias": weights_dict[9], - "default_policy/value_out/kernel": weights_dict[0], - "default_policy/value_out/bias": weights_dict[1], - } - return model_dict - - -if __name__ == "__main__": - import pytest - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/agents/registry.py b/rllib/agents/registry.py index a141783c5b2a..8354c5ce6fd2 100644 --- a/rllib/agents/registry.py +++ b/rllib/agents/registry.py @@ -41,9 +41,9 @@ def _import_bc(): return marwil.BCTrainer, marwil.DEFAULT_CONFIG -def _import_cql_sac(): +def _import_cql(): from ray.rllib.agents import cql - return cql.CQLSACTrainer, cql.CQLSAC_DEFAULT_CONFIG + return cql.CQLTrainer, cql.CQL_DEFAULT_CONFIG def _import_ddpg(): @@ -144,7 +144,7 @@ def _import_td3(): "APPO": _import_appo, "ARS": _import_ars, "BC": _import_bc, - "CQL_SAC": _import_cql_sac, + "CQL": _import_cql, "ES": _import_es, "DDPG": _import_ddpg, "DDPPO": _import_ddppo, diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 1ec3a5ad7978..bb269b45d1ce 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -502,7 +502,8 @@ class Trainer(Trainable): "tf_session_args", "local_tf_session_args", "env_config", "model", "optimizer", "multiagent", "custom_resources_per_worker", "evaluation_config", "exploration_config", - "extra_python_environs_for_driver", "extra_python_environs_for_worker" + "extra_python_environs_for_driver", "extra_python_environs_for_worker", + "input_config" ] # List of top level keys with value=dict, for which we always override the diff --git a/rllib/agents/trainer_factory.py b/rllib/agents/trainer_factory.py index 95051ea15b22..35872d1549ef 100644 --- a/rllib/agents/trainer_factory.py +++ b/rllib/agents/trainer_factory.py @@ -40,9 +40,7 @@ class DiscreteActionSpaceAlgorithm(Algorithm): DQN = (dqn.DQNTrainer, dqn.DEFAULT_CONFIG) PPO = (ppo.PPOTrainer, ppo.DEFAULT_CONFIG) SAC = (sac.SACTrainer, sac.DEFAULT_CONFIG) - CQL_SAC = (cql.CQLSACTrainer, cql.CQLSAC_DEFAULT_CONFIG) - CQL_APEX_SAC = (cql.CQLApexSACTrainer, cql.CQLAPEXSAC_DEFAULT_CONFIG) - CQL_DQN = (cql.CQLDQNTrainer, cql.CQLDQN_DEFAULT_CONFIG) + CQL = (cql.CQLTrainer, cql.CQL_DEFAULT_CONFIG) class ContinuousActionSpaceAlgorithm(Algorithm): @@ -57,8 +55,7 @@ class ContinuousActionSpaceAlgorithm(Algorithm): TD3 = (ddpg.TD3Trainer, ddpg.td3.TD3_DEFAULT_CONFIG) PPO = (ppo.PPOTrainer, ppo.DEFAULT_CONFIG) SAC = (sac.SACTrainer, sac.DEFAULT_CONFIG) - CQL_SAC = (cql.CQLSACTrainer, cql.CQLSAC_DEFAULT_CONFIG) - CQL_APEX_SAC = (cql.CQLApexSACTrainer, cql.CQLAPEXSAC_DEFAULT_CONFIG) + CQL = (cql.CQLTrainer, cql.CQL_DEFAULT_CONFIG) def trainer_factory( diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index ac2b23477127..0f04491e1d9e 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -1,5 +1,6 @@ import gym import logging +import importlib.util from types import FunctionType from typing import Callable, Dict, Generic, List, Optional, Tuple, Type, TypeVar, Union @@ -14,6 +15,8 @@ from ray.rllib.utils import merge_dicts from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import PolicyID, TrainerConfigDict, EnvType +from ray.rllib.utils.from_config import from_config +from ray.tune.registry import registry_contains_input, registry_get_input tf1, tf, tfv = try_import_tf() @@ -259,17 +262,36 @@ def session_creator(): return tf1.Session( config=tf1.ConfigProto(**config["tf_session_args"])) + def valid_module(class_path): + if isinstance(class_path, str) and "." in class_path: + module_path, class_name = class_path.rsplit(".", 1) + try: + spec = importlib.util.find_spec(module_path) + if spec is not None: + return True + except (ModuleNotFoundError, ValueError): + print( + f"module {module_path} not found while trying to get " + + f"input {class_path}") + return False + if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) + elif isinstance(config["input"], str) and \ + registry_contains_input(config["input"]): + input_creator = registry_get_input(config["input"]) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) elif "d4rl" in config["input"]: - env_name = config["input"].split(".")[1] + env_name = config["input"].split(".")[-1] input_creator = (lambda ioctx: D4RLReader(env_name, ioctx)) + elif valid_module(config["input"]): + input_creator = (lambda ioctx: ShuffledInput(from_config( + config["input"], ioctx=ioctx))) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), diff --git a/rllib/execution/metric_ops.py b/rllib/execution/metric_ops.py index 42a561ac169a..dce0f9775c74 100644 --- a/rllib/execution/metric_ops.py +++ b/rllib/execution/metric_ops.py @@ -1,10 +1,11 @@ -from typing import Any, List, Dict, Callable, Optional, Sequence +from typing import Any, List, Dict import time +from ray.actor import ActorHandle from ray.util.iter import LocalIterator from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes from ray.rllib.execution.common import AGENT_STEPS_SAMPLED_COUNTER, \ - STEPS_SAMPLED_COUNTER, TIMESTEPS_TOTAL, _get_shared_metrics + STEPS_SAMPLED_COUNTER, STEPS_TRAINED_COUNTER, _get_shared_metrics from ray.rllib.evaluation.worker_set import WorkerSet @@ -12,7 +13,9 @@ def StandardMetricsReporting( train_op: LocalIterator[Any], workers: WorkerSet, config: dict, - selected_workers: List["ActorHandle"] = None) -> LocalIterator[dict]: + selected_workers: List[ActorHandle] = None, + by_steps_trained: bool = False, +) -> LocalIterator[dict]: """Operator to periodically collect and report metrics. Args: @@ -23,6 +26,8 @@ def StandardMetricsReporting( of stats reporting. selected_workers (list): Override the list of remote workers to collect metrics from. + by_steps_trained (bool): If True, uses the `STEPS_TRAINED_COUNTER` + instead of the `STEPS_SAMPLED_COUNTER` in metrics. Returns: LocalIterator[dict]: A local iterator over training results. @@ -33,15 +38,17 @@ def StandardMetricsReporting( >>> next(metrics_op) {"episode_reward_max": ..., "episode_reward_mean": ..., ...} """ - custom_summarize_episodes = config.get("custom_summarize_episodes_callback") + output_op = train_op \ - .filter(OncePerTimestepsElapsed(config["timesteps_per_iteration"])) \ + .filter(OncePerTimestepsElapsed(config["timesteps_per_iteration"], + by_steps_trained=by_steps_trained)) \ .filter(OncePerTimeInterval(config["min_iter_time_s"])) \ .for_each(CollectMetrics( - workers, min_history=config["metrics_smoothing_episodes"], + workers, + min_history=config["metrics_smoothing_episodes"], timeout_seconds=config["collect_metrics_timeout"], selected_workers=selected_workers, - custom_summarize_episodes=custom_summarize_episodes)) + by_steps_trained=by_steps_trained,)) return output_op @@ -63,11 +70,8 @@ def __init__(self, workers: WorkerSet, min_history: int = 100, timeout_seconds: int = 180, - selected_workers: List["ActorHandle"] = None, - custom_summarize_episodes: Optional[ - Callable[[Sequence[Any], Sequence[Any], Dict[str, Any]], - Dict[str, Any]] - ] = None, + selected_workers: List[ActorHandle] = None, + by_steps_trained: bool = False, ): self.workers = workers self.episode_history = [] @@ -75,7 +79,7 @@ def __init__(self, self.min_history = min_history self.timeout_seconds = timeout_seconds self.selected_workers = selected_workers - self._custom_summarize_episodes = custom_summarize_episodes + self.by_steps_trained = by_steps_trained def __call__(self, _: Any) -> Dict: # Collect worker metrics. @@ -87,13 +91,11 @@ def __call__(self, _: Any) -> Dict: orig_episodes = list(episodes) missing = self.min_history - len(episodes) if missing > 0: - episodes.extend(self.episode_history[-missing:]) + episodes = self.episode_history[-missing:] + episodes assert len(episodes) <= self.min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-self.min_history:] res = summarize_episodes(episodes, orig_episodes) - if self._custom_summarize_episodes: - res = self._custom_summarize_episodes(episodes, orig_episodes, res) # Add in iterator metrics. metrics = _get_shared_metrics() @@ -111,7 +113,11 @@ def __call__(self, _: Any) -> Dict: timer.mean_throughput, 3) res.update({ "num_healthy_workers": len(self.workers.remote_workers()), - TIMESTEPS_TOTAL: metrics.counters[STEPS_SAMPLED_COUNTER], + "timesteps_total": ( + metrics.counters[STEPS_TRAINED_COUNTER] + if self.by_steps_trained + else metrics.counters[STEPS_SAMPLED_COUNTER] + ), "agent_timesteps_total": metrics.counters.get( AGENT_STEPS_SAMPLED_COUNTER, 0), }) @@ -166,15 +172,26 @@ class OncePerTimestepsElapsed: # will only return after 1000 steps have elapsed """ - def __init__(self, delay_steps: int): + def __init__(self, delay_steps: int, by_steps_trained: bool = False): + """ + Args: + delay_steps (int): The number of steps (sampled or trained) every + which this op returns True. + by_steps_trained (bool): If True, uses the `STEPS_TRAINED_COUNTER` + instead of the `STEPS_SAMPLED_COUNTER` in metrics. + """ self.delay_steps = delay_steps + self.by_steps_trained = by_steps_trained self.last_called = 0 def __call__(self, item: Any) -> bool: if self.delay_steps <= 0: return True metrics = _get_shared_metrics() - now = metrics.counters[STEPS_SAMPLED_COUNTER] + if self.by_steps_trained: + now = metrics.counters[STEPS_TRAINED_COUNTER] + else: + now = metrics.counters[STEPS_SAMPLED_COUNTER] if now - self.last_called >= self.delay_steps: self.last_called = now return True diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index 06c182372da8..0503174e326b 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -1,8 +1,8 @@ from math import log import numpy as np import functools -import tree import gym +import tree # pip install dm_tree from ray.rllib.models.action_dist import ActionDistribution from ray.rllib.models.modelv2 import ModelV2 @@ -103,26 +103,38 @@ def required_model_output_shape(action_space, model_config): class MultiCategorical(TFActionDistribution): """MultiCategorical distribution for MultiDiscrete action spaces.""" - def __init__(self, inputs: List[TensorType], model: ModelV2, - input_lens: Union[List[int], np.ndarray, Tuple[int, ...]]): + def __init__(self, + inputs: List[TensorType], + model: ModelV2, + input_lens: Union[List[int], np.ndarray, Tuple[int, ...]], + action_space=None): # skip TFActionDistribution init ActionDistribution.__init__(self, inputs, model) self.cats = [ Categorical(input_, model) for input_ in tf.split(inputs, input_lens, axis=1) ] + self.action_space = action_space self.sample_op = self._build_sample_op() self.sampled_action_logp_op = self.logp(self.sample_op) @override(ActionDistribution) def deterministic_sample(self) -> TensorType: - return tf.stack( + sample_ = tf.stack( [cat.deterministic_sample() for cat in self.cats], axis=1) + if isinstance(self.action_space, gym.spaces.Box): + return tf.cast( + tf.reshape(sample_, [-1] + list(self.action_space.shape)), + self.action_space.dtype) + return sample_ @override(ActionDistribution) def logp(self, actions: TensorType) -> TensorType: # If tensor is provided, unstack it into list. if isinstance(actions, tf.Tensor): + if isinstance(self.action_space, gym.spaces.Box): + actions = tf.reshape( + actions, [-1, int(np.product(self.action_space.shape))]) actions = tf.unstack(tf.cast(actions, tf.int32), axis=1) logps = tf.stack( [cat.logp(act) for cat, act in zip(self.cats, actions)]) @@ -148,14 +160,29 @@ def kl(self, other: ActionDistribution) -> TensorType: @override(TFActionDistribution) def _build_sample_op(self) -> TensorType: - return tf.stack([cat.sample() for cat in self.cats], axis=1) + sample_op = tf.stack([cat.sample() for cat in self.cats], axis=1) + if isinstance(self.action_space, gym.spaces.Box): + return tf.cast( + tf.reshape(sample_op, [-1] + list(self.action_space.shape)), + dtype=self.action_space.dtype) + return sample_op @staticmethod @override(ActionDistribution) def required_model_output_shape( action_space: gym.Space, model_config: ModelConfigDict) -> Union[int, np.ndarray]: - return np.sum(action_space.nvec) + # Int Box. + if isinstance(action_space, gym.spaces.Box): + assert action_space.dtype.name.startswith("int") + low_ = np.min(action_space.low) + high_ = np.max(action_space.high) + assert np.all(action_space.low == low_) + assert np.all(action_space.high == high_) + np.product(action_space.shape) * (high_ - low_ + 1) + # MultiDiscrete space. + else: + return np.sum(action_space.nvec) class GumbelSoftmax(TFActionDistribution): @@ -322,7 +349,7 @@ def _build_sample_op(self) -> TensorType: @override(ActionDistribution) def logp(self, x: TensorType) -> TensorType: # Unsquash values (from [low,high] to ]-inf,inf[) - unsquashed_values = self._unsquash(x) + unsquashed_values = tf.cast(self._unsquash(x), self.inputs.dtype) # Get log prob of unsquashed values from our Normal. log_prob_gaussian = self.distr.log_prob(unsquashed_values) # For safety reasons, clamp somehow, only then sum up. @@ -335,6 +362,14 @@ def logp(self, x: TensorType) -> TensorType: axis=-1) return log_prob + def sample_logp(self): + z = self.distr.sample() + actions = self._squash(z) + return actions, tf.reduce_sum( + self.distr.log_prob(z) - + tf.math.log(1 - actions * actions + SMALL_NUMBER), + axis=-1) + @override(ActionDistribution) def entropy(self) -> TensorType: raise ValueError("Entropy not defined for SquashedGaussian!") diff --git a/rllib/offline/d4rl_reader.py b/rllib/offline/d4rl_reader.py index 2c02af08868c..d191d65c61f3 100644 --- a/rllib/offline/d4rl_reader.py +++ b/rllib/offline/d4rl_reader.py @@ -27,14 +27,12 @@ def __init__(self, inputs: str, ioctx: IOContext = None): self.env = gym.make(inputs) self.dataset = convert_to_batch(d4rl.qlearning_dataset(self.env)) assert self.dataset.count >= 1 - self.dataset.shuffle() self.counter = 0 @override(InputReader) def next(self) -> SampleBatchType: if self.counter >= self.dataset.count: self.counter = 0 - self.dataset.shuffle() self.counter += 1 return self.dataset.slice(start=self.counter, end=self.counter + 1) diff --git a/rllib/offline/io_context.py b/rllib/offline/io_context.py index b0323065d87b..f13103b7f295 100644 --- a/rllib/offline/io_context.py +++ b/rllib/offline/io_context.py @@ -16,6 +16,7 @@ class IOContext: worker_index (int): When there are multiple workers created, this uniquely identifies the current worker. worker (RolloutWorker): RolloutWorker object reference. + input_config (dict): The input configuration for custom input. """ @PublicAPI @@ -32,3 +33,8 @@ def __init__(self, @PublicAPI def default_sampler_input(self) -> Any: return self.worker.sampler + + @PublicAPI + @property + def input_config(self): + return self.config.get("input_config", {}) diff --git a/rllib/offline/is_estimator.py b/rllib/offline/is_estimator.py index 619cc0deee2d..119eb2e1c97f 100644 --- a/rllib/offline/is_estimator.py +++ b/rllib/offline/is_estimator.py @@ -1,40 +1,40 @@ -from ray.rllib.offline.off_policy_estimator import OffPolicyEstimator, \ - OffPolicyEstimate -from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import SampleBatchType - - -class ImportanceSamplingEstimator(OffPolicyEstimator): - """The step-wise IS estimator. - - Step-wise IS estimator described in https://arxiv.org/pdf/1511.03722.pdf""" - - @override(OffPolicyEstimator) - def estimate(self, batch: SampleBatchType) -> OffPolicyEstimate: - self.check_can_estimate_for(batch) - - rewards, old_prob = batch["rewards"], batch["action_prob"] - new_prob = self.action_prob(batch) - - # calculate importance ratios - p = [] - for t in range(batch.count): - if t == 0: - pt_prev = 1.0 - else: - pt_prev = p[t - 1] - p.append(pt_prev * new_prob[t] / old_prob[t]) - - # calculate stepwise IS estimate - V_prev, V_step_IS = 0.0, 0.0 - for t in range(batch.count): - V_prev += rewards[t] * self.gamma**t - V_step_IS += p[t] * rewards[t] * self.gamma**t - - estimation = OffPolicyEstimate( - "is", { - "V_prev": V_prev, - "V_step_IS": V_step_IS, - "V_gain_est": V_step_IS / max(1e-8, V_prev), - }) - return estimation +from ray.rllib.offline.off_policy_estimator import OffPolicyEstimator, \ + OffPolicyEstimate +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import SampleBatchType + + +class ImportanceSamplingEstimator(OffPolicyEstimator): + """The step-wise IS estimator. + + Step-wise IS estimator described in https://arxiv.org/pdf/1511.03722.pdf""" + + @override(OffPolicyEstimator) + def estimate(self, batch: SampleBatchType) -> OffPolicyEstimate: + self.check_can_estimate_for(batch) + + rewards, old_prob = batch["rewards"], batch["action_prob"] + new_prob = self.action_prob(batch) + + # calculate importance ratios + p = [] + for t in range(batch.count): + if t == 0: + pt_prev = 1.0 + else: + pt_prev = p[t - 1] + p.append(pt_prev * new_prob[t] / old_prob[t]) + + # calculate stepwise IS estimate + V_prev, V_step_IS = 0.0, 0.0 + for t in range(batch.count): + V_prev += rewards[t] * self.gamma**t + V_step_IS += p[t] * rewards[t] * self.gamma**t + + estimation = OffPolicyEstimate( + "is", { + "V_prev": V_prev, + "V_step_IS": V_step_IS, + "V_gain_est": V_step_IS / max(1e-8, V_prev), + }) + return estimation diff --git a/rllib/offline/json_reader.py b/rllib/offline/json_reader.py index c84676d03655..2177ea27ef98 100644 --- a/rllib/offline/json_reader.py +++ b/rllib/offline/json_reader.py @@ -2,8 +2,12 @@ import json import logging import os +from pathlib import Path import random +import re +from typing import List, Optional from urllib.parse import urlparse +import zipfile try: from smart_open import smart_open @@ -16,8 +20,8 @@ SampleBatch from ray.rllib.utils.annotations import override, PublicAPI from ray.rllib.utils.compression import unpack_if_needed +from ray.rllib.utils.spaces.space_utils import clip_action, normalize_action from ray.rllib.utils.typing import FileType, SampleBatchType -from typing import List logger = logging.getLogger(__name__) @@ -35,10 +39,10 @@ def __init__(self, inputs: List[str], ioctx: IOContext = None): """Initialize a JsonReader. Args: - inputs (str|list): either a glob expression for files, e.g., + inputs (str|list): Either a glob expression for files, e.g., "/tmp/**/*.json", or a list of single file paths or URIs, e.g., ["s3://bucket/file.json", "s3://bucket/file2.json"]. - ioctx (IOContext): current IO context object. + ioctx (IOContext): Current IO context object. """ self.ioctx = ioctx or IOContext() @@ -49,16 +53,25 @@ def __init__(self, inputs: List[str], ioctx: IOContext = None): if isinstance(inputs, str): inputs = os.path.abspath(os.path.expanduser(inputs)) if os.path.isdir(inputs): - inputs = os.path.join(inputs, "*.json") + inputs = [ + os.path.join(inputs, "*.json"), + os.path.join(inputs, "*.zip") + ] logger.warning( - "Treating input directory as glob pattern: {}".format( - inputs)) - if urlparse(inputs).scheme not in [""] + WINDOWS_DRIVES: + f"Treating input directory as glob patterns: {inputs}") + else: + inputs = [inputs] + + if any( + urlparse(i).scheme not in [""] + WINDOWS_DRIVES + for i in inputs): raise ValueError( "Don't know how to glob over `{}`, ".format(inputs) + "please specify a list of files to read instead.") else: - self.files = glob.glob(inputs) + self.files = [] + for i in inputs: + self.files.extend(glob.glob(i)) elif type(inputs) is list: self.files = inputs else: @@ -82,6 +95,7 @@ def next(self) -> SampleBatchType: raise ValueError( "Failed to read valid experience batch from file: {}".format( self.cur_file)) + return self._postprocess_if_needed(batch) def _postprocess_if_needed(self, @@ -101,17 +115,85 @@ def _postprocess_if_needed(self, raise NotImplementedError( "Postprocessing of multi-agent data not implemented yet.") - def _try_parse(self, line: str) -> SampleBatchType: + def _try_open_file(self, path): + if urlparse(path).scheme not in [""] + WINDOWS_DRIVES: + if smart_open is None: + raise ValueError( + "You must install the `smart_open` module to read " + "from URIs like {}".format(path)) + ctx = smart_open + else: + # Allow shortcut for home directory ("~/" -> env[HOME]). + if path.startswith("~/"): + path = os.path.join(os.environ.get("HOME", ""), path[2:]) + + # If path doesn't exist, try to interpret is as relative to the + # rllib directory (located ../../ from this very module). + path_orig = path + if not os.path.exists(path): + path = os.path.join(Path(__file__).parent.parent, path) + if not os.path.exists(path): + raise FileNotFoundError(f"Offline file {path_orig} not found!") + + # Unzip files, if necessary and re-point to extracted json file. + if re.search("\\.zip$", path): + with zipfile.ZipFile(path, "r") as zip_ref: + zip_ref.extractall(Path(path).parent) + path = re.sub("\\.zip$", ".json", path) + assert os.path.exists(path) + ctx = open + file = ctx(path, "r") + return file + + def _try_parse(self, line: str) -> Optional[SampleBatchType]: line = line.strip() if not line: return None try: - return _from_json(line) + batch = _from_json(line) except Exception: logger.exception("Ignoring corrupt json record in {}: {}".format( self.cur_file, line)) return None + # Clip actions (from any values into env's bounds), if necessary. + cfg = self.ioctx.config + if cfg.get("clip_actions"): + if isinstance(batch, SampleBatch): + batch[SampleBatch.ACTIONS] = clip_action( + batch[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[ + "default_policy"].action_space_struct) + else: + for pid, b in batch.policy_batches.items(): + b[SampleBatch.ACTIONS] = clip_action( + b[SampleBatch.ACTIONS], + self.ioctx.worker.policy_map[pid].action_space_struct) + # Re-normalize actions (from env's bounds to 0.0 centered), if + # necessary. + if cfg.get("actions_in_input_normalized") is False: + if isinstance(batch, SampleBatch): + batch[SampleBatch.ACTIONS] = normalize_action( + batch[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[ + "default_policy"].action_space_struct) + else: + for pid, b in batch.policy_batches.items(): + b[SampleBatch.ACTIONS] = normalize_action( + b[SampleBatch.ACTIONS], + self.ioctx.worker.policy_map[pid].action_space_struct) + return batch + + def read_all_files(self): + for path in self.files: + file = self._try_open_file(path) + while True: + line = file.readline() + if not line: + break + batch = self._try_parse(line) + if batch is None: + break + yield batch + def _next_line(self) -> str: if not self.cur_file: self.cur_file = self._next_file() @@ -131,15 +213,16 @@ def _next_line(self) -> str: return line def _next_file(self) -> FileType: - path = random.choice(self.files) - if urlparse(path).scheme not in [""] + WINDOWS_DRIVES: - if smart_open is None: - raise ValueError( - "You must install the `smart_open` module to read " - "from URIs like {}".format(path)) - return smart_open(path, "r") + # If this is the first time, we open a file, make sure all workers + # start with a different one if possible. + if self.cur_file is None and self.ioctx.worker is not None: + idx = self.ioctx.worker.worker_index + total = self.ioctx.worker.num_workers or 1 + path = self.files[round((len(self.files) - 1) * (idx / total))] + # After the first file, pick all others randomly. else: - return open(path, "r") + path = random.choice(self.files) + return self._try_open_file(path) def _from_json(batch: str) -> SampleBatchType: diff --git a/rllib/offline/json_writer.py b/rllib/offline/json_writer.py index 73f883b917d8..d3c849684e49 100644 --- a/rllib/offline/json_writer.py +++ b/rllib/offline/json_writer.py @@ -114,7 +114,7 @@ def _to_json(batch: SampleBatchType, compress_columns: List[str]) -> str: policy_batches = {} for policy_id, sub_batch in batch.policy_batches.items(): policy_batches[policy_id] = {} - for k, v in sub_batch.data.items(): + for k, v in sub_batch.items(): policy_batches[policy_id][k] = _to_jsonable( v, compress=k in compress_columns) out["policy_batches"] = policy_batches diff --git a/rllib/offline/mixed_input.py b/rllib/offline/mixed_input.py index 69e92304da38..f0f40b1f6532 100644 --- a/rllib/offline/mixed_input.py +++ b/rllib/offline/mixed_input.py @@ -1,11 +1,13 @@ -import numpy as np +from types import FunctionType +from typing import Dict +import numpy as np from ray.rllib.offline.input_reader import InputReader -from ray.rllib.offline.json_reader import JsonReader from ray.rllib.offline.io_context import IOContext +from ray.rllib.offline.json_reader import JsonReader from ray.rllib.utils.annotations import override, DeveloperAPI from ray.rllib.utils.typing import SampleBatchType -from typing import Dict +from ray.tune.registry import registry_get_input, registry_contains_input @DeveloperAPI @@ -36,6 +38,11 @@ def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext): for k, v in dist.items(): if k == "sampler": self.choices.append(ioctx.default_sampler_input()) + elif isinstance(k, FunctionType): + self.choices.append(k(ioctx)) + elif isinstance(k, str) and registry_contains_input(k): + input_creator = registry_get_input(k) + self.choices.append(input_creator(ioctx)) else: self.choices.append(JsonReader(k, ioctx)) self.p.append(v) diff --git a/rllib/tests/agents/parameters.py b/rllib/tests/agents/parameters.py index 058a9d9c56c3..45f6d516f8a2 100644 --- a/rllib/tests/agents/parameters.py +++ b/rllib/tests/agents/parameters.py @@ -220,7 +220,7 @@ def astuple(self): config_updates={"num_workers": 0}, ), TestAgentParams.for_moab_move_to_center( - algorithm=ContinuousActionSpaceAlgorithm.CQL_SAC, + algorithm=ContinuousActionSpaceAlgorithm.CQL, config_updates={ # Common Configs "num_workers": 0, @@ -300,7 +300,7 @@ def astuple(self): # frameworks=[Framework.TensorFlow], # ), TestAgentParams.for_moab_move_to_center( - algorithm=ContinuousActionSpaceAlgorithm.CQL_SAC, + algorithm=ContinuousActionSpaceAlgorithm.CQL, config_updates={ # Common Configs "num_workers": 0, @@ -335,7 +335,7 @@ def astuple(self): frameworks=[Framework.TensorFlow], ), TestAgentParams.for_moab_move_to_center( - algorithm=ContinuousActionSpaceAlgorithm.CQL_SAC, + algorithm=ContinuousActionSpaceAlgorithm.CQL, config_updates={ # Common Configs "num_workers": 0, @@ -371,7 +371,7 @@ def astuple(self): frameworks=[Framework.TensorFlow], ), TestAgentParams.for_moab_move_to_center( - algorithm=ContinuousActionSpaceAlgorithm.CQL_SAC, + algorithm=ContinuousActionSpaceAlgorithm.CQL, config_updates={ # Common Configs "num_workers": 0, @@ -399,7 +399,7 @@ def astuple(self): "lagrangian": True, # False "lagrangian_thresh": 5.0, "min_q_weight": 5.0, - "initial_alpha_prime": 1.0, + # "initial_alpha_prime": 1.0, "Q_model": { "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", @@ -421,7 +421,7 @@ def astuple(self): frameworks=[Framework.TensorFlow], ), TestAgentParams.for_moab_move_to_center( - algorithm=ContinuousActionSpaceAlgorithm.CQL_SAC, + algorithm=ContinuousActionSpaceAlgorithm.CQL, config_updates={ # Common Configs "num_workers": 0, @@ -470,17 +470,6 @@ def astuple(self): threshold=150.0, frameworks=[Framework.TensorFlow], ), - TestAgentParams.for_moab_move_to_center( - algorithm=ContinuousActionSpaceAlgorithm.CQL_APEX_SAC, - config_updates={ - "num_workers": 8, - # Number of iterations with Behavior Cloning Pretraining - "bc_iters": 0, - }, - n_iter=100, - threshold=140.0, - frameworks=[Framework.TensorFlow], - ), # This is here for reference only of how to run vanilla DQN # in offline mode in a test. Keep in mind that DQN could hit # the threshold just by luck by being overconfident (extreme @@ -510,29 +499,6 @@ def astuple(self): # frameworks=[Framework.TensorFlow], # version=0, # ), - TestAgentParams.for_cart_pole( - algorithm=DiscreteActionSpaceAlgorithm.CQL_DQN, - config_updates={ - # Common Configs - "num_workers": 0, - "input": "tests/data/cartpole/output-*.json", - "input_evaluation": [], - "evaluation_config": { - "input": "sampler", - "explore": False, - }, - "evaluation_interval": 1, - "evaluation_num_episodes": 10, - "evaluation_num_workers": 1, - "log_level": logging.WARNING, - # CQL Configs - "min_q_weight": 1.0, - }, - n_iter=500, #25, # 3000, #500, #100, #25, # 250, - threshold=150.0, - frameworks=[Framework.TensorFlow], - version=0, - ), ) ] diff --git a/rllib/tests/data/pendulum/small.json b/rllib/tests/data/pendulum/small.json new file mode 100644 index 000000000000..6342e4c2995b --- /dev/null +++ b/rllib/tests/data/pendulum/small.json @@ -0,0 +1,40 @@ +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDBaDi/lZAxP2QkX76UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQA6NTq/dq0vP22RUD6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.3287242650985718]], "rewards": [-5.646285057067871], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [0], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQA6NTq/dq0vP22RUD6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCankC/fJ8oPzirPj+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.08813309669494629]], "rewards": [-5.693763256072998], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [1], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCankC/fJ8oPzirPj+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAkpEy/wc8ZPzjOvj+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.8395185470581055]], "rewards": [-5.9269185066223145], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [2], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAkpEy/wc8ZPzjOvj+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCvD1q/9hsGP/J47j+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.2607402801513672]], "rewards": [-6.457780838012695], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [3], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCvD1q/9hsGP/J47j+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAT92i/xUHUPi2dHkCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.7412639856338501]], "rewards": [-7.058495044708252], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [4], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAT92i/xUHUPi2dHkCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDZJna/WKWMPrT1PkCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.6482666730880737]], "rewards": [-7.982394218444824], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [5], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDZJna/WKWMPrT1PkCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBnuX2/aDwIPodkOUCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.9767004251480103]], "rewards": [-9.092668533325195], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [6], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBnuX2/aDwIPodkOUCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCQ/n+/HgvZu5tHM0CUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.6509912014007568]], "rewards": [-9.889808654785156], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [7], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCQ/n+/HgvZu5tHM0CUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAR63y/EWkevkdVPkCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.5922571420669556]], "rewards": [-10.614130973815918], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [8], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAR63y/EWkevkdVPkCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBDYnW/K+qRvmAaK0CUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.6148225665092468]], "rewards": [-9.803768157958984], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [9], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBDYnW/K+qRvmAaK0CUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDczmu/8E3HvuDfDUCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.8098440170288086]], "rewards": [-8.854684829711914], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [10], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDczmu/8E3HvuDfDUCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBvLmG/N43zvjqH9T+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.022167325019836426]], "rewards": [-8.008651733398438], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [11], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBvLmG/N43zvjqH9T+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB/HFi/uDoJvyU5sz+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.5374762415885925]], "rewards": [-7.369612216949463], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [12], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB/HFi/uDoJvyU5sz+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB+RFK/oQYSv606Uz+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.576760470867157]], "rewards": [-6.832327842712402], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [13], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB+RFK/oQYSv606Uz+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB+OlG/xYITv0L9ED6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.8523757457733154]], "rewards": [-6.495100021362305], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [14], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB+OlG/xYITv0L9ED6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCTNlG/VIgTv60DCDuUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.9754830598831177]], "rewards": [-6.3940863609313965], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [15], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCTNlG/VIgTv60DCDuUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDNWFK/XekRvzI/Hr6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.9187028408050537]], "rewards": [-6.391127109527588], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [16], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDNWFK/XekRvzI/Hr6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAAU1W/54UNv3oh1L6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.5589845180511475]], "rewards": [-6.430506229400635], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [17], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAAU1W/54UNv3oh1L6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAYClu/bYEEv3iJVb+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.01730865240097046]], "rewards": [-6.549499034881592], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [18], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAYClu/bYEEv3iJVb+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAFcWS/sRbnvpX5wb+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.9770012497901917]], "rewards": [-6.820656776428223], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [19], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAFcWS/sRbnvpX5wb+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDHO2+/xT62vnmPBcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.776479959487915]], "rewards": [-7.378707408905029], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [20], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDHO2+/xT62vnmPBcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDFcHe/9UmDvtXlBcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.8722989559173584]], "rewards": [-8.153972625732422], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [21], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDFcHe/9UmDvtXlBcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB+0Xy/GvEgvkPqAcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.8484913110733032]], "rewards": [-8.74808406829834], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [22], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB+0Xy/GvEgvkPqAcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCPzH+/H0Iivb9LF8CUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.7206584811210632]], "rewards": [-9.317010879516602], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [23], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCPzH+/H0Iivb9LF8CUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDmSH+/2fuYPUlsEsCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.3528306484222412]], "rewards": [-10.181554794311523], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [24], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDmSH+/2fuYPUlsEsCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCYEHu/LRlIPuoJHMCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.6875670552253723]], "rewards": [-9.930729866027832], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [25], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQCYEHu/LRlIPuoJHMCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQC8gHS//LaXPoVTBcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.6944198608398438]], "rewards": [-9.26891040802002], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [26], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQC8gHS//LaXPoVTBcCUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAjG2y/bePFPh7C9b+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.19667410850524902]], "rewards": [-8.504039764404297], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [27], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAjG2y/bePFPh7C9b+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB8xGS/f8vlPuOir7+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.8598417043685913]], "rewards": [-7.905289649963379], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [28], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB8xGS/f8vlPuOir7+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQD3vFy/7qcBPyELqL+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.9243161082267761]], "rewards": [-7.353479385375977], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [29], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQD3vFy/7qcBPyELqL+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAZdli/N60IPw1sJL+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.969048023223877]], "rewards": [-6.9908647537231445], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [30], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAZdli/N60IPw1sJL+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDqrlW/+foMPxjnzL6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.5278222560882568]], "rewards": [-6.690484523773193], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [31], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDqrlW/+foMPxjnzL6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDMlVS/f6EOP/uXHr6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.5590147972106934]], "rewards": [-6.562597751617432], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [32], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDMlVS/f6EOP/uXHr6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQD8hFW/cjoNP0vSBj6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.43775200843811035]], "rewards": [-6.5089335441589355], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [33], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQD8hFW/cjoNP0vSBj6UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDru1m/y6MGPwVuHD+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.21879124641418457]], "rewards": [-6.541318893432617], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [34], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDru1m/y6MGPwVuHD+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBAnV+/nUL5PqUzaD+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.3282276391983032]], "rewards": [-6.734356880187988], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [35], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQBAnV+/nUL5PqUzaD+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAoXGi/0uTWPo3awD+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.7816812992095947]], "rewards": [-7.018081188201904], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [36], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQAoXGi/0uTWPo3awD+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDscHC//8WvPsGy0z+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.5585429072380066]], "rewards": [-7.564019203186035], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [37], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQDscHC//8WvPsGy0z+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB1dXe/mCaDPpj46T+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[-0.2782477140426636]], "rewards": [-8.064400672912598], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [38], "weights": [1.0]} +{"type": "SampleBatch", "obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQB1dXe/mCaDPpj46T+UjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "new_obs": "BCJNGGhAjgAAAAAAAABxiQAAAFKABZWDAAEA8hmMEm51bXB5LmNvcmUubnVtZXJpY5SMC19mcm9tYnVmZmVylJOUKJYMLgDxAQC+Qn2/S2YVPn4nEECUjAU8APEYlIwFZHR5cGWUk5SMAmY0lEsASwGHlFKUKEsDjAE8lE5OTkr/////BQDwBUsAdJRiSwFLA4aUjAFDlHSUUpQuAAAAAA==", "action_prob": [0.1], "actions": [[0.774653434753418]], "rewards": [-8.645625114440918], "dones": [false], "infos": [{}], "agent_index": [0], "eps_id": [895625640], "unroll_id": [39], "weights": [1.0]} diff --git a/rllib/utils/exploration/random.py b/rllib/utils/exploration/random.py index 5fcc344d54e6..183a3422c585 100644 --- a/rllib/utils/exploration/random.py +++ b/rllib/utils/exploration/random.py @@ -1,6 +1,6 @@ from gym.spaces import Discrete, Box, MultiDiscrete, Space import numpy as np -import tree +import tree # pip install dm_tree from typing import Union, Optional from ray.rllib.models.action_dist import ActionDistribution @@ -63,7 +63,8 @@ def true_fn(): batch_size = 1 req = force_tuple( action_dist.required_model_output_shape( - self.action_space, self.model.model_config)) + self.action_space, getattr(self.model, "model_config", + None))) # Add a batch dimension? if len(action_dist.inputs.shape) == len(req) + 1: batch_size = tf.shape(action_dist.inputs)[0] diff --git a/rllib/utils/spaces/space_utils.py b/rllib/utils/spaces/space_utils.py index ff0463983f8c..b7f8ff41d26f 100644 --- a/rllib/utils/spaces/space_utils.py +++ b/rllib/utils/spaces/space_utils.py @@ -1,15 +1,17 @@ +import gym from gym.spaces import Tuple, Dict import numpy as np -import tree +import tree # pip install dm_tree +from typing import List, Optional, Union -def flatten_space(space): +def flatten_space(space: gym.Space) -> List[gym.Space]: """Flattens a gym.Space into its primitive components. Primitive components are any non Tuple/Dict spaces. Args: - space(gym.Space): The gym.Space to flatten. This may be any + space (gym.Space): The gym.Space to flatten. This may be any supported type (including nested Tuples and Dicts). Returns: @@ -17,16 +19,16 @@ def flatten_space(space): does not contain Tuples or Dicts anymore. """ - def _helper_flatten(space_, l): + def _helper_flatten(space_, return_list): from ray.rllib.utils.spaces.flexdict import FlexDict if isinstance(space_, Tuple): for s in space_: - _helper_flatten(s, l) + _helper_flatten(s, return_list) elif isinstance(space_, (Dict, FlexDict)): for k in space_.spaces: - _helper_flatten(space_[k], l) + _helper_flatten(space_[k], return_list) else: - l.append(space_) + return_list.append(space_) ret = [] _helper_flatten(space, ret) @@ -63,6 +65,76 @@ def _helper_struct(space_): return _helper_struct(space) +def get_dummy_batch_for_space( + space: gym.Space, + batch_size: int = 32, + fill_value: Union[float, int, str] = 0.0, + time_size: Optional[int] = None, + time_major: bool = False, +) -> np.ndarray: + """Returns batched dummy data (using `batch_size`) for the given `space`. + + Note: The returned batch will not pass a `space.contains(batch)` test + as an additional batch dimension has to be added as dim=0. + + Args: + space (gym.Space): The space to get a dummy batch for. + batch_size(int): The required batch size (B). Note that this can also + be 0 (only if `time_size` is None!), which will result in a + non-batched sample for the given space (no batch dim). + fill_value (Union[float, int, str]): The value to fill the batch with + or "random" for random values. + time_size (Optional[int]): If not None, add an optional time axis + of `time_size` size to the returned batch. + time_major (bool): If True AND `time_size` is not None, return batch + as shape [T x B x ...], otherwise as [B x T x ...]. If `time_size` + if None, ignore this setting and return [B x ...]. + + Returns: + The dummy batch of size `bqtch_size` matching the given space. + """ + # Complex spaces. Perform recursive calls of this function. + if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)): + return tree.map_structure( + lambda s: get_dummy_batch_for_space(s, batch_size, fill_value), + get_base_struct_from_space(space), + ) + # Primivite spaces: Box, Discrete, MultiDiscrete. + # Random values: Use gym's sample() method. + elif fill_value == "random": + if time_size is not None: + assert batch_size > 0 and time_size > 0 + if time_major: + return np.array( + [[space.sample() for _ in range(batch_size)] + for t in range(time_size)], + dtype=space.dtype) + else: + return np.array( + [[space.sample() for t in range(time_size)] + for _ in range(batch_size)], + dtype=space.dtype) + else: + return np.array( + [space.sample() for _ in range(batch_size)] + if batch_size > 0 else space.sample(), + dtype=space.dtype) + # Fill value given: Use np.full. + else: + if time_size is not None: + assert batch_size > 0 and time_size > 0 + if time_major: + shape = [time_size, batch_size] + else: + shape = [batch_size, time_size] + else: + shape = [batch_size] if batch_size > 0 else [] + return np.full( + shape + list(space.shape), + fill_value=fill_value, + dtype=space.dtype) + + def flatten_to_single_ndarray(input_): """Returns a single np.ndarray given a list/tuple of np.ndarrays. @@ -126,3 +198,91 @@ def unbatch(batches_struct): [flat_batches[i][batch_pos] for i in range(len(flat_batches))])) return out + + +def clip_action(action, action_space): + """Clips all components in `action` according to the given Space. + + Only applies to Box components within the action space. + + Args: + action (Any): The action to be clipped. This could be any complex + action, e.g. a dict or tuple. + action_space (Any): The action space struct, + e.g. `{"a": Distrete(2)}` for a space: Dict({"a": Discrete(2)}). + + Returns: + Any: The input action, but clipped by value according to the space's + bounds. + """ + + def map_(a, s): + if isinstance(s, gym.spaces.Box): + a = np.clip(a, s.low, s.high) + return a + + return tree.map_structure(map_, action, action_space) + + +def unsquash_action(action, action_space_struct): + """Unsquashes all components in `action` according to the given Space. + + Inverse of `normalize_action()`. Useful for mapping policy action + outputs (normalized between -1.0 and 1.0) to an env's action space. + Unsquashing results in cont. action component values between the + given Space's bounds (`low` and `high`). This only applies to Box + components within the action space, whose dtype is float32 or float64. + + Args: + action (Any): The action to be unsquashed. This could be any complex + action, e.g. a dict or tuple. + action_space_struct (Any): The action space struct, + e.g. `{"a": Box()}` for a space: Dict({"a": Box()}). + + Returns: + Any: The input action, but unsquashed, according to the space's + bounds. An unsquashed action is ready to be sent to the + environment (`BaseEnv.send_actions([unsquashed actions])`). + """ + + def map_(a, s): + if isinstance(s, gym.spaces.Box) and \ + (s.dtype == np.float32 or s.dtype == np.float64): + # Assuming values are roughly between -1.0 and 1.0 -> + # unsquash them to the given bounds. + a = s.low + (a + 1.0) * (s.high - s.low) / 2.0 + # Clip to given bounds, just in case the squashed values were + # outside [-1.0, 1.0]. + a = np.clip(a, s.low, s.high) + return a + + return tree.map_structure(map_, action, action_space_struct) + + +def normalize_action(action, action_space_struct): + """Normalizes all (Box) components in `action` to be in [-1.0, 1.0]. + + Inverse of `unsquash_action()`. Useful for mapping an env's action + (arbitrary bounded values) to a [-1.0, 1.0] interval. + This only applies to Box components within the action space, whose + dtype is float32 or float64. + + Args: + action (Any): The action to be normalized. This could be any complex + action, e.g. a dict or tuple. + action_space_struct (Any): The action space struct, + e.g. `{"a": Box()}` for a space: Dict({"a": Box()}). + + Returns: + Any: The input action, but normalized, according to the space's + bounds. + """ + + def map_(a, s): + if isinstance(s, gym.spaces.Box) and \ + (s.dtype == np.float32 or s.dtype == np.float64): + # Normalize values to be exactly between -1.0 and 1.0. + a = ((a - s.low) * 2.0) / (s.high - s.low) - 1.0 + return a + + return tree.map_structure(map_, action, action_space_struct) diff --git a/rllib/utils/torch_ops.py b/rllib/utils/torch_ops.py index 487b8c246adc..a27be53cc269 100644 --- a/rllib/utils/torch_ops.py +++ b/rllib/utils/torch_ops.py @@ -1,10 +1,12 @@ from gym.spaces import Discrete, MultiDiscrete import numpy as np -import tree +import os +import tree # pip install dm_tree import warnings from ray.rllib.models.repeated_values import RepeatedValues from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import SMALL_NUMBER torch, nn = try_import_torch() @@ -39,7 +41,22 @@ def apply_grad_clipping(policy, optimizer, loss): def atanh(x): - return 0.5 * torch.log((1 + x) / (1 - x)) + return 0.5 * torch.log( + (1 + x).clamp(min=SMALL_NUMBER) / (1 - x).clamp(min=SMALL_NUMBER)) + + +def concat_multi_gpu_td_errors(policy): + td_error = torch.cat( + [ + getattr(t, "td_error", torch.tensor([0.0])).to(policy.device) + for t in policy.model_gpu_towers + ], + dim=0) + policy.td_error = td_error + return { + "td_error": td_error, + "mean_td_error": torch.mean(td_error), + } def convert_to_non_torch_type(stats): @@ -203,6 +220,20 @@ def sequence_mask(lengths, maxlen=None, dtype=None, time_major=False): return mask +def set_torch_seed(seed): + if seed is not None and torch: + torch.manual_seed(seed) + # See https://github.com/pytorch/pytorch/issues/47672. + cuda_version = torch.version.cuda + if cuda_version is not None and float(torch.version.cuda) >= 10.2: + os.environ["CUBLAS_WORKSPACE_CONFIG"] = "4096:8" + else: + # Not all Operations support this. + torch.use_deterministic_algorithms(True) + # This is only for Convolution no problem. + torch.backends.cudnn.deterministic = True + + def softmax_cross_entropy_with_logits(logits, labels): """Same behavior as tf.nn.softmax_cross_entropy_with_logits.