SforAiDl · AdityaKapoor74 · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,20 +1,20 @@
 repos:
   - repo: https://github.com/asottile/seed-isort-config
-    rev: v1.9.4
+    rev: v2.2.0
     hooks:
         - id: seed-isort-config
           args: [--exclude=^((examples|docs)/.*)$]
 
   - repo: https://github.com/timothycrosley/isort
-    rev: 4.3.2
+    rev: 5.5.0
     hooks:
         - id: isort
 
   - repo: https://github.com/python/black
     rev: 20.8b1
     hooks:
         - id: black
-          language_version: python3.7
+          language_version: python3
 
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3

diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py
@@ -2,13 +2,14 @@
 
 import torch  # noqa
 import torch.nn as nn  # noqa
+import torch.nn.functional as F
 from gym import spaces
 from torch.distributions import Categorical, Normal
 
 from genrl.core.base import BaseActorCritic
 from genrl.core.policies import MlpPolicy
 from genrl.core.values import MlpValue
-from genrl.utils.utils import cnn
+from genrl.utils.utils import cnn, shared_mlp
 
 
 class MlpActorCritic(BaseActorCritic):
@@ -216,10 +217,119 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor:
         return value
 
 
+class SharedActorCritic(BaseActorCritic):
+    def __init__(
+        self,
+        critic_prev,
+        actor_prev,
+        shared,
+        critic_post,
+        actor_post,
+        weight_init,
+        activation_func,
+    ):
+        super(SharedActorCritic, self).__init__()
+
+        self.critic, self.actor = shared_mlp(
+            critic_prev,
+            actor_prev,
+            shared,
+            critic_post,
+            actor_post,
+            weight_init,
+            activation_func,
+            False,
+        )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def forward(self, state_critic, state_action):
+
+        if state_critic is not None:
+            return self.critic(state_critic)
+
+        if state_action is not None:
+            return self.actor(state_action)
+
+    def get_action(self, state, deterministic=False):
+        # state = torch.FloatTensor(state).to(self.device)
+        logits = self.forward(None, state)
+
+        dist = F.softmax(logits, dim=0)
+        probs = Categorical(dist)
+        if deterministic:
+            index = torch.argmax(probs)
+        else:
+            index = probs.sample().cpu().detach().item()
+        return index
+
+    def get_value(self, state):
+        # state = torch.FloatTensor(state).to(self.device)
+        value = self.forward(state, None)
+        return value
+
+
+class Actor(MlpPolicy):
+    def __init__(
+        self,
+        state_dim: spaces.Space,
+        action_dim: spaces.Space,
+        hidden: Tuple = (32, 32),
+        discrete: bool = True,
+        **kwargs,
+    ):
+        super(Actor, self).__init__(state_dim, action_dim, hidden, discrete ** kwargs)
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def forward(self, state):
+        state = self.model(state)
+        return state
+
+    def get_action(self, state, deterministic=False):
+        # state = torch.FloatTensor(state).to(self.device)
+        logits = self.forward(state)
+
+        dist = F.softmax(logits, dim=0)
+        probs = Categorical(dist)
+        if deterministic:
+            index = torch.argmax(probs)
+        else:
+            index = probs.sample().cpu().detach().item()
+        return index
+
+
+class Critic(MlpValue):
+    def __init__(
+        self,
+        state_dim: spaces.Space,
+        action_dim: spaces.Space,
+        fc_layers: Tuple = (32, 32),
+        val_type: str = "V",
+        **kwargs,
+    ):
+        super(Critic, self).__init__(
+            state_dim, action_dim, fc_layers, val_type, **kwargs
+        )
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def forward(self, state):
+
+        state = self.model(state)
+
+        return state
+
+    def get_value(self, state):
+        # state = torch.FloatTensor(state).to(self.device)
+        value = self.forward(state)
+        return value
+
+
 actor_critic_registry = {
     "mlp": MlpActorCritic,
     "cnn": CNNActorCritic,
     "mlp12": MlpSingleActorMultiCritic,
+    "mlpshared": SharedActorCritic,
 }
 
 

diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py
@@ -117,15 +117,15 @@ def sample(
         ]
     ):
         """
-                (Returns randomly sampled memories from replay memory along with their
+        (Returns randomly sampled memories from replay memory along with their
         respective indices and weights)
 
-                :param batch_size: Number of samples per batch
-                :param beta: (Bias exponent used to correct
+        :param batch_size: Number of samples per batch
+        :param beta: (Bias exponent used to correct
         Importance Sampling (IS) weights)
-                :type batch_size: int
-                :type beta: float
-                :returns: (Tuple containing `states`, `actions`, `next_states`,
+        :type batch_size: int
+        :type beta: float
+        :returns: (Tuple containing `states`, `actions`, `next_states`,
         `rewards`, `dones`, `indices` and `weights`)
         """
         if beta is None:
@@ -181,3 +181,118 @@ def __len__(self) -> int:
     @property
     def pos(self):
         return len(self.buffer)
+
+
+class MultiAgentReplayBuffer:
+    """
+    Implements the basic Experience Replay Mechanism for MultiAgents
+    by feeding in global states, global actions, global rewards,
+    global next_states, global dones
+
+        :param capacity: Size of the replay buffer
+        :type capacity: int
+        :param num_agents: Number of agents in the environment
+        :type num_agents: int
+    """
+
+    def __init__(self, num_agents: int, capacity: int):
+        """
+        Initialising the buffer
+            :param num_agents: number of agents in the environment
+            :type num_agents: int
+            :param capacity: Max buffer size
+            :type capacity: int
+
+        """
+        self.capacity = capacity
+        self.num_agents = num_agents
+        self.buffer = deque(maxlen=self.capacity)
+
+    def push(self, inp: Tuple) -> None:
+        """
+        Adds new experience to buffer
+
+            :param inp: (Tuple containing `state`, `action`, `reward`,
+            `next_state` and `done`)
+            :type inp: tuple
+            :returns: None
+        """
+        self.buffer.append(inp)
+
+    def sample(self, batch_size):
+
+        """
+        Returns randomly sampled experiences from replay memory
+
+            :param batch_size: Number of samples per batch
+            :type batch_size: int
+            :returns: (Tuple composing of `indiv_obs_batch`,
+            `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`,
+            `global_state_batch`, `global_actions_batch`, `global_next_state_batch`,
+            `done_batch`)
+        """
+        indiv_obs_batch = [
+            [] for _ in range(self.num_agents)
+        ]  # [ [states of agent 1], ... ,[states of agent n] ]    ]
+        indiv_action_batch = [
+            [] for _ in range(self.num_agents)
+        ]  # [ [actions of agent 1], ... , [actions of agent n]]
+        indiv_reward_batch = [[] for _ in range(self.num_agents)]
+        indiv_next_obs_batch = [[] for _ in range(self.num_agents)]
+
+        global_state_batch = []
+        global_next_state_batch = []
+        global_actions_batch = []
+        done_batch = []
+
+        batch = random.sample(self.buffer, batch_size)
+
+        for experience in batch:
+            state, action, reward, next_state, done = experience
+
+            for i in range(self.num_agents):
+                indiv_obs_batch[i].append(state[i])
+                indiv_action_batch[i].append(action[i])
+                indiv_reward_batch[i].append(reward[i])
+                indiv_next_obs_batch[i].append(next_state[i])
+
+            global_state_batch.append(torch.cat(state))
+            global_actions_batch.append(torch.cat(action))
+            global_next_state_batch.append(torch.cat(next_state))
+            done_batch.append(done)
+
+        global_state_batch = torch.stack(global_state_batch)
+        global_actions_batch = torch.stack(global_actions_batch)
+        global_next_state_batch = torch.stack(global_next_state_batch)
+        done_batch = torch.stack(done_batch)
+        indiv_obs_batch = torch.stack(
+            [torch.FloatTensor(obs) for obs in indiv_obs_batch]
+        )
+        indiv_action_batch = torch.stack(
+            [torch.FloatTensor(act) for act in indiv_action_batch]
+        )
+        indiv_reward_batch = torch.stack(
+            [torch.FloatTensor(rew) for rew in indiv_reward_batch]
+        )
+        indiv_next_obs_batch = torch.stack(
+            [torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch]
+        )
+
+        return (
+            indiv_obs_batch,
+            indiv_action_batch,
+            indiv_reward_batch,
+            indiv_next_obs_batch,
+            global_state_batch,
+            global_actions_batch,
+            global_next_state_batch,
+            done_batch,
+        )
+
+    def __len__(self):
+        """
+        Gives number of experiences in buffer currently
+
+        :returns: Length of replay memory
+        """
+        return len(self.buffer)