update minigrid_envs.py

zenglingqi647 · Dec 7, 2023 · 05479a7 · 05479a7
1 parent c33bd05
commit 05479a7
Show file tree

Hide file tree

Showing 9 changed files with 87 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 .vscode
 setup.sh
 rl-starter-files/storage
+rl-starter-files/storag
 ctrl.sh
 rl-starter-files/evaluate/
 rl-starter-files/log/

diff --git a/README.md b/README.md
@@ -22,6 +22,17 @@ python -m scripts.train --algo ppo --env BabyAI-GoToImpUnlock-v0 --model GoToImp
 ```
 The problem is, an ask probability of 0.0005 is still very bad...It takes a really long time to train.
 
+# TODO
+### Baselines
+Basic: 
+> PPO, A2C only
+
+Exploration(?): 
+> RND: https://opendilab.github.io/DI-engine/12_policies/rnd.html
+> BeBold, NovelD: https://github.com/tianjunz/NovelD
+> Deir
+
+
 ### **Update**
 - Bash script of experiments of different babyai and minigrid environments can be found as `babyai.sh` and `minigrid.sh`.
 

diff --git a/experimental-code/vocab.py b/experimental-code/vocab.py
@@ -0,0 +1,63 @@
+def get_minigrid_words():
+    colors = ["red", "green", "blue", "yellow", "purple", "grey"]
+    objects = [
+        "unseen",
+        "empty",
+        "wall",
+        "floor",
+        "box",
+        "key",
+        "ball",
+        "door",
+        "goal",
+        "agent",
+        "lava",
+    ]
+
+    verbs = [
+        "pick",
+        "avoid",
+        "get",
+        "find",
+        "put",
+        "use",
+        "open",
+        "go",
+        "fetch",
+        "reach",
+        "unlock",
+        "traverse",
+    ]
+
+    extra_words = [
+        "up",
+        "the",
+        "a",
+        "at",
+        ",",
+        "square",
+        "and",
+        "then",
+        "to",
+        "of",
+        "rooms",
+        "near",
+        "opening",
+        "must",
+        "you",
+        "matching",
+        "end",
+        "hallway",
+        "object",
+        "from",
+        "room",
+        "maze",
+    ]
+
+    all_words = colors + objects + verbs + extra_words
+    assert len(all_words) == len(set(all_words))
+    return {word: i for i, word in enumerate(all_words)}
+
+if __name__ == "__main__":
+    # Test the minigrid words
+    print(get_minigrid_words())
diff --git a/rl-starter-files/envs/minigrid_envs.py b/rl-starter-files/envs/minigrid_envs.py
@@ -7,10 +7,10 @@
 """
 
 from minigrid.envs.doorkey import DoorKeyEnv
-from minigrid.minigrid_env import MiniGridEnv, Grid, Door, Key, Wall, COLOR_NAMES, DIR_TO_VEC, Ball, Box
-from minigrid.core.world_object import Goal
-from gym_minigrid.register import register
-from gym_minigrid.roomgrid import RoomGrid
+from minigrid.minigrid_env import MiniGridEnv, Grid, COLOR_NAMES, DIR_TO_VEC
+from minigrid.core.world_object import Goal, Door, Key, Wall, Ball, Box
+from gymnasium.envs.registration  import register
+from minigrid.core.roomgrid import RoomGrid
 
 
 class CustomDoorKeyEnv(MiniGridEnv):

diff --git a/rl-starter-files/results.xlsx b/rl-starter-files/results.xlsx
diff --git a/rl-starter-files/results_.xlsx b/rl-starter-files/results_.xlsx
diff --git a/rl-starter-files/utils/gpt_interface.py b/rl-starter-files/utils/gpt_interface.py
@@ -15,6 +15,3 @@ def interact_with_gpt(prompt):
       request_timeout=10
     )
     return output.choices[0].message['content']
-
-
-
diff --git a/rl-starter-files/utils/planner_policy.py b/rl-starter-files/utils/planner_policy.py
@@ -48,6 +48,7 @@ def __init__(self, obs_space, action_space, vocab, llm_variant, ask_cooldown, us
         self.current_skill : int = 0
         self.vocab : Vocabulary = vocab
         self.llm_variant = llm_variant
+        # load skill mmodel 
         for i in range(num_skills):
             self.ac_models.append(self.load_model(i))
 
@@ -71,7 +72,7 @@ def load_model(self, index):
             p.requires_grad = True
         return mdl
 
-    def get_skill_distr(self, obs, memory):
+    def get_skill(self, obs, memory):
         with self.lock:
             if self.timer == 0:
                 invert_vocab = {v: k for k, v in self.vocab.vocab.items()}
@@ -99,14 +100,11 @@ def get_skill_distr(self, obs, memory):
     def forward(self, obs, memory):
         # for network in self.ac_models:
         #     network.zero_grad()
-        skill_network_idx = self.get_skill_distr(obs, memory)
+        skill_network_idx = self.get_skill(obs, memory)
         result = self.ac_models[skill_network_idx](obs, memory)
         for j in range(len(self.ac_models)):
             if j != skill_network_idx:
                 model = self.ac_models[j]
                 for p in model.parameters():
                     p.grad = torch.zeros_like(p)
         return result
-
-
-
diff --git a/scripts/run.sh b/scripts/run.sh
@@ -26,5 +26,8 @@ python3 -m scripts.evaluate --env MiniGrid-DoorKey-5x5-v0 --model DoorKey
 
 
 
-MiniGrid-BlockedUnlockPickup all performs bad. The training return are almost all zero. LavaCrossing
-MiniGrid-DistShift1 and Minigrid-SimpleCrossing, a2c outperforms a2c with reshaped reward
+
+
+Train with llama
+cd ../rl-starter-files
+python -m scripts.train --algo ppo --env BabyAI-GoToImpUnlock-v0 --text --frames 1000000 --recurrence 20 --obs-size 11 --frames-per-proc 40 --procs 64 --batch-size 200 --ask-every 500
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,6 +15,3 @@ def interact_with_gpt(prompt):
		request_timeout=10
		)
		return output.choices[0].message['content']