yardenas
diff --git a/‎actsafe/actsafe/replay_buffer.py
+137-59 b/‎actsafe/actsafe/replay_buffer.py
+137-59
diff --git a/‎actsafe/configs/config.yaml
+1-1 b/‎actsafe/configs/config.yaml
+1-1
diff --git a/‎actsafe/configs/experiment/debug.yaml
+1-1 b/‎actsafe/configs/experiment/debug.yaml
+1-1
diff --git a/‎actsafe/rl/acting.py
+36-50 b/‎actsafe/rl/acting.py
+36-50
@@ -1,9 +1,9 @@
-from typing import Iterator
+from typing import Iterator, Dict
 import jax
 import numpy as np
 
 from actsafe.common.double_buffer import double_buffer
-from actsafe.rl.trajectory import TrajectoryData
+from actsafe.rl.trajectory import Transition, TrajectoryData
 
 
 class ReplayBuffer:
@@ -21,67 +21,136 @@ def __init__(
         self.episode_id = 0
         self.dtype = np.float32
         self.obs_dtype = np.uint8
+        self.max_length = max_length
+        self.observation_shape = observation_shape
+        self.action_shape = action_shape
+        self.num_rewards = num_rewards
+
+        # Main storage arrays
         self.observation = np.zeros(
-            (
-                capacity,
-                max_length + 1,
-            )
-            + observation_shape,
+            (capacity, max_length + 1) + observation_shape,
             dtype=self.obs_dtype,
         )
         self.action = np.zeros(
-            (
-                capacity,
-                max_length,
-            )
-            + action_shape,
+            (capacity, max_length) + action_shape,
             dtype=self.dtype,
         )
         self.reward = np.zeros(
             (capacity, max_length, num_rewards),
             dtype=self.dtype,
         )
         self.cost = np.zeros(
-            (
-                capacity,
-                max_length,
-            ),
+            (capacity, max_length),
             dtype=self.dtype,
         )
+        self.done = np.ones(
+            (capacity, max_length),
+            dtype=bool,
+        )
+        self.episode_lengths = np.zeros(capacity, dtype=np.int32)
+
+        # Tracking ongoing episodes
+        self.ongoing_episodes: Dict[int, Dict] = {}
+
         self._valid_episodes = 0
         self.rs = np.random.RandomState(seed)
         self.batch_size = batch_size
         self.sequence_length = sequence_length
+        self.capacity = capacity
 
-    def add(self, trajectory: TrajectoryData):
-        capacity, *_ = self.reward.shape
-        batch_size = min(trajectory.observation.shape[0], capacity)
-        # Discard data if batch size overflows capacity.
-        end = min(self.episode_id + batch_size, capacity)
-        episode_slice = slice(self.episode_id, end)
-        if trajectory.reward.ndim == 2:
-            trajectory = TrajectoryData(
-                trajectory.observation,
-                trajectory.next_observation,
-                trajectory.action,
-                trajectory.reward[..., None],
-                trajectory.cost,
-            )
-        for data, val in zip(
-            (self.action, self.reward, self.cost),
-            (trajectory.action, trajectory.reward, trajectory.cost),
-        ):
-            data[episode_slice] = val[:batch_size].astype(self.dtype)
-        observation = np.concatenate(
-            [
-                trajectory.observation[:batch_size],
-                trajectory.next_observation[:batch_size, -1:],
-            ],
-            axis=1,
-        )
-        self.observation[episode_slice] = observation.astype(self.obs_dtype)
-        self.episode_id = (self.episode_id + batch_size) % capacity
-        self._valid_episodes = min(self._valid_episodes + batch_size, capacity)
+    def _initialize_ongoing_episode(self, worker_id: int):
+        """Initialize storage for a new ongoing episode."""
+        return {
+            "observation": np.zeros(
+                (self.max_length + 1,) + self.observation_shape, dtype=self.obs_dtype
+            ),
+            "action": np.zeros(
+                (self.max_length,) + self.action_shape, dtype=self.dtype
+            ),
+            "reward": np.zeros((self.max_length, self.num_rewards), dtype=self.dtype),
+            "cost": np.zeros(self.max_length, dtype=self.dtype),
+            "done": np.zeros(self.max_length, dtype=bool),
+            "current_step": 0,
+        }
+
+    def _commit_episode(self, worker_id: int):
+        """Commit a completed episode to the main buffer."""
+        episode_data = self.ongoing_episodes[worker_id]
+        current_step = episode_data["current_step"]
+
+        if current_step == 0:  # Skip empty episodes
+            return
+
+        # Check if we've reached capacity
+        if self.episode_id >= self.capacity:
+            self.episode_id = 0
+
+        # Copy data to main arrays
+        self.observation[self.episode_id, : current_step + 1] = episode_data[
+            "observation"
+        ][: current_step + 1]
+        self.action[self.episode_id, :current_step] = episode_data["action"][
+            :current_step
+        ]
+        self.reward[self.episode_id, :current_step] = episode_data["reward"][
+            :current_step
+        ]
+        self.cost[self.episode_id, :current_step] = episode_data["cost"][:current_step]
+        self.done[self.episode_id, :current_step] = episode_data["done"][:current_step]
+
+        # Set episode length
+        self.episode_lengths[self.episode_id] = current_step
+
+        # Mark remaining timesteps as done
+        self.done[self.episode_id, current_step:] = True
+
+        # Increment counters
+        self.episode_id += 1
+        self._valid_episodes = min(self._valid_episodes + 1, self.capacity)
+
+        # Clear the ongoing episode
+        self.ongoing_episodes[worker_id] = self._initialize_ongoing_episode(worker_id)
+
+    def add(self, step_data: Transition):
+        """Add a single environment step to the buffer."""
+        # Ensure reward has correct shape
+        for i in range(step_data.reward.shape[0]):
+            # Get worker ID for this step
+            worker_id = i
+            # Initialize ongoing episode if needed
+            if worker_id not in self.ongoing_episodes:
+                self.ongoing_episodes[worker_id] = self._initialize_ongoing_episode(
+                    worker_id
+                )
+
+            episode_data = self.ongoing_episodes[worker_id]
+            current_step = episode_data["current_step"]
+
+            # Store current observation
+            episode_data["observation"][current_step] = step_data.observation[i]
+
+            # If not the first step, store previous step's action, reward, cost, done
+            if current_step > 0:
+                episode_data["action"][current_step - 1] = step_data.action[i]
+                episode_data["reward"][current_step - 1] = step_data.reward[i]
+                episode_data["cost"][current_step - 1] = step_data.cost[i]
+                episode_data["done"][current_step - 1] = step_data.done[i]
+
+            # If episode terminated
+            if step_data.done[i]:
+                # Store final observation
+                episode_data["observation"][
+                    current_step + 1
+                ] = step_data.next_observation[i]
+                self._commit_episode(worker_id)
+            else:
+                # Continue episode
+                episode_data["current_step"] = current_step + 1
+
+                # Check if we've reached max length
+                if current_step + 1 >= self.max_length:
+                    episode_data["done"][current_step] = True
+                    self._commit_episode(worker_id)
 
     def _sample_batch(
         self,
@@ -93,37 +162,46 @@ def _sample_batch(
             valid_episodes = valid_episodes
         else:
             valid_episodes = self._valid_episodes
-        time_limit = self.observation.shape[1]
-        assert time_limit > sequence_length
+
         while True:
-            low = self.rs.choice(time_limit - sequence_length - 1, batch_size)
+            episode_ids = self.rs.choice(valid_episodes, size=batch_size)
+            low = np.array(
+                [
+                    self.rs.randint(
+                        0, max(1, self.episode_lengths[episode_id] - sequence_length)
+                    )
+                    for episode_id in episode_ids
+                ]
+            )
             timestep_ids = low[:, None] + np.tile(
                 np.arange(sequence_length + 1),
                 (batch_size, 1),
             )
-            episode_ids = self.rs.choice(valid_episodes, size=batch_size)
-            # Sample a sequence of length H for the actions, rewards and costs,
-            # and a length of H + 1 for the observations (which is needed for
-            # bootstrapping)
+            for i, (episode_id, time_steps) in enumerate(
+                zip(episode_ids, timestep_ids)
+            ):
+                episode_length = self.episode_lengths[episode_id]
+                if time_steps[-1] >= episode_length:
+                    # Adjust timesteps to end at episode termination
+                    offset = time_steps[-1] - episode_length + 1
+                    timestep_ids[i] -= offset
+
             a, r, c = [
                 x[episode_ids[:, None], timestep_ids[:, :-1]]
-                for x in (
-                    self.action,
-                    self.reward,
-                    self.cost,
-                )
+                for x in (self.action, self.reward, self.cost)
             ]
             o = self.observation[episode_ids[:, None], timestep_ids]
             o, next_o = o[:, :-1], o[:, 1:]
-            yield o, next_o, a, r, c
+            done = self.done[episode_ids[:, None], timestep_ids[:, :-1]]
+            yield o, next_o, a, r, c, done
 
     def sample(self, n_batches: int) -> Iterator[TrajectoryData]:
         if self.empty:
             return
         iterator = (
             TrajectoryData(
                 *next(self._sample_batch(self.batch_size, self.sequence_length))
-            )  # type: ignore
+            )
             for _ in range(n_batches)
         )
         if jax.default_backend() == "gpu":
 
@@ -39,7 +39,7 @@ training:
   safety_budget: 25
   seed: 0
   time_limit: 1000
-  episodes_per_epoch: 5
+  steps_per_epoch: 5000
   epochs: 200
   action_repeat: 1
   render_episodes: 0
 
@@ -8,7 +8,7 @@ writers:
 
 training:
   epochs: 2
-  episodes_per_epoch: 1
+  steps_per_epoch: 500
   time_limit: 100
   action_repeat: 2
   parallel_envs: 5
 
@@ -3,74 +3,60 @@
 
 from actsafe.rl.episodic_async_env import EpisodicAsync
 from actsafe.rl.epoch_summary import EpochSummary
-from actsafe.rl.trajectory import Trajectory, TrajectoryData, Transition
+from actsafe.rl.trajectory import Trajectory, Transition
 from actsafe.rl.types import Agent
 
 
-def _summarize_episodes(
-    trajectory: TrajectoryData,
-) -> tuple[float, float]:
-    reward = float(trajectory.reward.sum(1).mean())
-    cost = float(trajectory.cost.sum(1).mean())
-    return reward, cost
-
-
 def interact(
     agent: Agent,
     environment: EpisodicAsync,
-    num_episodes: int,
+    num_steps: int,
     train: bool,
     step: int,
     render_episodes: int = 0,
 ) -> tuple[list[Trajectory], int]:
     observations = environment.reset()
-    episode_count = 0
     episodes: list[Trajectory] = []
-    trajectory = Trajectory()
-    with tqdm(
-        total=num_episodes,
-        unit=f"Episode (✕ {environment.num_envs} parallel)",
-    ) as pbar:
-        while episode_count < num_episodes:
-            render = render_episodes > 0
-            if render:
-                trajectory.frames.append(environment.render())
-            actions = agent(observations, train)
-            next_observations, rewards, done, infos = environment.step(actions)
-            costs = np.array([info.get("cost", 0) for info in infos])
-            transition = Transition(
-                observations, next_observations, actions, rewards, costs
-            )
-            trajectory.transitions.append(transition)
-            agent.observe_transition(transition)
-            observations = next_observations
-            if done.any():
-                assert (
-                    done.all()
-                ), "No support for environments with different ending conditions"
-                np_trajectory = trajectory.as_numpy()
-                step += (
-                    int(np.prod(np_trajectory.cost.shape))
-                    * environment.action_repeat
-                )
-                if train:
-                    agent.observe(np_trajectory)
-                reward, cost = _summarize_episodes(np_trajectory)
-                pbar.set_postfix({"reward": reward, "cost": cost})
-                if render:
-                    render_episodes = max(render_episodes - 1, 0)
+    trajectories = [Trajectory() for _ in range(environment.num_envs)]
+    track_rewards = np.zeros(environment.num_envs)
+    track_costs = np.zeros(environment.num_envs)
+    pbar = tqdm(
+        range(0, num_steps, environment.action_repeat * environment.num_envs),
+        unit=f"Steps (✕ {environment.num_envs} parallel)",
+    )
+    for _ in pbar:
+        render = render_episodes > 0
+        if render:
+            images = environment.render()
+            for i, trajectory in enumerate(trajectories):
+                trajectory.frames.append(images[i])
+        actions = agent(observations, train)
+        next_observations, rewards, done, infos = environment.step(actions)
+        costs = np.array([info.get("cost", 0) for info in infos])
+        transition = Transition(
+            observations, next_observations, actions, rewards, costs, done
+        )
+        for i, trajectory in enumerate(trajectories):
+            trajectory.transitions.append(Transition(*map(lambda x: x[i], transition)))
+        agent.observe_transition(transition)
+        observations = next_observations
+        step += environment.action_repeat
+        track_rewards += rewards * (~done)
+        track_costs += costs * (~done)
+        pbar.set_postfix({"reward": track_rewards.mean(), "cost": track_costs.mean()})
+        if render:
+            render_episodes = max(render_episodes - done.any(), 0)
+        for i, (ep_done, trajectory) in enumerate(zip(done, trajectories)):
+            if ep_done:
                 episodes.append(trajectory)
-                trajectory = Trajectory()
-                pbar.update(1)
-                episode_count += 1
-                observations = environment.reset()
+                trajectories[i] = Trajectory()
     return episodes, step
 
 
 def epoch(
     agent: Agent,
     env: EpisodicAsync,
-    num_episodes: int,
+    num_steps: int,
     train: bool,
     step: int,
     render_episodes: int = 0,
@@ -79,7 +65,7 @@ def epoch(
     samples, step = interact(
         agent,
         env,
-        num_episodes,
+        num_steps,
         train,
         step,
         render_episodes,