diff --git a/environments/community/bash_env/README.md b/environments/community/bash_env/README.md
new file mode 100644
index 000000000..300ea3a63
--- /dev/null
+++ b/environments/community/bash_env/README.md
@@ -0,0 +1,128 @@
+# NL2Bash Generation Environment
+
+Train LLMs to translate natural language instructions into Bash commands.
+
+## Overview
+
+This environment uses the [NL2SH-ALFA](https://huggingface.co/datasets/westenfelder/NL2SH-ALFA) dataset to train language models on natural language to Bash translation. Commands are verified by **string matching** against gold standard commands.
+
+## Dataset
+
+- **Source**: [westenfelder/NL2SH-ALFA](https://huggingface.co/datasets/westenfelder/NL2SH-ALFA)
+- **Paper**: [LLM-Supported Natural Language to Bash Translation](https://arxiv.org/abs/2502.06858) (NAACL 2025)
+- **Training Set**: 40,939 instruction-command pairs
+- **Test Set**: 300 manually verified pairs with alternative commands and difficulty levels
+
+### Sample Data
+
+```json
+{
+  "nl": "find all files in the current directory with the extension .txt and delete them",
+  "bash": "find . -name \"*.txt\" -delete",
+  "bash2": "find . -type f -name \"*.txt\" -exec rm {} +",
+  "difficulty": 1
+}
+```
+
+## Usage
+
+### Training Mode (with API Server)
+
+```bash
+# Terminal 1: Start the Atropos API
+run-api
+
+# Terminal 2: Run the environment
+python bash_env.py serve --slurm False
+```
+
+### Local Testing (without API)
+
+```bash
+python bash_env.py process --env.data_path_to_save_groups bash_output.jsonl
+```
+
+This generates `bash_output.jsonl` and `bash_output.html` for inspection.
+
+### With Local vLLM Server
+
+```bash
+python bash_env.py process \
+    --env.data_path_to_save_groups bash_output.jsonl \
+    --openai.base_url http://localhost:9001/v1 \
+    --openai.model_name YOUR_MODEL_NAME
+```
+
+## Reward Function
+
+| Score | Condition |
+|-------|-----------|
+| **1.0** | Generated command matches gold or alternative (exact or normalized) |
+| **-1.0** | Command does not match or could not be extracted |
+
+String matching is used instead of execution-based verification because:
+1. Bash execution without sandboxing is unsafe
+2. Many commands have side effects (file creation/deletion, network calls)
+3. The dataset was designed for string-based evaluation
+
+## Prompt Format
+
+The model receives a natural language instruction:
+
+```
+Instruction: find all files in the current directory with the extension .txt and delete them
+```
+
+Output should be in boxed format:
+```
+<think>
+[Chain of thought reasoning]
+</think>
+
+\boxed{find . -name "*.txt" -delete}
+```
+
+## Unit Tests
+
+```bash
+# Run unit tests
+python -m pytest test_bash_utils.py -v
+```
+
+Tests cover:
+- Bash command normalization
+- `\boxed{}` extraction patterns
+- String matching with alternatives
+- Basic syntax validation
+
+## Integration Test
+
+```bash
+# Run with a local vLLM server
+python test_integration.py --base_url http://localhost:8000/v1 --model Qwen/Qwen3-8B
+
+# Test on training set instead
+python test_integration.py --base_url http://localhost:8000/v1 --model Qwen/Qwen3-8B --use_train
+```
+
+The test reports overall accuracy and difficulty-stratified accuracy (easy/medium/hard).
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `bash_env.py` | Main environment implementation |
+| `bash_utils.py` | Bash command processing utilities |
+| `nl2bash_loader.py` | NL2SH-ALFA dataset loader |
+| `test_bash_utils.py` | Unit tests for utilities |
+| `test_integration.py` | LLM integration test |
+
+## Evaluation Metrics
+
+The environment logs the following metrics to WandB:
+
+- `train/percent_correct` - Training accuracy
+- `eval/percent_correct` - Overall test accuracy
+- `eval/accuracy_easy` - Accuracy on easy problems (difficulty=0)
+- `eval/accuracy_medium` - Accuracy on medium problems (difficulty=1)
+- `eval/accuracy_hard` - Accuracy on hard problems (difficulty=2)
diff --git a/environments/community/bash_env/bash_env.py b/environments/community/bash_env/bash_env.py
new file mode 100644
index 000000000..a4817dc93
--- /dev/null
+++ b/environments/community/bash_env/bash_env.py
@@ -0,0 +1,406 @@
+"""
+NL2Bash Generation Environment for Atropos
+
+Trains LLMs to translate natural language instructions into Bash commands.
+Uses the NL2SH-ALFA dataset (NAACL 2025) with string-based verification.
+"""
+
+import random
+from typing import Dict, List, Optional, Tuple, TypedDict, Union
+
+from bash_utils import commands_match, extract_boxed_bash
+from nl2bash_loader import load_nl2bash_split
+from tqdm.asyncio import tqdm_asyncio
+
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    ScoredDataGroup,
+)
+from atroposlib.type_definitions import Item
+
+# System prompt following the established Atropos pattern
+system_prompt = (
+    "You are a deep thinking AI, you may use extremely long chains of thought "
+    "to deeply consider the problem and deliberate with yourself via systematic "
+    "reasoning processes to help come to a correct solution prior to answering. "
+    "You should enclose your thoughts and internal monologue inside <think> </think> "
+    "tags, and then provide your solution or response to the problem.\n\n"
+)
+
+system_prompt += """You are a Bash command expert. Given a natural language instruction,
+generate the appropriate Bash command.
+
+You are allocated a maximum of 1024 tokens, please strive to use less.
+
+Provide your Bash command inside \\boxed{} like this: \\boxed{find . -name "*.txt"}
+
+Important:
+- Generate a single, complete Bash command
+- Do not include explanatory text outside of <think> tags
+- Ensure your command is valid Bash syntax
+
+So please end your answer with \\boxed{your bash command here}"""
+
+
+class NL2BashItem(TypedDict):
+    """Type definition for a NL2Bash dataset item."""
+
+    nl: str
+    bash: str
+    bash2: Optional[str]
+    difficulty: Optional[int]
+
+
+def format_instruction(nl: str) -> str:
+    """Format the natural language instruction for the prompt."""
+    return f"Instruction: {nl}"
+
+
+class BashEnv(BaseEnv):
+    """
+    Environment for training LLMs to generate Bash commands.
+
+    Uses the NL2SH-ALFA dataset and verifies correctness
+    by string matching against gold commands.
+    """
+
+    name = "nl2bash"
+
+    def __init__(
+        self,
+        config: BaseEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm=True,
+        testing=False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+        self.percent_correct_buffer = list()
+        self.eval_metrics = list()
+        # Track accuracy by difficulty level (0=easy, 1=medium, 2=hard)
+        self.difficulty_correct = {0: [], 1: [], 2: []}
+
+    @classmethod
+    def config_init(cls) -> Tuple[BaseEnvConfig, List[APIServerConfig]]:
+        """Initialize default configuration for the environment."""
+        env_config = BaseEnvConfig(
+            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+            group_size=8,
+            use_wandb=True,
+            rollout_server_url="http://localhost:8000",
+            total_steps=1000,
+            batch_size=12,
+            steps_per_eval=100,
+            max_token_length=1024,
+            wandb_name="nl2bash",
+        )
+        server_configs = [
+            APIServerConfig(
+                model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+                base_url="http://localhost:9001/v1",
+                api_key="x",
+                num_requests_for_eval=256,
+            ),
+        ]
+        return env_config, server_configs
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log custom metrics to WandB."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        # Log percent correct
+        try:
+            wandb_metrics["train/percent_correct"] = sum(
+                self.percent_correct_buffer
+            ) / len(self.percent_correct_buffer)
+        except ZeroDivisionError:
+            pass
+
+        self.percent_correct_buffer = list()
+
+        for item in self.eval_metrics:
+            wandb_metrics[item[0]] = item[1]
+        self.eval_metrics = list()
+
+        await super().wandb_log(wandb_metrics)
+
+    async def setup(self):
+        """Load the NL2SH-ALFA dataset and prepare train/test splits."""
+        # Load training data
+        print("Loading NL2SH-ALFA training data...")
+        self.train = load_nl2bash_split("train")
+        print(f"Loaded {len(self.train)} training examples")
+
+        # Load test data
+        print("Loading NL2SH-ALFA test data...")
+        self.test = load_nl2bash_split("test")
+        print(f"Loaded {len(self.test)} test examples")
+
+        random.shuffle(self.train)
+        self.iter = 0
+
+    def save_checkpoint(self, step, data=None):
+        """Save checkpoint with iteration state."""
+        if data is None:
+            data = {}
+        data["iter"] = self.iter
+        super().save_checkpoint(step, data)
+
+    def _score_bash(
+        self,
+        generated_bash: str,
+        gold_bash: str,
+        alt_bash: Optional[str] = None,
+    ) -> float:
+        """
+        Score generated Bash command by string matching.
+
+        Returns:
+            1.0 if command matches gold or alternative
+            -1.0 if incorrect or malformed
+        """
+        if not generated_bash:
+            return -1.0
+
+        if commands_match(generated_bash, gold_bash, alt_bash):
+            return 1.0
+        else:
+            return -1.0
+
+    async def rollout_and_score_eval(
+        self,
+        nl: str,
+        gold_bash: str,
+        alt_bash: Optional[str],
+        difficulty: Optional[int],
+    ) -> dict:
+        """Rollout and score a single evaluation item."""
+        user_content = format_instruction(nl)
+
+        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+            completion = await managed.chat_completion(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_content},
+                ],
+                n=1,
+                max_tokens=self.config.max_token_length,
+                temperature=0.6,
+            )
+            response_content = completion.choices[0].message.content
+
+        # Extract and score generated Bash
+        generated_bash = extract_boxed_bash(response_content)
+        score = self._score_bash(generated_bash, gold_bash, alt_bash)
+        correct = score == 1.0
+
+        sample = {
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_content},
+                {"role": "assistant", "content": response_content},
+            ],
+            "instruction": nl,
+            "gold_bash": gold_bash,
+            "alt_bash": alt_bash,
+            "generated_bash": generated_bash,
+            "score": 1 if correct else 0,
+            "correct": correct,
+            "difficulty": difficulty,
+            "finish_reason": completion.choices[0].finish_reason,
+        }
+
+        return {
+            "score": 1 if correct else 0,
+            "sample": sample,
+            "difficulty": difficulty,
+        }
+
+    async def evaluate(self, *args, **kwargs):
+        """Run evaluation on test set."""
+        import time
+
+        start_time = time.time()
+
+        eval_tasks = []
+        # Evaluate on all 300 test items (small enough to do full eval)
+        for item in self.test:
+            eval_tasks.append(
+                self.rollout_and_score_eval(
+                    item["nl"],
+                    item["bash"],
+                    item.get("bash2"),
+                    item.get("difficulty"),
+                )
+            )
+        results = await tqdm_asyncio.gather(*eval_tasks)
+
+        scores = [result["score"] for result in results]
+        samples = [result["sample"] for result in results]
+
+        percent_correct = sum(scores) / len(scores) if scores else 0
+
+        # Calculate difficulty-stratified accuracy
+        difficulty_scores = {0: [], 1: [], 2: []}
+        for result in results:
+            diff = result.get("difficulty")
+            if diff is not None and diff in difficulty_scores:
+                difficulty_scores[diff].append(result["score"])
+
+        end_time = time.time()
+
+        self.eval_metrics.append(("eval/percent_correct", percent_correct))
+
+        eval_metrics = {
+            "eval/percent_correct": percent_correct,
+        }
+
+        # Add difficulty-stratified metrics
+        difficulty_names = {0: "easy", 1: "medium", 2: "hard"}
+        for diff, name in difficulty_names.items():
+            if difficulty_scores[diff]:
+                accuracy = sum(difficulty_scores[diff]) / len(difficulty_scores[diff])
+                eval_metrics[f"eval/accuracy_{name}"] = accuracy
+                self.eval_metrics.append((f"eval/accuracy_{name}", accuracy))
+
+        await self.evaluate_log(
+            metrics=eval_metrics,
+            samples=samples,
+            start_time=start_time,
+            end_time=end_time,
+            generation_parameters={
+                "temperature": 0.6,
+                "max_tokens": self.config.max_token_length,
+            },
+        )
+
+    async def collect_trajectories(
+        self, item: NL2BashItem
+    ) -> Tuple[ScoredDataGroup, list[Item]]:
+        """Generate Bash commands for a given instruction."""
+        user_content = format_instruction(item["nl"])
+        user_message = {"role": "user", "content": user_content}
+
+        async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+            chat_completions = await managed.chat_completion(
+                messages=[{"role": "system", "content": system_prompt}, user_message],
+                n=self.config.group_size,
+                max_tokens=self.config.max_token_length,
+                temperature=1.0,
+            )
+
+            try:
+                state = managed.get_state()
+                nodes = state["nodes"]
+            except AttributeError:
+                # Fallback for OpenAIServer which doesn't track state
+                nodes = []
+                for choice in chat_completions.choices:
+                    content = choice.message.content
+                    if self.tokenizer:
+                        tokens = self.tokenizer.encode(content)
+
+                        # Create dummy node-like object
+                        class Node:
+                            def __init__(self, t):
+                                self.tokens = t
+                                self.masked_tokens = t
+                                self.logprobs = [0.0] * len(t)
+
+                        nodes.append(Node(tokens))
+                    else:
+                        nodes.append(None)
+
+        to_score = list()
+        to_backlog = list()
+
+        for i, chat_completion in enumerate(chat_completions.choices):
+            messages = [
+                {"role": "system", "content": system_prompt},
+                user_message,
+                {"role": "assistant", "content": chat_completion.message.content},
+            ]
+            to_score.append(
+                {
+                    "messages": messages,
+                    "gold_bash": item["bash"],
+                    "alt_bash": item.get("bash2"),
+                    "finish_reason": chat_completion.finish_reason,
+                    "tokens": nodes[i].tokens,
+                    "masks": nodes[i].masked_tokens,
+                    "logprobs": nodes[i].logprobs,
+                }
+            )
+
+        to_postprocess = await self.score(to_score)
+        return to_postprocess, to_backlog
+
+    async def score(
+        self, rollout_group_data
+    ) -> Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]]:
+        """Score generated Bash commands by string matching."""
+        scores = ScoredDataGroup()
+
+        # If all scores are the same, return None (no training signal)
+        # if len(set(scores["scores"])) == 1:
+        #     return None
+
+        # Add messages to scores to avoid reconstruction from tokens
+        scores["messages"] = [
+            item["messages"]
+            for item in rollout_group_data
+            if len([1 for i in item["masks"] if i != -100]) >= 10
+        ]
+        # Align messages with the filtered tokens/scores
+        # Note: The loop above filtered items < 10 masks.
+        # We need to ensure messages list matches tokens list length and order
+
+        # Redo the loop to be safe and cleaner
+        scores["tokens"] = list()
+        scores["masks"] = list()
+        scores["scores"] = list()
+        scores["inference_logprobs"] = list()
+        scores["messages"] = list()
+
+        # Get gold info from first item (all items in group have same gold)
+        gold_bash = rollout_group_data[0]["gold_bash"]
+        alt_bash = rollout_group_data[0].get("alt_bash")
+
+        for item in rollout_group_data:
+            response_content = item["messages"][-1]["content"]
+            generated_bash = extract_boxed_bash(response_content)
+            reward = self._score_bash(generated_bash, gold_bash, alt_bash)
+
+            tokens = item["tokens"]
+            masks = item["masks"]
+            logprobs = item["logprobs"]
+
+            # Remove obviously bad examples (very short)
+            # if len([1 for i in masks if i != -100]) < 10:
+            #     continue
+
+            scores["tokens"].append(tokens)
+            scores["masks"].append(masks)
+            scores["inference_logprobs"].append(logprobs)
+            scores["scores"].append(reward)
+            scores["messages"].append(item["messages"])
+
+            if len(scores["tokens"]) >= self.config.group_size:
+                break
+
+        for score in scores["scores"]:
+            self.percent_correct_buffer.append(max(score, 0))
+
+        return scores
+
+    async def get_next_item(self) -> NL2BashItem:
+        """Get the next training item."""
+        next_item = self.train[self.iter % len(self.train)]
+        self.iter += 1
+        return next_item
+
+
+if __name__ == "__main__":
+    BashEnv.cli()
diff --git a/environments/community/bash_env/bash_utils.py b/environments/community/bash_env/bash_utils.py
new file mode 100644
index 000000000..3afe1676d
--- /dev/null
+++ b/environments/community/bash_env/bash_utils.py
@@ -0,0 +1,143 @@
+"""
+Bash Command Utilities
+
+Provides utilities for processing and comparing Bash commands.
+Used by the NL2Bash Environment for reward verification.
+"""
+
+import re
+import shlex
+from typing import Optional
+
+
+def normalize_bash(cmd: str) -> str:
+    """
+    Normalize a bash command for comparison.
+
+    Normalizations applied:
+    - Strip leading/trailing whitespace
+    - Normalize internal whitespace (collapse multiple spaces)
+    - Handle common quoting variations
+
+    Args:
+        cmd: Raw bash command string
+
+    Returns:
+        Normalized command string
+    """
+    if not cmd:
+        return ""
+
+    # Strip whitespace
+    cmd = cmd.strip()
+
+    # Normalize internal whitespace
+    cmd = re.sub(r"\s+", " ", cmd)
+
+    return cmd
+
+
+def extract_boxed_bash(text: str) -> Optional[str]:
+    """
+    Extract Bash command from \\boxed{} format in LLM response.
+
+    Args:
+        text: LLM response text
+
+    Returns:
+        Extracted Bash command string, or None if not found
+    """
+    if not text:
+        return None
+
+    # Try to find \boxed{...} pattern
+    # Handle both \\boxed{} and \boxed{} formats
+    patterns = [
+        r"\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}",  # Handles nested braces
+        r"\\boxed\{(.+?)\}",  # Simple pattern
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            bash_cmd = match.group(1).strip()
+            if bash_cmd:
+                return bash_cmd
+
+    return None
+
+
+def commands_match(
+    generated: str,
+    gold: str,
+    alt_gold: Optional[str] = None,
+) -> bool:
+    """
+    Check if generated command matches gold or alternative.
+
+    Comparison strategy:
+    1. Exact match against gold or alt_gold
+    2. Normalized match (whitespace, etc.)
+
+    Args:
+        generated: Generated bash command
+        gold: Primary gold bash command
+        alt_gold: Optional alternative gold command (from bash2 field)
+
+    Returns:
+        True if commands match, False otherwise
+    """
+    if not generated:
+        return False
+
+    # Normalize all commands
+    gen_norm = normalize_bash(generated)
+    gold_norm = normalize_bash(gold)
+
+    # Check primary gold
+    if gen_norm == gold_norm:
+        return True
+
+    # Check exact match (in case normalization removes something)
+    if generated.strip() == gold.strip():
+        return True
+
+    # Check alternative gold if provided
+    if alt_gold:
+        alt_norm = normalize_bash(alt_gold)
+        if gen_norm == alt_norm:
+            return True
+        if generated.strip() == alt_gold.strip():
+            return True
+
+    return False
+
+
+def is_valid_bash_syntax(cmd: str) -> bool:
+    """
+    Perform a basic syntax check on a bash command.
+
+    This is a lightweight check that catches obvious issues
+    without actually executing the command.
+
+    Args:
+        cmd: Bash command to check
+
+    Returns:
+        True if command appears syntactically valid
+    """
+    if not cmd or not cmd.strip():
+        return False
+
+    # Check for unclosed quotes
+    try:
+        shlex.split(cmd)
+    except ValueError:
+        return False
+
+    # Check for obviously incomplete commands
+    cmd_stripped = cmd.strip()
+    if cmd_stripped.endswith(("&&", "||", "|", ";")):
+        return False
+
+    return True
diff --git a/environments/community/bash_env/nl2bash_loader.py b/environments/community/bash_env/nl2bash_loader.py
new file mode 100644
index 000000000..987a3ae1f
--- /dev/null
+++ b/environments/community/bash_env/nl2bash_loader.py
@@ -0,0 +1,97 @@
+"""
+NL2Bash Data Loader
+
+Loads the NL2SH-ALFA dataset from HuggingFace for training LLMs
+to translate natural language to Bash commands.
+
+Dataset: westenfelder/NL2SH-ALFA (NAACL 2025)
+Paper: "LLM-Supported Natural Language to Bash Translation"
+"""
+
+from typing import Any, Dict, List, Optional
+
+from datasets import load_dataset
+
+
+def load_nl2bash_split(
+    split: str = "train",
+    limit: Optional[int] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Load a split of the NL2SH-ALFA dataset.
+
+    Args:
+        split: One of 'train' or 'test'
+        limit: Optional limit on number of examples to load
+
+    Returns:
+        List of dictionaries with:
+        - nl: Natural language instruction
+        - bash: Gold bash command
+        - bash2: Alternative bash command (test only)
+        - difficulty: Difficulty level 0-2 (test only)
+
+    Note: NL2SH-ALFA uses the config parameter (not split parameter) to select
+    train vs test data. Both configs use split="train" internally.
+    """
+    if split not in ("train", "test"):
+        raise ValueError(f"Split must be 'train' or 'test', got: {split}")
+
+    # Load dataset - config parameter selects train/test, split is always "train"
+    print(f"Loading NL2SH-ALFA {split} data from HuggingFace...")
+    dataset = load_dataset("westenfelder/NL2SH-ALFA", split, split="train")
+
+    # Convert to list of dicts
+    data = []
+    for i, item in enumerate(dataset):
+        if limit and i >= limit:
+            break
+
+        entry = {
+            "nl": item["nl"],
+            "bash": item["bash"],
+        }
+
+        # Test set has additional fields
+        if split == "test":
+            entry["bash2"] = item.get("bash2")
+            entry["difficulty"] = item.get("difficulty", 1)
+
+        data.append(entry)
+
+    print(f"Loaded {len(data)} {split} examples")
+    return data
+
+
+def load_nl2bash() -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Load the full NL2SH-ALFA dataset.
+
+    Returns:
+        Dictionary with 'train' and 'test' splits
+    """
+    return {
+        "train": load_nl2bash_split("train"),
+        "test": load_nl2bash_split("test"),
+    }
+
+
+if __name__ == "__main__":
+    # Test the loader
+    print("Testing NL2Bash loader...")
+
+    print("\n--- Training Set ---")
+    train = load_nl2bash_split("train", limit=3)
+    for i, item in enumerate(train):
+        print(f"\nExample {i+1}:")
+        print(f"  NL: {item['nl']}")
+        print(f"  Bash: {item['bash']}")
+
+    print("\n--- Test Set ---")
+    test = load_nl2bash_split("test", limit=3)
+    for i, item in enumerate(test):
+        print(f"\nExample {i+1}:")
+        print(f"  NL: {item['nl']}")
+        print(f"  Bash: {item['bash']}")
+        print(f"  Bash2: {item.get('bash2', 'N/A')}")
+        print(f"  Difficulty: {item.get('difficulty', 'N/A')}")
diff --git a/environments/community/bash_env/test_bash_utils.py b/environments/community/bash_env/test_bash_utils.py
new file mode 100644
index 000000000..0d5875f9c
--- /dev/null
+++ b/environments/community/bash_env/test_bash_utils.py
@@ -0,0 +1,163 @@
+"""
+Unit tests for Bash command utilities.
+"""
+
+import pytest
+from bash_utils import (
+    commands_match,
+    extract_boxed_bash,
+    is_valid_bash_syntax,
+    normalize_bash,
+)
+
+
+class TestNormalizeBash:
+    """Tests for normalize_bash function."""
+
+    def test_strip_whitespace(self):
+        """Test that leading/trailing whitespace is stripped."""
+        assert normalize_bash("  ls -la  ") == "ls -la"
+        assert normalize_bash("\tcd /tmp\n") == "cd /tmp"
+
+    def test_normalize_internal_whitespace(self):
+        """Test that internal whitespace is collapsed."""
+        assert normalize_bash("ls   -la") == "ls -la"
+        assert normalize_bash("find  .  -name  '*.txt'") == "find . -name '*.txt'"
+
+    def test_empty_string(self):
+        """Test empty string handling."""
+        assert normalize_bash("") == ""
+        assert normalize_bash("   ") == ""
+
+    def test_preserves_quoted_content(self):
+        """Test that quoted content is preserved."""
+        cmd = 'echo "hello   world"'
+        # Note: internal whitespace in quotes is NOT normalized by the simple regex
+        # This is expected behavior - we're normalizing command structure, not content
+        result = normalize_bash(cmd)
+        assert "echo" in result
+
+
+class TestExtractBoxedBash:
+    """Tests for extract_boxed_bash function."""
+
+    def test_simple_boxed(self):
+        """Test extraction from simple boxed format."""
+        text = "Here is the command: \\boxed{ls -la}"
+        assert extract_boxed_bash(text) == "ls -la"
+
+    def test_boxed_with_braces(self):
+        """Test extraction when command contains braces."""
+        text = "\\boxed{find . -name '*.txt' -exec rm {} \\;}"
+        result = extract_boxed_bash(text)
+        assert result is not None
+        assert "find" in result
+
+    def test_boxed_at_end(self):
+        """Test extraction when boxed is at the end."""
+        text = "<think>I need to list files</think>\n\\boxed{ls -la /tmp}"
+        assert extract_boxed_bash(text) == "ls -la /tmp"
+
+    def test_multiline_content(self):
+        """Test extraction with multiline thinking."""
+        text = """<think>
+        Let me think about this...
+        I need to find all text files.
+        </think>
+
+        \\boxed{find . -name "*.txt"}"""
+        result = extract_boxed_bash(text)
+        assert result == 'find . -name "*.txt"'
+
+    def test_no_boxed(self):
+        """Test when no boxed format is present."""
+        text = "Just run: ls -la"
+        assert extract_boxed_bash(text) is None
+
+    def test_empty_boxed(self):
+        """Test empty boxed content."""
+        text = "\\boxed{}"
+        assert extract_boxed_bash(text) is None
+
+    def test_none_input(self):
+        """Test None input."""
+        assert extract_boxed_bash(None) is None
+
+    def test_empty_input(self):
+        """Test empty string input."""
+        assert extract_boxed_bash("") is None
+
+
+class TestCommandsMatch:
+    """Tests for commands_match function."""
+
+    def test_exact_match(self):
+        """Test exact string match."""
+        assert commands_match("ls -la", "ls -la") is True
+
+    def test_match_with_whitespace_diff(self):
+        """Test match ignoring whitespace differences."""
+        assert commands_match("ls  -la", "ls -la") is True
+        assert commands_match("  ls -la  ", "ls -la") is True
+
+    def test_no_match(self):
+        """Test non-matching commands."""
+        assert commands_match("ls -la", "ls -l") is False
+        assert commands_match("cat file.txt", "cat other.txt") is False
+
+    def test_alt_gold_match(self):
+        """Test matching against alternative gold command."""
+        generated = "find . -type f -name '*.txt' -delete"
+        gold = "find . -name '*.txt' -delete"
+        alt_gold = "find . -type f -name '*.txt' -delete"
+        assert commands_match(generated, gold, alt_gold) is True
+
+    def test_empty_generated(self):
+        """Test empty generated command."""
+        assert commands_match("", "ls -la") is False
+        assert commands_match(None, "ls -la") is False
+
+    def test_with_quotes(self):
+        """Test commands with different quoting styles."""
+        # Exact match should work
+        assert commands_match('echo "hello"', 'echo "hello"') is True
+        # Different quote styles are NOT equivalent
+        assert commands_match("echo 'hello'", 'echo "hello"') is False
+
+
+class TestIsValidBashSyntax:
+    """Tests for is_valid_bash_syntax function."""
+
+    def test_valid_simple_command(self):
+        """Test valid simple commands."""
+        assert is_valid_bash_syntax("ls -la") is True
+        assert is_valid_bash_syntax("cd /tmp") is True
+        assert is_valid_bash_syntax("echo hello") is True
+
+    def test_valid_complex_command(self):
+        """Test valid complex commands."""
+        # Note: shlex.split has trouble with find -exec {} \; patterns
+        # Test with simpler complex commands that still validate properly
+        assert is_valid_bash_syntax('grep -r "pattern" /path') is True
+        assert is_valid_bash_syntax("ls -la | grep test") is True
+
+    def test_invalid_unclosed_quote(self):
+        """Test detection of unclosed quotes."""
+        assert is_valid_bash_syntax('echo "hello') is False
+        assert is_valid_bash_syntax("echo 'world") is False
+
+    def test_invalid_trailing_operator(self):
+        """Test detection of trailing operators."""
+        assert is_valid_bash_syntax("ls -la &&") is False
+        assert is_valid_bash_syntax("cat file |") is False
+        assert is_valid_bash_syntax("echo test;") is False
+
+    def test_empty_command(self):
+        """Test empty commands."""
+        assert is_valid_bash_syntax("") is False
+        assert is_valid_bash_syntax("   ") is False
+        assert is_valid_bash_syntax(None) is False
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/environments/community/bash_env/test_integration.py b/environments/community/bash_env/test_integration.py
new file mode 100644
index 000000000..e0efec94e
--- /dev/null
+++ b/environments/community/bash_env/test_integration.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""
+Integration test for NL2Bash Environment that works with OpenAI-compatible APIs.
+
+This test verifies:
+1. NL2SH-ALFA dataset loading
+2. Bash command generation from LLM
+3. Bash extraction from \\boxed{}
+4. String matching verification
+5. Scoring logic
+"""
+
+import asyncio
+import json
+import random
+from typing import Optional
+
+import openai
+
+# Import local modules
+from bash_utils import commands_match, extract_boxed_bash
+from nl2bash_loader import load_nl2bash_split
+
+# System prompt from the environment
+SYSTEM_PROMPT = (
+    "You are a deep thinking AI, you may use extremely long chains of thought "
+    "to deeply consider the problem and deliberate with yourself via systematic "
+    "reasoning processes to help come to a correct solution prior to answering. "
+    "You should enclose your thoughts and internal monologue inside <think> </think> "
+    "tags, and then provide your solution or response to the problem.\n\n"
+    "You are a Bash command expert. Given a natural language instruction, "
+    "generate the appropriate Bash command.\n\n"
+    "You are allocated a maximum of 1024 tokens, please strive to use less.\n\n"
+    "Provide your Bash command inside \\boxed{} like this: "
+    '\\boxed{find . -name "*.txt"}\n\n'
+    "Important:\n"
+    "- Generate a single, complete Bash command\n"
+    "- Do not include explanatory text outside of <think> tags\n"
+    "- Ensure your command is valid Bash syntax\n\n"
+    "So please end your answer with \\boxed{your bash command here}"
+)
+
+
+def format_instruction(nl: str) -> str:
+    """Format the natural language instruction for the prompt."""
+    return f"Instruction: {nl}"
+
+
+def score_bash(
+    generated_bash: str,
+    gold_bash: str,
+    alt_bash: Optional[str] = None,
+) -> dict:
+    """Score bash by string matching."""
+    result = {
+        "generated_bash": generated_bash,
+        "gold_bash": gold_bash,
+        "alt_bash": alt_bash,
+        "score": -1.0,
+        "match": False,
+        "error": None,
+    }
+
+    if not generated_bash:
+        result["error"] = "No Bash command extracted from response"
+        return result
+
+    if commands_match(generated_bash, gold_bash, alt_bash):
+        result["score"] = 1.0
+        result["match"] = True
+    else:
+        result["error"] = "Command does not match gold or alternative"
+
+    return result
+
+
+async def test_single_item(client, model_name: str, item: dict, item_idx: int) -> dict:
+    """Test a single NL2Bash item."""
+    user_content = format_instruction(item["nl"])
+
+    try:
+        response = await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1024,
+            temperature=0.6,
+        )
+
+        response_content = response.choices[0].message.content
+
+        # Extract Bash
+        generated_bash = extract_boxed_bash(response_content)
+
+        # Score
+        score_result = score_bash(generated_bash, item["bash"], item.get("bash2"))
+
+        return {
+            "item_idx": item_idx,
+            "instruction": item["nl"],
+            "difficulty": item.get("difficulty"),
+            "response": (
+                response_content[:500] + "..."
+                if len(response_content) > 500
+                else response_content
+            ),
+            **score_result,
+        }
+
+    except Exception as e:
+        return {
+            "item_idx": item_idx,
+            "instruction": item["nl"],
+            "error": str(e),
+            "score": -1.0,
+        }
+
+
+async def run_integration_test(
+    base_url: str,
+    model_name: str,
+    api_key: str = "x",
+    num_samples: int = 10,
+    use_test_set: bool = True,
+):
+    """Run the integration test."""
+    print(f"\n{'='*60}")
+    print("NL2Bash Environment Integration Test")
+    print(f"{'='*60}")
+    print(f"Server: {base_url}")
+    print(f"Model: {model_name}")
+    print(f"Samples: {num_samples}")
+    print(f"Split: {'test' if use_test_set else 'train'}")
+    print()
+
+    # Load dataset
+    split = "test" if use_test_set else "train"
+    print(f"Loading NL2SH-ALFA {split} data...")
+    data = load_nl2bash_split(split)
+    print(f"Loaded {len(data)} examples")
+
+    # Initialize OpenAI client
+    client = openai.AsyncClient(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=120.0,
+    )
+
+    # Sample random items
+    if num_samples < len(data):
+        test_items = random.sample(data, num_samples)
+    else:
+        test_items = data
+
+    # Run tests
+    print(f"\nTesting {len(test_items)} samples...\n")
+    results = []
+
+    for i, item in enumerate(test_items):
+        print(f"[{i+1}/{len(test_items)}] Testing: {item['nl'][:60]}...")
+        result = await test_single_item(client, model_name, item, i)
+        results.append(result)
+
+        # Print result
+        if result["score"] == 1.0:
+            print(f"  ✓ CORRECT - {result.get('generated_bash', 'N/A')[:60]}")
+        else:
+            print(f"  ✗ INCORRECT - {result.get('error', 'Unknown error')}")
+            if result.get("generated_bash"):
+                print(f"    Generated: {result['generated_bash'][:60]}")
+            print(f"    Gold: {result.get('gold_bash', 'N/A')[:60]}")
+            if result.get("alt_bash"):
+                print(f"    Alt:  {result.get('alt_bash', '')[:60]}")
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("SUMMARY")
+    print(f"{'='*60}")
+
+    correct = sum(1 for r in results if r["score"] == 1.0)
+    total = len(results)
+
+    print(f"Overall Accuracy: {correct}/{total} ({100*correct/total:.1f}%)")
+
+    # Difficulty breakdown (for test set)
+    if use_test_set:
+        difficulty_names = {0: "Easy", 1: "Medium", 2: "Hard"}
+        for diff, name in difficulty_names.items():
+            diff_results = [r for r in results if r.get("difficulty") == diff]
+            if diff_results:
+                diff_correct = sum(1 for r in diff_results if r["score"] == 1.0)
+                print(
+                    f"  {name}: {diff_correct}/{len(diff_results)} "
+                    f"({100*diff_correct/len(diff_results):.1f}%)"
+                )
+
+    # Save results
+    output_file = "integration_test_results.json"
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nDetailed results saved to: {output_file}")
+
+    return results
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="NL2Bash Environment Integration Test")
+    parser.add_argument(
+        "--base_url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="Base URL for OpenAI-compatible API",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Qwen/Qwen3-8B",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--api_key",
+        type=str,
+        default="x",
+        help="API key",
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=10,
+        help="Number of samples to test",
+    )
+    parser.add_argument(
+        "--use_train",
+        action="store_true",
+        help="Use training set instead of test set",
+    )
+
+    args = parser.parse_args()
+
+    asyncio.run(
+        run_integration_test(
+            base_url=args.base_url,
+            model_name=args.model,
+            api_key=args.api_key,
+            num_samples=args.num_samples,
+            use_test_set=not args.use_train,
+        )
+    )