dbt-labs · joellabes · Feb 24, 2026 · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -77,7 +77,7 @@ jobs:
         run: docker build -t ade-bench-base -f docker/base/Dockerfile.duckdb-dbt .
 
       - name: Run benchmark
-        run: uv run ade run all --agent ${{ matrix.agent }} --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 6 --no-rebuild
+        run: uv run ade run all --agent ${{ matrix.agent }} --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 6 --no-rebuild --plugin-set none
         env:
           USE_DYNAMIC_LOGGING: "FALSE"
           DEFAULT_TEST_TIMEOUT_SEC: "120"

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -50,12 +50,13 @@ uv run scripts_python/run_harness.py --agent sage --task-ids task1 task2
 ```
 
 ### Key Parameters:
-- `--agent`: Agent type (sage, terminus, etc.)
+- `--agent`: Agent type (sage, claude, codex, gemini, etc.)
 - `--model`: LLM model for AI agents
 - `--dataset-config`: YAML file defining task collection
 - `--n-concurrent-trials`: Parallel execution (default: 4)
 - `--no-rebuild`: Skip Docker rebuilds
 - `--cleanup`: Remove Docker resources after run
+- `--plugin-set`: Plugin set names from `experiment_sets/plugin-sets.yaml` (space-separated). Controls skills, MCP servers, and allowed tools. Defaults to `none` (no plugins).
 
 ## Development Workflow
 

diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ ade run all --db duckdb --project-type dbt --agent claude
 ### 8. Go beyond
 
 - Use the dbt Fusion engine instead of dbt Core with `--project-type dbt-fusion` ([set up Snowflake](#snowflake-setup) first)
-- Enable the dbt MCP server with the `--use-mcp` flag (requires Snowflake, see [MCP](#enabling-the-mcp-server) section)
+- Enable skills, MCP servers, or both with the `--plugin-set` flag (see [Plugin Sets](#plugin-sets) section)
 - [Contribute additional tasks or datasets](/docs/CONTRIBUTING.md)
 
 ---
@@ -159,7 +159,7 @@ ade run \
   --seed \ # Optional; flag for creating solution seed CSVs !! DESTRUCTIVE !! RUN WITH CAUTION !! SEE BELOW !!
   --no-diffs \ # Optional; disables taking snapshots of diffs for faster performance.
   --persist \ # Optional; keeps the container alive after the trial is over or is aborted.
-  --use-mcp \ # Optional; creates an dbt MCP server for the agent. Note: Not all agents and databases are supported.
+  --plugin-set none \ # Optional; plugin set name(s) from experiment_sets/plugin-sets.yaml. Defaults to 'none'.
   --tasks-dir /absolute/path/to/tasks \ # Optional; path to an external tasks directory. Defaults to 'tasks' in the current directory.
 ```
 
@@ -397,14 +397,26 @@ gemini --output-format json --yolo --prompt {task_prompt} --model {model-id}
 
 Configuration files for each agent are found in the `/shared/config` directory. You can use `CLAUDE.md` to configure Claude Code, `AGENTS.md` to configure Codex, and `GEMINI.md` to configure Gemini.
 
-### Enabling the MCP server
+### Plugin sets
 
-If run with the flag `--use-mcp`, ADE-bench will create a dbt MCP server that the agent is allowed to use. The following databases and agents are supported:
+Plugin sets are declarative configurations of skills, MCP servers, and allowed tools that can be applied to benchmark runs. They are defined in `experiment_sets/plugin-sets.yaml`.
 
-- Databases: `snowflake` (duckdb doesn't support multiple simultaneous connections)
-- Agents: `claude`, `codex`, `gemini`
+Use the `--plugin-set` flag to select one or more plugin sets:
 
-Because the server runs locally, it only has access to the [CLI tools](https://github.com/dbt-labs/dbt-mcp#tools). The others are disabled, because they require access to the dbt platform.
+```bash
+ade run all --db duckdb --project-type dbt --agent claude --plugin-set all-dbt-skills
+ade run all --db snowflake --project-type dbt --agent claude --plugin-set dbt-mcp
+ade run all --db duckdb --project-type dbt --agent claude --plugin-set none  # baseline (default)
+```
+
+Available plugin sets include:
+- `none` (default): No skills or MCP servers. Baseline configuration.
+- `all-dbt-skills`: Installs all skills from `dbt-labs/dbt-agent-skills`.
+- `dbt-for-ae`: Installs only the `using-dbt-for-analytics-engineering` skill.
+- `dbt-mcp`: Configures the dbt MCP server (Snowflake only — DuckDB doesn't support multiple simultaneous connections). Only [CLI tools](https://github.com/dbt-labs/dbt-mcp#tools) are enabled.
+- `dbt-skills-mcp`: Both dbt skills and the dbt MCP server.
+
+See `experiment_sets/plugin-sets.yaml` for the full configuration of each set.
 
 ### The Sage agent
 

diff --git a/ade_bench/agents/base_agent.py b/ade_bench/agents/base_agent.py
@@ -75,6 +75,21 @@ def format_agent_log(self, log_path: Path) -> str | None:
         """
         return None
 
+    def extract_tools_used(self, log_path: Path) -> list[str] | None:
+        """
+        Extract deduplicated list of tool names from the agent's log file.
+
+        This method can be overridden by subclasses to provide agent-specific
+        tool extraction. The default implementation returns None.
+
+        Args:
+            log_path: Path to the raw agent log file
+
+        Returns:
+            Sorted list of unique tool names, or None if not available
+        """
+        return None
+
     @abstractmethod
     def perform_task(
         self,

diff --git a/ade_bench/agents/installed_agents/abstract_installed_agent.py b/ade_bench/agents/installed_agents/abstract_installed_agent.py
@@ -18,19 +18,28 @@
 from ade_bench.agents.agent_name import AgentName
 from ade_bench.agents.base_agent import AgentResult, BaseAgent
 from ade_bench.harness_models import TerminalCommand, FailureMode
+from ade_bench.harness_models import McpServerConfig
 from ade_bench.terminal.tmux_session import TmuxSession
+from ade_bench.terminal.docker_compose_manager import DockerComposeManager
 from ade_bench.utils.logger import log_harness_info, logger
 from ade_bench.config import config
 
 
 class AbstractInstalledAgent(BaseAgent, ABC):
     NAME = AgentName.ABSTRACT_INSTALLED
 
-    def __init__(self, use_mcp: bool = False, model_name: str | None = None, **kwargs):
+    def __init__(
+        self,
+        model_name: str | None = None,
+        allowed_tools: list[str] | None = None,
+        mcp_servers: dict[str, McpServerConfig] | None = None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self._variant_config = {}
-        self._use_mcp = use_mcp
         self._model_name = model_name
+        self._allowed_tools = allowed_tools or []
+        self._mcp_servers = mcp_servers or {}
 
     @property
     @abstractmethod
@@ -61,6 +70,95 @@ def _run_agent_commands(self, task_prompt: str) -> list[TerminalCommand]:
     def _create_env_setup_file(self) -> str:
         return "\n".join([f"export {key}='{value}'" for key, value in self._env.items()])
 
+    def _get_dbt_dynamic_env(self, session: TmuxSession, task_name: str | None) -> dict[str, str]:
+        """Get dynamic environment variables for dbt MCP server."""
+        env_vars = {}
+
+        # DBT_PROJECT_DIR is the container app directory
+        env_vars["DBT_PROJECT_DIR"] = str(DockerComposeManager.CONTAINER_APP_DIR)
+
+        # Get the dbt path from the container
+        result = session.container.exec_run(
+            ["sh", "-c", "which dbt"], workdir=str(DockerComposeManager.CONTAINER_APP_DIR)
+        )
+        if result.exit_code == 0:
+            dbt_path = result.output.decode("utf-8").strip()
+            if dbt_path:
+                env_vars["DBT_PATH"] = dbt_path
+                log_harness_info(logger, task_name, "agent", f"Found dbt at: {dbt_path}")
+        else:
+            logger.warning("[MCP] dbt not found in PATH, MCP server may not work correctly")
+
+        # Enable the dbt CLI in dbt-mcp
+        env_vars["DISABLE_DBT_CLI"] = "false"
+
+        return env_vars
+
+    def _configure_mcp_servers(self, session: TmuxSession, task_name: str | None) -> None:
+        """Configure MCP servers after agent installation."""
+        agent_cli = self.NAME.value  # e.g., "claude", "gemini"
+
+        for server_name, mcp_config in self._mcp_servers.items():
+            log_harness_info(
+                logger, task_name, "agent", f"Configuring MCP server '{server_name}'..."
+            )
+
+            # Start with static env vars from config
+            env_vars = dict(mcp_config.env)
+
+            # For dbt MCP server, add dynamic environment variables
+            # Check server name or if dbt-mcp appears in any of the args
+            is_dbt_mcp = server_name == "dbt" or any("dbt-mcp" in arg for arg in mcp_config.args)
+            if is_dbt_mcp:
+                dynamic_env = self._get_dbt_dynamic_env(session, task_name)
+                # Merge dynamic vars (don't override static config)
+                for key, value in dynamic_env.items():
+                    if key not in env_vars:
+                        env_vars[key] = value
+
+            # Write env file if we have any env vars
+            env_file_path = None
+            if env_vars:
+                env_file_path = f"/tmp/{server_name}.env"
+                env_content = "\n".join(f"{k}={v}" for k, v in env_vars.items())
+                write_cmd = f"cat > {env_file_path} << 'ENVEOF'\n{env_content}\nENVEOF"
+
+                result = session.container.exec_run(
+                    ["sh", "-c", write_cmd], workdir=str(DockerComposeManager.CONTAINER_APP_DIR)
+                )
+                if result.exit_code != 0:
+                    logger.warning(
+                        f"[MCP] Failed to write env file: {result.output.decode('utf-8')}"
+                    )
+                else:
+                    log_harness_info(
+                        logger,
+                        task_name,
+                        "agent",
+                        f"Wrote env file with vars: {list(env_vars.keys())}",
+                    )
+
+            # Build mcp add command
+            args_str = " ".join(mcp_config.args)
+            if env_file_path:
+                mcp_cmd = f"{agent_cli} mcp add {server_name} -- {mcp_config.command} --env-file {env_file_path} {args_str}"
+            else:
+                mcp_cmd = f"{agent_cli} mcp add {server_name} -- {mcp_config.command} {args_str}"
+
+            result = session.container.exec_run(
+                ["sh", "-c", mcp_cmd], workdir=str(DockerComposeManager.CONTAINER_APP_DIR)
+            )
+
+            if result.exit_code != 0:
+                logger.warning(
+                    f"[MCP] Server registration failed for {server_name}: "
+                    f"{result.output.decode('utf-8')}"
+                )
+            else:
+                log_harness_info(
+                    logger, task_name, "agent", f"MCP server '{server_name}' configured"
+                )
+
     def perform_task(
         self,
         task_prompt: str,
@@ -104,32 +202,9 @@ def perform_task(
                 max_timeout_sec=config.setup_timeout_sec,  # Use setup timeout for installation
             )
 
-            # Optionally setup dbt MCP server
-            if self._use_mcp:
-                dbt_mcp_script = (
-                    Path(__file__).parent.parent.parent.parent
-                    / "shared"
-                    / "scripts"
-                    / "setup-dbt-mcp.sh"
-                )
-                session.copy_to_container(
-                    dbt_mcp_script,
-                    container_dir="/scripts",
-                    container_filename="setup-dbt-mcp.sh",
-                )
-
-                # Pass db_type, project_type, and agent name
-                db_type = self._variant_config.get("db_type", "unknown")
-                project_type = self._variant_config.get("project_type", "unknown")
-                agent_name = self.NAME.value if hasattr(self.NAME, "value") else str(self.NAME)
-                session.send_keys(
-                    [
-                        f"bash /scripts/setup-dbt-mcp.sh {db_type} {project_type} {agent_name}",
-                        "Enter",
-                    ],
-                    block=True,
-                    max_timeout_sec=config.setup_timeout_sec,
-                )
+            # Configure MCP servers after agent is installed
+            if self._mcp_servers:
+                self._configure_mcp_servers(session, task_name)
         except TimeoutError:
             log_harness_info(
                 logger,

diff --git a/ade_bench/agents/installed_agents/claude_code/claude_code_agent.py b/ade_bench/agents/installed_agents/claude_code/claude_code_agent.py
@@ -15,7 +15,6 @@
 
 class ClaudeCodeAgent(AbstractInstalledAgent):
     NAME = AgentName.CLAUDE_CODE
-    ALLOWED_TOOLS = ["Bash", "Edit", "Write", "NotebookEdit", "WebFetch", "mcp__dbt"]
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -40,7 +39,8 @@ def _run_agent_commands(self, task_prompt: str) -> list[TerminalCommand]:
         if self._model_name:
             command += f" --model {self._model_name}"
 
-        command += f" --allowedTools {' '.join(self.ALLOWED_TOOLS)}"
+        if self._allowed_tools:
+            command += f" --allowedTools {' '.join(self._allowed_tools)}"
 
         return [
             TerminalCommand(
@@ -62,10 +62,62 @@ def format_agent_log(self, log_path: Path) -> str | None:
         """
         Format the Claude Code agent's log file into a human-readable string.
 
+        Also generates an HTML transcript at log_path.parent / "transcript.html"
+        using claude-code-transcripts if available.
+
+
         Args:
             log_path: Path to the raw agent.log file (JSON-lines format)
 
         Returns:
             Formatted log content as a string, or None if formatting failed
         """
+        # Generate HTML transcript as a single well-known file
+        transcript_path = log_path.parent / "transcript.html"
+        self._log_formatter.generate_html_transcript(log_path, transcript_path)
+
+        # Return text-formatted log
         return self._log_formatter.format_log(log_path)
+
+    # Generic tools to filter out from tools_used reporting
+    _GENERIC_TOOLS = frozenset(
+        {
+            "Bash",
+            "Edit",
+            "Glob",
+            "Grep",
+            "Read",
+            "Write",
+            "WebFetch",
+            "WebSearch",
+            "Task",
+            "NotebookEdit",
+            "TodoRead",
+            "TodoWrite",
+        }
+    )
+
+    def extract_tools_used(self, log_path: Path) -> list[str] | None:
+        """
+        Extract deduplicated tool names from Claude Code agent logs.
+
+        Filters out generic tools (Bash, Edit, Glob, etc.) and expands
+        Skill tool calls to their actual skill names.
+        """
+        try:
+            turns = self._log_formatter.parse_log_file(log_path)
+            tool_names = set()
+            for turn in turns:
+                for tool in turn.get("tools", []):
+                    name = tool["name"]
+                    # Expand Skill tool to actual skill name
+                    if name == "Skill":
+                        skill_name = tool.get("input", {}).get("skill")
+                        if skill_name:
+                            tool_names.add(f"skill:{skill_name}")
+                    # Filter out generic tools
+                    elif name not in self._GENERIC_TOOLS:
+                        tool_names.add(name)
+            return sorted(tool_names) if tool_names else None
+        except Exception:
+            return None