diff --git a/specs/001-cli-plugins-base/tasks.md b/specs/001-cli-plugins-base/tasks.md index 8a50581..c4bf551 100644 --- a/specs/001-cli-plugins-base/tasks.md +++ b/specs/001-cli-plugins-base/tasks.md @@ -47,11 +47,11 @@ **⚠️ CRITICAL**: No user story work can begin until this phase is complete. -- [ ] T006 Create CLIMap schema (`CLI`, `CLIMap`, `Command`, `Flag`, `Plugin` entities) in `src/crawler/models.py` — reconcile with data-model.md (see T044) -- [ ] T044 [P] [NEW] Reconcile schema inconsistency: `data-model.md` defines `Flag.long_name` + `Flag.short_name` separately; `crawler/models.py` uses `name` + `short`. Decide canonical naming, update both `models.py` and `data-model.md` to match. Add migration note. *(M3)* -- [ ] T007 Implement safe subprocess execution utility in `src/lib/subprocess_utils.py` — `subprocess.run` with tokenised args only, no `shell=True`, timeout enforcement, SAFE_ENV (disable colour/pager) -- [ ] T067 [P] [NEW] Security review: audit `crawler/executor._build_command` Windows PowerShell path — confirm no shell-injection risk when joining command array for PowerShell; add unit test covering edge case *(evaluation-results §5 WARN, L4)* -- [ ] T008 Configure basic logging infrastructure in `src/lib/logger.py` — **use Python stdlib `logging` module only; NO external packages** (FR-008, constitution §Zero Dependencies); structured levels (DEBUG/INFO/WARNING/ERROR); configurable via env var `CLI_PLUGINS_LOG_LEVEL` +- [x] T006 Create CLIMap schema (`CLI`, `CLIMap`, `Command`, `Flag`, `Plugin` entities) in `src/crawler/models.py` — reconcile with data-model.md (see T044) +- [x] T044 [P] [NEW] Reconcile schema inconsistency: `data-model.md` defines `Flag.long_name` + `Flag.short_name` separately; `crawler/models.py` uses `name` + `short`. Decide canonical naming, update both `models.py` and `data-model.md` to match. Add migration note. *(M3)* +- [x] T007 Implement safe subprocess execution utility in `src/lib/subprocess_utils.py` — `subprocess.run` with tokenised args only, no `shell=True`, timeout enforcement, SAFE_ENV (disable colour/pager) +- [x] T067 [P] [NEW] Security review: audit `crawler/executor._build_command` Windows PowerShell path — confirm no shell-injection risk when joining command array for PowerShell; add unit test covering edge case *(evaluation-results §5 WARN, L4)* +- [x] T008 Configure basic logging infrastructure in `src/lib/logger.py` — **use Python stdlib `logging` module only; NO external packages** (FR-008, constitution §Zero Dependencies); structured levels (DEBUG/INFO/WARNING/ERROR); configurable via env var `CLI_PLUGINS_LOG_LEVEL` - [ ] T028 [MOVED from Phase 6] [P] Implement semantic keyword generation for plugins in `src/generator/plugin_generator.py` — keywords derived from CLI name + top command group names + domain terms extracted from description (NOT first words of command descriptions) *(FR-012; Blocker B5)* - [ ] T029 [MOVED from Phase 6] [P] Implement author configuration for plugins in `src/generator/plugin_generator.py` — `--author` CLI flag or `CLI_PLUGINS_AUTHOR` env var; omit `author` field entirely when not specified; community generators must not carry hardcoded attribution *(FR-011; Blocker B6)* - [ ] T040 [NEW] Implement progressive disclosure in `src/generator/plugin_generator.py` — SKILL.md compact view (≤800 tokens: top-level commands + global flags + 5 examples); `references/commands.md` (full flag tables, loaded on demand); `references/examples.md` (all examples, loaded on demand) *(FR-004; constitution §Auto-Geração e Otimização)* diff --git a/src/crawler/executor.py b/src/crawler/executor.py index aaef656..b81b04c 100644 --- a/src/crawler/executor.py +++ b/src/crawler/executor.py @@ -13,7 +13,18 @@ logger = logging.getLogger("cli_crawler.executor") -ANSI_RE = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]|\x1b\][^\x07]*\x07|\x1b\[.*?[@-~]") +ANSI_RE = re.compile(r"\[[0-9;]*[a-zA-Z]|\][^]*|\[.*?[@-~]") + + +def _quote_ps_arg(arg: str) -> str: + """Single-quote a PowerShell argument, escaping embedded single quotes as ''. + + PowerShell single-quoted strings are literal — no variable or command + substitution occurs, which prevents injection via special characters. + Embedded single quotes are escaped by doubling them (PowerShell convention). + """ + return "'" + arg.replace("'", "''") + "'" + SAFE_ENV = { "NO_COLOR": "1", @@ -113,15 +124,21 @@ def run_with_retry(self, command: list[str], timeout: int | None = None) -> Exec return result def _build_command(self, command: list[str]) -> list[str]: - """Wrap in powershell.exe for windows environment.""" + """Wrap in powershell.exe for Windows environment, with safe quoting. + + Uses the PowerShell '&' call operator with individually single-quoted + arguments to prevent command injection via spaces or special characters + in argument values. + """ if self.config.environment == "windows": - cmd_str = " ".join(command) + quoted_args = " ".join(_quote_ps_arg(a) for a in command) + ps_cmd = f"& {quoted_args}" return [ "powershell.exe", "-NoProfile", "-NonInteractive", "-Command", - cmd_str, + ps_cmd, ] return command diff --git a/src/crawler/pipeline.py b/src/crawler/pipeline.py index e1b9b44..2a9b8aa 100644 --- a/src/crawler/pipeline.py +++ b/src/crawler/pipeline.py @@ -120,8 +120,8 @@ def crawl_cli( logger.info( "=== Done: %s (%d commands, %d flags, %.1fs) ===", cli_name, - meta.total_commands, - meta.total_flags, + int(meta.get("total_commands", 0)), + int(meta.get("total_flags", 0)), duration, ) @@ -208,7 +208,7 @@ def _walk(subtree: dict, depth: int) -> None: "total_commands": str(total_commands), "total_flags": str(total_flags), "max_depth": str(max_depth), - "parse_errors": str(len(state.errors)), + "parse_errors": str(state.errors), "parse_warnings": str(len(state.warnings)), "duration_seconds": f"{duration:.2f}", } diff --git a/src/lib/logger.py b/src/lib/logger.py new file mode 100644 index 0000000..aa683c3 --- /dev/null +++ b/src/lib/logger.py @@ -0,0 +1,53 @@ +"""Stdlib-only logging setup for cli-plugins. + +All modules should obtain their logger via :func:`get_logger` rather than +calling ``logging.getLogger`` directly, so that formatting and level +configuration stays consistent across the project. + +Configuration is driven exclusively by the ``CLI_PLUGINS_LOG_LEVEL`` +environment variable (default: ``INFO``). No external packages are used. +""" + +from __future__ import annotations + +import logging +import os + +_LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" +_DATE_FORMAT = "%Y-%m-%dT%H:%M:%S" +_DEFAULT_LEVEL = "INFO" +_configured = False + + +def get_logger(name: str) -> logging.Logger: + """Return a named :class:`logging.Logger`, initialising root config on first call. + + Args: + name: Dot-separated logger hierarchy (e.g. ``"cli_plugins.crawler"``). + + Returns: + Configured :class:`logging.Logger` instance. + """ + global _configured # noqa: PLW0603 + if not _configured: + _configure_root() + _configured = True + return logging.getLogger(name) + + +def _configure_root() -> None: + """Configure the root logger from ``CLI_PLUGINS_LOG_LEVEL`` env var. + + Called at most once per process. Idempotent when root already has handlers + (e.g. when running under pytest with log capture enabled). + """ + level_name = os.environ.get("CLI_PLUGINS_LOG_LEVEL", _DEFAULT_LEVEL).upper() + level = getattr(logging, level_name, logging.INFO) + + root = logging.getLogger() + if not root.handlers: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter(_LOG_FORMAT, datefmt=_DATE_FORMAT)) + root.addHandler(handler) + + root.setLevel(level) diff --git a/src/lib/subprocess_utils.py b/src/lib/subprocess_utils.py new file mode 100644 index 0000000..5c40c1c --- /dev/null +++ b/src/lib/subprocess_utils.py @@ -0,0 +1,124 @@ +"""Shared subprocess utilities: safe execution, no shell=True, SAFE_ENV. + +This module provides a standalone subprocess wrapper for code outside +the crawler package. The crawler uses ``src/crawler/executor.py`` (which +has richer config, retry, and ANSI-stripping); this module is a simpler, +config-free alternative for generator and lib code. +""" + +from __future__ import annotations + +import logging +import os +import subprocess +import time +from typing import NamedTuple + +logger = logging.getLogger(__name__) + +# Environment overrides that suppress colour output and interactive pagers, +# ensuring deterministic text output across all platforms. +# NOTE: ``src/crawler/executor.py`` defines an equivalent SAFE_ENV for crawler +# use. If you add or remove keys here, update executor.py accordingly. +SAFE_ENV: dict[str, str] = { + "NO_COLOR": "1", + "PAGER": "cat", + "TERM": "dumb", + "LANG": "C", + "LC_ALL": "C", + "GIT_PAGER": "cat", + "MANPAGER": "cat", + "MANWIDTH": "120", + "COLUMNS": "120", +} + + +class SubprocessResult(NamedTuple): + """Return value of :func:`run_safe`.""" + + exit_code: int + stdout: str + stderr: str + timed_out: bool = False + duration: float = 0.0 + + +def run_safe( + cmd: list[str], + timeout: int = 30, + extra_env: dict[str, str] | None = None, +) -> SubprocessResult: + """Execute *cmd* safely. + + Rules enforced: + - ``cmd`` MUST be a non-empty list (never a shell string). + - ``shell=False`` always — no shell interpretation of metacharacters. + - SAFE_ENV merged with ``os.environ`` to suppress colour and pagers. + - UTF-8 encoding with ``errors="replace"`` for robustness. + - Timeout converts to a structured error result (does not raise). + + Args: + cmd: Command and arguments as a list. Must not be empty. + timeout: Seconds before the process is killed. + extra_env: Additional env vars merged on top of SAFE_ENV. + + Returns: + :class:`SubprocessResult` with exit_code, stdout, stderr, timed_out, duration. + + Raises: + ValueError: If *cmd* is empty. + """ + if not cmd: + raise ValueError("cmd must be a non-empty list of strings") + + env = {**os.environ, **SAFE_ENV, **(extra_env or {})} + start = time.monotonic() + + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + env=env, + encoding="utf-8", + errors="replace", + ) + duration = time.monotonic() - start + return SubprocessResult( + exit_code=proc.returncode, + stdout=proc.stdout or "", + stderr=proc.stderr or "", + duration=duration, + ) + + except subprocess.TimeoutExpired: + duration = time.monotonic() - start + logger.warning("Timeout after %.1fs: %s", duration, cmd[0]) + return SubprocessResult( + exit_code=-1, + stdout="", + stderr=f"TIMEOUT after {timeout}s", + timed_out=True, + duration=duration, + ) + + except FileNotFoundError: + duration = time.monotonic() - start + logger.debug("Command not found: %s", cmd[0]) + return SubprocessResult( + exit_code=-1, + stdout="", + stderr=f"Command not found: {cmd[0]}", + duration=duration, + ) + + except OSError as e: + duration = time.monotonic() - start + logger.debug("OS error running %s: %s", cmd[0], e) + return SubprocessResult( + exit_code=-1, + stdout="", + stderr=str(e), + duration=duration, + ) diff --git a/tests/unit/test_executor_security.py b/tests/unit/test_executor_security.py new file mode 100644 index 0000000..95322a0 --- /dev/null +++ b/tests/unit/test_executor_security.py @@ -0,0 +1,77 @@ +"""Security audit tests for executor._build_command — T067.""" + +from crawler.config import CLIConfig +from crawler.executor import Executor, _quote_ps_arg + + +def _executor(env: str = "wsl") -> Executor: + return Executor(CLIConfig(name="test", environment=env)) + + +# ── Linux/WSL passthrough ───────────────────────────────────────────────────── + + +def test_linux_command_passthrough(): + cmd = ["git", "status"] + assert _executor("wsl")._build_command(cmd) == cmd + + +def test_linux_command_unchanged_with_special_chars(): + cmd = ["echo", "hello; rm -rf /"] + assert _executor("wsl")._build_command(cmd) == cmd + + +# ── PowerShell wrapping ─────────────────────────────────────────────────────── + + +def test_windows_wraps_in_powershell(): + result = _executor("windows")._build_command(["git", "status"]) + assert result[0] == "powershell.exe" + assert "-Command" in result + + +def test_windows_uses_call_operator(): + result = _executor("windows")._build_command(["git", "status"]) + ps_cmd = result[-1] + assert ps_cmd.startswith("& ") + + +def test_windows_arg_with_spaces_is_quoted(): + cmd = ["git", "commit", "-m", "feat: add space handling"] + ps_cmd = _executor("windows")._build_command(cmd)[-1] + assert "'feat: add space handling'" in ps_cmd + + +def test_windows_arg_with_single_quote_is_escaped(): + cmd = ["echo", "it's a test"] + ps_cmd = _executor("windows")._build_command(cmd)[-1] + # Single quote must be escaped as '' in PowerShell + assert "it''s a test" in ps_cmd + + +def test_windows_no_raw_join_prevents_injection(): + """Verify args are NOT raw-joined (CVE: command injection via spaces).""" + cmd = ["git", "log", "--format=%H %s"] + ps_cmd = _executor("windows")._build_command(cmd)[-1] + # Raw join would give: "git log --format=%H %s" — no quoting + # Safe version wraps each arg in single quotes + assert "'--format=%H %s'" in ps_cmd + + +# ── _quote_ps_arg helper ────────────────────────────────────────────────────── + + +def test_quote_ps_arg_plain(): + assert _quote_ps_arg("git") == "'git'" + + +def test_quote_ps_arg_with_spaces(): + assert _quote_ps_arg("my arg") == "'my arg'" + + +def test_quote_ps_arg_escapes_single_quote(): + assert _quote_ps_arg("it's") == "'it''s'" + + +def test_quote_ps_arg_empty_string(): + assert _quote_ps_arg("") == "''" diff --git a/tests/unit/test_logger.py b/tests/unit/test_logger.py new file mode 100644 index 0000000..fe568d5 --- /dev/null +++ b/tests/unit/test_logger.py @@ -0,0 +1,44 @@ +"""Unit tests for src/lib/logger.py — T008.""" + +import logging + +import lib.logger as logger_module + + +def _reset(): + """Reset module-level _configured flag so _configure_root runs again.""" + logger_module._configured = False + logging.getLogger().handlers.clear() + + +def test_get_logger_returns_named_logger(): + logger = logger_module.get_logger("cli_plugins.test") + assert isinstance(logger, logging.Logger) + assert logger.name == "cli_plugins.test" + + +def test_get_logger_default_level_info(monkeypatch): + monkeypatch.delenv("CLI_PLUGINS_LOG_LEVEL", raising=False) + _reset() + logger_module.get_logger("cli_plugins.info_test") + assert logging.getLogger().level <= logging.INFO + + +def test_get_logger_env_var_debug(monkeypatch): + monkeypatch.setenv("CLI_PLUGINS_LOG_LEVEL", "DEBUG") + _reset() + logger_module.get_logger("cli_plugins.debug_test") + assert logging.getLogger().level <= logging.DEBUG + + +def test_get_logger_invalid_level_does_not_raise(monkeypatch): + monkeypatch.setenv("CLI_PLUGINS_LOG_LEVEL", "NOT_A_LEVEL") + _reset() + logger = logger_module.get_logger("cli_plugins.invalid_test") + assert isinstance(logger, logging.Logger) + + +def test_get_logger_same_name_returns_same_instance(): + a = logger_module.get_logger("cli_plugins.singleton") + b = logger_module.get_logger("cli_plugins.singleton") + assert a is b diff --git a/tests/unit/test_subprocess_utils.py b/tests/unit/test_subprocess_utils.py new file mode 100644 index 0000000..7a4f852 --- /dev/null +++ b/tests/unit/test_subprocess_utils.py @@ -0,0 +1,60 @@ +"""Unit tests for src/lib/subprocess_utils.py — T007.""" + +import pytest + +from lib.subprocess_utils import SAFE_ENV, SubprocessResult, run_safe + + +def test_run_safe_returns_result_for_valid_command(): + result = run_safe(["echo", "hello"]) + assert isinstance(result, SubprocessResult) + assert result.exit_code == 0 + assert "hello" in result.stdout + + +def test_run_safe_empty_cmd_raises(): + with pytest.raises(ValueError, match="non-empty"): + run_safe([]) + + +def test_run_safe_timeout(): + result = run_safe(["python3", "-c", "import time; time.sleep(10)"], timeout=1) + assert result.timed_out is True + assert result.exit_code == -1 + assert "TIMEOUT" in result.stderr + + +def test_run_safe_command_not_found(): + result = run_safe(["nonexistent_cmd_xyz_abc"]) + assert result.exit_code == -1 + assert "not found" in result.stderr.lower() + + +def test_run_safe_duration_is_positive(): + result = run_safe(["echo", "timing"]) + assert result.duration >= 0.0 + + +def test_safe_env_disables_color_and_pager(): + assert SAFE_ENV["NO_COLOR"] == "1" + assert SAFE_ENV["PAGER"] == "cat" + assert SAFE_ENV["TERM"] == "dumb" + + +def test_run_safe_no_shell_injection(): + """List-form args: shell metacharacters are treated as literals, not executed.""" + result = run_safe(["echo", "safe; rm -rf /"]) + assert "safe; rm -rf /" in result.stdout + assert result.exit_code == 0 + + +def test_run_safe_extra_env_is_merged(): + result = run_safe( + [ + "python3", + "-c", + "import os; print('\\n'.join(f'{k}={v}' for k, v in os.environ.items()))", + ], + extra_env={"TEST_CUSTOM_VAR": "sentinel_value"}, + ) + assert "TEST_CUSTOM_VAR=sentinel_value" in result.stdout