Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions specs/001-cli-plugins-base/tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@

**⚠️ CRITICAL**: No user story work can begin until this phase is complete.

- [ ] T006 Create CLIMap schema (`CLI`, `CLIMap`, `Command`, `Flag`, `Plugin` entities) in `src/crawler/models.py` — reconcile with data-model.md (see T044)
- [ ] T044 [P] [NEW] Reconcile schema inconsistency: `data-model.md` defines `Flag.long_name` + `Flag.short_name` separately; `crawler/models.py` uses `name` + `short`. Decide canonical naming, update both `models.py` and `data-model.md` to match. Add migration note. *(M3)*
- [ ] T007 Implement safe subprocess execution utility in `src/lib/subprocess_utils.py` — `subprocess.run` with tokenised args only, no `shell=True`, timeout enforcement, SAFE_ENV (disable colour/pager)
- [ ] T067 [P] [NEW] Security review: audit `crawler/executor._build_command` Windows PowerShell path — confirm no shell-injection risk when joining command array for PowerShell; add unit test covering edge case *(evaluation-results §5 WARN, L4)*
- [ ] T008 Configure basic logging infrastructure in `src/lib/logger.py` — **use Python stdlib `logging` module only; NO external packages** (FR-008, constitution §Zero Dependencies); structured levels (DEBUG/INFO/WARNING/ERROR); configurable via env var `CLI_PLUGINS_LOG_LEVEL`
- [x] T006 Create CLIMap schema (`CLI`, `CLIMap`, `Command`, `Flag`, `Plugin` entities) in `src/crawler/models.py` — reconcile with data-model.md (see T044)
- [x] T044 [P] [NEW] Reconcile schema inconsistency: `data-model.md` defines `Flag.long_name` + `Flag.short_name` separately; `crawler/models.py` uses `name` + `short`. Decide canonical naming, update both `models.py` and `data-model.md` to match. Add migration note. *(M3)*
- [x] T007 Implement safe subprocess execution utility in `src/lib/subprocess_utils.py` — `subprocess.run` with tokenised args only, no `shell=True`, timeout enforcement, SAFE_ENV (disable colour/pager)
- [x] T067 [P] [NEW] Security review: audit `crawler/executor._build_command` Windows PowerShell path — confirm no shell-injection risk when joining command array for PowerShell; add unit test covering edge case *(evaluation-results §5 WARN, L4)*
- [x] T008 Configure basic logging infrastructure in `src/lib/logger.py` — **use Python stdlib `logging` module only; NO external packages** (FR-008, constitution §Zero Dependencies); structured levels (DEBUG/INFO/WARNING/ERROR); configurable via env var `CLI_PLUGINS_LOG_LEVEL`
- [ ] T028 [MOVED from Phase 6] [P] Implement semantic keyword generation for plugins in `src/generator/plugin_generator.py` — keywords derived from CLI name + top command group names + domain terms extracted from description (NOT first words of command descriptions) *(FR-012; Blocker B5)*
- [ ] T029 [MOVED from Phase 6] [P] Implement author configuration for plugins in `src/generator/plugin_generator.py` — `--author` CLI flag or `CLI_PLUGINS_AUTHOR` env var; omit `author` field entirely when not specified; community generators must not carry hardcoded attribution *(FR-011; Blocker B6)*
- [ ] T040 [NEW] Implement progressive disclosure in `src/generator/plugin_generator.py` — SKILL.md compact view (≤800 tokens: top-level commands + global flags + 5 examples); `references/commands.md` (full flag tables, loaded on demand); `references/examples.md` (all examples, loaded on demand) *(FR-004; constitution §Auto-Geração e Otimização)*
Expand Down
25 changes: 21 additions & 4 deletions src/crawler/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,18 @@

logger = logging.getLogger("cli_crawler.executor")

ANSI_RE = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]|\x1b\][^\x07]*\x07|\x1b\[.*?[@-~]")
ANSI_RE = re.compile(r"\[[0-9;]*[a-zA-Z]|\][^]*|\[.*?[@-~]")


def _quote_ps_arg(arg: str) -> str:
"""Single-quote a PowerShell argument, escaping embedded single quotes as ''.

PowerShell single-quoted strings are literal — no variable or command
substitution occurs, which prevents injection via special characters.
Embedded single quotes are escaped by doubling them (PowerShell convention).
"""
return "'" + arg.replace("'", "''") + "'"


SAFE_ENV = {
"NO_COLOR": "1",
Expand Down Expand Up @@ -113,15 +124,21 @@ def run_with_retry(self, command: list[str], timeout: int | None = None) -> Exec
return result

def _build_command(self, command: list[str]) -> list[str]:
"""Wrap in powershell.exe for windows environment."""
"""Wrap in powershell.exe for Windows environment, with safe quoting.

Uses the PowerShell '&' call operator with individually single-quoted
arguments to prevent command injection via spaces or special characters
in argument values.
"""
if self.config.environment == "windows":
cmd_str = " ".join(command)
quoted_args = " ".join(_quote_ps_arg(a) for a in command)
ps_cmd = f"& {quoted_args}"
return [
"powershell.exe",
"-NoProfile",
"-NonInteractive",
"-Command",
cmd_str,
ps_cmd,
]
return command

Expand Down
6 changes: 3 additions & 3 deletions src/crawler/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ def crawl_cli(
logger.info(
"=== Done: %s (%d commands, %d flags, %.1fs) ===",
cli_name,
meta.total_commands,
meta.total_flags,
int(meta.get("total_commands", 0)),
int(meta.get("total_flags", 0)),
duration,
)

Expand Down Expand Up @@ -208,7 +208,7 @@ def _walk(subtree: dict, depth: int) -> None:
"total_commands": str(total_commands),
"total_flags": str(total_flags),
"max_depth": str(max_depth),
"parse_errors": str(len(state.errors)),
"parse_errors": str(state.errors),
"parse_warnings": str(len(state.warnings)),
"duration_seconds": f"{duration:.2f}",
}
Expand Down
53 changes: 53 additions & 0 deletions src/lib/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Stdlib-only logging setup for cli-plugins.

All modules should obtain their logger via :func:`get_logger` rather than
calling ``logging.getLogger`` directly, so that formatting and level
configuration stays consistent across the project.

Configuration is driven exclusively by the ``CLI_PLUGINS_LOG_LEVEL``
environment variable (default: ``INFO``). No external packages are used.
"""

from __future__ import annotations

import logging
import os

_LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
_DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
_DEFAULT_LEVEL = "INFO"
_configured = False


def get_logger(name: str) -> logging.Logger:
"""Return a named :class:`logging.Logger`, initialising root config on first call.

Args:
name: Dot-separated logger hierarchy (e.g. ``"cli_plugins.crawler"``).

Returns:
Configured :class:`logging.Logger` instance.
"""
global _configured # noqa: PLW0603
if not _configured:
_configure_root()
_configured = True
return logging.getLogger(name)


def _configure_root() -> None:
"""Configure the root logger from ``CLI_PLUGINS_LOG_LEVEL`` env var.

Called at most once per process. Idempotent when root already has handlers
(e.g. when running under pytest with log capture enabled).
"""
level_name = os.environ.get("CLI_PLUGINS_LOG_LEVEL", _DEFAULT_LEVEL).upper()
level = getattr(logging, level_name, logging.INFO)

root = logging.getLogger()
if not root.handlers:
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(_LOG_FORMAT, datefmt=_DATE_FORMAT))
root.addHandler(handler)

root.setLevel(level)
122 changes: 122 additions & 0 deletions src/lib/subprocess_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Shared subprocess utilities: safe execution, no shell=True, SAFE_ENV.

This module provides a standalone subprocess wrapper for code outside
the crawler package. The crawler uses ``src/crawler/executor.py`` (which
has richer config, retry, and ANSI-stripping); this module is a simpler,
config-free alternative for generator and lib code.
"""

from __future__ import annotations

import logging
import os
import subprocess
import time
from typing import NamedTuple

logger = logging.getLogger(__name__)

# Environment overrides that suppress colour output and interactive pagers,
# ensuring deterministic text output across all platforms.
SAFE_ENV: dict[str, str] = {
"NO_COLOR": "1",
"PAGER": "cat",
"TERM": "dumb",
"LANG": "C",
"LC_ALL": "C",
"GIT_PAGER": "cat",
"MANPAGER": "cat",
"MANWIDTH": "120",
"COLUMNS": "120",
}


class SubprocessResult(NamedTuple):
"""Return value of :func:`run_safe`."""

exit_code: int
stdout: str
stderr: str
timed_out: bool = False
duration: float = 0.0


def run_safe(
cmd: list[str],
timeout: int = 30,
extra_env: dict[str, str] | None = None,
) -> SubprocessResult:
"""Execute *cmd* safely.

Rules enforced:
- ``cmd`` MUST be a non-empty list (never a shell string).
- ``shell=False`` always — no shell interpretation of metacharacters.
- SAFE_ENV merged with ``os.environ`` to suppress colour and pagers.
- UTF-8 encoding with ``errors="replace"`` for robustness.
- Timeout converts to a structured error result (does not raise).

Args:
cmd: Command and arguments as a list. Must not be empty.
timeout: Seconds before the process is killed.
extra_env: Additional env vars merged on top of SAFE_ENV.

Returns:
:class:`SubprocessResult` with exit_code, stdout, stderr, timed_out, duration.

Raises:
ValueError: If *cmd* is empty.
"""
if not cmd:
raise ValueError("cmd must be a non-empty list of strings")

env = {**os.environ, **SAFE_ENV, **(extra_env or {})}
start = time.monotonic()

try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
env=env,
encoding="utf-8",
errors="replace",
)
duration = time.monotonic() - start
return SubprocessResult(
exit_code=proc.returncode,
stdout=proc.stdout or "",
stderr=proc.stderr or "",
duration=duration,
)

except subprocess.TimeoutExpired:
duration = time.monotonic() - start
logger.warning("Timeout after %.1fs: %s", duration, cmd[0])
return SubprocessResult(
exit_code=-1,
stdout="",
stderr=f"TIMEOUT after {timeout}s",
timed_out=True,
duration=duration,
)

except FileNotFoundError:
duration = time.monotonic() - start
logger.debug("Command not found: %s", cmd[0])
return SubprocessResult(
exit_code=-1,
stdout="",
stderr=f"Command not found: {cmd[0]}",
duration=duration,
)

except OSError as e:
duration = time.monotonic() - start
logger.debug("OS error running %s: %s", cmd[0], e)
return SubprocessResult(
exit_code=-1,
stdout="",
stderr=str(e),
duration=duration,
)
74 changes: 74 additions & 0 deletions tests/unit/test_executor_security.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Security audit tests for executor._build_command — T067."""

from crawler.config import CLIConfig
from crawler.executor import Executor, _quote_ps_arg


def _executor(env: str = "wsl") -> Executor:
return Executor(CLIConfig(name="test", environment=env))


# ── Linux/WSL passthrough ─────────────────────────────────────────────────────

def test_linux_command_passthrough():
cmd = ["git", "status"]
assert _executor("wsl")._build_command(cmd) == cmd


def test_linux_command_unchanged_with_special_chars():
cmd = ["echo", "hello; rm -rf /"]
assert _executor("wsl")._build_command(cmd) == cmd


# ── PowerShell wrapping ───────────────────────────────────────────────────────

def test_windows_wraps_in_powershell():
result = _executor("windows")._build_command(["git", "status"])
assert result[0] == "powershell.exe"
assert "-Command" in result


def test_windows_uses_call_operator():
result = _executor("windows")._build_command(["git", "status"])
ps_cmd = result[-1]
assert ps_cmd.startswith("& ")


def test_windows_arg_with_spaces_is_quoted():
cmd = ["git", "commit", "-m", "feat: add space handling"]
ps_cmd = _executor("windows")._build_command(cmd)[-1]
assert "'feat: add space handling'" in ps_cmd


def test_windows_arg_with_single_quote_is_escaped():
cmd = ["echo", "it's a test"]
ps_cmd = _executor("windows")._build_command(cmd)[-1]
# Single quote must be escaped as '' in PowerShell
assert "it''s a test" in ps_cmd


def test_windows_no_raw_join_prevents_injection():
"""Verify args are NOT raw-joined (CVE: command injection via spaces)."""
cmd = ["git", "log", "--format=%H %s"]
ps_cmd = _executor("windows")._build_command(cmd)[-1]
# Raw join would give: "git log --format=%H %s" — no quoting
# Safe version wraps each arg in single quotes
assert "'--format=%H %s'" in ps_cmd


# ── _quote_ps_arg helper ──────────────────────────────────────────────────────

def test_quote_ps_arg_plain():
assert _quote_ps_arg("git") == "'git'"


def test_quote_ps_arg_with_spaces():
assert _quote_ps_arg("my arg") == "'my arg'"


def test_quote_ps_arg_escapes_single_quote():
assert _quote_ps_arg("it's") == "'it''s'"


def test_quote_ps_arg_empty_string():
assert _quote_ps_arg("") == "''"
45 changes: 45 additions & 0 deletions tests/unit/test_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Unit tests for src/lib/logger.py — T008."""

import logging

import lib.logger as logger_module
from lib.logger import get_logger


def _reset():
"""Reset module-level _configured flag so _configure_root runs again."""
logger_module._configured = False
logging.getLogger().handlers.clear()


def test_get_logger_returns_named_logger():
logger = get_logger("cli_plugins.test")
assert isinstance(logger, logging.Logger)
assert logger.name == "cli_plugins.test"


def test_get_logger_default_level_info(monkeypatch):
monkeypatch.delenv("CLI_PLUGINS_LOG_LEVEL", raising=False)
_reset()
get_logger("cli_plugins.info_test")
assert logging.getLogger().level <= logging.INFO


def test_get_logger_env_var_debug(monkeypatch):
monkeypatch.setenv("CLI_PLUGINS_LOG_LEVEL", "DEBUG")
_reset()
get_logger("cli_plugins.debug_test")
assert logging.getLogger().level <= logging.DEBUG


def test_get_logger_invalid_level_does_not_raise(monkeypatch):
monkeypatch.setenv("CLI_PLUGINS_LOG_LEVEL", "NOT_A_LEVEL")
_reset()
logger = get_logger("cli_plugins.invalid_test")
assert isinstance(logger, logging.Logger)


def test_get_logger_same_name_returns_same_instance():
a = get_logger("cli_plugins.singleton")
b = get_logger("cli_plugins.singleton")
assert a is b
Loading
Loading