From 0d5609518b4815a9b8a960e38619c348c7b5b288 Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Fri, 10 Apr 2026 22:24:14 +0530 Subject: [PATCH 01/22] Debug: log procurement skill registration failure + custom CORS middleware Co-Authored-By: Claude Sonnet 4.6 --- api/routes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/api/routes.py b/api/routes.py index 8787600..c0b846b 100644 --- a/api/routes.py +++ b/api/routes.py @@ -44,8 +44,11 @@ def register_skills(): from skills.hackathon_novelty import skill_card as hackathon_card _skill_router.register(hackathon_card) - from skills.confidential_data_procurement import skill_card as procurement_card - _skill_router.register(procurement_card) + try: + from skills.confidential_data_procurement import skill_card as procurement_card + _skill_router.register(procurement_card) + except Exception as e: + logger.error("Failed to register confidential_data_procurement: %s", e, exc_info=True) # --- Helpers --- From e496d2ff554f0254c7e3b1ef2b3f6a40ff8b2b3e Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 16:53:45 +0530 Subject: [PATCH 02/22] Drop procurement skill, conversational /init, and dataset_audit stub - Delete skills/confidential_data_procurement/ (preserved in backup branch) - Delete skills/dataset_audit/ stub - Delete skills/hackathon_novelty/init.py (dead after /init removed) - Remove POST /init, /upload, /respond, /download/{token} endpoints - Remove init_handler / upload_handler / respond_handler from SkillCard - Remove InitRequest / InitResponse from core.models - Drop procurement and live e2e tests; rewrite test_e2e to seed instances directly until typed POST /instances lands in phase 4 Phase 1 of pivot/agent-skill. All 52 remaining tests pass. --- api/routes.py | 200 +--- core/models.py | 13 - core/skill_card.py | 3 - .../.env.example | 6 - .../confidential_data_procurement/__init__.py | 312 ------ skills/confidential_data_procurement/agent.py | 297 ------ .../confidential_data_procurement/config.py | 103 -- .../deterministic.py | 313 ------ .../guardrails.py | 110 -- .../confidential_data_procurement/ingest.py | 252 ----- skills/confidential_data_procurement/init.py | 266 ----- .../confidential_data_procurement/models.py | 121 --- skills/confidential_data_procurement/tools.py | 140 --- skills/dataset_audit/__init__.py | 0 skills/hackathon_novelty/__init__.py | 2 - skills/hackathon_novelty/init.py | 167 --- tests/test_data_procurement.py | 981 ------------------ tests/test_e2e.py | 272 +---- tests/test_live_e2e.py | 496 --------- tests/test_procurement_e2e.py | 450 -------- 20 files changed, 54 insertions(+), 4450 deletions(-) delete mode 100644 skills/confidential_data_procurement/.env.example delete mode 100644 skills/confidential_data_procurement/__init__.py delete mode 100644 skills/confidential_data_procurement/agent.py delete mode 100644 skills/confidential_data_procurement/config.py delete mode 100644 skills/confidential_data_procurement/deterministic.py delete mode 100644 skills/confidential_data_procurement/guardrails.py delete mode 100644 skills/confidential_data_procurement/ingest.py delete mode 100644 skills/confidential_data_procurement/init.py delete mode 100644 skills/confidential_data_procurement/models.py delete mode 100644 skills/confidential_data_procurement/tools.py delete mode 100644 skills/dataset_audit/__init__.py delete mode 100644 skills/hackathon_novelty/init.py delete mode 100644 tests/test_data_procurement.py delete mode 100644 tests/test_live_e2e.py delete mode 100644 tests/test_procurement_e2e.py diff --git a/api/routes.py b/api/routes.py index c0b846b..f8b9b94 100644 --- a/api/routes.py +++ b/api/routes.py @@ -2,20 +2,16 @@ import asyncio import logging import secrets -import traceback import uuid from datetime import datetime from functools import partial -from fastapi import APIRouter, File, HTTPException, Request, UploadFile -from fastapi.datastructures import FormData -from fastapi.responses import Response +from fastapi import APIRouter, HTTPException, Request logger = logging.getLogger(__name__) -from core.models import SkillResponse, InitRequest, InitResponse +from core.models import SkillResponse from skills.router import SkillRouter -from skills.confidential_data_procurement.ingest import store_authorized_download, get_download_bytes router = APIRouter() @@ -44,12 +40,6 @@ def register_skills(): from skills.hackathon_novelty import skill_card as hackathon_card _skill_router.register(hackathon_card) - try: - from skills.confidential_data_procurement import skill_card as procurement_card - _skill_router.register(procurement_card) - except Exception as e: - logger.error("Failed to register confidential_data_procurement: %s", e, exc_info=True) - # --- Helpers --- @@ -90,72 +80,6 @@ async def _run_pipeline(instance_id: str) -> int: # --- Endpoints --- -@router.post("/init") -async def init_instance(body: InitRequest): - """ - Conversational operator onboarding loop. - - First call: instance_id=None — creates instance, starts conversation. - Subsequent calls: include instance_id to continue the conversation. - The skill's init_handler owns all onboarding logic (prompts, LLM calls, config extraction). - Returns status='configuring' (skill needs more info) or status='ready' (tokens issued). - """ - if body.instance_id is None: - try: - card = _skill_router.get_card(body.skill_name) - except KeyError: - raise HTTPException(status_code=404, detail=f"Skill '{body.skill_name}' not found") - if card.init_handler is None: - raise HTTPException(status_code=400, detail=f"Skill '{body.skill_name}' does not support conversational setup") - instance_id = str(uuid.uuid4()) - _instances[instance_id] = { - "skill_name": body.skill_name, - "config": None, - "threshold": card.config.get("min_submissions", 5), - "conversation": [], - "triggered": False, - } - _submissions[instance_id] = {} - _results[instance_id] = {} - else: - instance_id = body.instance_id - if instance_id not in _instances: - raise HTTPException(status_code=404, detail="Instance not found") - card = _skill_router.get_card(_instances[instance_id]["skill_name"]) - - inst = _instances[instance_id] - - # Delegate entirely to the skill's init_handler — sync call wrapped in executor - loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, card.init_handler, body.message, inst["conversation"] - ) - - # Store updated conversation returned by the handler - inst["conversation"] = result["conversation"] - - if result["status"] == "ready": - inst["config"] = result["config"] - inst["config"].instance_id = instance_id - inst["threshold"] = result.get("threshold", inst["threshold"]) - - admin_token = secrets.token_urlsafe(16) - _tokens[admin_token] = {"instance_id": instance_id, "role": "admin", "submission_ids": set()} - - return InitResponse( - instance_id=instance_id, - status="ready", - message=result["message"], - admin_token=admin_token, - ) - - return InitResponse( - instance_id=instance_id, - status="configuring", - message=result["message"], - ) - - @router.post("/register") def register_user(body: dict): """ @@ -389,126 +313,6 @@ def get_submissions(request: Request): return {"submissions": meta} -@router.post("/upload") -async def upload_file(request: Request): - """ - Generic file upload — delegates entirely to the skill's upload_handler. - Skills that need file upload declare upload_handler on their SkillCard. - The skill owns all parsing, storage, and validation logic. - - Returns whatever the skill's upload_handler returns (e.g. {"dataset_id": "..."}). - """ - token_info = _resolve_token(request) - instance_id = token_info["instance_id"] - card = _skill_router.get_card(_instances[instance_id]["skill_name"]) - - if card.upload_handler is None: - raise HTTPException( - status_code=400, - detail=f"Skill '{_instances[instance_id]['skill_name']}' does not support file upload", - ) - - try: - form: FormData = await request.form() - logger.info("upload: form fields received: %s", list(form.keys())) - for key in form.keys(): - val = form.get(key) - if hasattr(val, "filename"): - logger.info(" field=%s filename=%s content_type=%s", key, val.filename, val.content_type) - else: - logger.info(" field=%s value=%r", key, val) - loop = asyncio.get_running_loop() - result = await loop.run_in_executor(None, card.upload_handler, form, instance_id) - except ValueError as e: - logger.warning("upload: validation error: %s", e) - raise HTTPException(status_code=422, detail=str(e)) - except Exception as e: - logger.error("upload: unexpected error:\n%s", traceback.format_exc()) - raise HTTPException(status_code=500, detail=f"Upload failed: {e}") - - return result - - -@router.post("/respond") -async def respond_to_result(body: dict, request: Request): - """ - Deal response — delegates entirely to the skill's respond_handler. - Skills that support renegotiation declare respond_handler on their SkillCard. - - Body: { - "submission_id": str, - "action": "accept" | "reject" | "renegotiate", - "revised_value": float | null # only when action="renegotiate" - } - Returns: {"settlement_status": str, ...any extra fields the skill returns} - """ - token_info = _resolve_token(request) - instance_id = token_info["instance_id"] - role = token_info["role"] - card = _skill_router.get_card(_instances[instance_id]["skill_name"]) - - if card.respond_handler is None: - raise HTTPException( - status_code=400, - detail=f"Skill '{_instances[instance_id]['skill_name']}' does not support deal responses", - ) - - submission_id = body.get("submission_id") - if not submission_id: - raise HTTPException(status_code=422, detail="submission_id is required") - - instance_results = _results.get(instance_id, {}) - if submission_id not in instance_results: - raise HTTPException(status_code=404, detail="Result not found or not yet available") - - action = body.get("action") - if action not in ("accept", "reject", "renegotiate"): - raise HTTPException(status_code=422, detail="action must be 'accept', 'reject', or 'renegotiate'") - - try: - loop = asyncio.get_running_loop() - updated = await loop.run_in_executor( - None, - card.respond_handler, - instance_results[submission_id], - action, - body.get("revised_value"), - "buyer" if role == "admin" else "supplier", - _instances[instance_id]["config"], - ) - except ValueError as e: - raise HTTPException(status_code=422, detail=str(e)) - - # If deal just became authorized, index the CSV bytes under the release token. - # Note: _dataset_id is stripped by guardrails, so look it up from _submissions instead. - if updated.get("settlement_status") == "authorized" and updated.get("release_token"): - sub_record = _submissions.get(instance_id, {}).get(submission_id, {}) - dataset_id = sub_record.get("dataset_id") - if dataset_id: - store_authorized_download(updated["release_token"], dataset_id) - - _results[instance_id][submission_id] = updated - return {"settlement_status": updated.get("settlement_status")} - - -@router.get("/download/{token}") -async def download_dataset(token: str): - """ - Download the authorized dataset CSV using the release token. - The token itself is the bearer credential — no X-Instance-Token needed. - Only issued after both parties accept the deal (settlement_status='authorized'). - """ - try: - csv_bytes = get_download_bytes(token) - except KeyError: - raise HTTPException(status_code=404, detail="Invalid or expired download token") - return Response( - content=csv_bytes, - media_type="text/csv", - headers={"Content-Disposition": f"attachment; filename=\"dataset_{token[:8]}.csv\""}, - ) - - @router.post("/trigger") async def trigger(request: Request): """Manual pipeline trigger. Admin only. Uses stored instance config.""" diff --git a/core/models.py b/core/models.py index d2ecd5b..7372e52 100644 --- a/core/models.py +++ b/core/models.py @@ -32,16 +32,3 @@ class SkillResponse(BaseModel): trace: Optional[list[dict]] = None enclave_signature: Optional[str] = None # added by infra side attestation_quote: Optional[str] = None # added by infra side - - -class InitRequest(BaseModel): - skill_name: str - message: str # admin's configuration message - instance_id: Optional[str] = None # None on first call, set on subsequent calls - - -class InitResponse(BaseModel): - instance_id: str - status: str # "configuring" | "ready" - message: str # LLM response (question or confirmation) - admin_token: Optional[str] = None # only when status="ready" diff --git a/core/skill_card.py b/core/skill_card.py index 1e19ba8..8e47e12 100644 --- a/core/skill_card.py +++ b/core/skill_card.py @@ -34,9 +34,6 @@ class SkillCard: trigger_modes: list = field(default_factory=list) # supported trigger declarations roles: dict = field(default_factory=dict) # admin + user role declarations setup_prompt: str = "" # LLM onboarding text for admins (metadata/docs) - init_handler: Optional[Callable] = None # skill-owned onboarding conversation handler - upload_handler: Optional[Callable] = None # skill-owned file upload handler (POST /upload) - respond_handler: Optional[Callable] = None # skill-owned deal response handler (POST /respond) user_display: dict = field(default_factory=dict) # display hints per output key for the frontend renderer version: str = "0.1.0" diff --git a/skills/confidential_data_procurement/.env.example b/skills/confidential_data_procurement/.env.example deleted file mode 100644 index 493cb78..0000000 --- a/skills/confidential_data_procurement/.env.example +++ /dev/null @@ -1,6 +0,0 @@ -# Per-node model overrides for confidential_data_procurement skill. -# Copy to skills/confidential_data_procurement/.env and fill in values. -# Empty value = fallback to CONCLAVE_DEFAULT_MODEL in root .env - -CONCLAVE_CDP_INIT_MODEL=deepseek-ai/DeepSeek-V3.1 -CONCLAVE_CDP_EVALUATE_MODEL=deepseek-ai/DeepSeek-V3.1 diff --git a/skills/confidential_data_procurement/__init__.py b/skills/confidential_data_procurement/__init__.py deleted file mode 100644 index d4a0767..0000000 --- a/skills/confidential_data_procurement/__init__.py +++ /dev/null @@ -1,312 +0,0 @@ -""" -Entry point for the confidential_data_procurement skill. - -Pipeline (per submission — threshold=1, so always exactly one): - 0. ingest.py — CSV parse + metadata parse (no LLM) - 1. deterministic.py — quality metrics, component scores, price, deal check (no LLM) - 2. agent.py — schema matching, claim verification, explanation (LLM) - 3. guardrails.py — role-aware key filter, score clamping, leakage detection - 4. respond_handler — deal response + one-round renegotiation (3×3 resolution matrix) - -What to edit here: -- run_skill(): change how deterministic + agent results merge -- respond_handler / _resolve(): update renegotiation logic -- skill_card: update description, config, trigger_modes, roles, user_display - -The skill_card is consumed by the SkillRouter and the /skills API endpoint. -""" -from __future__ import annotations - -import secrets as _secrets - -from core.models import SkillResponse -from core.skill_card import SkillCard -from skills.confidential_data_procurement.agent import run_agent -from skills.confidential_data_procurement.config import ( - ALLOWED_OUTPUT_KEYS, - USER_OUTPUT_KEYS, -) -from skills.confidential_data_procurement.deterministic import ( - check_deal, - compute_price, - compute_quality_score, - run_deterministic, -) -from skills.confidential_data_procurement.guardrails import ProcurementFilter -from skills.confidential_data_procurement.init import procurement_init_handler -from skills.confidential_data_procurement.ingest import get_dataset, procurement_upload_handler -from skills.confidential_data_procurement.models import ( - BuyerPolicy, - ProcurementResult, - SupplierSubmission, -) -from skills.confidential_data_procurement.tools import set_context - - -def run_skill(inputs: list[SupplierSubmission], params: BuyerPolicy) -> SkillResponse: - """ - Full pipeline: deterministic → [agent — Commit 7] → guardrails → response. - - With threshold=1, inputs always has exactly one SupplierSubmission. - The dataset DataFrame lives in the ingest store — never serialized or passed to LLM. - """ - results = [] - - for sub in inputs: - det = run_deterministic(sub.dataset_id, params, sub.reserve_price) - metrics = det["metrics"] - - if metrics.critical_failure: - result = ProcurementResult( - submission_id=sub.submission_id, - deal=False, - quality_score=0.0, - component_scores={}, - proposed_payment=params.base_price, - hard_constraints_pass=False, - settlement_status="rejected", - notes=det["notes"], - ) - else: - # Agent layer — schema matching + claim verification + explanation - dataset = get_dataset(sub.dataset_id) - set_context(sub.dataset_id, { - "required_columns": params.required_columns, - "column_definitions": dataset.get("column_definitions") or {}, - "seller_claims": dataset.get("seller_claims") or {}, - }) - agent_result = run_agent(sub.dataset_id, params, metrics, det["component_scores"]) - - # Merge agent's refined scores into component_scores, recompute quality - component_scores = {**det["component_scores"]} - component_scores["schema"] = agent_result["schema_score"] - component_scores["claim_veracity"] = agent_result["claim_veracity_score"] - - quality_score = compute_quality_score(component_scores, params) - proposed_payment = compute_price(quality_score, params.base_price, params.max_budget) - deal = check_deal( - metrics.hard_constraints_pass, sub.reserve_price, - proposed_payment, params.max_budget, - ) - settlement_status = "pending_approval" if deal else "rejected" - - result = ProcurementResult( - submission_id=sub.submission_id, - deal=deal, - quality_score=quality_score, - component_scores=component_scores, - proposed_payment=proposed_payment, - hard_constraints_pass=metrics.hard_constraints_pass, - settlement_status=settlement_status, - notes=det["notes"], - explanation=agent_result.get("explanation"), - schema_matching=agent_result.get("schema_matching"), - claim_verification=agent_result.get("claim_verification"), - ) - - result_dict = result.model_dump() - result_dict["_dataset_id"] = sub.dataset_id # internal — for post-deal download - results.append(result_dict) - - # Guardrails — admin-level filter stores all allowed keys. - # Role-based filtering (buyer vs supplier) happens in routes.py GET /results. - output_filter = ProcurementFilter(role="admin") - filtered = output_filter.apply(results, raw_inputs=[]) - - return SkillResponse(skill="confidential_data_procurement", results=filtered) - - -def procurement_respond_handler( - result: dict, - action: str, - revised_value: float | None, - role: str, # "buyer" or "supplier" (mapped from "admin"/"user" in routes.py) - policy: BuyerPolicy, -) -> dict: - """ - Process one deal response and advance the settlement state machine. - - 3×3 resolution matrix (B = buyer, S = supplier): - - B \\ S | accept | reject | renegotiate - --------|--------|--------|------------ - accept | auth | reject | auth* - reject | reject | reject | reject - reneg | auth* | reject | check† - - * auth at proposed_payment — the acceptor already committed - † auth if revised_budget >= revised_reserve, else rejected - - One renegotiation round only — ValueError if renegotiation_used is True. - revised_value is required when action='renegotiate'. - """ - result = dict(result) # shallow copy — don't mutate caller's dict - - if action == "renegotiate": - if result.get("renegotiation_used"): - raise ValueError("Renegotiation already used. Only one round is allowed.") - if revised_value is None: - raise ValueError("revised_value is required when action='renegotiate'.") - if role == "buyer": - revised_value = float(revised_value) - if revised_value < (policy.base_price or 0.0): - raise ValueError( - f"Revised payment (${revised_value:,.2f}) cannot be below " - f"base price (${policy.base_price:,.2f})." - ) - if revised_value > policy.max_budget: - raise ValueError( - f"Revised payment (${revised_value:,.2f}) cannot exceed " - f"max budget (${policy.max_budget:,.2f})." - ) - else: # supplier - revised_value = float(revised_value) - if revised_value < 0: - raise ValueError("Revised reserve price cannot be negative.") - - # Store this party's response - if role == "buyer": - result["buyer_response"] = action - if action == "renegotiate": - result["revised_budget"] = revised_value - else: - result["supplier_response"] = action - if action == "renegotiate": - result["revised_reserve"] = revised_value - - # If both parties have now responded, resolve; otherwise await counterparty - buyer_resp = result.get("buyer_response") - supplier_resp = result.get("supplier_response") - - if buyer_resp is None or supplier_resp is None: - result["settlement_status"] = "awaiting_counterparty" - return result - - return _resolve(result) - - -def _resolve(result: dict) -> dict: - """Apply the 3×3 matrix once both buyer_response and supplier_response are set.""" - buyer_resp = result["buyer_response"] - supplier_resp = result["supplier_response"] - - # Any reject → deal off - if buyer_resp == "reject" or supplier_resp == "reject": - result["settlement_status"] = "rejected" - result["deal"] = False - return result - - # Both accept → authorized - if buyer_resp == "accept" and supplier_resp == "accept": - result["settlement_status"] = "authorized" - result["deal"] = True - result["release_token"] = _secrets.token_urlsafe(16) - return result - - # One accepts + other renegotiates → honor the acceptor's bound (proposed_payment) - if buyer_resp == "accept" or supplier_resp == "accept": - result["settlement_status"] = "authorized" - result["deal"] = True - result["renegotiation_used"] = True - result["release_token"] = _secrets.token_urlsafe(16) - return result - - # Both renegotiate → check if revised terms meet - result["renegotiation_used"] = True - revised_budget = float(result.get("revised_budget") or result.get("proposed_payment") or 0) - revised_reserve = float(result.get("revised_reserve") or 0) - - if revised_budget >= revised_reserve: - result["settlement_status"] = "authorized" - result["deal"] = True - result["proposed_payment"] = revised_budget - result["release_token"] = _secrets.token_urlsafe(16) - else: - result["settlement_status"] = "rejected" - result["deal"] = False - note = ( - f"Renegotiation failed: buyer's revised offer (${revised_budget:,.2f}) " - f"is below supplier's revised reserve (${revised_reserve:,.2f})." - ) - notes = list(result.get("notes") or []) - notes.append(note) - result["notes"] = notes - - return result - - -skill_card = SkillCard( - name="confidential_data_procurement", - description=( - "Bilateral confidential dataset trade protocol. A buyer defines acquisition " - "policy and budget; a supplier uploads a CSV dataset with a reserve price. " - "The TEE evaluates data quality (null rates, duplicates, schema match, claim " - "verification) and proposes a fair price — neither party sees the other's " - "private numbers. Only derived quality metrics and the deal verdict leave " - "the enclave." - ), - run=run_skill, - input_model=SupplierSubmission, - output_keys=ALLOWED_OUTPUT_KEYS, - user_output_keys=USER_OUTPUT_KEYS, - config={"min_submissions": 1}, - trigger_modes=[ - { - "mode": "instant", - "description": ( - "Pipeline fires immediately when the supplier submits. " - "Each submission is evaluated independently against the buyer's policy." - ), - "default_config": {"min_submissions": 1}, - "admin_configurable": False, - }, - ], - roles={ - "admin": { - "description": ( - "Data buyer. Initialises the instance with an acquisition policy " - "(required columns, quality thresholds, budget range). Sees full " - "quality scores and proposed payment. Can accept, reject, or " - "renegotiate the deal." - ), - "capabilities": ["configure", "view_all_results", "respond"], - }, - "user": { - "description": ( - "Data supplier. Uploads a CSV dataset and metadata, sets a reserve " - "price, and submits for evaluation. Sees the proposed payment and " - "deal verdict but NOT the quality score (to prevent budget " - "reverse-engineering). Can accept, reject, or renegotiate." - ), - "capabilities": ["upload", "submit", "respond"], - "result_view": "own", - }, - }, - setup_prompt=( - "This skill runs a confidential dataset trade inside a TEE. " - "No raw data or private budget numbers ever leave the enclave.\n\n" - "As the buyer (admin), you need to provide:\n" - "1. **Required columns** — the column names you expect in the dataset.\n" - "2. **Quality thresholds** — minimum rows, max null rate, max duplicate rate.\n" - "3. **Budget** — your maximum budget and optional base (floor) price.\n" - "4. **(Optional)** Label column + minimum label rate.\n" - "5. **(Optional)** Forbidden columns — PII fields to automatically block.\n\n" - "The supplier will upload a CSV + metadata and set a reserve price. " - "The TEE computes a quality score, proposes a fair price, and checks " - "if the deal is viable (reserve ≤ price ≤ budget). Both parties then " - "accept, reject, or renegotiate." - ), - init_handler=procurement_init_handler, - upload_handler=procurement_upload_handler, - respond_handler=procurement_respond_handler, - user_display={ - "deal": {"type": "badge", "label": "Deal Status"}, - "quality_score": {"type": "gauge", "label": "Quality Score", "min": 0, "max": 1}, - "proposed_payment": {"type": "currency", "label": "Proposed Payment"}, - "settlement_status": {"type": "badge", "label": "Settlement"}, - "notes": {"type": "list", "label": "Notes"}, - "explanation": {"type": "text", "label": "Analysis"}, - "schema_matching": {"type": "json", "label": "Schema Matching"}, - "claim_verification": {"type": "json", "label": "Claim Verification"}, - }, -) diff --git a/skills/confidential_data_procurement/agent.py b/skills/confidential_data_procurement/agent.py deleted file mode 100644 index a84453c..0000000 --- a/skills/confidential_data_procurement/agent.py +++ /dev/null @@ -1,297 +0,0 @@ -""" -Single evaluate_node agent for confidential_data_procurement. - -Graph: StateGraph with single evaluate_node → END. -Provides LangSmith trace visibility with proper node names, tool calls, and timing. - -The dataset never leaves the TEE — the LLM sees only aggregate statistics -returned by the tools. validate_tool_output() in tools.py blocks raw row dumps. - -Graph: - evaluate_node (LLM + tools) → END -""" -from __future__ import annotations - -import json -import re -from typing import Any, Annotated, TypedDict - -from langchain_core.messages import HumanMessage, SystemMessage, BaseMessage -from langgraph.graph import StateGraph, END -from langgraph.graph.message import add_messages -from langgraph.prebuilt import ToolNode - -from config import get_llm -from skills.confidential_data_procurement.config import EVALUATE_MODEL -from skills.confidential_data_procurement.models import BuyerPolicy, DatasetMetrics -from skills.confidential_data_procurement.tools import EVALUATE_TOOLS, set_context - - -EVALUATE_PROMPT_VERSION = "v1" - - -_SYSTEM_PROMPT = """\ -You are a data quality evaluator running inside a Trusted Execution Environment (TEE). -Your job is to assess a supplier's dataset against a buyer's acquisition policy. - -You have three tools: - - get_schema_summary() — column names, dtypes, null rates, row count - - get_column_stats(column_name) — per-column statistics - - get_value_distribution(column_name, top_n) — top-N value frequencies - -TASK 1 — SCHEMA MATCHING -The buyer requires these columns (semantic — names may differ from actual dataset): -{required_columns} - -Column definitions provided by the seller: -{column_definitions} - -For each required column, find the best matching actual column. -A match is valid if the column names are semantically equivalent -(e.g. "transaction_id" ≈ "txn_id", "is_fraud" ≈ "fraud_label"). -Score schema_score as: matched_count / required_count (0.0 if none match, 1.0 if all match). - -TASK 2 — CLAIM VERIFICATION -The seller claims: -{seller_claims} - -Call get_column_stats or get_value_distribution to check each claim against real data. -Mark each claim as "verified", "disputed", or "unverifiable" (if no relevant column exists). -Score claim_veracity_score as: verified_count / total_claims (1.0 if no claims). - -TASK 3 — EXPLANATION -Write a concise (3-5 sentence) neutral explanation covering: -- Which required columns were found/missing -- Whether seller's claims held up -- Any notable quality concerns from the deterministic metrics - -IMPORTANT: -- Only use aggregate stats from tools — never infer individual values -- Do not mention the buyer's budget, base price, or quality score -- Keep explanation under 400 words - -After calling the tools you need, output ONLY this JSON (no markdown fences, no prose): -{{ - "schema_score": 0.0-1.0, - "claim_veracity_score": 0.0-1.0, - "schema_matching": {{"required_col": "matched_col_or_null", ...}}, - "claim_verification": {{"claim_text": "verified|disputed|unverifiable", ...}}, - "explanation": "..." -}} -""" - - -class EvaluateState(TypedDict): - messages: Annotated[list[BaseMessage], add_messages] - dataset_id: str - policy: Any # BuyerPolicy — held in-memory, not serialized - metrics: Any # DatasetMetrics — held in-memory, not serialized - eval_result: dict - - -# --- Node --- - -def evaluate_node(state: EvaluateState) -> dict: - """LLM node: schema matching + claim verification + explanation with tool loop.""" - from skills.confidential_data_procurement.ingest import get_dataset - - dataset_id = state["dataset_id"] - policy: BuyerPolicy = state["policy"] - metrics: DatasetMetrics = state["metrics"] - - dataset = get_dataset(dataset_id) - column_definitions = dataset.get("column_definitions") or {} - seller_claims = dataset.get("seller_claims") or {} - - # Bind tools to the active dataset - set_context(dataset_id, { - "required_columns": policy.required_columns or [], - "column_definitions": column_definitions, - "seller_claims": seller_claims, - }) - - required_str = ", ".join(policy.required_columns) if policy.required_columns else "(none)" - definitions_str = ( - "\n".join(f" {col}: {defn}" for col, defn in column_definitions.items()) - if column_definitions else " (no definitions provided)" - ) - claims_str = ( - "\n".join(f" - {k}: {v}" for k, v in seller_claims.items()) - if seller_claims else " (no claims provided)" - ) - - system_content = _SYSTEM_PROMPT.format( - required_columns=required_str, - column_definitions=definitions_str, - seller_claims=claims_str, - ) - - det_note = ( - f"Deterministic metrics already computed:\n" - f" rows={metrics.row_count}, " - f" overall_null_rate={metrics.overall_null_rate:.1%}, " - f" duplicate_rate={metrics.duplicate_rate:.1%}, " - f" hard_constraints_pass={metrics.hard_constraints_pass}" - ) - - llm = get_llm(EVALUATE_MODEL).bind_tools(EVALUATE_TOOLS) - messages = [ - SystemMessage(content=system_content), - HumanMessage(content=( - f"Evaluate the dataset now.\n\n{det_note}\n\n" - "Call get_schema_summary first, then any other tools you need, " - "then output the final JSON." - )), - ] - - # Tool loop — LLM decides when to stop calling tools - max_iterations = 10 - response = None - for _ in range(max_iterations): - response = llm.invoke(messages) - messages.append(response) - if not (hasattr(response, "tool_calls") and response.tool_calls): - break - tool_node = ToolNode(EVALUATE_TOOLS) - tool_results = tool_node.invoke({"messages": messages}) - messages.extend(tool_results["messages"]) - - raw = response.content if response and isinstance(response.content, str) else "" - - # Nudge if LLM stopped without producing JSON - if raw.strip() and not _looks_like_json(raw): - messages.append(HumanMessage(content=( - "Now output ONLY the final JSON object with schema_score, " - "claim_veracity_score, schema_matching, claim_verification, and explanation." - ))) - response = llm.invoke(messages) - messages.append(response) - raw = response.content if isinstance(response.content, str) else "" - - parsed = _parse_agent_output(raw, policy, seller_claims) - return {"messages": messages, "eval_result": parsed} - - -# --- Graph builder --- - -def _build_evaluate_graph(): - """Build and compile the single-node StateGraph for dataset evaluation.""" - graph = StateGraph(EvaluateState) - graph.add_node("evaluate", evaluate_node) - graph.set_entry_point("evaluate") - graph.add_edge("evaluate", END) - return graph.compile() - - -# --- Entry point --- - -def run_agent( - dataset_id: str, - policy: BuyerPolicy, - metrics: DatasetMetrics, - component_scores: dict[str, float], -) -> dict[str, Any]: - """ - Run the evaluate node for one dataset. - - Returns a dict with: - schema_score, claim_veracity_score, schema_matching, claim_verification, explanation - Falls back to safe defaults if the LLM output cannot be parsed. - """ - graph = _build_evaluate_graph() - - initial_state: EvaluateState = { - "messages": [], - "dataset_id": dataset_id, - "policy": policy, - "metrics": metrics, - "eval_result": {}, - } - - final_state = graph.invoke(initial_state, config={ - "recursion_limit": 50, - "metadata": {"evaluate_prompt": EVALUATE_PROMPT_VERSION}, - }) - return final_state["eval_result"] - - -# --------------------------------------------------------------------------- -# Parsers -# --------------------------------------------------------------------------- - -def _looks_like_json(text: str) -> bool: - return bool(re.search(r'\{', text)) - - -def _parse_agent_output( - text: str, - policy: BuyerPolicy, - seller_claims: dict, -) -> dict[str, Any]: - """Extract agent JSON from LLM response. Falls back to safe defaults.""" - text = text.strip() - # Strip markdown fences - if text.startswith("```"): - lines = text.splitlines() - inner = lines[1:-1] if lines[-1].strip() == "```" else lines[1:] - text = "\n".join(inner).strip() - - obj = None - match = re.search(r'\{', text) - if match: - start = match.start() - depth = 0 - in_str = False - escape = False - end = -1 - for i in range(start, len(text)): - c = text[i] - if escape: - escape = False - continue - if c == "\\" and in_str: - escape = True - continue - if c == '"': - in_str = not in_str - if not in_str: - if c == "{": - depth += 1 - elif c == "}": - depth -= 1 - if depth == 0: - end = i + 1 - break - if end != -1: - try: - obj = json.loads(text[start:end]) - except (json.JSONDecodeError, ValueError): - obj = None - - if not obj: - return _safe_defaults(policy, seller_claims) - - schema_score = float(obj.get("schema_score") or 0.5) - schema_score = max(0.0, min(1.0, schema_score)) - - claim_veracity_score = float(obj.get("claim_veracity_score") or 1.0) - claim_veracity_score = max(0.0, min(1.0, claim_veracity_score)) - - return { - "schema_score": schema_score, - "claim_veracity_score": claim_veracity_score, - "schema_matching": obj.get("schema_matching") or {}, - "claim_verification": obj.get("claim_verification") or {}, - "explanation": str(obj.get("explanation") or ""), - } - - -def _safe_defaults(policy: BuyerPolicy, seller_claims: dict) -> dict[str, Any]: - """Return conservative defaults when agent output cannot be parsed.""" - return { - "schema_score": 0.5, - "claim_veracity_score": 1.0, - "schema_matching": {col: None for col in policy.required_columns}, - "claim_verification": {k: "unverifiable" for k in seller_claims}, - "explanation": "Automated evaluation completed. Schema and claim verification results unavailable.", - } diff --git a/skills/confidential_data_procurement/config.py b/skills/confidential_data_procurement/config.py deleted file mode 100644 index 2d81cdb..0000000 --- a/skills/confidential_data_procurement/config.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Skill-specific constants for confidential_data_procurement. - -What to edit here: -- ALLOWED_OUTPUT_KEYS: buyer (admin) view — keys that leave the pipeline to the buyer -- USER_OUTPUT_KEYS: supplier (participant) view — subset of ALLOWED_OUTPUT_KEYS. - quality_score and hard_constraints_pass are buyer-only to prevent - the supplier from reverse-engineering max_budget via P/S = max_budget. -- SCORE_BOUNDS: clamping ranges for numeric output fields -- DEFAULT_SCORE_WEIGHTS: used when buyer doesn't specify score_weights in BuyerPolicy -- CRITICAL_*: deterministic early-exit thresholds (no LLM runs on critical failure) -- *_MODEL: per-node model overrides (set in .env) - -Consumed by: -- deterministic.py (CRITICAL_*, DEFAULT_SCORE_WEIGHTS) -- guardrails.py (ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS, SCORE_BOUNDS) -- __init__.py (ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS via skill_card) -- agent.py (EVALUATE_MODEL) -- init.py (INIT_MODEL) -""" -from __future__ import annotations - -import os - -from dotenv import load_dotenv - -load_dotenv(os.path.join(os.path.dirname(__file__), ".env")) - -# --- Output key sets --- - -# Buyer (admin) sees quality details + budget-sensitive fields -ALLOWED_OUTPUT_KEYS: set[str] = { - "submission_id", - "deal", - "quality_score", # buyer-only (budget leak if supplier sees this + proposed_payment) - "component_scores", # buyer-only (same reasoning as quality_score) - "proposed_payment", - "hard_constraints_pass", # buyer-only - "settlement_status", - "release_token", - "notes", - "explanation", - "claim_verification", - "schema_matching", - "buyer_response", - "supplier_response", - "renegotiation_used", - "revised_budget", # buyer's counter-offer — disclosed to seller during renegotiation - "revised_reserve", # seller's counter-offer — disclosed to buyer during renegotiation -} - -# Supplier (participant) — same info, quality_score and hard_constraints_pass withheld -USER_OUTPUT_KEYS: set[str] = { - "submission_id", - "deal", - "proposed_payment", - "settlement_status", - "release_token", - "notes", - "explanation", - "claim_verification", - "schema_matching", - "buyer_response", - "supplier_response", - "renegotiation_used", - "revised_budget", # buyer's counter-offer — visible to seller so they can respond -} - -# --- Score bounds (used by guardrails for clamping) --- - -SCORE_BOUNDS: dict[str, tuple[float, float]] = { - "quality_score": (0.0, 1.0), -} - -# --- Default score weights --- -# Buyer can override via BuyerPolicy.score_weights. Must sum to 1.0. -DEFAULT_SCORE_WEIGHTS: dict[str, float] = { - "schema": 0.15, - "coverage": 0.15, - "null": 0.20, - "duplicate": 0.15, - "label": 0.10, - "risk": 0.15, - "claim_veracity": 0.10, -} - -# --- Critical failure thresholds (deterministic early exit, no LLM) --- - -# Duplicate rate above this → critical failure, deal rejected immediately -CRITICAL_DUPLICATE_THRESHOLD: float = 0.50 - -# Dataset size limits -MAX_DATASET_SIZE_MB: int = 50 -MAX_DATASET_ROWS: int = 500_000 - -# Minimum leakage substring length passed to LeakageDetector -MIN_LEAKAGE_SUBSTRING_LENGTH: int = 20 - -# --- Per-node model overrides --- - -_default = os.environ.get("CONCLAVE_DEFAULT_MODEL", "deepseek-ai/DeepSeek-V3.1") -INIT_MODEL = os.environ.get("CONCLAVE_CDP_INIT_MODEL") or _default -EVALUATE_MODEL = os.environ.get("CONCLAVE_CDP_EVALUATE_MODEL") or _default diff --git a/skills/confidential_data_procurement/deterministic.py b/skills/confidential_data_procurement/deterministic.py deleted file mode 100644 index 9a0b7a8..0000000 --- a/skills/confidential_data_procurement/deterministic.py +++ /dev/null @@ -1,313 +0,0 @@ -""" -Deterministic quality evaluation layer for confidential_data_procurement. - -No LLM calls. Pure pandas + math. - -Pipeline: - 1. compute_metrics() — null rates, duplicate rate, label rate, forbidden col check - 2. check_critical() — early exit if any hard constraint is fatally violated - 3. compute_component_scores() — each dimension scored [0, 1] - 4. compute_quality_score() — weighted sum clamped to [0, 1] - 5. compute_price() — P = base_price + (max_budget - base_price) * S - 6. check_deal() — R <= P <= B and hard_constraints_pass - 7. run_deterministic() — orchestrates all of the above - -Note: schema_score and claim_veracity are placeholders (0.5 and 1.0 respectively) -until the agent layer runs fuzzy column matching and claim verification. -run_skill() will merge the agent's verdicts into the final quality score. -""" -from __future__ import annotations - -import math -from typing import Any - -import pandas as pd - -from skills.confidential_data_procurement.config import ( - CRITICAL_DUPLICATE_THRESHOLD, - DEFAULT_SCORE_WEIGHTS, -) -from skills.confidential_data_procurement.ingest import get_dataset -from skills.confidential_data_procurement.models import BuyerPolicy, DatasetMetrics - - -# --------------------------------------------------------------------------- -# Step 1: Metrics -# --------------------------------------------------------------------------- - -def compute_metrics(df: pd.DataFrame, policy: BuyerPolicy) -> DatasetMetrics: - """Compute all quality metrics from the raw DataFrame.""" - row_count = len(df) - column_names = list(df.columns) - - # Null rates per column and overall - null_rate_by_column = {col: float(df[col].isna().mean()) for col in df.columns} - total_cells = df.size - overall_null_rate = float(df.isna().sum().sum() / total_cells) if total_cells > 0 else 0.0 - - # Duplicate rate - duplicate_rate = float(df.duplicated().mean()) if row_count > 0 else 0.0 - - # Label rate — fraction of positive/truthy values in label column - label_rate: float | None = None - if policy.label_column and policy.label_column in df.columns: - col = df[policy.label_column].dropna() - if len(col) > 0: - label_rate = float((col.astype(bool)).mean()) - - # Forbidden columns present - forbidden_columns_present = [ - col for col in policy.forbidden_columns if col in column_names - ] - - # Hard constraints: forbidden cols absent + not critical duplicate rate - hard_constraints_pass = ( - len(forbidden_columns_present) == 0 - and duplicate_rate < CRITICAL_DUPLICATE_THRESHOLD - and row_count > 0 - ) - - # Critical failure detection - critical_failure = False - critical_reason: str | None = None - - if row_count == 0: - critical_failure = True - critical_reason = "Dataset is empty — no rows to evaluate." - elif forbidden_columns_present: - critical_failure = True - critical_reason = ( - f"Forbidden column(s) detected: {', '.join(forbidden_columns_present)}. " - "Deal rejected to protect data privacy constraints." - ) - elif duplicate_rate >= CRITICAL_DUPLICATE_THRESHOLD: - critical_failure = True - critical_reason = ( - f"Duplicate rate ({duplicate_rate:.1%}) exceeds critical threshold " - f"({CRITICAL_DUPLICATE_THRESHOLD:.0%}). Dataset quality is insufficient." - ) - - return DatasetMetrics( - row_count=row_count, - column_names=column_names, - null_rate_by_column=null_rate_by_column, - overall_null_rate=overall_null_rate, - duplicate_rate=duplicate_rate, - label_rate=label_rate, - forbidden_columns_present=forbidden_columns_present, - hard_constraints_pass=hard_constraints_pass, - critical_failure=critical_failure, - critical_reason=critical_reason, - ) - - -# --------------------------------------------------------------------------- -# Step 2: Critical check -# --------------------------------------------------------------------------- - -def check_critical(metrics: DatasetMetrics) -> tuple[bool, str | None]: - """Return (is_critical, reason). Caller should early-exit if is_critical.""" - return metrics.critical_failure, metrics.critical_reason - - -# --------------------------------------------------------------------------- -# Step 3: Component scores -# --------------------------------------------------------------------------- - -def compute_component_scores( - metrics: DatasetMetrics, policy: BuyerPolicy -) -> dict[str, float]: - """ - Score each quality dimension in [0, 1]. - - schema_score: 0.5 placeholder — agent will compute fuzzy match verdict. - claim_veracity: 1.0 placeholder — agent will compute claim verification score. - """ - scores: dict[str, float] = {} - - # Schema — agent will refine this - scores["schema"] = 0.5 - - # Coverage: how close are we to the required row count? - scores["coverage"] = min(metrics.row_count / policy.min_rows, 1.0) - - # Null score: penalise for null rate exceeding the policy threshold - if policy.max_null_rate > 0: - scores["null"] = max(0.0, 1.0 - (metrics.overall_null_rate / policy.max_null_rate)) - else: - scores["null"] = 1.0 if metrics.overall_null_rate == 0 else 0.0 - - # Duplicate score - if policy.max_duplicate_rate > 0: - scores["duplicate"] = max( - 0.0, 1.0 - (metrics.duplicate_rate / policy.max_duplicate_rate) - ) - else: - scores["duplicate"] = 1.0 if metrics.duplicate_rate == 0 else 0.0 - - # Label score - if policy.min_label_rate is not None and policy.min_label_rate > 0: - label_rate = metrics.label_rate or 0.0 - scores["label"] = min(label_rate / policy.min_label_rate, 1.0) - else: - scores["label"] = 1.0 # not required - - # Risk score: hard 0 if forbidden columns present - scores["risk"] = 0.0 if metrics.forbidden_columns_present else 1.0 - - # Claim veracity — agent will refine this - scores["claim_veracity"] = 1.0 - - return scores - - -# --------------------------------------------------------------------------- -# Step 4: Weighted quality score -# --------------------------------------------------------------------------- - -def compute_quality_score( - component_scores: dict[str, float], policy: BuyerPolicy -) -> float: - """ - Weighted sum of component scores, clamped to [0, 1]. - Uses policy.score_weights if set, otherwise DEFAULT_SCORE_WEIGHTS. - """ - weights = policy.score_weights if policy.score_weights else DEFAULT_SCORE_WEIGHTS - total = sum( - weights.get(key, 0.0) * score for key, score in component_scores.items() - ) - return max(0.0, min(1.0, total)) - - -# --------------------------------------------------------------------------- -# Step 5: Price -# --------------------------------------------------------------------------- - -def compute_price(S: float, base_price: float, max_budget: float) -> float: - """ - P = base_price + (max_budget - base_price) * S - - S=0 → P = base_price (floor: minimum payment even for poor quality) - S=1 → P = max_budget (ceiling: full payment for perfect quality) - """ - return round(base_price + (max_budget - base_price) * S, 2) - - -# --------------------------------------------------------------------------- -# Step 6: Deal condition -# --------------------------------------------------------------------------- - -def check_deal( - hard_constraints_pass: bool, - reserve_price: float, - proposed_payment: float, - max_budget: float, -) -> bool: - """ - deal = hard_constraints_pass AND (reserve_price <= proposed_payment <= max_budget) - """ - return ( - hard_constraints_pass - and reserve_price <= proposed_payment <= max_budget - ) - - -# --------------------------------------------------------------------------- -# Step 7: Orchestrator -# --------------------------------------------------------------------------- - -def run_deterministic( - dataset_id: str, - policy: BuyerPolicy, - reserve_price: float, -) -> dict[str, Any]: - """ - Run the full deterministic evaluation for a single dataset. - - Returns a dict consumed by run_skill(): - { - "metrics": DatasetMetrics, - "component_scores": dict[str, float], - "quality_score": float, # preliminary S (schema + claim are placeholders) - "proposed_payment": float, - "deal": bool, - "notes": list[str], # human-readable partial-failure notes - } - """ - dataset = get_dataset(dataset_id) - df: pd.DataFrame = dataset["df"] - - # Step 1 - metrics = compute_metrics(df, policy) - - # Step 2 — critical failures propagate directly to run_skill for early exit - if metrics.critical_failure: - return { - "metrics": metrics, - "component_scores": {}, - "quality_score": 0.0, - "proposed_payment": policy.base_price, - "deal": False, - "notes": [metrics.critical_reason] if metrics.critical_reason else [], - } - - # Step 3 - component_scores = compute_component_scores(metrics, policy) - - # Step 4 - quality_score = compute_quality_score(component_scores, policy) - - # Step 5 - proposed_payment = compute_price(quality_score, policy.base_price, policy.max_budget) - - # Step 6 - deal = check_deal( - metrics.hard_constraints_pass, reserve_price, proposed_payment, policy.max_budget - ) - - # Build human-readable notes for partial failures (non-critical but notable) - notes: list[str] = [] - - if metrics.overall_null_rate > policy.max_null_rate: - notes.append( - f"Null rate ({metrics.overall_null_rate:.1%}) exceeds policy threshold " - f"({policy.max_null_rate:.1%}). Quality score penalised." - ) - - if metrics.duplicate_rate > policy.max_duplicate_rate: - notes.append( - f"Duplicate rate ({metrics.duplicate_rate:.1%}) exceeds policy threshold " - f"({policy.max_duplicate_rate:.1%}). Quality score penalised." - ) - - if metrics.row_count < policy.min_rows: - notes.append( - f"Row count ({metrics.row_count:,}) is below policy minimum " - f"({policy.min_rows:,}). Coverage score penalised." - ) - - if ( - policy.min_label_rate is not None - and metrics.label_rate is not None - and metrics.label_rate < policy.min_label_rate - ): - notes.append( - f"Label rate ({metrics.label_rate:.2%}) is below policy minimum " - f"({policy.min_label_rate:.2%})." - ) - - if not deal and not metrics.critical_failure: - if reserve_price > proposed_payment: - notes.append( - f"Proposed payment (${proposed_payment:,.2f}) is below supplier's " - "reserve price. Consider renegotiation." - ) - - return { - "metrics": metrics, - "component_scores": component_scores, - "quality_score": quality_score, - "proposed_payment": proposed_payment, - "deal": deal, - "notes": notes, - } diff --git a/skills/confidential_data_procurement/guardrails.py b/skills/confidential_data_procurement/guardrails.py deleted file mode 100644 index 602dff5..0000000 --- a/skills/confidential_data_procurement/guardrails.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Output filter and tool output validator for confidential_data_procurement. - -ProcurementFilter — role-aware output filter: - - Buyer (admin): sees quality_score, hard_constraints_pass - - Supplier (user): those two fields are withheld (budget leak prevention — - if supplier sees quality_score + proposed_payment they can - compute max_budget = P / S) - -validate_tool_output — programmatic guardrail wrapping every agent data tool: - - Blocks raw row dumps (too many CSV-like lines) - - Blocks high-cardinality value lists (> MAX_TOOL_OUTPUT_ITEMS list entries) - - Blocks oversized blobs (> MAX_TOOL_OUTPUT_CHARS) - -LeakageDetector is applied inside ProcurementFilter.apply() (inherited from -OutputFilterBase). Even if the LLM echoes a cell value in its explanation, the -detector flags it before the response leaves the pipeline. -""" -from __future__ import annotations - -from core.guardrails import LeakageDetector, OutputFilterBase -from skills.confidential_data_procurement.config import ( - ALLOWED_OUTPUT_KEYS, - MIN_LEAKAGE_SUBSTRING_LENGTH, - SCORE_BOUNDS, - USER_OUTPUT_KEYS, -) - -# --------------------------------------------------------------------------- -# Tool output guardrail constants -# --------------------------------------------------------------------------- - -MAX_TOOL_OUTPUT_CHARS: int = 4_000 -MAX_TOOL_OUTPUT_ITEMS: int = 50 # max enumerated items (bullet/colon lines) -MAX_RAW_ROW_LINES: int = 5 # more comma-separated lines than this → raw dump - - -# --------------------------------------------------------------------------- -# Role-aware output filter -# --------------------------------------------------------------------------- - -class ProcurementFilter(OutputFilterBase): - """ - Role-aware output filter for the dataset procurement pipeline. - - role="admin" → buyer view — full ALLOWED_OUTPUT_KEYS (includes quality_score) - role="user" → supplier view — USER_OUTPUT_KEYS (quality_score withheld) - """ - - def __init__(self, role: str = "admin"): - keys = ALLOWED_OUTPUT_KEYS if role == "admin" else USER_OUTPUT_KEYS - super().__init__( - allowed_keys=keys, - leakage_detector=LeakageDetector(min_length=MIN_LEAKAGE_SUBSTRING_LENGTH), - ) - - def check_bounds(self, result: dict) -> dict: - """Clamp quality_score to [0, 1]. All other fields pass through.""" - if "quality_score" in result: - lo, hi = SCORE_BOUNDS["quality_score"] - result["quality_score"] = max(lo, min(hi, float(result["quality_score"]))) - return result - - -# --------------------------------------------------------------------------- -# Tool output validator -# --------------------------------------------------------------------------- - -def validate_tool_output(output: str) -> str: - """ - Programmatic guardrail for every agent data tool. - - Raises ValueError if the output looks like: - - A raw row dump (> MAX_RAW_ROW_LINES CSV-like lines) - - A high-cardinality list (> MAX_TOOL_OUTPUT_ITEMS enumerated items) - - An oversized blob (> MAX_TOOL_OUTPUT_CHARS characters) - - Returns the output unchanged if all checks pass. - """ - if len(output) > MAX_TOOL_OUTPUT_CHARS: - raise ValueError( - f"Tool output too large ({len(output):,} chars). " - f"Maximum allowed: {MAX_TOOL_OUTPUT_CHARS:,}. " - "Return aggregate statistics, not raw data." - ) - - lines = [line for line in output.splitlines() if line.strip()] - - # Raw row detection — a real stats summary rarely has many comma-heavy lines - csv_like = sum(1 for line in lines if line.count(",") >= 2) - if csv_like > MAX_RAW_ROW_LINES: - raise ValueError( - f"Tool output contains {csv_like} CSV-like lines " - f"(threshold: {MAX_RAW_ROW_LINES}). " - "Tools must return aggregate statistics, not raw rows." - ) - - # High-cardinality detection — count bullet/label lines - list_items = [ - line for line in lines - if line.lstrip().startswith(("-", "*", "•")) or ": " in line - ] - if len(list_items) > MAX_TOOL_OUTPUT_ITEMS: - raise ValueError( - f"Tool output enumerates {len(list_items)} items " - f"(threshold: {MAX_TOOL_OUTPUT_ITEMS}). " - "Return top-N values or aggregates only." - ) - - return output diff --git a/skills/confidential_data_procurement/ingest.py b/skills/confidential_data_procurement/ingest.py deleted file mode 100644 index 44ca674..0000000 --- a/skills/confidential_data_procurement/ingest.py +++ /dev/null @@ -1,252 +0,0 @@ -""" -Ingestion layer for confidential_data_procurement. - -Responsibilities: -- Parse uploaded CSV into a pandas DataFrame -- Parse metadata file (JSON working; PDF/DOCX stubbed) -- Parse buyer policy document (JSON working; PDF/DOCX stubbed) -- Store DataFrames in memory keyed by dataset_id -- Expose upload_handler — the SkillCard callable for POST /upload - -The DataFrame NEVER leaves this module as raw data. -Tools in tools.py query it only via aggregate operations. -Cleanup is called by run_skill after the pipeline completes. - -Format support matrix: - CSV: ✓ working - JSON: ✓ working (metadata + buyer policy documents) - DOCX: ✗ stubbed - PDF: ✗ stubbed - Excel: ✗ stubbed -""" -from __future__ import annotations - -import io -import json -import uuid -from typing import Any - -import pandas as pd - -from skills.confidential_data_procurement.config import ( - MAX_DATASET_ROWS, - MAX_DATASET_SIZE_MB, -) - -# --------------------------------------------------------------------------- -# In-memory dataset store -# dataset_id -> { -# "df": pd.DataFrame, -# "csv_bytes": bytes, # raw upload bytes — kept for post-deal download -# "metadata": dict, # seller-provided metadata -# "column_definitions": dict, # col_name -> human description -# "seller_claims": dict, # claim_key -> claim_value -# "instance_id": str, -# } -# --------------------------------------------------------------------------- -_datasets: dict[str, dict[str, Any]] = {} - -# release_token -> csv_bytes -# Populated by store_authorized_download() when a deal is authorized. -# Persists after cleanup() so the buyer can download post-settlement. -_authorized_downloads: dict[str, bytes] = {} - - -# --------------------------------------------------------------------------- -# Public accessors -# --------------------------------------------------------------------------- - -def get_dataset(dataset_id: str) -> dict[str, Any]: - """Return the stored dataset dict. Raises KeyError if not found.""" - if dataset_id not in _datasets: - raise KeyError(f"Dataset '{dataset_id}' not found. Upload may have expired.") - return _datasets[dataset_id] - - -def cleanup(dataset_id: str) -> None: - """Discard the DataFrame after the pipeline completes.""" - _datasets.pop(dataset_id, None) - - -def store_authorized_download(release_token: str, dataset_id: str) -> None: - """ - Move CSV bytes from the dataset store into the authorized downloads map. - Called when a deal reaches settlement_status='authorized'. - The bytes persist here after the DataFrame is cleaned up. - """ - dataset = _datasets.get(dataset_id) - if dataset and "csv_bytes" in dataset: - _authorized_downloads[release_token] = dataset["csv_bytes"] - - -def get_download_bytes(release_token: str) -> bytes: - """ - Return the CSV bytes for an authorized download token. - Raises KeyError if the token is not found. - """ - if release_token not in _authorized_downloads: - raise KeyError(f"Download token not found or not yet authorized.") - return _authorized_downloads[release_token] - - -# --------------------------------------------------------------------------- -# CSV parsing -# --------------------------------------------------------------------------- - -def parse_csv(file_bytes: bytes) -> pd.DataFrame: - """ - Parse CSV bytes into a DataFrame. - Enforces size and row limits before returning. - Raises ValueError with a human-readable message on any failure. - """ - size_mb = len(file_bytes) / (1024 * 1024) - if size_mb > MAX_DATASET_SIZE_MB: - raise ValueError( - f"Dataset exceeds size limit ({size_mb:.1f}MB > {MAX_DATASET_SIZE_MB}MB). " - "Please upload a smaller file." - ) - - try: - df = pd.read_csv(io.BytesIO(file_bytes)) - except Exception as e: - raise ValueError(f"Could not parse CSV: {e}") from e - - if len(df) > MAX_DATASET_ROWS: - raise ValueError( - f"Dataset exceeds row limit ({len(df):,} rows > {MAX_DATASET_ROWS:,}). " - "Please upload a sample." - ) - - if df.empty: - raise ValueError("Uploaded CSV is empty.") - - return df - - -# --------------------------------------------------------------------------- -# Metadata parsing -# --------------------------------------------------------------------------- - -def parse_metadata(file_bytes: bytes, file_type: str) -> dict[str, Any]: - """ - Parse the supplier's metadata file. - - JSON (working): expects keys such as: - column_definitions: {col_name: description} - seller_claims: {claim_key: claim_value} - source, date_range, license, etc. - - PDF / DOCX / other: stubbed — returns empty metadata with a note. - """ - file_type = (file_type or "").lower().strip(".") - - if file_type == "json": - try: - return json.loads(file_bytes.decode("utf-8")) - except Exception as e: - raise ValueError(f"Could not parse metadata JSON: {e}") from e - - # --- Stubs --- - _STUB_TYPES = {"pdf", "docx", "doc", "txt", "md"} - if file_type in _STUB_TYPES: - return { - "_stub": True, - "_stub_reason": ( - f"Metadata format '{file_type}' is not yet supported. " - "Please upload a JSON metadata file. " - "Proceeding with empty metadata." - ), - } - - return { - "_stub": True, - "_stub_reason": ( - f"Unknown metadata format '{file_type}'. " - "Proceeding with empty metadata." - ), - } - - -def parse_buyer_document(file_bytes: bytes, file_type: str) -> dict[str, Any]: - """ - Parse a buyer-uploaded policy document. - - JSON (working): expects BuyerPolicy-compatible fields. - PDF / DOCX: stubbed — buyer should describe requirements in the init chat. - """ - file_type = (file_type or "").lower().strip(".") - - if file_type == "json": - try: - return json.loads(file_bytes.decode("utf-8")) - except Exception as e: - raise ValueError(f"Could not parse policy JSON: {e}") from e - - _STUB_TYPES = {"pdf", "docx", "doc", "txt", "md"} - if file_type in _STUB_TYPES: - return { - "_stub": True, - "_stub_reason": ( - f"Policy document format '{file_type}' is not yet supported. " - "Please describe your requirements in the setup chat, " - "or upload a JSON policy file." - ), - } - - return { - "_stub": True, - "_stub_reason": ( - f"Unknown policy format '{file_type}'. " - "Please describe your requirements in the setup chat." - ), - } - - -# --------------------------------------------------------------------------- -# Upload handler (SkillCard.upload_handler) -# --------------------------------------------------------------------------- - -def procurement_upload_handler(form: Any, instance_id: str) -> dict[str, Any]: - """ - Skill-owned handler for POST /upload. - Called by routes.py with the parsed multipart form and instance_id. - - Expected form fields: - csv_file — the dataset CSV (required) - metadata_file — JSON metadata file (optional) - - Returns: - {"dataset_id": str} - """ - # --- Extract CSV --- - csv_upload = form.get("csv_file") - if csv_upload is None: - raise ValueError("csv_file is required") - - csv_bytes: bytes = csv_upload.file.read() if hasattr(csv_upload, "file") else bytes(csv_upload) - df = parse_csv(csv_bytes) - - # --- Extract metadata (optional) --- - metadata: dict[str, Any] = {} - metadata_upload = form.get("metadata_file") - if metadata_upload is not None: - meta_bytes = ( - metadata_upload.file.read() - if hasattr(metadata_upload, "file") - else bytes(metadata_upload) - ) - filename = getattr(metadata_upload, "filename", "") or "" - ext = filename.rsplit(".", 1)[-1] if "." in filename else "json" - metadata = parse_metadata(meta_bytes, ext) - - dataset_id = str(uuid.uuid4()) - _datasets[dataset_id] = { - "df": df, - "csv_bytes": csv_bytes, - "metadata": metadata, - "column_definitions": metadata.get("column_definitions", {}), - "seller_claims": metadata.get("seller_claims", {}), - "instance_id": instance_id, - } - - return {"dataset_id": dataset_id} diff --git a/skills/confidential_data_procurement/init.py b/skills/confidential_data_procurement/init.py deleted file mode 100644 index 62cda40..0000000 --- a/skills/confidential_data_procurement/init.py +++ /dev/null @@ -1,266 +0,0 @@ -""" -Buyer onboarding handler for confidential_data_procurement. - -The API calls procurement_init_handler(message, conversation) on each POST /init. -This module owns all procurement-specific onboarding logic: - - Greeting and guided data-collection conversation - - LLM extraction of BuyerPolicy fields from free-form buyer input - - BuyerPolicy construction and validation - -Handler interface (same contract as hackathon_novelty.init): - procurement_init_handler(message: str, conversation: list[dict]) -> dict - - Returns one of: - {"status": "configuring", "message": str, "conversation": list[dict]} - {"status": "ready", "message": str, "conversation": list[dict], - "config": BuyerPolicy, "threshold": 1} - -threshold is always 1 — procurement triggers instantly when a supplier submits. -""" -from __future__ import annotations - -import json -from typing import Optional - -from langchain_core.messages import AIMessage, HumanMessage, SystemMessage - -from config import get_llm -from skills.confidential_data_procurement.config import INIT_MODEL -from skills.confidential_data_procurement.models import BuyerPolicy - - -INIT_PROMPT_VERSION = "v1" - - -_GREETING_TEMPLATE = """\ -Welcome to the Confidential Data Procurement setup. - -I'll help you configure your dataset acquisition policy inside the TEE. \ -Suppliers will upload datasets and submit a reserve price — neither party \ -sees the other's private numbers. - -Please provide the following: - -**Required** -1. **Dataset description** — what kind of data you need and why -2. **Required columns** — list the column names you expect - Example: transaction_id, amount, is_fraud -3. **Minimum rows** — fewest acceptable rows (e.g. 10000) -4. **Max null rate** — e.g. 5% means at most 5% cells can be missing -5. **Max duplicate rate** — e.g. 10% means at most 10% duplicate rows -6. **Maximum budget** — the most you will pay for a perfect dataset ($) - -**Optional** -- **Base price** — minimum payment even for poor-quality data (default $0) -- **Label column** + **minimum label rate** — e.g. is_fraud column must have ≥ 2% positives -- **Forbidden columns** — PII fields to block (e.g. ssn, dob, passport_number) - -You can provide everything in one message or answer step by step.\ -""" - - -_SYSTEM_PROMPT = """\ -You are configuring a confidential dataset procurement instance for a buyer. \ -Your job is to collect the required policy fields from the buyer's messages. - -REQUIRED fields (must be present and valid before responding with JSON): - - required_columns: list of expected column name strings (non-empty) - - min_rows: positive integer - - max_null_rate: float in [0, 1] (e.g. 0.05 for 5%) - - max_duplicate_rate: float in [0, 1] - - max_budget: positive float (the ceiling payment for a perfect dataset) - -OPTIONAL fields (use defaults if not provided): - - base_price: float >= 0, default 0.0 (floor payment when quality score = 0) - - min_label_rate: float in [0, 1] or null - - label_column: string or null - - forbidden_columns: list of strings, default [] - - description: free-text description of the dataset need - -CRITICAL RULE: base_price must be strictly less than max_budget. \ -If the buyer provides both and base_price >= max_budget, ask them to fix it. - -Once you have all required fields and they are valid, respond with ONLY this \ -JSON — no extra text, no markdown fences: -{ - "ready": true, - "required_columns": [...], - "min_rows": N, - "max_null_rate": 0.XX, - "max_duplicate_rate": 0.XX, - "max_budget": NNN.0, - "base_price": NN.0, - "min_label_rate": null_or_float, - "label_column": null_or_string, - "forbidden_columns": [...], - "description": "..." -} - -Only ask follow-up questions if required fields are missing or invalid. \ -Convert percentages to decimals (e.g. "5%" → 0.05).\ -""" - - -def _parse_llm_response(text: str) -> Optional[dict]: - """Strip markdown fences, parse JSON, return dict if ready=true else None.""" - text = text.strip() - if text.startswith("```"): - lines = text.splitlines() - inner = lines[1:-1] if lines[-1].strip() == "```" else lines[1:] - text = "\n".join(inner).strip() - try: - obj = json.loads(text) - if isinstance(obj, dict) and obj.get("ready") is True: - return obj - except (json.JSONDecodeError, ValueError): - pass - return None - - -def procurement_init_handler(message: str, conversation: list[dict]) -> dict: - """ - Handle one turn of the buyer onboarding conversation. - - Called by the API on each POST /init. The accumulated conversation is passed - in; this handler appends the new messages and returns the updated state. - """ - # First turn: return fixed greeting immediately (no LLM call). - if not conversation: - conversation = [ - {"role": "system", "content": _SYSTEM_PROMPT}, - {"role": "ai", "content": _GREETING_TEMPLATE}, - ] - return { - "status": "configuring", - "message": _GREETING_TEMPLATE, - "conversation": conversation, - } - - conversation = conversation + [{"role": "human", "content": message}] - - # Build LangChain messages - lc_messages = [] - for msg in conversation: - if msg["role"] == "system": - lc_messages.append(SystemMessage(content=msg["content"])) - elif msg["role"] == "human": - lc_messages.append(HumanMessage(content=msg["content"])) - else: - lc_messages.append(AIMessage(content=msg["content"])) - - llm = get_llm(INIT_MODEL) - response = llm.invoke(lc_messages) - ai_text = response.content - - conversation = conversation + [{"role": "ai", "content": ai_text}] - - extracted = _parse_llm_response(ai_text) - if not extracted: - return { - "status": "configuring", - "message": ai_text, - "conversation": conversation, - } - - # Validate required fields - required_columns = extracted.get("required_columns") - if not required_columns or not isinstance(required_columns, list): - return { - "status": "configuring", - "message": "Required columns cannot be empty. Please list the column names you expect.", - "conversation": conversation, - } - - min_rows = extracted.get("min_rows") - try: - min_rows = int(min_rows) - if min_rows < 1: - raise ValueError - except (TypeError, ValueError): - return { - "status": "configuring", - "message": "Minimum rows must be a positive integer. Please provide a valid number.", - "conversation": conversation, - } - - max_budget = extracted.get("max_budget") - try: - max_budget = float(max_budget) - if max_budget <= 0: - raise ValueError - except (TypeError, ValueError): - return { - "status": "configuring", - "message": "Maximum budget must be a positive number. Please provide a valid dollar amount.", - "conversation": conversation, - } - - for rate_key in ("max_null_rate", "max_duplicate_rate"): - val = extracted.get(rate_key) - try: - val = float(val) - if not (0.0 <= val <= 1.0): - raise ValueError - except (TypeError, ValueError): - return { - "status": "configuring", - "message": ( - f"{rate_key.replace('_', ' ').title()} must be a decimal between 0 and 1 " - "(e.g. 0.05 for 5%). Please provide a valid value." - ), - "conversation": conversation, - } - - base_price = float(extracted.get("base_price") or 0.0) - if base_price >= max_budget: - return { - "status": "configuring", - "message": ( - f"Base price (${base_price:,.2f}) must be less than max budget (${max_budget:,.2f}). " - "Please adjust." - ), - "conversation": conversation, - } - - # Build and validate BuyerPolicy (Pydantic catches anything we missed) - try: - policy = BuyerPolicy( - required_columns=[str(c) for c in required_columns], - min_rows=min_rows, - max_null_rate=float(extracted["max_null_rate"]), - max_duplicate_rate=float(extracted["max_duplicate_rate"]), - min_label_rate=extracted.get("min_label_rate"), - label_column=extracted.get("label_column"), - forbidden_columns=[str(c) for c in (extracted.get("forbidden_columns") or [])], - max_budget=max_budget, - base_price=base_price, - description=str(extracted.get("description") or ""), - ) - except Exception as exc: - return { - "status": "configuring", - "message": f"Could not build policy: {exc}. Please review your inputs.", - "conversation": conversation, - } - - ready_message = ( - f"Policy saved.\n" - f"Required columns: {', '.join(policy.required_columns)}\n" - f"Minimum rows: {policy.min_rows:,}\n" - f"Max null rate: {policy.max_null_rate:.0%} | " - f"Max duplicate rate: {policy.max_duplicate_rate:.0%}\n" - f"Budget: ${policy.base_price:,.2f} – ${policy.max_budget:,.2f}\n" - + (f"Label column: {policy.label_column} (≥ {policy.min_label_rate:.1%})\n" - if policy.label_column and policy.min_label_rate is not None else "") - + (f"Forbidden columns: {', '.join(policy.forbidden_columns)}\n" - if policy.forbidden_columns else "") - + "\nShare the instance link with your supplier to begin." - ) - - return { - "status": "ready", - "message": ready_message, - "conversation": conversation, - "config": policy, - "threshold": 1, # procurement triggers instantly on first supplier submission - } diff --git a/skills/confidential_data_procurement/models.py b/skills/confidential_data_procurement/models.py deleted file mode 100644 index ccbf634..0000000 --- a/skills/confidential_data_procurement/models.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Input and output Pydantic models for the confidential_data_procurement skill. - -Roles: -- BuyerPolicy: operator config — replaces OperatorConfig for this skill. - NEVER expose max_budget or base_price to the supplier. -- SupplierSubmission: participant input — references an uploaded dataset by ID. - NEVER expose reserve_price to the buyer. -- DatasetMetrics: intermediate deterministic output — not returned to API callers. -- ProcurementResult: final result per submission — key-filtered by role in routes.py. - revised_budget / revised_reserve are internal-only fields, - excluded from both ALLOWED_OUTPUT_KEYS and USER_OUTPUT_KEYS. -""" -from __future__ import annotations - -from typing import Any, Literal, Optional - -from pydantic import BaseModel, Field, model_validator - -from core.models import Submission - - -class BuyerPolicy(BaseModel): - """ - Operator config for the confidential_data_procurement skill. - Extracted by the init_handler from the buyer's onboarding conversation. - routes.py sets instance_id after init completes. - """ - required_columns: list[str] # semantic — agent does fuzzy matching - min_rows: int = Field(gt=0) - max_null_rate: float = Field(ge=0.0, le=1.0) # e.g. 0.03 = 3% - max_duplicate_rate: float = Field(ge=0.0, le=1.0) - min_label_rate: Optional[float] = Field(default=None, ge=0.0, le=1.0) - label_column: Optional[str] = None - forbidden_columns: list[str] = [] - max_budget: float = Field(gt=0.0) # NEVER exposed to supplier - base_price: float = Field(default=0.0, ge=0.0) # floor: P when S=0 - score_weights: dict[str, float] = {} # buyer overrides DEFAULT_SCORE_WEIGHTS - description: str = "" # natural language description of dataset need - instance_id: str = "default" # set by routes.py after init - - @model_validator(mode="after") - def validate_weights(self) -> "BuyerPolicy": - if self.score_weights: - total = sum(self.score_weights.values()) - if abs(total - 1.0) > 0.01: - raise ValueError( - f"score_weights must sum to 1.0 (got {total:.3f}). " - "Adjust weights or omit to use defaults." - ) - if self.base_price >= self.max_budget: - raise ValueError("base_price must be less than max_budget") - return self - - -class SupplierSubmission(Submission): - """ - Participant input for the confidential_data_procurement skill. - Supplier uploads their dataset via POST /upload first, then submits here. - """ - dataset_id: str # references uploaded DataFrame in ingest store - dataset_name: str - reserve_price: float = Field(ge=0.0) # NEVER exposed to buyer - - -class DatasetMetrics(BaseModel): - """ - Deterministic quality metrics computed from the raw DataFrame. - Intermediate result — never returned directly to API callers. - """ - row_count: int - column_names: list[str] - null_rate_by_column: dict[str, float] - overall_null_rate: float - duplicate_rate: float - label_rate: Optional[float] = None # None if label_column not specified - forbidden_columns_present: list[str] = [] - hard_constraints_pass: bool # all binary must-pass checks - critical_failure: bool = False # triggers early exit before agent - critical_reason: Optional[str] = None # human-readable reason for critical failure - - -class ProcurementResult(BaseModel): - """ - Final result per submission after guardrails. - Role-filtered in routes.py: buyer sees ALLOWED_OUTPUT_KEYS, supplier sees USER_OUTPUT_KEYS. - - Field notes: - - deal: enclave's mathematical verdict (R ≤ P ≤ B and hard constraints pass) - - quality_score: buyer-only — supplier could reverse-engineer max_budget via P/S - - hard_constraints_pass: buyer-only — same reasoning - - settlement_status: lifecycle state — independent of deal bool - "rejected" | "pending_approval" | "awaiting_counterparty" | - "renegotiating" | "authorized" - - revised_budget/reserve: INTERNAL — never in any output key set - """ - submission_id: str - deal: bool = False - quality_score: float = Field(default=0.0, ge=0.0, le=1.0) # buyer-only - proposed_payment: float = 0.0 - hard_constraints_pass: bool = False # buyer-only - settlement_status: Literal[ - "rejected", - "pending_approval", - "awaiting_counterparty", - "renegotiating", - "authorized", - ] = "rejected" - component_scores: dict[str, float] = {} # buyer-only: per-dimension scores - release_token: Optional[str] = None - notes: list[str] = [] # failure/partial notes — same for both roles - explanation: Optional[str] = None # bounded LLM summary - claim_verification: Optional[dict[str, Any]] = None # from agent layer - schema_matching: Optional[dict[str, Any]] = None # from agent layer - buyer_response: Optional[Literal["accept", "reject", "renegotiate"]] = None - supplier_response: Optional[Literal["accept", "reject", "renegotiate"]] = None - renegotiation_used: bool = False - - # INTERNAL ONLY — excluded from ALLOWED_OUTPUT_KEYS and USER_OUTPUT_KEYS - revised_budget: Optional[float] = None - revised_reserve: Optional[float] = None diff --git a/skills/confidential_data_procurement/tools.py b/skills/confidential_data_procurement/tools.py deleted file mode 100644 index 0966a5e..0000000 --- a/skills/confidential_data_procurement/tools.py +++ /dev/null @@ -1,140 +0,0 @@ -""" -Aggregate-only data tools for the confidential_data_procurement evaluate node. - -Security: - - Tools NEVER return raw rows or individual cell values. - - All output passes through validate_tool_output() before leaving the tool. - - The LLM sees aggregate statistics only — it cannot reconstruct the dataset. - -Tools: - 1. get_schema_summary() — column names, dtypes, null rates, row count - 2. get_column_stats(column_name) — numeric: min/max/mean/std; categorical: top-5 counts - 3. get_value_distribution(col, n) — top-N value counts + distinct count - -What to edit here: - - Add a new tool: define @tool function, add to EVALUATE_TOOLS. - - Change cardinality / size limits: update constants in guardrails.py. -""" -from __future__ import annotations - -from langchain_core.tools import tool - -from skills.confidential_data_procurement.guardrails import validate_tool_output - -# --------------------------------------------------------------------------- -# Module-level context — set by set_context() in __init__.py before agent runs -# --------------------------------------------------------------------------- - -_dataset_id: str = "" -_policy_context: dict = {} # required_columns, column_definitions, seller_claims - - -def set_context(dataset_id: str, policy_context: dict) -> None: - """Bind the active dataset and policy context for this evaluation run. - Called by run_skill() before run_agent(). - """ - global _dataset_id, _policy_context - _dataset_id = dataset_id - _policy_context = policy_context - - -def _get_df(): - from skills.confidential_data_procurement.ingest import get_dataset - return get_dataset(_dataset_id)["df"] - - -# --------------------------------------------------------------------------- -# Tools -# --------------------------------------------------------------------------- - -@tool -def get_schema_summary() -> str: - """ - Get a summary of the dataset schema. - - Returns: column names, data types, null rate per column, row count, column count. - Call this first to understand what columns are present and their data quality. - """ - df = _get_df() - lines = [f"rows: {len(df)}", f"columns ({len(df.columns)}):"] - for col in df.columns: - dtype = str(df[col].dtype) - null_rate = float(df[col].isna().mean()) - lines.append(f" {col}: dtype={dtype}, null_rate={null_rate:.1%}") - return validate_tool_output("\n".join(lines)) - - -@tool -def get_column_stats(column_name: str) -> str: - """ - Get aggregate statistics for a single column. - - Numeric columns: min, max, mean, median, std, non-null count. - Categorical columns: total distinct values, top-5 most frequent values with counts. - Returns an error if the column does not exist. - """ - df = _get_df() - if column_name not in df.columns: - available = ", ".join(list(df.columns)[:10]) - return f"Column '{column_name}' not found. Available columns: {available}" - - col = df[column_name].dropna() - if len(col) == 0: - return validate_tool_output(f"column: {column_name}\nAll values are null.") - - if col.dtype.kind in ("i", "f", "u"): - output = ( - f"column: {column_name} (numeric)\n" - f"count: {len(col)}\n" - f"min: {col.min():.4g}\n" - f"max: {col.max():.4g}\n" - f"mean: {col.mean():.4g}\n" - f"median: {col.median():.4g}\n" - f"std: {col.std():.4g}" - ) - else: - top = col.value_counts().head(5) - top_lines = "\n".join(f" {v}: {c}" for v, c in top.items()) - output = ( - f"column: {column_name} (categorical)\n" - f"count: {len(col)}\n" - f"distinct: {col.nunique()}\n" - f"top-5:\n{top_lines}" - ) - - return validate_tool_output(output) - - -@tool -def get_value_distribution(column_name: str, top_n: int = 10) -> str: - """ - Get the top-N most frequent values for a column with their counts and percentages. - - top_n is capped at 20. Use this to assess label distribution (e.g. fraud rate), - category balance, or unusual value concentration. Returns total distinct count too. - """ - df = _get_df() - if column_name not in df.columns: - return f"Column '{column_name}' not found." - - top_n = min(max(top_n, 1), 20) - col = df[column_name].dropna() - total = len(col) - total_distinct = col.nunique() - top = col.value_counts().head(top_n) - - lines = [ - f"column: {column_name}", - f"total non-null: {total}", - f"total distinct: {total_distinct}", - f"top-{top_n}:", - ] - for val, count in top.items(): - pct = count / total * 100 if total > 0 else 0 - lines.append(f" {val}: {count} ({pct:.1f}%)") - - return validate_tool_output("\n".join(lines)) - - -# Tool group — bound to evaluate_node in agent.py -EVALUATE_TOOLS = [get_schema_summary, get_column_stats, get_value_distribution] diff --git a/skills/dataset_audit/__init__.py b/skills/dataset_audit/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/skills/hackathon_novelty/__init__.py b/skills/hackathon_novelty/__init__.py index afdf348..00e084d 100644 --- a/skills/hackathon_novelty/__init__.py +++ b/skills/hackathon_novelty/__init__.py @@ -25,7 +25,6 @@ from skills.hackathon_novelty.agent import run_agent from skills.hackathon_novelty.guardrails import HackathonNoveltyFilter from skills.hackathon_novelty.config import ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS, MIN_SUBMISSIONS, SIMILARITY_DUPLICATE_THRESHOLD -from skills.hackathon_novelty.init import hackathon_init_handler def run_skill(inputs: list[HackathonSubmission], params: OperatorConfig) -> SkillResponse: @@ -178,7 +177,6 @@ def run_skill(inputs: list[HackathonSubmission], params: OperatorConfig) -> Skil "and an alignment flag (whether your idea fits the hackathon theme). " "They never see other teams' submissions or scores." ), - init_handler=hackathon_init_handler, user_display={ "novelty_score": {"type": "gauge", "label": "Novelty", "min": 0, "max": 1}, "aligned": {"type": "badge", "label": "Aligned"}, diff --git a/skills/hackathon_novelty/init.py b/skills/hackathon_novelty/init.py deleted file mode 100644 index 631bed0..0000000 --- a/skills/hackathon_novelty/init.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Operator onboarding handler for hackathon_novelty. - -The API calls init_handler(message, conversation) on each /init request. -This module owns all hackathon-specific onboarding logic: -- System prompt construction (criteria weights, guidelines, threshold) -- LLM conversation management -- JSON extraction and OperatorConfig construction - -To adapt for a different skill: implement a new handler with the same -interface. The API doesn't care what happens inside. - -Handler interface: - init_handler(message: str, conversation: list[dict]) -> dict - Returns: - {"status": "configuring", "message": str, "conversation": list[dict]} - {"status": "ready", "message": str, "conversation": list[dict], - "config": OperatorConfig, "threshold": int} -""" -from __future__ import annotations -import json -from typing import Optional - -from langchain_core.messages import SystemMessage, HumanMessage, AIMessage - -from config import get_llm -from core.models import OperatorConfig -from skills.hackathon_novelty.config import MIN_SUBMISSIONS, INIT_MODEL - - -# Bump when changing _SYSTEM_PROMPT or _GREETING_TEMPLATE. -INIT_PROMPT_VERSION = "v3" - - -_GREETING_TEMPLATE = ( - "Welcome to hackathon evaluation setup.\n\n" - "Please provide the following:\n\n" - "1. **Evaluation criteria** with weights summing to 1.0\n" - ' Example: {"originality": 0.4, "feasibility": 0.3, "impact": 0.3}\n\n' - "2. **(Optional) Guidelines** — judging instructions\n" - ' Example: "Focus on AI/ML innovations"\n\n' - f"3. **(Optional) Threshold** — minimum submissions before auto-evaluation (default: {MIN_SUBMISSIONS})\n\n" - "You can provide everything in one message." -) - - -_SYSTEM_PROMPT = ( - "You are setting up a hackathon novelty evaluation instance. " - "Your job is to collect the required configuration from the admin.\n\n" - "REQUIRED:\n" - "- criteria: a dict of criterion names to weights that sum to exactly 1.0\n" - " Example: {\"originality\": 0.4, \"feasibility\": 0.3, \"impact\": 0.3}\n\n" - "OPTIONAL:\n" - "- guidelines: free-text judging instructions (e.g. 'Focus on AI/ML projects')\n" - f"- threshold: minimum submissions before auto-evaluation runs (default: {MIN_SUBMISSIONS})\n\n" - "IMPORTANT: As soon as you have the required criteria (with weights summing to 1.0), " - "respond with ONLY the JSON below — no confirmation, no commentary, no extra text:\n" - '{"ready": true, "criteria": {"name": weight, ...}, "guidelines": "...", "threshold": N}\n\n' - "Only ask follow-up questions if the criteria are missing or weights do not sum to 1.0." -) - - -def _parse_llm_response(text: str) -> Optional[dict]: - """Strip markdown fences, parse JSON, return dict if ready=true else None.""" - text = text.strip() - if text.startswith("```"): - lines = text.splitlines() - inner = lines[1:-1] if lines[-1].strip() == "```" else lines[1:] - text = "\n".join(inner).strip() - try: - obj = json.loads(text) - if isinstance(obj, dict) and obj.get("ready") is True: - return obj - except (json.JSONDecodeError, ValueError): - pass - return None - - -def hackathon_init_handler(message: str, conversation: list[dict]) -> dict: - """ - Handle one turn of the admin onboarding conversation. - - Called by the API on each POST /init. The API passes the accumulated - conversation; this handler appends the new messages and returns the result. - """ - # First turn: return fixed greeting immediately (no LLM call). - # Seed the conversation so DeepSeek sees the greeting as its own message on turn 2+. - if not conversation: - conversation = [ - {"role": "system", "content": _SYSTEM_PROMPT}, - {"role": "ai", "content": _GREETING_TEMPLATE}, - ] - return { - "status": "configuring", - "message": _GREETING_TEMPLATE, - "conversation": conversation, - } - - conversation = conversation + [{"role": "human", "content": message}] - - # Build LangChain messages - lc_messages = [] - for msg in conversation: - if msg["role"] == "system": - lc_messages.append(SystemMessage(content=msg["content"])) - elif msg["role"] == "human": - lc_messages.append(HumanMessage(content=msg["content"])) - else: - lc_messages.append(AIMessage(content=msg["content"])) - - llm = get_llm(INIT_MODEL) - response = llm.invoke(lc_messages) - ai_text = response.content - - conversation = conversation + [{"role": "ai", "content": ai_text}] - - extracted = _parse_llm_response(ai_text) - if extracted: - criteria = extracted.get("criteria", {}) - guidelines = extracted.get("guidelines", "") - - if not criteria: - return { - "status": "configuring", - "message": "Criteria cannot be empty. Please provide at least one criterion with a weight.", - "conversation": conversation, - } - - weight_sum = sum(criteria.values()) - if abs(weight_sum - 1.0) > 0.01: - return { - "status": "configuring", - "message": f"Criteria weights must sum to 1.0 (got {weight_sum:.2f}). Please adjust.", - "conversation": conversation, - } - - try: - threshold = int(extracted.get("threshold", MIN_SUBMISSIONS)) - if threshold < 1: - raise ValueError("non-positive") - except (ValueError, TypeError): - return { - "status": "configuring", - "message": "Threshold must be a positive integer. Please provide a valid number.", - "conversation": conversation, - } - - config = OperatorConfig(criteria=criteria, guidelines=guidelines, min_submissions=threshold) - ready_message = ( - f"Configuration saved.\n" - f"Criteria: {json.dumps(criteria)}\n" - f"Guidelines: {guidelines or '(none)'}\n" - f"Threshold: {threshold} submissions" - ) - return { - "status": "ready", - "message": ready_message, - "conversation": conversation, - "config": config, - "threshold": threshold, - } - - return { - "status": "configuring", - "message": ai_text, - "conversation": conversation, - } diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py deleted file mode 100644 index 8ec7762..0000000 --- a/tests/test_data_procurement.py +++ /dev/null @@ -1,981 +0,0 @@ -""" -Unit tests for confidential_data_procurement. -Tests cover: metrics computation, critical checks, component scores, -quality score formula, price formula, deal condition, run_deterministic, -guardrails (filter + validator), init handler, run_skill, and skill_card. -""" -from __future__ import annotations - -import io -import uuid - -import pandas as pd -import pytest - -from skills.confidential_data_procurement.config import ( - CRITICAL_DUPLICATE_THRESHOLD, - DEFAULT_SCORE_WEIGHTS, -) -from skills.confidential_data_procurement.deterministic import ( - check_critical, - check_deal, - compute_component_scores, - compute_metrics, - compute_price, - compute_quality_score, - run_deterministic, -) -from skills.confidential_data_procurement.ingest import _datasets, cleanup, procurement_upload_handler -from skills.confidential_data_procurement.models import BuyerPolicy, SupplierSubmission - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _make_policy(**overrides) -> BuyerPolicy: - defaults = dict( - required_columns=["transaction_id", "amount", "is_fraud"], - min_rows=100, - max_null_rate=0.05, - max_duplicate_rate=0.10, - min_label_rate=0.02, - label_column="is_fraud", - forbidden_columns=["ssn", "dob"], - max_budget=5000.0, - base_price=500.0, - ) - defaults.update(overrides) - return BuyerPolicy(**defaults) - - -def _make_df(rows=150, null_amount_rate=0.0, dup_rate=0.0, cols=None) -> pd.DataFrame: - """Generate a clean fraud-like DataFrame.""" - import numpy as np - np.random.seed(0) - base_n = int(rows * (1 - dup_rate)) - df = pd.DataFrame({ - "transaction_id": [f"txn_{i:04d}" for i in range(base_n)], - "amount": [round(float(i * 10.5), 2) for i in range(base_n)], - "is_fraud": [1 if i % 25 == 0 else 0 for i in range(base_n)], - }) - if cols: - df = df[[c for c in cols if c in df.columns]] - if null_amount_rate > 0: - n_nulls = int(base_n * null_amount_rate) - df.loc[:n_nulls, "amount"] = None - if dup_rate > 0: - extra = int(rows * dup_rate) - df = pd.concat([df, df.iloc[:extra]], ignore_index=True) - return df - - -def _register_df(df: pd.DataFrame, metadata: dict | None = None) -> str: - """Store a DataFrame in the ingest store and return its dataset_id.""" - dataset_id = str(uuid.uuid4()) - _datasets[dataset_id] = { - "df": df, - "metadata": metadata or {}, - "column_definitions": {}, - "seller_claims": {}, - "instance_id": "test_instance", - } - return dataset_id - - -# --------------------------------------------------------------------------- -# compute_metrics -# --------------------------------------------------------------------------- - -class TestComputeMetrics: - def test_basic_counts(self): - df = _make_df(rows=150) - policy = _make_policy() - m = compute_metrics(df, policy) - assert m.row_count == 150 - assert "transaction_id" in m.column_names - assert m.critical_failure is False - assert m.hard_constraints_pass is True - - def test_null_rates(self): - df = _make_df(rows=100, null_amount_rate=0.20) - policy = _make_policy() - m = compute_metrics(df, policy) - assert m.null_rate_by_column["amount"] > 0.15 - assert m.overall_null_rate > 0.0 - - def test_duplicate_rate(self): - df = _make_df(rows=100, dup_rate=0.30) - policy = _make_policy() - m = compute_metrics(df, policy) - assert m.duplicate_rate > 0.20 - - def test_label_rate(self): - df = _make_df(rows=100) - policy = _make_policy(label_column="is_fraud") - m = compute_metrics(df, policy) - assert m.label_rate is not None - assert 0 < m.label_rate < 0.1 # ~4% fraud - - def test_no_label_column(self): - df = _make_df(rows=100) - policy = _make_policy(label_column=None, min_label_rate=None) - m = compute_metrics(df, policy) - assert m.label_rate is None - - def test_forbidden_column_detected(self): - df = _make_df(rows=50) - df["ssn"] = "xxx-xx-0000" - policy = _make_policy() - m = compute_metrics(df, policy) - assert "ssn" in m.forbidden_columns_present - assert m.hard_constraints_pass is False - - def test_empty_dataframe(self): - df = pd.DataFrame(columns=["transaction_id", "amount", "is_fraud"]) - policy = _make_policy() - m = compute_metrics(df, policy) - assert m.row_count == 0 - assert m.critical_failure is True - assert m.hard_constraints_pass is False - - -# --------------------------------------------------------------------------- -# check_critical -# --------------------------------------------------------------------------- - -class TestCheckCritical: - def test_clean_df_not_critical(self): - df = _make_df(rows=100) - policy = _make_policy() - m = compute_metrics(df, policy) - is_crit, reason = check_critical(m) - assert is_crit is False - assert reason is None - - def test_forbidden_col_is_critical(self): - df = _make_df(rows=50) - df["ssn"] = "xxx" - policy = _make_policy() - m = compute_metrics(df, policy) - is_crit, reason = check_critical(m) - assert is_crit is True - assert "ssn" in reason.lower() - - def test_high_duplicate_rate_is_critical(self): - df = _make_df(rows=100, dup_rate=CRITICAL_DUPLICATE_THRESHOLD + 0.05) - policy = _make_policy() - m = compute_metrics(df, policy) - is_crit, reason = check_critical(m) - assert is_crit is True - assert "duplicate" in reason.lower() - - def test_empty_df_is_critical(self): - df = pd.DataFrame(columns=["a", "b"]) - policy = _make_policy() - m = compute_metrics(df, policy) - is_crit, reason = check_critical(m) - assert is_crit is True - - -# --------------------------------------------------------------------------- -# compute_component_scores -# --------------------------------------------------------------------------- - -class TestComponentScores: - def test_perfect_dataset_scores(self): - df = _make_df(rows=200) - policy = _make_policy(min_rows=100) - m = compute_metrics(df, policy) - scores = compute_component_scores(m, policy) - assert scores["coverage"] == 1.0 - assert scores["risk"] == 1.0 - assert scores["null"] == 1.0 # no nulls - assert scores["duplicate"] == 1.0 # no dups - - def test_coverage_below_min(self): - df = _make_df(rows=50) - policy = _make_policy(min_rows=200) - m = compute_metrics(df, policy) - scores = compute_component_scores(m, policy) - assert scores["coverage"] == pytest.approx(0.25, abs=0.01) - - def test_null_score_penalised(self): - df = _make_df(rows=100, null_amount_rate=0.20) - policy = _make_policy(max_null_rate=0.05) - m = compute_metrics(df, policy) - scores = compute_component_scores(m, policy) - assert scores["null"] < 0.5 - - def test_risk_score_zero_on_forbidden(self): - df = _make_df(rows=50) - df["ssn"] = "xxx" - policy = _make_policy() - m = compute_metrics(df, policy) - scores = compute_component_scores(m, policy) - assert scores["risk"] == 0.0 - - def test_schema_score_placeholder(self): - df = _make_df(rows=100) - policy = _make_policy() - m = compute_metrics(df, policy) - scores = compute_component_scores(m, policy) - assert scores["schema"] == 0.5 # placeholder until agent - - def test_claim_veracity_placeholder(self): - df = _make_df(rows=100) - policy = _make_policy() - m = compute_metrics(df, policy) - scores = compute_component_scores(m, policy) - assert scores["claim_veracity"] == 1.0 # placeholder until agent - - -# --------------------------------------------------------------------------- -# compute_quality_score -# --------------------------------------------------------------------------- - -class TestQualityScore: - def test_default_weights_sum_to_one(self): - assert abs(sum(DEFAULT_SCORE_WEIGHTS.values()) - 1.0) < 0.001 - - def test_perfect_scores_give_one(self): - perfect = {k: 1.0 for k in DEFAULT_SCORE_WEIGHTS} - assert compute_quality_score(perfect, _make_policy()) == pytest.approx(1.0) - - def test_zero_scores_give_zero(self): - zeros = {k: 0.0 for k in DEFAULT_SCORE_WEIGHTS} - assert compute_quality_score(zeros, _make_policy()) == pytest.approx(0.0) - - def test_clamped_to_zero_one(self): - over = {k: 2.0 for k in DEFAULT_SCORE_WEIGHTS} - assert compute_quality_score(over, _make_policy()) == 1.0 - - def test_custom_weights_respected(self): - policy = _make_policy(score_weights={k: 1/7 for k in DEFAULT_SCORE_WEIGHTS}) - scores = {k: 1.0 for k in DEFAULT_SCORE_WEIGHTS} - assert compute_quality_score(scores, policy) == pytest.approx(1.0, abs=0.01) - - -# --------------------------------------------------------------------------- -# compute_price -# --------------------------------------------------------------------------- - -class TestComputePrice: - def test_s_zero_gives_base_price(self): - assert compute_price(0.0, 500.0, 5000.0) == 500.0 - - def test_s_one_gives_max_budget(self): - assert compute_price(1.0, 500.0, 5000.0) == 5000.0 - - def test_midpoint(self): - assert compute_price(0.5, 0.0, 1000.0) == 500.0 - - def test_rounded_to_two_decimals(self): - result = compute_price(0.333, 0.0, 1000.0) - assert result == round(result, 2) - - def test_formula_correct(self): - S, base, budget = 0.87, 500.0, 5000.0 - expected = round(500.0 + (5000.0 - 500.0) * 0.87, 2) - assert compute_price(S, base, budget) == expected - - -# --------------------------------------------------------------------------- -# check_deal -# --------------------------------------------------------------------------- - -class TestCheckDeal: - def test_deal_passes(self): - assert check_deal(True, 3000.0, 4000.0, 5000.0) is True - - def test_reserve_above_payment(self): - assert check_deal(True, 4500.0, 4000.0, 5000.0) is False - - def test_payment_above_budget(self): - # P > B can't normally happen (P = B * S ≤ B), but guard anyway - assert check_deal(True, 100.0, 6000.0, 5000.0) is False - - def test_hard_constraints_fail(self): - assert check_deal(False, 3000.0, 4000.0, 5000.0) is False - - def test_exact_reserve_equals_payment(self): - assert check_deal(True, 4000.0, 4000.0, 5000.0) is True # R == P: ok - - -# --------------------------------------------------------------------------- -# run_deterministic (integration) -# --------------------------------------------------------------------------- - -class TestRunDeterministic: - def test_good_dataset_deal_passes(self): - df = _make_df(rows=200) - policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) - dataset_id = _register_df(df) - try: - result = run_deterministic(dataset_id, policy, reserve_price=1000.0) - assert result["deal"] is True - assert result["quality_score"] > 0.5 - assert result["proposed_payment"] >= 500.0 - assert result["proposed_payment"] <= 5000.0 - assert not result["metrics"].critical_failure - finally: - cleanup(dataset_id) - - def test_critical_failure_early_exit(self): - df = _make_df(rows=50) - df["ssn"] = "xxx" - policy = _make_policy() - dataset_id = _register_df(df) - try: - result = run_deterministic(dataset_id, policy, reserve_price=100.0) - assert result["metrics"].critical_failure is True - assert result["deal"] is False - assert result["quality_score"] == 0.0 - assert result["proposed_payment"] == policy.base_price - assert len(result["notes"]) > 0 - finally: - cleanup(dataset_id) - - def test_high_null_reduces_price(self): - df_clean = _make_df(rows=150) - df_nulls = _make_df(rows=150, null_amount_rate=0.30) - policy = _make_policy(max_null_rate=0.05, max_budget=5000.0, base_price=0.0) - - id_clean = _register_df(df_clean) - id_nulls = _register_df(df_nulls) - try: - clean_result = run_deterministic(id_clean, policy, reserve_price=0.0) - nulls_result = run_deterministic(id_nulls, policy, reserve_price=0.0) - assert nulls_result["proposed_payment"] < clean_result["proposed_payment"] - finally: - cleanup(id_clean) - cleanup(id_nulls) - - def test_reserve_above_payment_no_deal(self): - df = _make_df(rows=150) - policy = _make_policy(min_rows=100, max_budget=1000.0, base_price=0.0) - dataset_id = _register_df(df) - try: - result = run_deterministic(dataset_id, policy, reserve_price=9999.0) - assert result["deal"] is False - assert any("reserve" in n.lower() for n in result["notes"]) - finally: - cleanup(dataset_id) - - def test_notes_populated_on_partial_failure(self): - df = _make_df(rows=50) # below min_rows=100 - policy = _make_policy(min_rows=100) - dataset_id = _register_df(df) - try: - result = run_deterministic(dataset_id, policy, reserve_price=0.0) - assert any("row count" in n.lower() for n in result["notes"]) - finally: - cleanup(dataset_id) - - def test_dataset_not_found_raises(self): - policy = _make_policy() - with pytest.raises(KeyError): - run_deterministic("nonexistent_id", policy, reserve_price=100.0) - - -# --------------------------------------------------------------------------- -# ProcurementFilter -# --------------------------------------------------------------------------- - -from skills.confidential_data_procurement.guardrails import ( - ProcurementFilter, - validate_tool_output, -) - - -class TestProcurementFilter: - def _result(self) -> dict: - return { - "submission_id": "sub-1", - "deal": True, - "quality_score": 0.85, - "proposed_payment": 3000.0, - "hard_constraints_pass": True, - "settlement_status": "authorized", - "release_token": "tok-abc", - "notes": [], - "explanation": "Looks good.", - "claim_verification": None, - "schema_matching": None, - "buyer_response": "accept", - "supplier_response": "accept", - "renegotiation_used": False, - } - - def test_buyer_sees_quality_score(self): - f = ProcurementFilter(role="admin") - out = f.filter_keys(self._result()) - assert "quality_score" in out - - def test_supplier_hides_quality_score(self): - f = ProcurementFilter(role="user") - out = f.filter_keys(self._result()) - assert "quality_score" not in out - assert "hard_constraints_pass" not in out - - def test_supplier_still_sees_payment(self): - f = ProcurementFilter(role="user") - out = f.filter_keys(self._result()) - assert "proposed_payment" in out - assert "deal" in out - - def test_check_bounds_clamps_high(self): - f = ProcurementFilter(role="admin") - r = {"quality_score": 1.5} - assert f.check_bounds(r)["quality_score"] == 1.0 - - def test_check_bounds_clamps_low(self): - f = ProcurementFilter(role="admin") - r = {"quality_score": -0.3} - assert f.check_bounds(r)["quality_score"] == 0.0 - - def test_check_bounds_passes_valid(self): - f = ProcurementFilter(role="admin") - r = {"quality_score": 0.72} - assert f.check_bounds(r)["quality_score"] == pytest.approx(0.72) - - def test_unknown_keys_stripped(self): - f = ProcurementFilter(role="admin") - r = self._result() - r["_internal_secret"] = "max_budget=9000" - out = f.filter_keys(r) - assert "_internal_secret" not in out - - def test_leakage_flagged_in_apply(self): - f = ProcurementFilter(role="admin") - result = self._result() - # Inject a long substring into explanation that also appears in raw_inputs - leaked = "SENSITIVE_CELL_VALUE_XYZ_1234567890" - result["explanation"] = f"The data shows {leaked} is common." - filtered = f.apply([result], [leaked]) - assert "_leakage_warning" in filtered[0] - - -# --------------------------------------------------------------------------- -# validate_tool_output -# --------------------------------------------------------------------------- - -class TestValidateToolOutput: - def test_clean_stats_pass(self): - output = "count: 150\nmean: 4.2\nstd: 1.1\nmin: 0.0\nmax: 10.0" - assert validate_tool_output(output) == output - - def test_oversized_raises(self): - big = "x" * 5000 - with pytest.raises(ValueError, match="too large"): - validate_tool_output(big) - - def test_raw_rows_raises(self): - # 6 CSV-like lines — over the threshold of 5 - rows = "\n".join(f"txn_{i},100.{i},0" for i in range(6)) - with pytest.raises(ValueError, match="CSV-like"): - validate_tool_output(rows) - - def test_exactly_at_raw_row_limit_passes(self): - # exactly MAX_RAW_ROW_LINES (5) CSV-like lines — should pass - rows = "\n".join(f"txn_{i},100.{i},0" for i in range(5)) - assert validate_tool_output(rows) == rows - - def test_high_cardinality_raises(self): - # 51 bullet items — over the threshold of 50 - items = "\n".join(f"- value_{i}: {i}" for i in range(51)) - with pytest.raises(ValueError, match="enumerates"): - validate_tool_output(items) - - def test_exactly_at_cardinality_limit_passes(self): - items = "\n".join(f"- value_{i}: {i}" for i in range(50)) - assert validate_tool_output(items) == items - - def test_empty_string_passes(self): - assert validate_tool_output("") == "" - - -# --------------------------------------------------------------------------- -# procurement_init_handler -# --------------------------------------------------------------------------- - -from unittest.mock import patch - -from skills.confidential_data_procurement.init import ( - _parse_llm_response, - procurement_init_handler, -) - - -class _FakeLLM: - """Minimal LLM stub — returns a fixed content string.""" - def __init__(self, content: str): - self._content = content - - def invoke(self, _messages): - class _R: - pass - r = _R() - r.content = self._content - return r - - -_SEEDED_CONV = [ - {"role": "system", "content": "sys"}, - {"role": "ai", "content": "greeting"}, -] - -_VALID_JSON = ( - '{"ready": true, "required_columns": ["txn_id", "amount", "label"], ' - '"min_rows": 500, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, ' - '"max_budget": 4000.0, "base_price": 200.0, ' - '"min_label_rate": 0.02, "label_column": "label", ' - '"forbidden_columns": ["ssn"], "description": "fraud dataset"}' -) - - -class TestParseLlmResponse: - def test_valid_ready_json(self): - result = _parse_llm_response(_VALID_JSON) - assert result is not None - assert result["ready"] is True - assert result["required_columns"] == ["txn_id", "amount", "label"] - - def test_markdown_fences_stripped(self): - wrapped = f"```json\n{_VALID_JSON}\n```" - assert _parse_llm_response(wrapped) is not None - - def test_non_json_returns_none(self): - assert _parse_llm_response("Sure, what columns do you need?") is None - - def test_ready_false_returns_none(self): - assert _parse_llm_response('{"ready": false, "message": "tell me more"}') is None - - def test_missing_ready_returns_none(self): - assert _parse_llm_response('{"required_columns": ["a"]}') is None - - -class TestProcurementInitHandler: - def test_first_turn_returns_greeting(self): - result = procurement_init_handler("", []) - assert result["status"] == "configuring" - assert "required columns" in result["message"].lower() - assert result["conversation"][0]["role"] == "system" - - def test_first_turn_no_llm_call(self): - # No patch needed — should not call get_llm at all on turn 1 - result = procurement_init_handler("anything", []) - assert result["status"] == "configuring" - - def test_valid_json_returns_ready(self): - with patch("skills.confidential_data_procurement.init.get_llm", - return_value=_FakeLLM(_VALID_JSON)): - result = procurement_init_handler("here is my policy", _SEEDED_CONV) - assert result["status"] == "ready" - assert result["threshold"] == 1 - policy = result["config"] - assert policy.min_rows == 500 - assert policy.max_budget == 4000.0 - assert policy.base_price == 200.0 - assert "ssn" in policy.forbidden_columns - - def test_empty_columns_stays_configuring(self): - bad = _VALID_JSON.replace('"txn_id", "amount", "label"', "") - bad = bad.replace('"required_columns": [],', '"required_columns": [],') - payload = '{"ready": true, "required_columns": [], "min_rows": 500, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, "max_budget": 4000.0}' - with patch("skills.confidential_data_procurement.init.get_llm", - return_value=_FakeLLM(payload)): - result = procurement_init_handler("no columns", _SEEDED_CONV) - assert result["status"] == "configuring" - assert "column" in result["message"].lower() - - def test_zero_min_rows_stays_configuring(self): - payload = '{"ready": true, "required_columns": ["a"], "min_rows": 0, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, "max_budget": 1000.0}' - with patch("skills.confidential_data_procurement.init.get_llm", - return_value=_FakeLLM(payload)): - result = procurement_init_handler("zero rows", _SEEDED_CONV) - assert result["status"] == "configuring" - assert "rows" in result["message"].lower() - - def test_base_price_above_budget_stays_configuring(self): - payload = '{"ready": true, "required_columns": ["a"], "min_rows": 100, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, "max_budget": 500.0, "base_price": 600.0}' - with patch("skills.confidential_data_procurement.init.get_llm", - return_value=_FakeLLM(payload)): - result = procurement_init_handler("bad price", _SEEDED_CONV) - assert result["status"] == "configuring" - assert "base price" in result["message"].lower() - - def test_non_json_response_stays_configuring(self): - with patch("skills.confidential_data_procurement.init.get_llm", - return_value=_FakeLLM("What forbidden columns do you need?")): - result = procurement_init_handler("not ready yet", _SEEDED_CONV) - assert result["status"] == "configuring" - assert result["message"] == "What forbidden columns do you need?" - - def test_conversation_accumulates(self): - with patch("skills.confidential_data_procurement.init.get_llm", - return_value=_FakeLLM(_VALID_JSON)): - result = procurement_init_handler("my policy", _SEEDED_CONV) - # seeded (2) + human (1) + ai (1) = 4 - assert len(result["conversation"]) == 4 - - -# --------------------------------------------------------------------------- -# run_skill + skill_card -# --------------------------------------------------------------------------- - -from skills.confidential_data_procurement import run_skill, skill_card - - -class TestRunSkill: - @pytest.mark.live - def test_good_dataset_returns_deal(self): - df = _make_df(rows=200) - policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) - dataset_id = _register_df(df) - try: - sub = SupplierSubmission( - submission_id="sub-good", - dataset_id=dataset_id, - dataset_name="fraud_data.csv", - reserve_price=1000.0, - ) - resp = run_skill([sub], policy) - assert resp.skill == "confidential_data_procurement" - assert len(resp.results) == 1 - r = resp.results[0] - assert r["deal"] is True - assert r["settlement_status"] == "pending_approval" - assert r["proposed_payment"] >= 500.0 - assert r["quality_score"] > 0.5 - finally: - cleanup(dataset_id) - - def test_critical_failure_returns_rejected(self): - df = _make_df(rows=50) - df["ssn"] = "xxx" - policy = _make_policy() - dataset_id = _register_df(df) - try: - sub = SupplierSubmission( - submission_id="sub-bad", - dataset_id=dataset_id, - dataset_name="bad_data.csv", - reserve_price=100.0, - ) - resp = run_skill([sub], policy) - r = resp.results[0] - assert r["deal"] is False - assert r["settlement_status"] == "rejected" - assert r["quality_score"] == 0.0 - finally: - cleanup(dataset_id) - - @pytest.mark.live - def test_reserve_above_payment_no_deal(self): - df = _make_df(rows=150) - policy = _make_policy(min_rows=100, max_budget=1000.0, base_price=0.0) - dataset_id = _register_df(df) - try: - sub = SupplierSubmission( - submission_id="sub-expensive", - dataset_id=dataset_id, - dataset_name="data.csv", - reserve_price=9999.0, - ) - resp = run_skill([sub], policy) - r = resp.results[0] - assert r["deal"] is False - assert r["settlement_status"] == "rejected" - finally: - cleanup(dataset_id) - - @pytest.mark.live - def test_internal_fields_stripped_by_guardrails(self): - """revised_budget and revised_reserve should not appear in output.""" - df = _make_df(rows=200) - policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) - dataset_id = _register_df(df) - try: - sub = SupplierSubmission( - submission_id="sub-internal", - dataset_id=dataset_id, - dataset_name="data.csv", - reserve_price=100.0, - ) - resp = run_skill([sub], policy) - r = resp.results[0] - assert "revised_budget" not in r - assert "revised_reserve" not in r - finally: - cleanup(dataset_id) - - -class TestSkillCard: - def test_card_name(self): - assert skill_card.name == "confidential_data_procurement" - - def test_card_has_required_fields(self): - assert skill_card.run is run_skill - assert skill_card.input_model is SupplierSubmission - assert skill_card.init_handler is procurement_init_handler - assert skill_card.upload_handler is procurement_upload_handler - - def test_output_keys_superset_of_user_keys(self): - assert skill_card.user_output_keys.issubset(skill_card.output_keys) - - def test_quality_score_buyer_only(self): - assert "quality_score" in skill_card.output_keys - assert "quality_score" not in skill_card.user_output_keys - - def test_metadata_serializable(self): - meta = skill_card.metadata() - assert meta["name"] == "confidential_data_procurement" - assert "quality_score" in meta["output_keys"] - assert "quality_score" not in meta["user_output_keys"] - - def test_threshold_is_one(self): - assert skill_card.config["min_submissions"] == 1 - - def test_respond_handler_registered(self): - from skills.confidential_data_procurement import procurement_respond_handler - assert skill_card.respond_handler is procurement_respond_handler - - -# --------------------------------------------------------------------------- -# Agent layer -# --------------------------------------------------------------------------- - -from skills.confidential_data_procurement.agent import _parse_agent_output, _safe_defaults -from skills.confidential_data_procurement.tools import ( - get_column_stats, - get_schema_summary, - get_value_distribution, - set_context, -) - - -_AGENT_JSON = ( - '{"schema_score": 0.8, "claim_veracity_score": 0.9, ' - '"schema_matching": {"transaction_id": "txn_id", "amount": "amount"}, ' - '"claim_verification": {"no_nulls": "disputed"}, ' - '"explanation": "Dataset looks reasonable."}' -) - - -class TestParseAgentOutput: - def test_valid_json_extracted(self): - policy = _make_policy() - result = _parse_agent_output(_AGENT_JSON, policy, {"no_nulls": "true"}) - assert result["schema_score"] == pytest.approx(0.8) - assert result["claim_veracity_score"] == pytest.approx(0.9) - assert result["schema_matching"]["transaction_id"] == "txn_id" - assert result["explanation"] == "Dataset looks reasonable." - - def test_clamped_scores(self): - policy = _make_policy() - bad = '{"schema_score": 2.5, "claim_veracity_score": -0.1, "explanation": "x"}' - result = _parse_agent_output(bad, policy, {}) - assert result["schema_score"] == 1.0 - assert result["claim_veracity_score"] == 0.0 - - def test_markdown_fences_stripped(self): - policy = _make_policy() - wrapped = f"```json\n{_AGENT_JSON}\n```" - result = _parse_agent_output(wrapped, policy, {}) - assert result["schema_score"] == pytest.approx(0.8) - - def test_unparseable_returns_defaults(self): - policy = _make_policy() - result = _parse_agent_output("Sorry, I could not evaluate.", policy, {"claim": "x"}) - assert result["schema_score"] == 0.5 - assert result["claim_veracity_score"] == 1.0 - - def test_safe_defaults_structure(self): - policy = _make_policy() - result = _safe_defaults(policy, {"low_nulls": "true"}) - assert "schema_matching" in result - assert "claim_verification" in result - assert result["claim_verification"]["low_nulls"] == "unverifiable" - - -class TestTools: - def setup_method(self): - self.df = _make_df(rows=50) - self.dataset_id = _register_df(self.df) - set_context(self.dataset_id, { - "required_columns": ["transaction_id", "amount"], - "column_definitions": {}, - "seller_claims": {}, - }) - - def teardown_method(self): - from skills.confidential_data_procurement.ingest import cleanup - cleanup(self.dataset_id) - - def test_schema_summary_passes_validator(self): - result = get_schema_summary.invoke({}) - assert "transaction_id" in result - assert "rows:" in result - - def test_column_stats_numeric(self): - result = get_column_stats.invoke({"column_name": "amount"}) - assert "numeric" in result - assert "mean" in result - - def test_column_stats_missing_column(self): - result = get_column_stats.invoke({"column_name": "nonexistent"}) - assert "not found" in result.lower() - - def test_value_distribution(self): - result = get_value_distribution.invoke({"column_name": "is_fraud", "top_n": 5}) - assert "is_fraud" in result - assert "distinct" in result - - def test_value_distribution_capped_at_20(self): - result = get_value_distribution.invoke({"column_name": "amount", "top_n": 999}) - assert "top-20" in result - - -# --------------------------------------------------------------------------- -# respond_handler + renegotiation (3×3 matrix) -# --------------------------------------------------------------------------- - -from skills.confidential_data_procurement import procurement_respond_handler - - -def _base_result(deal=True) -> dict: - return { - "submission_id": "sub-1", - "deal": deal, - "quality_score": 0.75, - "proposed_payment": 3000.0, - "hard_constraints_pass": True, - "settlement_status": "pending_approval" if deal else "rejected", - "release_token": None, - "notes": [], - "explanation": None, - "claim_verification": None, - "schema_matching": None, - "buyer_response": None, - "supplier_response": None, - "renegotiation_used": False, - "revised_budget": None, - "revised_reserve": None, - } - - -class TestRespondHandler: - # --- First response only → awaiting_counterparty --- - - def test_first_buyer_response_awaits_counterparty(self): - r = procurement_respond_handler(_base_result(), "accept", None, "buyer", _make_policy()) - assert r["settlement_status"] == "awaiting_counterparty" - assert r["buyer_response"] == "accept" - assert r["supplier_response"] is None - - def test_first_supplier_response_awaits_counterparty(self): - r = procurement_respond_handler(_base_result(), "accept", None, "supplier", _make_policy()) - assert r["settlement_status"] == "awaiting_counterparty" - assert r["supplier_response"] == "accept" - - # --- Both accept → authorized --- - - def test_both_accept_authorized(self): - result = _base_result() - result["buyer_response"] = "accept" - r = procurement_respond_handler(result, "accept", None, "supplier", _make_policy()) - assert r["settlement_status"] == "authorized" - assert r["deal"] is True - assert r["release_token"] is not None - - # --- Any reject → rejected --- - - def test_buyer_reject_rejected(self): - result = _base_result() - result["supplier_response"] = "accept" - r = procurement_respond_handler(result, "reject", None, "buyer", _make_policy()) - assert r["settlement_status"] == "rejected" - assert r["deal"] is False - - def test_supplier_reject_rejected(self): - result = _base_result() - result["buyer_response"] = "accept" - r = procurement_respond_handler(result, "reject", None, "supplier", _make_policy()) - assert r["settlement_status"] == "rejected" - - def test_both_reject_rejected(self): - result = _base_result() - result["buyer_response"] = "reject" - r = procurement_respond_handler(result, "reject", None, "supplier", _make_policy()) - assert r["settlement_status"] == "rejected" - - def test_renegotiate_then_reject_rejected(self): - result = _base_result() - result["buyer_response"] = "renegotiate" - result["revised_budget"] = 2500.0 - result["renegotiation_used"] = False - r = procurement_respond_handler(result, "reject", None, "supplier", _make_policy()) - assert r["settlement_status"] == "rejected" - - # --- accept + renegotiate → authorized at proposed_payment --- - - def test_buyer_accept_supplier_renegotiate_authorized(self): - result = _base_result() - result["buyer_response"] = "accept" - r = procurement_respond_handler(result, "renegotiate", 3500.0, "supplier", _make_policy()) - assert r["settlement_status"] == "authorized" - assert r["renegotiation_used"] is True - assert r["release_token"] is not None - - def test_supplier_accept_buyer_renegotiate_authorized(self): - result = _base_result() - result["supplier_response"] = "accept" - r = procurement_respond_handler(result, "renegotiate", 2500.0, "buyer", _make_policy()) - assert r["settlement_status"] == "authorized" - assert r["renegotiation_used"] is True - - # --- Both renegotiate --- - - def test_both_renegotiate_deal_succeeds(self): - result = _base_result() - result["buyer_response"] = "renegotiate" - result["revised_budget"] = 3000.0 - r = procurement_respond_handler(result, "renegotiate", 2500.0, "supplier", _make_policy()) - assert r["settlement_status"] == "authorized" - assert r["proposed_payment"] == 3000.0 - assert r["renegotiation_used"] is True - - def test_both_renegotiate_deal_fails(self): - result = _base_result() - result["buyer_response"] = "renegotiate" - result["revised_budget"] = 1000.0 - r = procurement_respond_handler(result, "renegotiate", 2000.0, "supplier", _make_policy()) - assert r["settlement_status"] == "rejected" - assert r["deal"] is False - assert any("renegotiation failed" in n.lower() for n in r["notes"]) - - # --- Validation errors --- - - def test_second_renegotiation_raises(self): - result = _base_result() - result["renegotiation_used"] = True - result["buyer_response"] = "renegotiate" - with pytest.raises(ValueError, match="already used"): - procurement_respond_handler(result, "renegotiate", 2000.0, "supplier", _make_policy()) - - def test_renegotiate_without_value_raises(self): - with pytest.raises(ValueError, match="revised_value is required"): - procurement_respond_handler(_base_result(), "renegotiate", None, "buyer", _make_policy()) - - def test_buyer_revised_above_budget_raises(self): - with pytest.raises(ValueError, match="max budget"): - procurement_respond_handler( - _base_result(), "renegotiate", 99999.0, "buyer", _make_policy() - ) - - def test_supplier_negative_reserve_raises(self): - with pytest.raises(ValueError, match="negative"): - procurement_respond_handler( - _base_result(), "renegotiate", -100.0, "supplier", _make_policy() - ) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 09737f7..b255a53 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -6,14 +6,20 @@ This is the CI test suite. Workflow covered: - 1. Operator init (multi-turn loop) → configuring → ready, tokens issued + 1. Instance setup → tokens issued 2. Participant submits below threshold → received_pending - 3. 5th submission auto-triggers pipeline → received_analysis_complete + 3. Nth submission auto-triggers pipeline → received_analysis_complete 4. Operator manual trigger → runs pipeline 5. Role-based result views (admin sees all, user sees own) 6. Token enforcement (missing/wrong/wrong-role → 401/403) + +Note: /init was removed in the agent-skill pivot. Tests now seed instances +directly via _setup_instance() until typed POST /instances lands in phase 4. """ from __future__ import annotations +import secrets +import uuid + import pytest from unittest.mock import patch from fastapi.testclient import TestClient @@ -44,39 +50,34 @@ def _fake_run_skill(inputs, params): ) -def _make_init_handler(): - """Stateful handler: call 1 → configuring, call 2 → ready.""" - calls = [] - - def handler(message, conversation): - calls.append(message) - conv = list(conversation) + [{"role": "human", "content": message}] - if len(calls) == 1: - conv.append({"role": "ai", "content": "What evaluation criteria would you like?"}) - return { - "status": "configuring", - "message": "What evaluation criteria would you like?", - "conversation": conv, - } - conv.append({"role": "ai", "content": "All set! Instance is ready."}) - return { - "status": "ready", - "message": "All set! Instance is ready.", - "conversation": conv, - "config": OperatorConfig( - criteria={"originality": 0.5, "feasibility": 0.5}, - guidelines="Focus on AI/ML projects", - ), - "threshold": 5, - } - - return handler +def _setup_instance(threshold=5): + """Seed an instance directly in routes._instances. Returns (instance_id, admin_token). + Replaces the now-deleted /init flow. Phase 4 will introduce typed POST /instances + and these tests will be updated to call it instead.""" + instance_id = str(uuid.uuid4()) + routes._instances[instance_id] = { + "skill_name": "hackathon_novelty", + "config": OperatorConfig( + criteria={"originality": 0.5, "feasibility": 0.5}, + guidelines="", + instance_id=instance_id, + ), + "threshold": threshold, + "conversation": [], + "triggered": False, + } + routes._submissions[instance_id] = {} + routes._results[instance_id] = {} + + admin_token = secrets.token_urlsafe(16) + routes._tokens[admin_token] = { + "instance_id": instance_id, + "role": "admin", + "submission_ids": set(), + } + return instance_id, admin_token -# NOTE — Per-user token isolation: -# Each participant registers via POST /register to receive a unique user token. -# Ownership (token → submission_ids) is tracked inside the TEE. -# GET /results/{id} returns 403 if the token did not submit that submission_id. # --- Fixtures --- @@ -99,60 +100,16 @@ def client(): # --- Tests --- -def test_operator_init_loop(client): - """Two-turn init: first response asks for more info, second issues tokens.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler): - # Turn 1: LLM asks for criteria - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "I want to run a hackathon", - }) - assert r.status_code == 200 - body = r.json() - assert body["status"] == "configuring" - assert body["admin_token"] is None - instance_id = body["instance_id"] - - # Turn 2: operator provides criteria → ready - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "originality 0.5, feasibility 0.5", - "instance_id": instance_id, - }) - assert r.status_code == 200 - body = r.json() - assert body["status"] == "ready" - assert body["admin_token"] is not None - assert "user_token" not in body - assert body["instance_id"] == instance_id - - def test_full_e2e_workflow(client): - """Full happy path: init → submit below threshold → auto-trigger → view results → manual trigger.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler), \ - patch.object(skill_card, "run", _fake_run_skill): - - # Step 1-2: Operator init loop - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "setup hackathon", - }) - instance_id = r.json()["instance_id"] - - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "originality 0.5, feasibility 0.5", - "instance_id": instance_id, - }) - admin_token = r.json()["admin_token"] + """Full happy path: seed instance → submit below threshold → auto-trigger → view results → manual trigger.""" + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, admin_token = _setup_instance(threshold=5) r = client.post("/register", json={"instance_id": instance_id}) assert r.status_code == 200 user_token = r.json()["user_token"] - # Step 3: Submit 4 times — all below threshold + # Submit 4 times — all below threshold for i in range(1, 5): r = client.post( "/submit", @@ -164,7 +121,7 @@ def test_full_e2e_workflow(client): assert body["status"] == "received_pending" assert body["submissions_count"] == i - # Step 4: 5th submission auto-triggers pipeline + # 5th submission auto-triggers pipeline r = client.post( "/submit", json={"submission_id": "sub_005", "idea_text": "Fifth idea, triggers pipeline"}, @@ -173,7 +130,7 @@ def test_full_e2e_workflow(client): assert r.status_code == 200 assert r.json()["status"] == "received_analysis_complete" - # Step 5: Participant views their own result + # Participant views their own result r = client.get("/results/sub_001", headers={"X-Instance-Token": user_token}) assert r.status_code == 200 body = r.json() @@ -183,16 +140,15 @@ def test_full_e2e_workflow(client): # Users should NOT see internal fields assert "criteria_scores" not in body assert "status" not in body - assert "relevance_score" not in body - # Step 6: Operator views all results + # Operator views all results r = client.get("/results", headers={"X-Instance-Token": admin_token}) assert r.status_code == 200 results = r.json()["results"] assert len(results) == 5 assert all("submission_id" in res for res in results) - # Step 7: Operator manual trigger + # Operator manual trigger r = client.post("/trigger", headers={"X-Instance-Token": admin_token}) assert r.status_code == 200 assert r.json()["status"] == "complete" @@ -201,16 +157,7 @@ def test_full_e2e_workflow(client): def test_token_enforcement(client): """Token-based auth and role enforcement.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler): - r = client.post("/init", json={"skill_name": "hackathon_novelty", "message": "start"}) - instance_id = r.json()["instance_id"] - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "criteria ready", - "instance_id": instance_id, - }) - admin_token = r.json()["admin_token"] + instance_id, admin_token = _setup_instance() r = client.post("/register", json={"instance_id": instance_id}) user_token = r.json()["user_token"] @@ -246,16 +193,7 @@ def test_token_enforcement(client): def test_result_not_found_before_pipeline(client): """Requesting a result before the pipeline runs returns 404.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler): - r = client.post("/init", json={"skill_name": "hackathon_novelty", "message": "start"}) - instance_id = r.json()["instance_id"] - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "ready", - "instance_id": instance_id, - }) - instance_id = r.json()["instance_id"] + instance_id, _ = _setup_instance() r = client.post("/register", json={"instance_id": instance_id}) user_token = r.json()["user_token"] @@ -264,13 +202,9 @@ def test_result_not_found_before_pipeline(client): assert r.status_code == 404 -def test_init_unknown_instance_returns_404(client): - """Continuing an init conversation with a non-existent instance_id returns 404.""" - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "hello", - "instance_id": "does-not-exist", - }) +def test_register_unknown_instance_returns_404(client): + """Registering for a non-existent instance returns 404.""" + r = client.post("/register", json={"instance_id": "does-not-exist"}) assert r.status_code == 404 @@ -304,87 +238,11 @@ def test_skills_metadata_endpoints(client): assert r.status_code == 404 -def test_init_unknown_skill_returns_404(client): - """POST /init with a non-existent skill_name returns 404, not 500.""" - r = client.post("/init", json={ - "skill_name": "nonexistent_skill", - "message": "hello", - }) - assert r.status_code == 404 - assert "not found" in r.json()["detail"].lower() - - -def test_init_rejects_empty_criteria(): - """Init handler returns configuring when LLM extracts empty criteria.""" - from skills.hackathon_novelty.init import hackathon_init_handler - from unittest.mock import patch - - class _FakeLLM: - def invoke(self, messages): - class _Resp: - content = '{"ready": true, "criteria": {}, "guidelines": "", "threshold": 5}' - return _Resp() - - # Pass non-empty conversation so it skips the greeting template and hits the LLM - seeded_conversation = [ - {"role": "system", "content": "system prompt"}, - {"role": "ai", "content": "greeting"}, - ] - with patch("skills.hackathon_novelty.init.get_llm", return_value=_FakeLLM()): - result = hackathon_init_handler("use empty criteria", seeded_conversation) - assert result["status"] == "configuring" - assert "empty" in result["message"].lower() or "criterion" in result["message"].lower() - - -def test_init_rejects_bad_weight_sum(): - """Init handler returns configuring when criteria weights don't sum to ~1.0.""" - from skills.hackathon_novelty.init import hackathon_init_handler - from unittest.mock import patch - - class _FakeLLM: - def invoke(self, messages): - class _Resp: - content = '{"ready": true, "criteria": {"a": 0.3, "b": 0.3}, "guidelines": "", "threshold": 5}' - return _Resp() - - seeded_conversation = [ - {"role": "system", "content": "system prompt"}, - {"role": "ai", "content": "greeting"}, - ] - with patch("skills.hackathon_novelty.init.get_llm", return_value=_FakeLLM()): - result = hackathon_init_handler("bad weights", seeded_conversation) - assert result["status"] == "configuring" - assert "1.0" in result["message"] or "sum" in result["message"].lower() - - -def test_init_rejects_non_numeric_threshold(): - """Init handler returns configuring when threshold is non-numeric.""" - from skills.hackathon_novelty.init import hackathon_init_handler - from unittest.mock import patch - - class _FakeLLM: - def invoke(self, messages): - class _Resp: - content = '{"ready": true, "criteria": {"a": 0.5, "b": 0.5}, "guidelines": "", "threshold": "five"}' - return _Resp() - - seeded_conversation = [ - {"role": "system", "content": "system prompt"}, - {"role": "ai", "content": "greeting"}, - ] - with patch("skills.hackathon_novelty.init.get_llm", return_value=_FakeLLM()): - result = hackathon_init_handler("bad threshold", seeded_conversation) - assert result["status"] == "configuring" - assert "threshold" in result["message"].lower() - - def test_missing_agent_result_produces_error_status(): """When agent output is missing a submission_id, that result gets status='error'.""" import numpy as np - from unittest.mock import patch from skills.hackathon_novelty import run_skill from skills.hackathon_novelty.models import HackathonSubmission - from core.models import OperatorConfig inputs = [ HackathonSubmission(submission_id=f"sub_{i:03d}", idea_text=f"Unique idea number {i}") @@ -425,17 +283,8 @@ def test_missing_agent_result_produces_error_status(): def test_retrigger_on_6th_submission(client): """After threshold triggers on 5th submission, 6th submission re-triggers with all 6 scored.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler), \ - patch.object(skill_card, "run", _fake_run_skill): - r = client.post("/init", json={"skill_name": "hackathon_novelty", "message": "start"}) - instance_id = r.json()["instance_id"] - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "ready", - "instance_id": instance_id, - }) - admin_token = r.json()["admin_token"] + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, admin_token = _setup_instance(threshold=5) r = client.post("/register", json={"instance_id": instance_id}) user_token = r.json()["user_token"] @@ -462,16 +311,7 @@ def test_retrigger_on_6th_submission(client): def test_submit_missing_required_field_returns_422(client): """Submitting without the required idea_text field returns 422.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler): - r = client.post("/init", json={"skill_name": "hackathon_novelty", "message": "start"}) - instance_id = r.json()["instance_id"] - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "ready", - "instance_id": instance_id, - }) - instance_id = r.json()["instance_id"] + instance_id, _ = _setup_instance() r = client.post("/register", json={"instance_id": instance_id}) user_token = r.json()["user_token"] @@ -486,16 +326,8 @@ def test_submit_missing_required_field_returns_422(client): def test_cross_user_result_isolation(client): """User A cannot read User B's result even if they know the submission_id.""" - handler = _make_init_handler() - with patch.object(skill_card, "init_handler", handler), \ - patch.object(skill_card, "run", _fake_run_skill): - r = client.post("/init", json={"skill_name": "hackathon_novelty", "message": "start"}) - instance_id = r.json()["instance_id"] - r = client.post("/init", json={ - "skill_name": "hackathon_novelty", - "message": "ready", - "instance_id": instance_id, - }) + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, _ = _setup_instance(threshold=5) # Two distinct users register token_a = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] diff --git a/tests/test_live_e2e.py b/tests/test_live_e2e.py deleted file mode 100644 index 52db6ed..0000000 --- a/tests/test_live_e2e.py +++ /dev/null @@ -1,496 +0,0 @@ -""" -Live E2E tests for confidential_data_procurement. - -Tests the full API endpoint flow with real HuggingFace transaction data and a real LLM. -No hardcoded BuyerPolicy — buyer describes requirements in natural language via POST /init. - -Scenarios: - 1. Full satisfaction — clean seller data, all required columns → deal, both accept - 2. Partial satisfaction — seller drops 'category' → lower payment (schema gap flagged) - 3. Bad data — >50% duplicate rows → critical rejection, no LLM agent ran - 4. Renegotiation overlap — partial data, both negotiate, terms meet → authorized - 5. Renegotiation no overlap — partial data, buyer drops 40%, seller holds → rejected - -NOTE: All tests are @pytest.mark.live. They are skipped unless CONCLAVE_NEARAI_API_KEY -is set in the environment. Run individually with: - set -a && source .env && set +a - ./venv/bin/python -m pytest tests/test_live_e2e.py -v -s -""" -from __future__ import annotations - -import json -import uuid - -import pandas as pd -import pytest -from fastapi.testclient import TestClient - -import api.routes as routes -from skills.confidential_data_procurement.ingest import _datasets - - -# --------------------------------------------------------------------------- -# Buyer prompt — natural language, ~150 words, all required fields included -# --------------------------------------------------------------------------- - -BUYER_PROMPT = ( - "We are building a machine learning pipeline for real-time fraud detection in " - "payment processing. We need a labeled transaction dataset with four specific columns: " - "transaction_id (a unique identifier per transaction), amount (the transaction value " - "in USD), is_fraud (a binary label — 1 for fraudulent, 0 for legitimate), and category " - "(merchant category code). The dataset must contain at least 500 rows. We can tolerate " - "at most 5% missing values and at most 10% duplicate rows. No personally identifiable " - "information should appear — date of birth, credit card numbers, social security numbers, " - "or any customer names and addresses are strictly not acceptable. This data will train a " - "gradient boosting classifier so label accuracy and field completeness are critical. " - "Our maximum budget for a perfect dataset is $800. We have no floor price — if the " - "data is unusable we expect to pay nothing." -) - - -# --------------------------------------------------------------------------- -# Seller metadata -# --------------------------------------------------------------------------- - -_META_CLEAN = json.dumps({ - "column_definitions": { - "transaction_id": "Unique identifier per transaction", - "amount": "Transaction value in USD", - "is_fraud": "Binary fraud label — 1 if fraudulent, 0 otherwise", - "category": "Merchant category code", - }, - "seller_claims": { - "completeness": "All four columns are fully populated — zero missing values", - "label_rate": "Approximately 4% of transactions are labeled as fraudulent", - }, -}).encode() - -_META_PARTIAL = json.dumps({ - "column_definitions": { - "transaction_id": "Unique identifier per transaction", - "amount": "Transaction value in USD", - "is_fraud": "Binary fraud label", - }, - "seller_claims": { - "completeness": "Data is mostly complete", - }, -}).encode() - - -# --------------------------------------------------------------------------- -# Session fixtures -# --------------------------------------------------------------------------- - -@pytest.fixture(scope="session") -def app_client(): - """Session-scoped TestClient — state persists across all live E2E tests.""" - # Clear any leftover state from prior test files - routes._instances.clear() - routes._submissions.clear() - routes._results.clear() - routes._tokens.clear() - routes._registrations.clear() - _datasets.clear() - from main import app - with TestClient(app) as client: - yield client - _datasets.clear() - - -@pytest.fixture(scope="session") -def buyer_init(app_client): - """ - Single real LLM conversation: buyer describes requirements → BuyerPolicy extracted. - Session-scoped — runs once, all scenario tests share the same instance. - """ - r1 = app_client.post("/init", json={ - "skill_name": "confidential_data_procurement", - "message": "I want to set up a data procurement instance.", - }) - assert r1.status_code == 200, r1.text - instance_id = r1.json()["instance_id"] - - r2 = app_client.post("/init", json={ - "skill_name": "confidential_data_procurement", - "message": BUYER_PROMPT, - "instance_id": instance_id, - }) - assert r2.status_code == 200, r2.text - - # If LLM asks a follow-up, give one more nudge - if r2.json().get("status") != "ready": - r3 = app_client.post("/init", json={ - "skill_name": "confidential_data_procurement", - "message": "That covers everything. Please finalize the policy.", - "instance_id": instance_id, - }) - assert r3.status_code == 200, r3.text - assert r3.json().get("status") == "ready", ( - f"Init handler did not reach ready after 3 turns: {r3.json().get('message')}" - ) - admin_token = r3.json()["admin_token"] - else: - admin_token = r2.json()["admin_token"] - - print(f"\n[buyer_init] instance_id={instance_id}, admin_token={admin_token[:12]}...") - return instance_id, admin_token - - -# --------------------------------------------------------------------------- -# Seller data builders (from real HuggingFace base_df) -# --------------------------------------------------------------------------- - -def _clean_csv(base_df: pd.DataFrame) -> bytes: - """All four required columns, no corruption.""" - cols = [c for c in ["transaction_id", "amount", "is_fraud", "category"] - if c in base_df.columns] - return base_df[cols].to_csv(index=False).encode() - - -def _partial_csv(base_df: pd.DataFrame) -> bytes: - """Drop 'category' — buyer requires it. Everything else intact.""" - cols = [c for c in ["transaction_id", "amount", "is_fraud"] - if c in base_df.columns] - return base_df[cols].to_csv(index=False).encode() - - -def _bad_csv(base_df: pd.DataFrame) -> bytes: - """Duplicate every row — produces exactly 50% dup rate, triggers critical rejection.""" - cols = [c for c in ["transaction_id", "amount", "is_fraud", "category"] - if c in base_df.columns] - df = base_df[cols] - return pd.concat([df, df]).to_csv(index=False).encode() - - -# --------------------------------------------------------------------------- -# API helpers -# --------------------------------------------------------------------------- - -def _register(client, instance_id: str) -> str: - r = client.post("/register", json={"instance_id": instance_id}) - assert r.status_code == 200, r.text - return r.json()["user_token"] - - -def _upload(client, user_token: str, csv_bytes: bytes, metadata_bytes: bytes) -> str: - r = client.post( - "/upload", - files={ - "csv_file": ("dataset.csv", csv_bytes, "text/csv"), - "metadata_file": ("metadata.json", metadata_bytes, "application/json"), - }, - headers={"X-Instance-Token": user_token}, - ) - assert r.status_code == 200, r.text - return r.json()["dataset_id"] - - -def _submit(client, user_token: str, dataset_id: str, sub_id: str, reserve: float) -> dict: - r = client.post( - "/submit", - json={ - "submission_id": sub_id, - "dataset_id": dataset_id, - "dataset_name": "seller_data.csv", - "reserve_price": reserve, - }, - headers={"X-Instance-Token": user_token}, - ) - assert r.status_code == 200, r.text - return r.json() - - -def _get_result(client, sub_id: str, token: str) -> dict: - r = client.get(f"/results/{sub_id}", headers={"X-Instance-Token": token}) - assert r.status_code == 200, r.text - return r.json() - - -def _respond(client, sub_id: str, action: str, token: str, - revised_value: float | None = None) -> dict: - body: dict = {"submission_id": sub_id, "action": action} - if revised_value is not None: - body["revised_value"] = revised_value - r = client.post("/respond", json=body, headers={"X-Instance-Token": token}) - assert r.status_code == 200, r.text - return r.json() - - -def _run_pipeline(client, buyer_init, csv_bytes: bytes, metadata_bytes: bytes, - reserve: float): - """ - Full supplier flow: register → upload → submit → get_result. - Returns (result_dict, admin_token, user_token, sub_id). - """ - instance_id, admin_token = buyer_init - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token, csv_bytes, metadata_bytes) - sub_id = str(uuid.uuid4())[:12] - _submit(client, user_token, dataset_id, sub_id, reserve) - result = _get_result(client, sub_id, admin_token) - print(f"\n pipeline → sub={sub_id} deal={result.get('deal')} " - f"quality={result.get('quality_score')} payment=${result.get('proposed_payment')}") - return result, admin_token, user_token, sub_id - - -# --------------------------------------------------------------------------- -# Scenario 1: Full satisfaction -# --------------------------------------------------------------------------- - -@pytest.mark.live -def test_full_satisfaction(app_client, base_df, buyer_init, matrix_results): - """ - 100% satisfied: clean data, all required columns, honest claims. - Both parties accept the enclave's offer → authorized. - """ - result, admin_token, user_token, sub_id = _run_pipeline( - app_client, buyer_init, - _clean_csv(base_df), _META_CLEAN, reserve=150.0, - ) - - assert result.get("deal") is True, f"Expected deal=True, got: {result}" - assert result.get("settlement_status") == "pending_approval" - - _respond(app_client, sub_id, "accept", admin_token) - respond_result = _respond(app_client, sub_id, "accept", user_token) - assert respond_result["settlement_status"] == "authorized" - - # Fetch full result to verify release_token issued - final = _get_result(app_client, sub_id, admin_token) - assert final.get("release_token") is not None - - matrix_results.append({ - "type": "evaluation", - "scenario": "Full Satisfaction", - "narrative": ( - "Buyer described exact requirements in natural language — LLM extracted the policy. " - "Seller uploaded clean real transaction data (HuggingFace) with all four required columns. " - "Agent verified claims and found no schema gaps. Both parties accepted the enclave's offer." - ), - "buyer_prompt": BUYER_PROMPT, - "seller_input": { - "column_definitions": { - "transaction_id": "Unique identifier per transaction", - "amount": "Transaction value in USD", - "is_fraud": "Binary fraud label — 1 if fraudulent, 0 otherwise", - "category": "Merchant category code", - }, - "seller_claims": { - "completeness": "All four columns are fully populated — zero missing values", - "label_rate": "Approximately 4% of transactions are labeled as fraudulent", - }, - }, - "seller": "clean", - "buyer": "standard ($800)", - "reserve": 150.0, - "quality": result.get("quality_score"), - "payment": result.get("proposed_payment"), - "deal": True, - "notes": result.get("notes", []), - "explanation": result.get("explanation", ""), - "schema_matching": result.get("schema_matching"), - "claim_verification": result.get("claim_verification"), - }) - - -# --------------------------------------------------------------------------- -# Scenario 2: Partial satisfaction -# --------------------------------------------------------------------------- - -@pytest.mark.live -def test_partial_satisfaction(app_client, base_df, buyer_init, matrix_results): - """ - ~80% satisfied: seller drops 'category' (buyer required it). - Agent penalises schema score — payment is proportionally lower. - """ - result, _, _, _ = _run_pipeline( - app_client, buyer_init, - _partial_csv(base_df), _META_PARTIAL, reserve=50.0, - ) - - # Agent should at minimum note the missing column in its explanation - explanation = result.get("explanation", "") - assert "category" in explanation.lower(), ( - f"Expected agent to flag missing 'category' column. Explanation: {explanation}" - ) - - matrix_results.append({ - "type": "evaluation", - "scenario": "Partial Satisfaction", - "narrative": ( - "Seller omitted the 'category' column, which the buyer explicitly required. " - "Agent identified the schema gap and noted it in the explanation. " - "Quality score and payment reflect the LLM's assessment of the partial dataset." - ), - "buyer_prompt": BUYER_PROMPT, - "seller_input": { - "column_definitions": { - "transaction_id": "Unique identifier per transaction", - "amount": "Transaction value in USD", - "is_fraud": "Binary fraud label", - }, - "seller_claims": { - "completeness": "Data is mostly complete", - }, - }, - "seller": "partial (missing category)", - "buyer": "standard ($800)", - "reserve": 50.0, - "quality": result.get("quality_score"), - "payment": result.get("proposed_payment"), - "deal": result.get("deal"), - "notes": result.get("notes", []), - "explanation": result.get("explanation", ""), - "schema_matching": result.get("schema_matching"), - "claim_verification": result.get("claim_verification"), - }) - - -# --------------------------------------------------------------------------- -# Scenario 3: Bad data — critical rejection -# --------------------------------------------------------------------------- - -@pytest.mark.live -def test_bad_data_rejected(app_client, base_df, buyer_init, matrix_results): - """ - Critical: >50% duplicate rows → immediate rejection by deterministic layer. - No LLM agent ran — explanation is absent. - """ - result, _, _, _ = _run_pipeline( - app_client, buyer_init, - _bad_csv(base_df), _META_CLEAN, reserve=0.0, - ) - - assert result.get("deal") is False - assert not result.get("explanation"), "Agent should not have run for critical rejection" - - matrix_results.append({ - "type": "evaluation", - "scenario": "Critical: >50% Duplicates", - "narrative": ( - "Seller submitted a dataset where every row is duplicated — over 50% dup rate. " - "The deterministic layer flags this as a critical violation and rejects immediately. " - "No LLM call. Payment = $0. Seller's reserve price is irrelevant." - ), - "buyer_prompt": BUYER_PROMPT, - "seller_input": { - "column_definitions": { - "transaction_id": "Unique identifier per transaction", - "amount": "Transaction value in USD", - "is_fraud": "Binary fraud label — 1 if fraudulent, 0 otherwise", - "category": "Merchant category code", - }, - "seller_claims": {}, - "note": "Every row duplicated — submitted 2000 rows from a 1000-row base dataset", - }, - "seller": "duplicated (>50%)", - "buyer": "standard ($800)", - "reserve": 0.0, - "quality": 0.0, - "payment": result.get("proposed_payment", 0), - "deal": False, - "notes": result.get("notes", []), - "explanation": None, - "schema_matching": None, - "claim_verification": None, - }) - - -# --------------------------------------------------------------------------- -# Scenario 4: Renegotiation — terms overlap → deal -# --------------------------------------------------------------------------- - -@pytest.mark.live -def test_renegotiation_overlap(app_client, base_df, buyer_init, matrix_results): - """ - Partial data evaluated by full pipeline. Enclave proposes an offer. - Buyer revises down 12%, seller lowers floor 21% — they overlap → deal. - Amounts derived from the actual pipeline result (no hardcoding). - """ - result, admin_token, user_token, sub_id = _run_pipeline( - app_client, buyer_init, - _partial_csv(base_df), _META_PARTIAL, reserve=50.0, - ) - assert result.get("settlement_status") == "pending_approval", ( - f"Expected pending_approval for renegotiation test, got: {result.get('settlement_status')}" - ) - - p = result["proposed_payment"] - buyer_revised = round(p * 0.88) # buyer cuts 12% - supplier_revised = round(p * 0.79) # seller lowers floor 21% - - _respond(app_client, sub_id, "renegotiate", admin_token, buyer_revised) - respond_result = _respond(app_client, sub_id, "renegotiate", user_token, supplier_revised) - assert respond_result["settlement_status"] == "authorized", ( - f"Expected authorized, got: {respond_result['settlement_status']}. " - f"buyer_revised={buyer_revised}, supplier_revised={supplier_revised}" - ) - - # Fetch full result to verify final payment - final = _get_result(app_client, sub_id, admin_token) - assert final["proposed_payment"] == buyer_revised - - matrix_results.append({ - "type": "renegotiation", - "scenario": "Renegotiation — Terms Overlap", - "narrative": ( - f"Enclave offers ${p:.0f} for partial data (missing category). " - f"Buyer revises down to ${buyer_revised:.0f} (−12%). " - f"Seller lowers floor to ${supplier_revised:.0f} (−21%). " - f"${buyer_revised:.0f} ≥ ${supplier_revised:.0f} → deal at buyer's revised offer. " - "Neither party saw the other's private number." - ), - "initial_offer": p, - "buyer_action": f"renegotiate → ${buyer_revised:.0f}", - "supplier_action": f"renegotiate → ${supplier_revised:.0f}", - "final_payment": buyer_revised, - "deal": True, - }) - - -# --------------------------------------------------------------------------- -# Scenario 5: Renegotiation — no overlap → rejected -# --------------------------------------------------------------------------- - -@pytest.mark.live -def test_renegotiation_no_overlap(app_client, base_df, buyer_init, matrix_results): - """ - Partial data evaluated. Buyer drops 40%, seller barely moves (−5%). - No overlap → deal rejected. One renegotiation round used, deal falls through. - """ - result, admin_token, user_token, sub_id = _run_pipeline( - app_client, buyer_init, - _partial_csv(base_df), _META_PARTIAL, reserve=50.0, - ) - assert result.get("settlement_status") == "pending_approval", ( - f"Expected pending_approval for renegotiation test, got: {result.get('settlement_status')}" - ) - - p = result["proposed_payment"] - buyer_revised = round(p * 0.60) # buyer drops 40% - supplier_revised = round(p * 0.95) # seller barely moves - - _respond(app_client, sub_id, "renegotiate", admin_token, buyer_revised) - final = _respond(app_client, sub_id, "renegotiate", user_token, supplier_revised) - - assert final["settlement_status"] == "rejected", ( - f"Expected rejected, got: {final['settlement_status']}. " - f"buyer_revised={buyer_revised}, supplier_revised={supplier_revised}" - ) - - matrix_results.append({ - "type": "renegotiation", - "scenario": "Renegotiation — No Overlap", - "narrative": ( - f"Enclave offers ${p:.0f} for partial data. " - f"Buyer drops hard to ${buyer_revised:.0f} (−40%). " - f"Seller holds firm at ${supplier_revised:.0f} (−5%). " - f"${buyer_revised:.0f} < ${supplier_revised:.0f} → deal falls through. " - "One round used — both sides walked away." - ), - "initial_offer": p, - "buyer_action": f"renegotiate → ${buyer_revised:.0f}", - "supplier_action": f"renegotiate → ${supplier_revised:.0f}", - "final_payment": None, - "deal": False, - }) diff --git a/tests/test_procurement_e2e.py b/tests/test_procurement_e2e.py deleted file mode 100644 index dac73ba..0000000 --- a/tests/test_procurement_e2e.py +++ /dev/null @@ -1,450 +0,0 @@ -""" -E2E tests for the confidential_data_procurement skill. - -Validates API plumbing: token auth, upload, submit, role-filtered results, -deal responses, and renegotiation. LLM + deterministic pipeline are mocked -so no API keys or credits are needed. - -Scenarios: - 1. Happy path: init → register → upload → submit → accept → authorized - 2. Critical reject: forbidden column CSV → immediate rejection, no LLM - 3. Role filtering: buyer sees quality_score, supplier does not - 4. Renegotiate: double-renegotiate with overlapping terms → authorized - 5. Mixed respond: buyer accept + supplier renegotiate → authorized - 6. Token enforcement: missing/invalid token → 401/403 -""" -from __future__ import annotations - -import json - -import pytest -from fastapi.testclient import TestClient -from unittest.mock import patch - -import api.routes as routes -from core.models import SkillResponse -from skills.confidential_data_procurement import skill_card as proc_card -from skills.confidential_data_procurement.models import BuyerPolicy - - -# --------------------------------------------------------------------------- -# Test data -# --------------------------------------------------------------------------- - -_GOOD_CSV = ( - b"transaction_id,amount,is_fraud\n" - + b"".join( - f"txn_{i:04d},{i * 10.5:.2f},{1 if i % 25 == 0 else 0}\n".encode() - for i in range(200) - ) -) - -_BAD_CSV = ( - b"transaction_id,amount,is_fraud,ssn\n" - + b"".join( - f"txn_{i:04d},{i * 10.5:.2f},{1 if i % 25 == 0 else 0},xxx-xx-0000\n".encode() - for i in range(50) - ) -) - -_METADATA_JSON = json.dumps({ - "column_definitions": { - "transaction_id": "Unique ID for each transaction", - "amount": "Transaction amount in USD", - "is_fraud": "1 if fraudulent, 0 otherwise", - }, - "seller_claims": { - "balanced_labels": "Approximately 4% fraud rate", - "no_missing_values": "All fields fully populated", - }, -}).encode() - -_POLICY = BuyerPolicy( - required_columns=["transaction_id", "amount", "is_fraud"], - min_rows=100, - max_null_rate=0.05, - max_duplicate_rate=0.10, - min_label_rate=0.02, - label_column="is_fraud", - forbidden_columns=["ssn", "dob"], - max_budget=5000.0, - base_price=500.0, -) - - -# --------------------------------------------------------------------------- -# Fakes -# --------------------------------------------------------------------------- - -def _make_init_handler(policy: BuyerPolicy = _POLICY): - """Stateful mock: turn 1 → configuring, turn 2 → ready with BuyerPolicy.""" - calls = [] - - def handler(message, conversation): - calls.append(message) - conv = list(conversation) + [{"role": "human", "content": message}] - if len(calls) == 1: - conv.append({"role": "ai", "content": "Please describe your dataset requirements."}) - return { - "status": "configuring", - "message": "Please describe your dataset requirements.", - "conversation": conv, - } - conv.append({"role": "ai", "content": "Policy saved."}) - return { - "status": "ready", - "message": "Policy saved.", - "conversation": conv, - "config": policy, - "threshold": 1, - } - - return handler - - -def _fake_run_deal(inputs, params): - return SkillResponse( - skill="confidential_data_procurement", - results=[{ - "submission_id": inputs[0].submission_id, - "deal": True, - "quality_score": 0.82, - "proposed_payment": 3500.0, - "hard_constraints_pass": True, - "settlement_status": "pending_approval", - "release_token": None, - "notes": [], - "explanation": "Dataset meets all requirements.", - "claim_verification": {"balanced_labels": "verified"}, - "schema_matching": {"transaction_id": "transaction_id"}, - "buyer_response": None, - "supplier_response": None, - "renegotiation_used": False, - }], - ) - - -def _fake_run_rejected(inputs, params): - return SkillResponse( - skill="confidential_data_procurement", - results=[{ - "submission_id": inputs[0].submission_id, - "deal": False, - "quality_score": 0.0, - "proposed_payment": 500.0, - "hard_constraints_pass": False, - "settlement_status": "rejected", - "release_token": None, - "notes": ["Forbidden column 'ssn' detected. Deal rejected."], - "explanation": None, - "claim_verification": None, - "schema_matching": None, - "buyer_response": None, - "supplier_response": None, - "renegotiation_used": False, - }], - ) - - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - -@pytest.fixture(autouse=True) -def clear_stores(): - """Reset all in-memory state before each test.""" - routes._instances.clear() - routes._submissions.clear() - routes._results.clear() - routes._tokens.clear() - routes._registrations.clear() - from skills.confidential_data_procurement.ingest import _datasets - _datasets.clear() - yield - _datasets.clear() - - -@pytest.fixture -def client(): - from main import app - return TestClient(app) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _init_procurement(client, policy=_POLICY): - """Run the two-turn init flow and return (instance_id, admin_token).""" - handler = _make_init_handler(policy) - with patch.object(proc_card, "init_handler", handler): - r = client.post("/init", json={"skill_name": "confidential_data_procurement", "message": "setup"}) - assert r.status_code == 200 - instance_id = r.json()["instance_id"] - - r = client.post("/init", json={ - "skill_name": "confidential_data_procurement", - "message": "transaction_id, amount, is_fraud, budget 5000", - "instance_id": instance_id, - }) - assert r.status_code == 200 - assert r.json()["status"] == "ready" - return instance_id, r.json()["admin_token"] - - -def _register(client, instance_id): - r = client.post("/register", json={"instance_id": instance_id}) - assert r.status_code == 200 - return r.json()["user_token"] - - -def _upload(client, user_token, csv_bytes=_GOOD_CSV, metadata_bytes=_METADATA_JSON): - r = client.post( - "/upload", - files={ - "csv_file": ("dataset.csv", csv_bytes, "text/csv"), - "metadata_file": ("metadata.json", metadata_bytes, "application/json"), - }, - headers={"X-Instance-Token": user_token}, - ) - assert r.status_code == 200, r.text - return r.json()["dataset_id"] - - -def _submit(client, user_token, dataset_id, sub_id="sub-001", reserve=1000.0): - r = client.post( - "/submit", - json={ - "submission_id": sub_id, - "dataset_id": dataset_id, - "dataset_name": "fraud_dataset.csv", - "reserve_price": reserve, - }, - headers={"X-Instance-Token": user_token}, - ) - assert r.status_code == 200 - return r.json() - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - -def test_procurement_happy_path_both_accept(client): - """Full happy path: init → upload → submit → both accept → authorized.""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token) - - with patch.object(proc_card, "run", _fake_run_deal): - resp = _submit(client, user_token, dataset_id) - assert resp["status"] == "received_analysis_complete" - - # Buyer views result (should see quality_score) - r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) - assert r.status_code == 200 - result = r.json() - assert result["deal"] is True - assert "quality_score" in result - assert result["settlement_status"] == "pending_approval" - - # Buyer accepts - r = client.post("/respond", json={ - "submission_id": "sub-001", - "action": "accept", - }, headers={"X-Instance-Token": admin_token}) - assert r.status_code == 200 - assert r.json()["settlement_status"] == "awaiting_counterparty" - - # Supplier accepts - r = client.post("/respond", json={ - "submission_id": "sub-001", - "action": "accept", - }, headers={"X-Instance-Token": user_token}) - assert r.status_code == 200 - assert r.json()["settlement_status"] == "authorized" - - # Final result should have release_token - r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) - assert r.json()["release_token"] is not None - assert r.json()["settlement_status"] == "authorized" - - -def test_procurement_critical_reject(client): - """Forbidden column CSV → settlement_status='rejected' immediately.""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token, csv_bytes=_BAD_CSV) - - with patch.object(proc_card, "run", _fake_run_rejected): - _submit(client, user_token, dataset_id) - - r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) - assert r.status_code == 200 - result = r.json() - assert result["deal"] is False - assert result["settlement_status"] == "rejected" - assert len(result["notes"]) > 0 - - -def test_procurement_role_filtering(client): - """Buyer sees quality_score; supplier does not.""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token) - - with patch.object(proc_card, "run", _fake_run_deal): - _submit(client, user_token, dataset_id) - - buyer_result = client.get( - "/results/sub-001", headers={"X-Instance-Token": admin_token} - ).json() - supplier_result = client.get( - "/results/sub-001", headers={"X-Instance-Token": user_token} - ).json() - - assert "quality_score" in buyer_result - assert "hard_constraints_pass" in buyer_result - assert "quality_score" not in supplier_result - assert "hard_constraints_pass" not in supplier_result - - # Both should see proposed_payment and deal - assert "proposed_payment" in supplier_result - assert "deal" in supplier_result - - -def test_procurement_double_renegotiate_success(client): - """Both renegotiate with overlapping terms → authorized.""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token) - - with patch.object(proc_card, "run", _fake_run_deal): - _submit(client, user_token, dataset_id) - - # Buyer renegotiates down to 3000 - r = client.post("/respond", json={ - "submission_id": "sub-001", - "action": "renegotiate", - "revised_value": 3000.0, - }, headers={"X-Instance-Token": admin_token}) - assert r.status_code == 200 - assert r.json()["settlement_status"] == "awaiting_counterparty" - - # Supplier renegotiates reserve down to 2500 (< buyer's 3000 → deal) - r = client.post("/respond", json={ - "submission_id": "sub-001", - "action": "renegotiate", - "revised_value": 2500.0, - }, headers={"X-Instance-Token": user_token}) - assert r.status_code == 200 - assert r.json()["settlement_status"] == "authorized" - - r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) - assert r.json()["proposed_payment"] == 3000.0 - - -def test_procurement_double_renegotiate_failure(client): - """Both renegotiate but terms don't meet → rejected.""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token) - - with patch.object(proc_card, "run", _fake_run_deal): - _submit(client, user_token, dataset_id) - - client.post("/respond", json={ - "submission_id": "sub-001", "action": "renegotiate", "revised_value": 1000.0, - }, headers={"X-Instance-Token": admin_token}) - - r = client.post("/respond", json={ - "submission_id": "sub-001", "action": "renegotiate", "revised_value": 2000.0, - }, headers={"X-Instance-Token": user_token}) - assert r.json()["settlement_status"] == "rejected" - - -def test_procurement_buyer_accept_supplier_renegotiate(client): - """Buyer accepts, supplier renegotiates → authorized (acceptor's bound honored).""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token) - - with patch.object(proc_card, "run", _fake_run_deal): - _submit(client, user_token, dataset_id) - - client.post("/respond", json={ - "submission_id": "sub-001", "action": "accept", - }, headers={"X-Instance-Token": admin_token}) - - r = client.post("/respond", json={ - "submission_id": "sub-001", "action": "renegotiate", "revised_value": 4000.0, - }, headers={"X-Instance-Token": user_token}) - assert r.json()["settlement_status"] == "authorized" - - -def test_procurement_second_renegotiation_rejected(client): - """Second renegotiation attempt returns 422.""" - instance_id, admin_token = _init_procurement(client) - user_token = _register(client, instance_id) - dataset_id = _upload(client, user_token) - - with patch.object(proc_card, "run", _fake_run_deal): - _submit(client, user_token, dataset_id) - - # First renegotiation - client.post("/respond", json={ - "submission_id": "sub-001", "action": "renegotiate", "revised_value": 3000.0, - }, headers={"X-Instance-Token": admin_token}) - client.post("/respond", json={ - "submission_id": "sub-001", "action": "renegotiate", "revised_value": 2500.0, - }, headers={"X-Instance-Token": user_token}) - - # Attempt second renegotiation — should fail - r = client.post("/respond", json={ - "submission_id": "sub-001", "action": "renegotiate", "revised_value": 2000.0, - }, headers={"X-Instance-Token": admin_token}) - assert r.status_code == 422 - - -def test_procurement_missing_token_401(client): - """No token header → 401.""" - r = client.post("/submit", json={ - "submission_id": "sub-001", "dataset_id": "x", - "dataset_name": "x.csv", "reserve_price": 100.0, - }) - assert r.status_code == 401 - - -def test_procurement_user_cannot_see_other_submission(client): - """User token cannot view a result it didn't submit.""" - instance_id, admin_token = _init_procurement(client) - user_a = _register(client, instance_id) - user_b = _register(client, instance_id) - - dataset_id = _upload(client, user_a) - with patch.object(proc_card, "run", _fake_run_deal): - _submit(client, user_a, dataset_id, sub_id="sub-a") - - r = client.get("/results/sub-a", headers={"X-Instance-Token": user_b}) - assert r.status_code == 403 - - -def test_procurement_upload_without_csv_returns_422(client): - """Upload with no csv_file field → 422.""" - instance_id, _ = _init_procurement(client) - user_token = _register(client, instance_id) - - r = client.post( - "/upload", - files={"metadata_file": ("meta.json", b"{}", "application/json")}, - headers={"X-Instance-Token": user_token}, - ) - assert r.status_code == 422 - - -def test_procurement_skill_appears_in_skills_list(client): - """New skill is registered and visible via GET /skills.""" - r = client.get("/skills") - assert r.status_code == 200 - names = [s["name"] for s in r.json()["skills"]] - assert "confidential_data_procurement" in names From e9fe1c4e563f1c3952aa6afe1707848216631264 Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 17:01:26 +0530 Subject: [PATCH 03/22] Swap in-memory dicts for SQLite-backed storage - Add storage/ package with module-level functions for instances, submissions, results, tokens, registrations - Schema includes evaluation_runs and attestations tables stubbed for phases 5 and 8 - routes.py: drop _instances/_submissions/_results/_tokens/_registrations module-level dicts, route through storage.* instead - main.py: call storage.init_db() at startup - tests: fixture now calls storage.reset_all(); tests use :memory: DB - _resolve_token now stashes the raw token in the returned dict so /submit can call add_submission_to_token() DB path: env CONCLAVE_DB_PATH (default ./data/conclave.db). Tests use in-memory. Phase 2 of pivot/agent-skill. All 52 tests pass. --- .gitignore | 2 + api/routes.py | 117 +++++++------- data/.gitkeep | 0 main.py | 3 + storage/__init__.py | 60 +++++++ storage/sqlite.py | 370 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_e2e.py | 46 +++--- 7 files changed, 506 insertions(+), 92 deletions(-) create mode 100644 data/.gitkeep create mode 100644 storage/__init__.py create mode 100644 storage/sqlite.py diff --git a/.gitignore b/.gitignore index cd8d950..35f9c3d 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ scripts/ tests/fixtures/ .agents/ skills-lock.json +data/*.db +data/*.db-* diff --git a/api/routes.py b/api/routes.py index f8b9b94..644dc14 100644 --- a/api/routes.py +++ b/api/routes.py @@ -2,7 +2,6 @@ import asyncio import logging import secrets -import uuid from datetime import datetime from functools import partial @@ -10,28 +9,12 @@ logger = logging.getLogger(__name__) -from core.models import SkillResponse +import storage +from core.models import OperatorConfig, SkillResponse from skills.router import SkillRouter router = APIRouter() -# Instance-scoped in-memory stores -_instances: dict[str, dict] = {} -# instance_id -> {skill_name, config, threshold, conversation[], triggered} - -_submissions: dict[str, dict] = {} -# instance_id -> {submission_id -> raw_dict} - -_results: dict[str, dict] = {} -# instance_id -> {submission_id -> result_dict} - -_tokens: dict[str, dict] = {} -# token_string -> {instance_id, role, submission_ids: set[str]} - -_registrations: dict[str, dict] = {} -# instance_id -> {supabase_user_id -> token_string} -# prevents a single Supabase identity from registering twice for the same instance - _skill_router = SkillRouter() @@ -44,37 +27,42 @@ def register_skills(): # --- Helpers --- def _resolve_token(request: Request) -> dict: - """Read X-Instance-Token header and resolve to {instance_id, role}.""" + """Read X-Instance-Token header and resolve to {instance_id, role, submission_ids, ...}.""" token = request.headers.get("X-Instance-Token") if not token: raise HTTPException(status_code=401, detail="X-Instance-Token header required") - if token not in _tokens: + info = storage.get_token(token) + if info is None: raise HTTPException(status_code=403, detail="Invalid or expired token") - return _tokens[token] + info["_raw_token"] = token + return info async def _run_pipeline(instance_id: str) -> int: """Validate submissions, invoke skill pipeline, store results. Returns result count.""" - inst = _instances[instance_id] + inst = storage.get_instance(instance_id) + if inst is None: + raise HTTPException(status_code=404, detail="Instance not found") card = _skill_router.get_card(inst["skill_name"]) - subs = _submissions.get(instance_id, {}) + subs = storage.list_submissions(instance_id) try: inputs = [card.input_model(**s) for s in subs.values()] except Exception as e: raise HTTPException(status_code=422, detail=f"Submission validation failed: {e}") + config = OperatorConfig(**inst["config"]) if isinstance(inst["config"], dict) else inst["config"] + loop = asyncio.get_event_loop() response: SkillResponse = await loop.run_in_executor( None, - partial(_skill_router.invoke, inst["skill_name"], inputs=inputs, params=inst["config"]), + partial(_skill_router.invoke, inst["skill_name"], inputs=inputs, params=config), ) - _results.setdefault(instance_id, {}) for r in response.results: - _results[instance_id][r["submission_id"]] = r + storage.upsert_result(instance_id, r["submission_id"], r) - inst["triggered"] = True + storage.set_instance_triggered(instance_id, True) return len(response.results) @@ -88,10 +76,10 @@ def register_user(body: dict): Each call returns a fresh token — ownership of submitted results is tracked per token. """ instance_id = body.get("instance_id", "").strip() - if not instance_id or instance_id not in _instances: + if not instance_id or not storage.has_instance(instance_id): raise HTTPException(status_code=404, detail="Instance not found") token = secrets.token_urlsafe(16) - _tokens[token] = {"instance_id": instance_id, "role": "user", "submission_ids": set()} + storage.create_token(token, instance_id, role="user") return {"user_token": token} @@ -111,7 +99,7 @@ def auth_send_otp(body: dict): if not email: raise HTTPException(status_code=422, detail="email is required") - if not instance_id or instance_id not in _instances: + if not instance_id or not storage.has_instance(instance_id): raise HTTPException(status_code=404, detail="Instance not found") try: @@ -140,7 +128,7 @@ def auth_verify_token(body: dict): if not access_token: raise HTTPException(status_code=422, detail="access_token is required") - if not instance_id or instance_id not in _instances: + if not instance_id or not storage.has_instance(instance_id): raise HTTPException(status_code=404, detail="Instance not found") try: @@ -156,13 +144,13 @@ def auth_verify_token(body: dict): except Exception as e: raise HTTPException(status_code=401, detail=f"Token validation failed: {e}") - instance_reg = _registrations.setdefault(instance_id, {}) - if supabase_user_id in instance_reg: - return {"user_token": instance_reg[supabase_user_id]} + existing = storage.get_registration_token(instance_id, supabase_user_id) + if existing: + return {"user_token": existing} user_token = secrets.token_urlsafe(16) - _tokens[user_token] = {"instance_id": instance_id, "role": "user", "submission_ids": set(), "supabase_user_id": supabase_user_id} - instance_reg[supabase_user_id] = user_token + storage.create_token(user_token, instance_id, role="user", supabase_user_id=supabase_user_id) + storage.set_registration_token(instance_id, supabase_user_id, user_token) return {"user_token": user_token} @@ -183,7 +171,7 @@ def auth_verify_otp(body: dict): if not email or not token: raise HTTPException(status_code=422, detail="email and token are required") - if not instance_id or instance_id not in _instances: + if not instance_id or not storage.has_instance(instance_id): raise HTTPException(status_code=404, detail="Instance not found") try: @@ -191,15 +179,13 @@ def auth_verify_otp(body: dict): except Exception as e: raise HTTPException(status_code=401, detail=f"OTP verification failed: {e}") - # Idempotent: return existing token if this user already registered for this instance - instance_reg = _registrations.setdefault(instance_id, {}) - if supabase_user_id in instance_reg: - existing_token = instance_reg[supabase_user_id] - return {"user_token": existing_token} + existing = storage.get_registration_token(instance_id, supabase_user_id) + if existing: + return {"user_token": existing} user_token = secrets.token_urlsafe(16) - _tokens[user_token] = {"instance_id": instance_id, "role": "user", "submission_ids": set(), "supabase_user_id": supabase_user_id} - instance_reg[supabase_user_id] = user_token + storage.create_token(user_token, instance_id, role="user", supabase_user_id=supabase_user_id) + storage.set_registration_token(instance_id, supabase_user_id, user_token) return {"user_token": user_token} @@ -213,25 +199,24 @@ def get_me(request: Request): @router.get("/instances/{instance_id}") def get_instance(instance_id: str): """Check if an instance exists. Used by the frontend to validate a participant URL.""" - if instance_id not in _instances: + inst = storage.get_instance(instance_id) + if inst is None: raise HTTPException(status_code=404, detail="Instance not found or expired") - inst = _instances[instance_id] return { "instance_id": instance_id, "skill_name": inst["skill_name"], "triggered": inst["triggered"], - "submissions": len(_submissions.get(instance_id, {})), + "submissions": storage.count_submissions(instance_id), "threshold": inst["threshold"], } @router.get("/health") def health(): - total_subs = sum(len(s) for s in _submissions.values()) return { "status": "ok", - "instances": len(_instances), - "submissions": total_subs, + "instances": storage.count_instances(), + "submissions": storage.count_submissions(), "skills": _skill_router.list_skills(), } @@ -245,7 +230,10 @@ async def submit(submission: dict, request: Request): """ token_info = _resolve_token(request) instance_id = token_info["instance_id"] - skill_name = _instances[instance_id]["skill_name"] + inst = storage.get_instance(instance_id) + if inst is None: + raise HTTPException(status_code=404, detail="Instance not found") + skill_name = inst["skill_name"] card = _skill_router.get_card(skill_name) try: @@ -257,10 +245,10 @@ async def submit(submission: dict, request: Request): submission = validated.model_dump() # ensure stored dict is normalized submission["_submitted_at"] = datetime.utcnow().isoformat() + "Z" - _submissions[instance_id][sid] = submission - token_info["submission_ids"].add(sid) - count = len(_submissions[instance_id]) - threshold = _instances[instance_id]["threshold"] + storage.upsert_submission(instance_id, sid, submission) + storage.add_submission_to_token(token_info["_raw_token"], sid) + count = storage.count_submissions(instance_id) + threshold = inst["threshold"] # CONCURRENCY NOTE: This threshold check is not atomic. Concurrent submissions could # both see count >= threshold and trigger _run_pipeline twice. This is a non-issue in @@ -298,7 +286,7 @@ def get_submissions(request: Request): raise HTTPException(status_code=403, detail="Only admin can view submission metadata") instance_id = token_info["instance_id"] - subs = _submissions.get(instance_id, {}) + subs = storage.list_submissions(instance_id) meta = [] for sub in subs.values(): @@ -321,7 +309,7 @@ async def trigger(request: Request): raise HTTPException(status_code=403, detail="Only admin can trigger manually") instance_id = token_info["instance_id"] - if not _submissions.get(instance_id): + if storage.count_submissions(instance_id) == 0: raise HTTPException(status_code=400, detail="No submissions to analyze") count = await _run_pipeline(instance_id) @@ -336,7 +324,7 @@ def get_all_results(request: Request): raise HTTPException(status_code=403, detail="Only admin can view all results") instance_id = token_info["instance_id"] - return {"results": list(_results.get(instance_id, {}).values())} + return {"results": storage.list_results(instance_id)} @router.get("/results/{submission_id}") @@ -350,21 +338,20 @@ def get_results(submission_id: str, request: Request): instance_id = token_info["instance_id"] role = token_info["role"] - instance_results = _results.get(instance_id, {}) - - if submission_id not in instance_results: + result = storage.get_result(instance_id, submission_id) + if result is None: raise HTTPException(status_code=404, detail="Result not found or not yet available") if role == "user": if submission_id not in token_info["submission_ids"]: raise HTTPException(status_code=403, detail="Access denied: submission not owned by this token") # Participant view: filtered to skill-declared user_output_keys - card = _skill_router.get_card(_instances[instance_id]["skill_name"]) - result = instance_results[submission_id] + inst = storage.get_instance(instance_id) + card = _skill_router.get_card(inst["skill_name"]) return {k: result[k] for k in card.user_output_keys if k in result} # admin: unrestricted access within the instance - return instance_results[submission_id] + return result @router.get("/skills") diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py index 8018b23..ad22d3f 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,12 @@ from fastapi import FastAPI, Request from fastapi.responses import Response from api.routes import router, register_skills +import storage app = FastAPI(title="Conclave — NDAI Skills Service") +storage.init_db() + @app.middleware("http") async def cors_middleware(request: Request, call_next): diff --git a/storage/__init__.py b/storage/__init__.py new file mode 100644 index 0000000..a229947 --- /dev/null +++ b/storage/__init__.py @@ -0,0 +1,60 @@ +"""SQLite-backed persistent storage for Conclave. + +Replaces the in-memory dicts that previously lived in api/routes.py. +All state survives enclave restart. + +Path resolution: env var CONCLAVE_DB_PATH (default ./data/conclave.db). +Tests set CONCLAVE_DB_PATH=:memory: via the storage fixture. +""" +from storage.sqlite import ( + init_db, + reset_all, + # instances + create_instance, + get_instance, + has_instance, + set_instance_triggered, + list_instances, + count_instances, + # submissions + upsert_submission, + get_submission, + list_submissions, + count_submissions, + # results + upsert_result, + get_result, + list_results, + # tokens + create_token, + get_token, + has_token, + add_submission_to_token, + # registrations + get_registration_token, + set_registration_token, +) + +__all__ = [ + "init_db", + "reset_all", + "create_instance", + "get_instance", + "has_instance", + "set_instance_triggered", + "list_instances", + "count_instances", + "upsert_submission", + "get_submission", + "list_submissions", + "count_submissions", + "upsert_result", + "get_result", + "list_results", + "create_token", + "get_token", + "has_token", + "add_submission_to_token", + "get_registration_token", + "set_registration_token", +] diff --git a/storage/sqlite.py b/storage/sqlite.py new file mode 100644 index 0000000..8199b4e --- /dev/null +++ b/storage/sqlite.py @@ -0,0 +1,370 @@ +"""SQLite-backed persistent storage. + +Single connection, JSON `data` columns for variant payloads, typed columns +for the fields routes.py queries against (instance_id, submission_id, token, +role, etc.). + +Schema includes evaluation_runs and attestations tables that aren't yet used +by the API — they're stubbed here so phases 5 (scheduler) and 8 (Solana +attestation) don't need a migration step. +""" +from __future__ import annotations + +import json +import os +import sqlite3 +import threading +from typing import Any + +_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "conclave.db") +_DB_PATH = os.environ.get("CONCLAVE_DB_PATH", _DEFAULT_PATH) + +_conn: sqlite3.Connection | None = None +_lock = threading.Lock() + + +def _get_conn() -> sqlite3.Connection: + global _conn + if _conn is None: + with _lock: + if _conn is None: + if _DB_PATH != ":memory:": + os.makedirs(os.path.dirname(_DB_PATH), exist_ok=True) + _conn = sqlite3.connect( + _DB_PATH, + check_same_thread=False, + isolation_level=None, # autocommit + ) + _conn.row_factory = sqlite3.Row + if _DB_PATH != ":memory:": + _conn.execute("PRAGMA journal_mode=WAL") + _conn.execute("PRAGMA foreign_keys=ON") + _init_schema(_conn) + return _conn + + +def _init_schema(conn: sqlite3.Connection) -> None: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS instances ( + instance_id TEXT PRIMARY KEY, + skill_name TEXT NOT NULL, + data TEXT NOT NULL, -- JSON: {config, threshold, triggered, ...} + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS submissions ( + instance_id TEXT NOT NULL, + submission_id TEXT NOT NULL, + data TEXT NOT NULL, -- JSON: full submission dict + submitted_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + PRIMARY KEY (instance_id, submission_id) + ); + + CREATE TABLE IF NOT EXISTS results ( + instance_id TEXT NOT NULL, + submission_id TEXT NOT NULL, + data TEXT NOT NULL, -- JSON: full result dict + computed_at TEXT NOT NULL, + PRIMARY KEY (instance_id, submission_id) + ); + + CREATE TABLE IF NOT EXISTS tokens ( + token TEXT PRIMARY KEY, + instance_id TEXT NOT NULL, + role TEXT NOT NULL, -- 'admin' or 'user' + data TEXT NOT NULL, -- JSON: {submission_ids: [...], supabase_user_id?} + created_at TEXT NOT NULL, + expires_at TEXT + ); + + CREATE TABLE IF NOT EXISTS registrations ( + instance_id TEXT NOT NULL, + supabase_user_id TEXT NOT NULL, + token TEXT NOT NULL, + PRIMARY KEY (instance_id, supabase_user_id) + ); + + CREATE TABLE IF NOT EXISTS evaluation_runs ( + run_id TEXT PRIMARY KEY, + instance_id TEXT NOT NULL, + ran_at TEXT NOT NULL, + submission_count INTEGER NOT NULL, + data TEXT -- JSON: aggregate snapshot for this tick + ); + + CREATE TABLE IF NOT EXISTS attestations ( + instance_id TEXT NOT NULL, + report_hash TEXT NOT NULL, + tx_sig TEXT, + chain TEXT NOT NULL DEFAULT 'solana-devnet', + published_at TEXT NOT NULL, + PRIMARY KEY (instance_id, report_hash) + ); + """ + ) + + +def init_db() -> None: + """Initialize the schema. Called at app startup.""" + _get_conn() + + +def reset_all() -> None: + """Wipe every table. Used by test fixtures.""" + conn = _get_conn() + conn.executescript( + """ + DELETE FROM instances; + DELETE FROM submissions; + DELETE FROM results; + DELETE FROM tokens; + DELETE FROM registrations; + DELETE FROM evaluation_runs; + DELETE FROM attestations; + """ + ) + + +def _now() -> str: + from datetime import datetime + return datetime.utcnow().isoformat() + "Z" + + +def _to_jsonable(value: Any) -> Any: + """Convert pydantic models, sets, etc. into JSON-serializable structures.""" + if hasattr(value, "model_dump"): + return value.model_dump() + if isinstance(value, set): + return list(value) + if isinstance(value, dict): + return {k: _to_jsonable(v) for k, v in value.items()} + if isinstance(value, list): + return [_to_jsonable(v) for v in value] + return value + + +# --- Instances --- + +def create_instance(instance_id: str, skill_name: str, config: dict, threshold: int) -> None: + """Insert a new instance. config is a JSON-serializable dict (typically OperatorConfig.model_dump()).""" + payload = { + "config": _to_jsonable(config), + "threshold": int(threshold), + "triggered": False, + } + now = _now() + _get_conn().execute( + "INSERT INTO instances (instance_id, skill_name, data, created_at, updated_at) VALUES (?, ?, ?, ?, ?)", + (instance_id, skill_name, json.dumps(payload), now, now), + ) + + +def get_instance(instance_id: str) -> dict | None: + """Return {skill_name, config, threshold, triggered} or None if not found.""" + row = _get_conn().execute( + "SELECT skill_name, data FROM instances WHERE instance_id = ?", (instance_id,) + ).fetchone() + if row is None: + return None + payload = json.loads(row["data"]) + return { + "instance_id": instance_id, + "skill_name": row["skill_name"], + "config": payload.get("config"), + "threshold": payload.get("threshold"), + "triggered": payload.get("triggered", False), + } + + +def has_instance(instance_id: str) -> bool: + row = _get_conn().execute( + "SELECT 1 FROM instances WHERE instance_id = ?", (instance_id,) + ).fetchone() + return row is not None + + +def set_instance_triggered(instance_id: str, triggered: bool = True) -> None: + inst = get_instance(instance_id) + if inst is None: + raise KeyError(f"Instance {instance_id} not found") + payload = { + "config": inst["config"], + "threshold": inst["threshold"], + "triggered": triggered, + } + _get_conn().execute( + "UPDATE instances SET data = ?, updated_at = ? WHERE instance_id = ?", + (json.dumps(payload), _now(), instance_id), + ) + + +def list_instances() -> list[dict]: + rows = _get_conn().execute( + "SELECT instance_id, skill_name, data FROM instances" + ).fetchall() + out = [] + for row in rows: + payload = json.loads(row["data"]) + out.append({ + "instance_id": row["instance_id"], + "skill_name": row["skill_name"], + "config": payload.get("config"), + "threshold": payload.get("threshold"), + "triggered": payload.get("triggered", False), + }) + return out + + +def count_instances() -> int: + return _get_conn().execute("SELECT COUNT(*) FROM instances").fetchone()[0] + + +# --- Submissions --- + +def upsert_submission(instance_id: str, submission_id: str, data: dict) -> None: + """Insert or update a submission. _submitted_at is preserved on update; updated_at always advances.""" + serialized = json.dumps(_to_jsonable(data)) + submitted_at = data.get("_submitted_at") or _now() + now = _now() + _get_conn().execute( + """ + INSERT INTO submissions (instance_id, submission_id, data, submitted_at, updated_at) + VALUES (?, ?, ?, ?, ?) + ON CONFLICT(instance_id, submission_id) DO UPDATE SET + data = excluded.data, + updated_at = excluded.updated_at + """, + (instance_id, submission_id, serialized, submitted_at, now), + ) + + +def get_submission(instance_id: str, submission_id: str) -> dict | None: + row = _get_conn().execute( + "SELECT data FROM submissions WHERE instance_id = ? AND submission_id = ?", + (instance_id, submission_id), + ).fetchone() + if row is None: + return None + return json.loads(row["data"]) + + +def list_submissions(instance_id: str) -> dict[str, dict]: + rows = _get_conn().execute( + "SELECT submission_id, data FROM submissions WHERE instance_id = ?", + (instance_id,), + ).fetchall() + return {row["submission_id"]: json.loads(row["data"]) for row in rows} + + +def count_submissions(instance_id: str | None = None) -> int: + if instance_id is None: + return _get_conn().execute("SELECT COUNT(*) FROM submissions").fetchone()[0] + return _get_conn().execute( + "SELECT COUNT(*) FROM submissions WHERE instance_id = ?", (instance_id,) + ).fetchone()[0] + + +# --- Results --- + +def upsert_result(instance_id: str, submission_id: str, data: dict) -> None: + serialized = json.dumps(_to_jsonable(data)) + _get_conn().execute( + """ + INSERT INTO results (instance_id, submission_id, data, computed_at) + VALUES (?, ?, ?, ?) + ON CONFLICT(instance_id, submission_id) DO UPDATE SET + data = excluded.data, + computed_at = excluded.computed_at + """, + (instance_id, submission_id, serialized, _now()), + ) + + +def get_result(instance_id: str, submission_id: str) -> dict | None: + row = _get_conn().execute( + "SELECT data FROM results WHERE instance_id = ? AND submission_id = ?", + (instance_id, submission_id), + ).fetchone() + if row is None: + return None + return json.loads(row["data"]) + + +def list_results(instance_id: str) -> list[dict]: + rows = _get_conn().execute( + "SELECT data FROM results WHERE instance_id = ?", (instance_id,) + ).fetchall() + return [json.loads(row["data"]) for row in rows] + + +# --- Tokens --- + +def create_token(token: str, instance_id: str, role: str, supabase_user_id: str | None = None) -> None: + payload: dict = {"submission_ids": []} + if supabase_user_id: + payload["supabase_user_id"] = supabase_user_id + _get_conn().execute( + "INSERT INTO tokens (token, instance_id, role, data, created_at) VALUES (?, ?, ?, ?, ?)", + (token, instance_id, role, json.dumps(payload), _now()), + ) + + +def get_token(token: str) -> dict | None: + row = _get_conn().execute( + "SELECT instance_id, role, data FROM tokens WHERE token = ?", (token,) + ).fetchone() + if row is None: + return None + payload = json.loads(row["data"]) + return { + "instance_id": row["instance_id"], + "role": row["role"], + "submission_ids": set(payload.get("submission_ids", [])), + "supabase_user_id": payload.get("supabase_user_id"), + } + + +def has_token(token: str) -> bool: + row = _get_conn().execute( + "SELECT 1 FROM tokens WHERE token = ?", (token,) + ).fetchone() + return row is not None + + +def add_submission_to_token(token: str, submission_id: str) -> None: + info = get_token(token) + if info is None: + raise KeyError(f"Token {token} not found") + sids = info["submission_ids"] + sids.add(submission_id) + payload = { + "submission_ids": sorted(sids), + } + if info.get("supabase_user_id"): + payload["supabase_user_id"] = info["supabase_user_id"] + _get_conn().execute( + "UPDATE tokens SET data = ? WHERE token = ?", (json.dumps(payload), token) + ) + + +# --- Registrations --- + +def get_registration_token(instance_id: str, supabase_user_id: str) -> str | None: + row = _get_conn().execute( + "SELECT token FROM registrations WHERE instance_id = ? AND supabase_user_id = ?", + (instance_id, supabase_user_id), + ).fetchone() + return row["token"] if row else None + + +def set_registration_token(instance_id: str, supabase_user_id: str, token: str) -> None: + _get_conn().execute( + """ + INSERT INTO registrations (instance_id, supabase_user_id, token) VALUES (?, ?, ?) + ON CONFLICT(instance_id, supabase_user_id) DO UPDATE SET token = excluded.token + """, + (instance_id, supabase_user_id, token), + ) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index b255a53..c8cad6e 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -17,6 +17,9 @@ directly via _setup_instance() until typed POST /instances lands in phase 4. """ from __future__ import annotations +import os +os.environ.setdefault("CONCLAVE_DB_PATH", ":memory:") + import secrets import uuid @@ -24,7 +27,7 @@ from unittest.mock import patch from fastapi.testclient import TestClient -import api.routes as routes +import storage from core.models import OperatorConfig, SkillResponse from skills.hackathon_novelty import skill_card @@ -51,31 +54,24 @@ def _fake_run_skill(inputs, params): def _setup_instance(threshold=5): - """Seed an instance directly in routes._instances. Returns (instance_id, admin_token). + """Seed an instance directly in storage. Returns (instance_id, admin_token). Replaces the now-deleted /init flow. Phase 4 will introduce typed POST /instances and these tests will be updated to call it instead.""" instance_id = str(uuid.uuid4()) - routes._instances[instance_id] = { - "skill_name": "hackathon_novelty", - "config": OperatorConfig( - criteria={"originality": 0.5, "feasibility": 0.5}, - guidelines="", - instance_id=instance_id, - ), - "threshold": threshold, - "conversation": [], - "triggered": False, - } - routes._submissions[instance_id] = {} - routes._results[instance_id] = {} - + config = OperatorConfig( + criteria={"originality": 0.5, "feasibility": 0.5}, + guidelines="", + instance_id=instance_id, + ) + storage.create_instance( + instance_id=instance_id, + skill_name="hackathon_novelty", + config=config.model_dump(), + threshold=threshold, + ) admin_token = secrets.token_urlsafe(16) - routes._tokens[admin_token] = { - "instance_id": instance_id, - "role": "admin", - "submission_ids": set(), - } + storage.create_token(admin_token, instance_id, role="admin") return instance_id, admin_token @@ -83,12 +79,8 @@ def _setup_instance(threshold=5): @pytest.fixture(autouse=True) def clear_stores(): - """Reset all in-memory API state before each test.""" - routes._instances.clear() - routes._submissions.clear() - routes._results.clear() - routes._tokens.clear() - routes._registrations.clear() + """Reset all storage tables before each test.""" + storage.reset_all() yield From 400225ed31d93a9590f9fe245be180b1551d994b Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 17:03:48 +0530 Subject: [PATCH 04/22] Add Bearer auth and POST /generate-token - _resolve_token now accepts either Authorization: Bearer or the legacy X-Instance-Token header. Bearer is the canonical path used by the agent skill. - New endpoint POST /generate-token returns {token, expires_at} matching Colosseum Copilot's PAT shape. /register kept for the existing web UI. - expires_at is null in v1 (no token expiry yet). - URL-as-access-control: anyone with the enclave URL can mint a token. Sybil prevention is deferred per plan. Phase 3 of pivot/agent-skill. 58 tests pass. --- api/routes.py | 35 ++++++++++++++++++++----- tests/test_e2e.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/api/routes.py b/api/routes.py index 644dc14..cf0b7b8 100644 --- a/api/routes.py +++ b/api/routes.py @@ -27,10 +27,18 @@ def register_skills(): # --- Helpers --- def _resolve_token(request: Request) -> dict: - """Read X-Instance-Token header and resolve to {instance_id, role, submission_ids, ...}.""" - token = request.headers.get("X-Instance-Token") + """Resolve an instance token from either Authorization: Bearer or X-Instance-Token. + + Bearer is the canonical convention used by the agent skill. X-Instance-Token is preserved + for the web UI.""" + token: str | None = None + auth = request.headers.get("Authorization") or request.headers.get("authorization") + if auth and auth.startswith("Bearer "): + token = auth[len("Bearer "):].strip() if not token: - raise HTTPException(status_code=401, detail="X-Instance-Token header required") + token = request.headers.get("X-Instance-Token") + if not token: + raise HTTPException(status_code=401, detail="Authorization (Bearer) or X-Instance-Token header required") info = storage.get_token(token) if info is None: raise HTTPException(status_code=403, detail="Invalid or expired token") @@ -71,9 +79,8 @@ async def _run_pipeline(instance_id: str) -> int: @router.post("/register") def register_user(body: dict): """ - Issue a unique user token for a specific instance. - Participants call this with the instance_id provided by the operator. - Each call returns a fresh token — ownership of submitted results is tracked per token. + Issue a unique user token for a specific instance (legacy shape used by the web UI). + Returns {user_token}. New integrations should use POST /generate-token. """ instance_id = body.get("instance_id", "").strip() if not instance_id or not storage.has_instance(instance_id): @@ -83,6 +90,22 @@ def register_user(body: dict): return {"user_token": token} +@router.post("/generate-token") +def generate_token(body: dict): + """ + Issue a participant token for an instance. + Canonical endpoint for the agent skill — mirrors Colosseum Copilot's PAT issuance. + URL-as-access-control: anyone with the unique enclave URL can mint a token. + Sybil prevention is intentionally deferred (see plans/conclave_skill_plan.md). + """ + instance_id = body.get("instance_id", "").strip() + if not instance_id or not storage.has_instance(instance_id): + raise HTTPException(status_code=404, detail="Instance not found") + token = secrets.token_urlsafe(16) + storage.create_token(token, instance_id, role="user") + return {"token": token, "expires_at": None} + + @router.post("/auth/send-otp") def auth_send_otp(body: dict): """ diff --git a/tests/test_e2e.py b/tests/test_e2e.py index c8cad6e..3d87c60 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -200,6 +200,72 @@ def test_register_unknown_instance_returns_404(client): assert r.status_code == 404 +def test_generate_token_issues_bearer_compatible_token(client): + """POST /generate-token returns {token, expires_at} and the token works with Bearer auth.""" + instance_id, _ = _setup_instance() + + r = client.post("/generate-token", json={"instance_id": instance_id}) + assert r.status_code == 200 + body = r.json() + assert "token" in body + assert body["expires_at"] is None + token = body["token"] + + # Token works with Bearer auth on a protected endpoint + r = client.get("/me", headers={"Authorization": f"Bearer {token}"}) + assert r.status_code == 200 + assert r.json()["role"] == "user" + assert r.json()["instance_id"] == instance_id + + +def test_generate_token_unknown_instance_returns_404(client): + r = client.post("/generate-token", json={"instance_id": "does-not-exist"}) + assert r.status_code == 404 + + +def test_bearer_and_x_instance_token_both_accepted(client): + """Either Bearer or X-Instance-Token header resolves a token.""" + instance_id, admin_token = _setup_instance() + + r = client.get("/me", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + + r = client.get("/me", headers={"Authorization": f"Bearer {admin_token}"}) + assert r.status_code == 200 + + +def test_no_auth_headers_returns_401(client): + r = client.get("/me") + assert r.status_code == 401 + + +def test_invalid_bearer_returns_403(client): + r = client.get("/me", headers={"Authorization": "Bearer not-a-real-token"}) + assert r.status_code == 403 + + +def test_submit_via_bearer(client): + """Full submit flow works via Bearer auth (the agent-skill path).""" + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, _ = _setup_instance(threshold=2) + + r = client.post("/generate-token", json={"instance_id": instance_id}) + token = r.json()["token"] + + for i in (1, 2): + r = client.post( + "/submit", + json={"submission_id": f"sub_{i}", "idea_text": f"idea {i}"}, + headers={"Authorization": f"Bearer {token}"}, + ) + assert r.status_code == 200 + + # Result accessible via Bearer + r = client.get("/results/sub_1", headers={"Authorization": f"Bearer {token}"}) + assert r.status_code == 200 + assert "novelty_score" in r.json() + + def test_health_endpoint(client): r = client.get("/health") assert r.status_code == 200 From c101faa0ccfbd50d0768542abf72e7573b6fe010 Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 17:06:41 +0530 Subject: [PATCH 05/22] Add typed POST /instances for operator setup - New endpoint accepts {name, end_date, evaluation_frequency, tracks[]} and returns {instance_id, admin_token, enclave_url}. - Add TrackConfig, CreateInstanceRequest, CreateInstanceResponse models. - Duration parser supports w/d/h/m/s units (e.g. "1w", "30m"). - Validates end_date in future and at least one track. - enclave_url comes from CONCLAVE_PUBLIC_URL env var or request.base_url. - Threshold set to 999_999 on creation; phase 5 scheduler will drive evaluation instead of count-based auto-trigger. - Storage create_instance now takes **fields kwargs that flow into the JSON data column. New fields (name, end_date, evaluation_frequency_seconds, tracks) are stored alongside config/threshold. Phase 4 of pivot/agent-skill. 62 tests pass. --- api/routes.py | 75 ++++++++++++++++++++++++++++++++++++++++++++-- core/models.py | 22 ++++++++++++++ storage/sqlite.py | 40 ++++++++----------------- tests/test_e2e.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 183 insertions(+), 30 deletions(-) diff --git a/api/routes.py b/api/routes.py index cf0b7b8..8cf2ae3 100644 --- a/api/routes.py +++ b/api/routes.py @@ -1,8 +1,10 @@ from __future__ import annotations import asyncio import logging +import os import secrets -from datetime import datetime +import uuid +from datetime import datetime, timezone from functools import partial from fastapi import APIRouter, HTTPException, Request @@ -10,11 +12,35 @@ logger = logging.getLogger(__name__) import storage -from core.models import OperatorConfig, SkillResponse +from core.models import ( + CreateInstanceRequest, + CreateInstanceResponse, + OperatorConfig, + SkillResponse, +) from skills.router import SkillRouter router = APIRouter() + +_DURATION_UNITS = {"w": 604800, "d": 86400, "h": 3600, "m": 60, "s": 1} + + +def _parse_duration(spec: str) -> int: + """Parse a duration string like '1w', '3d', '12h', '30m' into seconds.""" + if not spec or len(spec) < 2: + raise ValueError(f"invalid duration: {spec!r}") + unit = spec[-1].lower() + if unit not in _DURATION_UNITS: + raise ValueError(f"unknown duration unit {unit!r}; use one of w/d/h/m/s") + try: + n = int(spec[:-1]) + except ValueError as e: + raise ValueError(f"invalid duration number: {spec!r}") from e + if n <= 0: + raise ValueError(f"duration must be positive: {spec!r}") + return n * _DURATION_UNITS[unit] + _skill_router = SkillRouter() @@ -76,6 +102,51 @@ async def _run_pipeline(instance_id: str) -> int: # --- Endpoints --- +@router.post("/instances") +def create_instance_endpoint(body: CreateInstanceRequest, request: Request) -> CreateInstanceResponse: + """ + Create a new hackathon novelty instance. + Returns the unique enclave URL the operator shares with participants and an + admin token for the operator dashboard. + """ + now = datetime.now(timezone.utc) + end = body.end_date if body.end_date.tzinfo else body.end_date.replace(tzinfo=timezone.utc) + if end <= now: + raise HTTPException(status_code=422, detail="end_date must be in the future") + + try: + freq_seconds = _parse_duration(body.evaluation_frequency) + except ValueError as e: + raise HTTPException(status_code=422, detail=str(e)) + + instance_id = str(uuid.uuid4()) + config = OperatorConfig( + criteria={"originality": 0.5, "feasibility": 0.5}, + guidelines="", + instance_id=instance_id, + ) + storage.create_instance( + instance_id=instance_id, + skill_name="hackathon_novelty", + config=config.model_dump(), + threshold=999_999, # threshold-trigger disabled; phase 5 scheduler drives evaluation + name=body.name, + end_date=end.isoformat(), + evaluation_frequency_seconds=freq_seconds, + tracks=[t.model_dump() for t in body.tracks], + ) + + admin_token = secrets.token_urlsafe(16) + storage.create_token(admin_token, instance_id, role="admin") + + base = os.environ.get("CONCLAVE_PUBLIC_URL", str(request.base_url).rstrip("/")) + return CreateInstanceResponse( + instance_id=instance_id, + admin_token=admin_token, + enclave_url=base, + ) + + @router.post("/register") def register_user(body: dict): """ diff --git a/core/models.py b/core/models.py index 7372e52..66f92d6 100644 --- a/core/models.py +++ b/core/models.py @@ -1,5 +1,6 @@ from __future__ import annotations import uuid +from datetime import datetime from pydantic import BaseModel, Field from typing import Optional @@ -32,3 +33,24 @@ class SkillResponse(BaseModel): trace: Optional[list[dict]] = None enclave_signature: Optional[str] = None # added by infra side attestation_quote: Optional[str] = None # added by infra side + + +class TrackConfig(BaseModel): + """One track in a hackathon — name + markdown description used by the + track-alignment scoring layer (added in phase 6).""" + name: str + description_markdown: str + + +class CreateInstanceRequest(BaseModel): + """Typed operator setup payload for POST /instances.""" + name: str # hackathon display name (e.g. "Frontier 2026") + end_date: datetime + evaluation_frequency: str # e.g. "1w", "3d", "12h", "30m" + tracks: list[TrackConfig] = Field(min_length=1) + + +class CreateInstanceResponse(BaseModel): + instance_id: str + admin_token: str + enclave_url: str diff --git a/storage/sqlite.py b/storage/sqlite.py index 8199b4e..b76a82a 100644 --- a/storage/sqlite.py +++ b/storage/sqlite.py @@ -148,13 +148,14 @@ def _to_jsonable(value: Any) -> Any: # --- Instances --- -def create_instance(instance_id: str, skill_name: str, config: dict, threshold: int) -> None: - """Insert a new instance. config is a JSON-serializable dict (typically OperatorConfig.model_dump()).""" - payload = { - "config": _to_jsonable(config), - "threshold": int(threshold), - "triggered": False, - } +def create_instance(instance_id: str, skill_name: str, **fields: Any) -> None: + """Insert a new instance. All `fields` are stored in the JSON `data` column. + + Conventional fields used by routes.py: config (dict), threshold (int), + triggered (bool), name, end_date, evaluation_frequency_seconds, tracks. + """ + payload = {k: _to_jsonable(v) for k, v in fields.items()} + payload.setdefault("triggered", False) now = _now() _get_conn().execute( "INSERT INTO instances (instance_id, skill_name, data, created_at, updated_at) VALUES (?, ?, ?, ?, ?)", @@ -163,20 +164,14 @@ def create_instance(instance_id: str, skill_name: str, config: dict, threshold: def get_instance(instance_id: str) -> dict | None: - """Return {skill_name, config, threshold, triggered} or None if not found.""" + """Return {instance_id, skill_name, **stored_fields} or None if not found.""" row = _get_conn().execute( "SELECT skill_name, data FROM instances WHERE instance_id = ?", (instance_id,) ).fetchone() if row is None: return None payload = json.loads(row["data"]) - return { - "instance_id": instance_id, - "skill_name": row["skill_name"], - "config": payload.get("config"), - "threshold": payload.get("threshold"), - "triggered": payload.get("triggered", False), - } + return {"instance_id": instance_id, "skill_name": row["skill_name"], **payload} def has_instance(instance_id: str) -> bool: @@ -190,11 +185,8 @@ def set_instance_triggered(instance_id: str, triggered: bool = True) -> None: inst = get_instance(instance_id) if inst is None: raise KeyError(f"Instance {instance_id} not found") - payload = { - "config": inst["config"], - "threshold": inst["threshold"], - "triggered": triggered, - } + payload = {k: v for k, v in inst.items() if k not in ("instance_id", "skill_name")} + payload["triggered"] = triggered _get_conn().execute( "UPDATE instances SET data = ?, updated_at = ? WHERE instance_id = ?", (json.dumps(payload), _now(), instance_id), @@ -208,13 +200,7 @@ def list_instances() -> list[dict]: out = [] for row in rows: payload = json.loads(row["data"]) - out.append({ - "instance_id": row["instance_id"], - "skill_name": row["skill_name"], - "config": payload.get("config"), - "threshold": payload.get("threshold"), - "triggered": payload.get("triggered", False), - }) + out.append({"instance_id": row["instance_id"], "skill_name": row["skill_name"], **payload}) return out diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 3d87c60..6f0b402 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -69,7 +69,7 @@ def _setup_instance(threshold=5): skill_name="hackathon_novelty", config=config.model_dump(), threshold=threshold, - ) + ) # kwargs flow through to the JSON `data` column admin_token = secrets.token_urlsafe(16) storage.create_token(admin_token, instance_id, role="admin") return instance_id, admin_token @@ -244,6 +244,80 @@ def test_invalid_bearer_returns_403(client): assert r.status_code == 403 +def test_create_instance_typed(client): + """POST /instances with a valid typed body returns instance_id, admin_token, enclave_url.""" + from datetime import datetime, timedelta, timezone + future = (datetime.now(timezone.utc) + timedelta(days=14)).isoformat() + r = client.post( + "/instances", + json={ + "name": "Frontier 2026", + "end_date": future, + "evaluation_frequency": "1d", + "tracks": [ + {"name": "DeFi", "description_markdown": "Decentralized finance projects"}, + {"name": "AI", "description_markdown": "AI/ML applications"}, + ], + }, + ) + assert r.status_code == 200 + body = r.json() + assert "instance_id" in body + assert "admin_token" in body + assert "enclave_url" in body + + # The admin token works + r = client.get("/me", headers={"Authorization": f"Bearer {body['admin_token']}"}) + assert r.status_code == 200 + assert r.json()["role"] == "admin" + + +def test_create_instance_rejects_past_end_date(client): + from datetime import datetime, timedelta, timezone + past = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat() + r = client.post( + "/instances", + json={ + "name": "Past Hackathon", + "end_date": past, + "evaluation_frequency": "1d", + "tracks": [{"name": "X", "description_markdown": "x"}], + }, + ) + assert r.status_code == 422 + assert "end_date" in r.json()["detail"] + + +def test_create_instance_rejects_bad_frequency(client): + from datetime import datetime, timedelta, timezone + future = (datetime.now(timezone.utc) + timedelta(days=14)).isoformat() + r = client.post( + "/instances", + json={ + "name": "Test", + "end_date": future, + "evaluation_frequency": "1y", # 'y' not a valid unit + "tracks": [{"name": "X", "description_markdown": "x"}], + }, + ) + assert r.status_code == 422 + + +def test_create_instance_requires_at_least_one_track(client): + from datetime import datetime, timedelta, timezone + future = (datetime.now(timezone.utc) + timedelta(days=14)).isoformat() + r = client.post( + "/instances", + json={ + "name": "Test", + "end_date": future, + "evaluation_frequency": "1d", + "tracks": [], + }, + ) + assert r.status_code == 422 + + def test_submit_via_bearer(client): """Full submit flow works via Bearer auth (the agent-skill path).""" with patch.object(skill_card, "run", _fake_run_skill): From ac2947f8d71d1afc32e390145ff86461477ed3c7 Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 19:19:17 +0530 Subject: [PATCH 06/22] Add periodic evaluation scheduler and drop threshold-based auto-trigger - New module infra/scheduler.py with one asyncio task per instance. Loop sleeps for evaluation_frequency_seconds, runs the pipeline (skipped if cohort empty), repeats until end_date. Final tick fires on the way out. - main.py uses lifespan to call scheduler.start_all() on startup and stop_all() on shutdown. Synchronous setup (storage.init_db, register_skills) runs at import so tests don't need lifespan. - POST /instances spins up the scheduler loop for the new instance. - POST /submit no longer auto-triggers the pipeline. Status response is "received" (was "received_pending" / "received_analysis_complete"). Pipeline runs only via the scheduler or admin POST /trigger. - Tests: e2e tests call /trigger explicitly. New test_scheduler.py covers empty-cohort skip, normal tick, end_date stop, env-var disable. - CONCLAVE_DISABLE_SCHEDULER=1 disables scheduler globally; tests use it. Phase 5 of pivot/agent-skill. 66 tests pass. --- api/routes.py | 24 +++----- infra/scheduler.py | 130 ++++++++++++++++++++++++++++++++++++++++ main.py | 25 ++++++-- tests/test_e2e.py | 66 ++++++++++---------- tests/test_scheduler.py | 103 +++++++++++++++++++++++++++++++ 5 files changed, 297 insertions(+), 51 deletions(-) create mode 100644 infra/scheduler.py create mode 100644 tests/test_scheduler.py diff --git a/api/routes.py b/api/routes.py index 8cf2ae3..10d6001 100644 --- a/api/routes.py +++ b/api/routes.py @@ -139,6 +139,10 @@ def create_instance_endpoint(body: CreateInstanceRequest, request: Request) -> C admin_token = secrets.token_urlsafe(16) storage.create_token(admin_token, instance_id, role="admin") + # Spin up the scheduler loop for this instance immediately. + from infra import scheduler + scheduler.start_instance(instance_id) + base = os.environ.get("CONCLAVE_PUBLIC_URL", str(request.base_url).rstrip("/")) return CreateInstanceResponse( instance_id=instance_id, @@ -342,26 +346,14 @@ async def submit(submission: dict, request: Request): storage.upsert_submission(instance_id, sid, submission) storage.add_submission_to_token(token_info["_raw_token"], sid) count = storage.count_submissions(instance_id) - threshold = inst["threshold"] - - # CONCURRENCY NOTE: This threshold check is not atomic. Concurrent submissions could - # both see count >= threshold and trigger _run_pipeline twice. This is a non-issue in - # the current deployment model — the TEE container runs single-worker uvicorn which - # serializes all requests. If deployment changes to allow concurrent request handling, - # add a per-instance asyncio.Lock around this check. - if count >= threshold: - await _run_pipeline(instance_id) - return { - "submission_id": sid, - "status": "received_analysis_complete", - "submissions_count": count, - } + # Pipeline triggering moved to the scheduler (phase 5). /submit is now + # purely an ingest endpoint. Operators can still call POST /trigger to + # force an evaluation. return { "submission_id": sid, - "status": "received_pending", + "status": "received", "submissions_count": count, - "threshold": threshold, } diff --git a/infra/scheduler.py b/infra/scheduler.py new file mode 100644 index 0000000..7f1818e --- /dev/null +++ b/infra/scheduler.py @@ -0,0 +1,130 @@ +"""Periodic evaluation scheduler. + +One asyncio task per instance. Sleeps for `evaluation_frequency_seconds`, +then triggers the skill pipeline over the full accumulated cohort. + +Lifecycle: + - main.py lifespan: calls start_all() for every active instance on startup. + - POST /instances: calls start_instance() for the new instance. + - When end_date passes: a final pipeline run fires, then the task exits. + +State is persisted in storage (next_run_at, last_run_at) so tasks can be +re-created cleanly after a restart without losing track of where they were. + +Tests disable the scheduler entirely by setting CONCLAVE_DISABLE_SCHEDULER=1. +""" +from __future__ import annotations + +import asyncio +import logging +import os +from datetime import datetime, timezone + +import storage + +logger = logging.getLogger(__name__) + +_tasks: dict[str, asyncio.Task] = {} + + +def disabled() -> bool: + return os.environ.get("CONCLAVE_DISABLE_SCHEDULER") == "1" + + +def _parse_iso(s: str) -> datetime: + dt = datetime.fromisoformat(s) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + + +async def _run_pipeline_safely(instance_id: str) -> None: + """Trigger the pipeline for an instance, swallowing exceptions so the loop survives.""" + if storage.count_submissions(instance_id) == 0: + logger.info("scheduler: instance %s has no submissions, skipping tick", instance_id) + return + # Local import to avoid a circular dependency at module load. + from api.routes import _run_pipeline + try: + count = await _run_pipeline(instance_id) + logger.info("scheduler: instance %s tick complete, %d results", instance_id, count) + except Exception as e: + logger.error("scheduler: pipeline failed for instance %s: %s", instance_id, e, exc_info=True) + + +async def _loop_for(instance_id: str) -> None: + """Inner loop. Sleeps `evaluation_frequency_seconds`, ticks, repeats until end_date.""" + while True: + inst = storage.get_instance(instance_id) + if inst is None: + logger.info("scheduler: instance %s deleted, stopping loop", instance_id) + return + freq = inst.get("evaluation_frequency_seconds") + end_date_str = inst.get("end_date") + if freq is None or end_date_str is None: + logger.warning("scheduler: instance %s missing freq/end_date, stopping", instance_id) + return + + end_date = _parse_iso(end_date_str) + now = datetime.now(timezone.utc) + if now >= end_date: + # Final tick on the way out so the end-of-hackathon report is fresh. + await _run_pipeline_safely(instance_id) + logger.info("scheduler: instance %s reached end_date, exiting", instance_id) + return + + # Sleep until the next tick or end_date, whichever is sooner. + seconds_until_end = (end_date - now).total_seconds() + delay = min(float(freq), seconds_until_end) + try: + await asyncio.sleep(max(delay, 0.0)) + except asyncio.CancelledError: + logger.info("scheduler: instance %s loop cancelled", instance_id) + return + + await _run_pipeline_safely(instance_id) + + +def start_instance(instance_id: str) -> None: + """Spin up the loop for a single instance. No-op if already running or scheduler disabled.""" + if disabled(): + return + if instance_id in _tasks and not _tasks[instance_id].done(): + return + try: + loop = asyncio.get_running_loop() + except RuntimeError: + # Called outside an event loop (e.g., during sync startup before lifespan). + # The next start_all() during lifespan will pick it up. + logger.warning("scheduler: no running loop, deferring start for instance %s", instance_id) + return + _tasks[instance_id] = loop.create_task(_loop_for(instance_id)) + + +async def start_all() -> None: + """Start tasks for every active (not-yet-ended) instance.""" + if disabled(): + logger.info("scheduler: disabled via CONCLAVE_DISABLE_SCHEDULER") + return + now = datetime.now(timezone.utc) + for inst in storage.list_instances(): + end_date_str = inst.get("end_date") + if not end_date_str: + continue # legacy or test instance, skip + try: + end_date = _parse_iso(end_date_str) + except ValueError: + continue + if now >= end_date: + continue + start_instance(inst["instance_id"]) + logger.info("scheduler: started %d instance loops", len(_tasks)) + + +async def stop_all() -> None: + """Cancel all running tasks. Used on app shutdown.""" + for task in _tasks.values(): + task.cancel() + if _tasks: + await asyncio.gather(*_tasks.values(), return_exceptions=True) + _tasks.clear() diff --git a/main.py b/main.py index ad22d3f..55ccf60 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,28 @@ +from contextlib import asynccontextmanager + from fastapi import FastAPI, Request from fastapi.responses import Response -from api.routes import router, register_skills -import storage -app = FastAPI(title="Conclave — NDAI Skills Service") +import storage +from api.routes import router, register_skills +from infra import scheduler +# Synchronous setup runs at import — tests rely on this without needing to +# enter the lifespan context. storage.init_db() +register_skills() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + await scheduler.start_all() + try: + yield + finally: + await scheduler.stop_all() + + +app = FastAPI(title="Conclave — NDAI Skills Service", lifespan=lifespan) @app.middleware("http") @@ -20,5 +37,5 @@ async def cors_middleware(request: Request, call_next): response.headers["Access-Control-Allow-Origin"] = "*" return response -register_skills() + app.include_router(router) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 6f0b402..8b34f75 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -19,6 +19,7 @@ from __future__ import annotations import os os.environ.setdefault("CONCLAVE_DB_PATH", ":memory:") +os.environ.setdefault("CONCLAVE_DISABLE_SCHEDULER", "1") import secrets import uuid @@ -93,16 +94,16 @@ def client(): # --- Tests --- def test_full_e2e_workflow(client): - """Full happy path: seed instance → submit below threshold → auto-trigger → view results → manual trigger.""" + """Full happy path: seed instance → submit → admin trigger → view results.""" with patch.object(skill_card, "run", _fake_run_skill): - instance_id, admin_token = _setup_instance(threshold=5) + instance_id, admin_token = _setup_instance() r = client.post("/register", json={"instance_id": instance_id}) assert r.status_code == 200 user_token = r.json()["user_token"] - # Submit 4 times — all below threshold - for i in range(1, 5): + # Submit 5 times — all just stored, no auto-trigger anymore (scheduler owns triggering) + for i in range(1, 6): r = client.post( "/submit", json={"submission_id": f"sub_00{i}", "idea_text": f"Idea number {i}"}, @@ -110,17 +111,18 @@ def test_full_e2e_workflow(client): ) assert r.status_code == 200 body = r.json() - assert body["status"] == "received_pending" + assert body["status"] == "received" assert body["submissions_count"] == i - # 5th submission auto-triggers pipeline - r = client.post( - "/submit", - json={"submission_id": "sub_005", "idea_text": "Fifth idea, triggers pipeline"}, - headers={"X-Instance-Token": user_token}, - ) + # No results yet — pipeline hasn't run + r = client.get("/results/sub_001", headers={"X-Instance-Token": user_token}) + assert r.status_code == 404 + + # Admin triggers evaluation manually + r = client.post("/trigger", headers={"X-Instance-Token": admin_token}) assert r.status_code == 200 - assert r.json()["status"] == "received_analysis_complete" + assert r.json()["status"] == "complete" + assert r.json()["results_count"] == 5 # Participant views their own result r = client.get("/results/sub_001", headers={"X-Instance-Token": user_token}) @@ -140,12 +142,6 @@ def test_full_e2e_workflow(client): assert len(results) == 5 assert all("submission_id" in res for res in results) - # Operator manual trigger - r = client.post("/trigger", headers={"X-Instance-Token": admin_token}) - assert r.status_code == 200 - assert r.json()["status"] == "complete" - assert r.json()["results_count"] == 5 - def test_token_enforcement(client): """Token-based auth and role enforcement.""" @@ -319,9 +315,9 @@ def test_create_instance_requires_at_least_one_track(client): def test_submit_via_bearer(client): - """Full submit flow works via Bearer auth (the agent-skill path).""" + """Full submit + manual-trigger flow works via Bearer auth (the agent-skill path).""" with patch.object(skill_card, "run", _fake_run_skill): - instance_id, _ = _setup_instance(threshold=2) + instance_id, admin_token = _setup_instance() r = client.post("/generate-token", json={"instance_id": instance_id}) token = r.json()["token"] @@ -334,6 +330,10 @@ def test_submit_via_bearer(client): ) assert r.status_code == 200 + # Admin triggers manually + r = client.post("/trigger", headers={"Authorization": f"Bearer {admin_token}"}) + assert r.status_code == 200 + # Result accessible via Bearer r = client.get("/results/sub_1", headers={"Authorization": f"Bearer {token}"}) assert r.status_code == 200 @@ -413,10 +413,10 @@ def test_missing_agent_result_produces_error_status(): assert by_id[f"sub_{i:03d}"]["status"] == "analyzed" -def test_retrigger_on_6th_submission(client): - """After threshold triggers on 5th submission, 6th submission re-triggers with all 6 scored.""" +def test_manual_retrigger_after_more_submissions(client): + """Two trigger calls produce results that include later submissions.""" with patch.object(skill_card, "run", _fake_run_skill): - instance_id, admin_token = _setup_instance(threshold=5) + instance_id, admin_token = _setup_instance() r = client.post("/register", json={"instance_id": instance_id}) user_token = r.json()["user_token"] @@ -428,14 +428,19 @@ def test_retrigger_on_6th_submission(client): headers={"X-Instance-Token": user_token}, ) - r = client.post( + # First trigger covers 5 submissions + r = client.post("/trigger", headers={"X-Instance-Token": admin_token}) + assert r.json()["results_count"] == 5 + + # 6th submission lands; second trigger covers all 6 + client.post( "/submit", json={"submission_id": "sub_006", "idea_text": "Sixth idea"}, headers={"X-Instance-Token": user_token}, ) + r = client.post("/trigger", headers={"X-Instance-Token": admin_token}) assert r.status_code == 200 - assert r.json()["status"] == "received_analysis_complete" - assert r.json()["submissions_count"] == 6 + assert r.json()["results_count"] == 6 r = client.get("/results", headers={"X-Instance-Token": admin_token}) assert len(r.json()["results"]) == 6 @@ -459,26 +464,25 @@ def test_submit_missing_required_field_returns_422(client): def test_cross_user_result_isolation(client): """User A cannot read User B's result even if they know the submission_id.""" with patch.object(skill_card, "run", _fake_run_skill): - instance_id, _ = _setup_instance(threshold=5) + instance_id, admin_token = _setup_instance() # Two distinct users register token_a = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] token_b = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] - # Submit enough to trigger pipeline (5 total, split across users) for i in range(1, 5): client.post( "/submit", json={"submission_id": f"sub_00{i}", "idea_text": f"Idea {i}"}, headers={"X-Instance-Token": token_a}, ) - # User B's submission triggers the pipeline - r = client.post( + client.post( "/submit", json={"submission_id": "sub_005", "idea_text": "User B's idea"}, headers={"X-Instance-Token": token_b}, ) - assert r.json()["status"] == "received_analysis_complete" + + client.post("/trigger", headers={"X-Instance-Token": admin_token}) # User B can read their own result r = client.get("/results/sub_005", headers={"X-Instance-Token": token_b}) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py new file mode 100644 index 0000000..10cbf1a --- /dev/null +++ b/tests/test_scheduler.py @@ -0,0 +1,103 @@ +"""Unit tests for the periodic evaluation scheduler. + +Validates the loop wakes up at the right cadence, ticks the pipeline, +respects end_date, and survives empty cohorts. +""" +from __future__ import annotations +import os +os.environ.setdefault("CONCLAVE_DB_PATH", ":memory:") + +import asyncio +from datetime import datetime, timedelta, timezone +from unittest.mock import AsyncMock, patch + +import pytest + +import storage +from core.models import OperatorConfig +from infra import scheduler + + +@pytest.fixture(autouse=True) +def clear_storage(monkeypatch): + # Other test modules may have set CONCLAVE_DISABLE_SCHEDULER=1 at import time. + # Force-enable for this module so scheduler tasks actually run. + monkeypatch.delenv("CONCLAVE_DISABLE_SCHEDULER", raising=False) + storage.reset_all() + yield + + +def _seed(instance_id: str, freq_seconds: int, end_offset_seconds: int) -> None: + config = OperatorConfig( + criteria={"a": 1.0}, + guidelines="", + instance_id=instance_id, + ) + end_date = (datetime.now(timezone.utc) + timedelta(seconds=end_offset_seconds)).isoformat() + storage.create_instance( + instance_id=instance_id, + skill_name="hackathon_novelty", + config=config.model_dump(), + threshold=999_999, + name="test", + end_date=end_date, + evaluation_frequency_seconds=freq_seconds, + tracks=[{"name": "t", "description_markdown": "x"}], + ) + + +@pytest.mark.asyncio +async def test_scheduler_skips_when_cohort_empty(): + """No submissions → tick fires but pipeline isn't called.""" + _seed("inst-1", freq_seconds=1, end_offset_seconds=10) + + with patch("api.routes._run_pipeline", new=AsyncMock()) as mock_pipeline: + scheduler.start_instance("inst-1") + await asyncio.sleep(1.2) + # Even though tick fired, no submissions → no pipeline call + assert mock_pipeline.await_count == 0 + await scheduler.stop_all() + + +@pytest.mark.asyncio +async def test_scheduler_calls_pipeline_when_cohort_has_data(): + """Submission present → tick fires → pipeline called.""" + _seed("inst-2", freq_seconds=1, end_offset_seconds=10) + storage.upsert_submission("inst-2", "sub-1", {"submission_id": "sub-1", "idea_text": "x"}) + + pipeline_mock = AsyncMock(return_value=1) + with patch("api.routes._run_pipeline", new=pipeline_mock): + scheduler.start_instance("inst-2") + await asyncio.sleep(1.2) + assert pipeline_mock.await_count >= 1 + await scheduler.stop_all() + + +@pytest.mark.asyncio +async def test_scheduler_stops_after_end_date(): + """end_date already past → final tick + exit.""" + _seed("inst-3", freq_seconds=1, end_offset_seconds=-1) + storage.upsert_submission("inst-3", "sub-1", {"submission_id": "sub-1", "idea_text": "x"}) + + pipeline_mock = AsyncMock(return_value=1) + with patch("api.routes._run_pipeline", new=pipeline_mock): + scheduler.start_instance("inst-3") + await asyncio.sleep(0.3) + # Single final tick fires, then loop exits. + assert pipeline_mock.await_count == 1 + # Task should be done. + task = scheduler._tasks.get("inst-3") + assert task is not None + # Give it a moment to settle. + await asyncio.sleep(0.1) + assert task.done() + await scheduler.stop_all() + + +@pytest.mark.asyncio +async def test_scheduler_disabled_via_env(): + """CONCLAVE_DISABLE_SCHEDULER=1 → start_instance is a no-op.""" + _seed("inst-4", freq_seconds=1, end_offset_seconds=10) + with patch.dict(os.environ, {"CONCLAVE_DISABLE_SCHEDULER": "1"}): + scheduler.start_instance("inst-4") + assert "inst-4" not in scheduler._tasks From 2aa02e4b0e576e9880616dff856b69d3622e7d36 Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 19:28:38 +0530 Subject: [PATCH 07/22] Add track alignment, name collision, and confidence to pipeline output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NoveltyResult gains: - track_alignments: dict[str, float] — cosine similarity vs each track - best_fit_track: str | None — argmax of track alignments - cluster_label / cluster_size — surfaced from deterministic layer - confidence: 'low' | 'high' — 'low' when cohort < 5 - name_collisions: list[NameCollision] — fuzzy-matched project name dupes Deterministic layer now computes track alignments via cosine similarity between submission embeddings and operator-supplied track descriptions, plus name collisions via difflib.SequenceMatcher (no new deps). OperatorConfig gains a tracks list. POST /instances populates it from the typed body. ALLOWED_OUTPUT_KEYS expanded; USER_OUTPUT_KEYS rewritten to expose the new participant-facing fields and drop aligned/criteria_scores from the participant view (those are admin-only signals now). run_skill no longer short-circuits on small cohorts — instead tags every result confidence='low' so the agent skill can warn early submitters that scores will firm up. test_run_skill_insufficient_submissions renamed and rewritten to assert this. Submission update flow already worked via storage.upsert_submission's ON CONFLICT REPLACE; pipeline naturally re-evaluates from the latest stored row on each scheduler tick. Phase 6 of pivot/agent-skill. 66 tests pass. --- api/routes.py | 4 +- core/models.py | 1 + skills/hackathon_novelty/__init__.py | 38 ++++++++--- skills/hackathon_novelty/config.py | 23 ++++++- skills/hackathon_novelty/deterministic.py | 79 ++++++++++++++++++++++- skills/hackathon_novelty/models.py | 13 ++++ tests/test_e2e.py | 15 ++++- tests/test_hackathon_novelty.py | 7 +- 8 files changed, 163 insertions(+), 17 deletions(-) diff --git a/api/routes.py b/api/routes.py index 10d6001..ad52c94 100644 --- a/api/routes.py +++ b/api/routes.py @@ -120,10 +120,12 @@ def create_instance_endpoint(body: CreateInstanceRequest, request: Request) -> C raise HTTPException(status_code=422, detail=str(e)) instance_id = str(uuid.uuid4()) + tracks_dump = [t.model_dump() for t in body.tracks] config = OperatorConfig( criteria={"originality": 0.5, "feasibility": 0.5}, guidelines="", instance_id=instance_id, + tracks=tracks_dump, ) storage.create_instance( instance_id=instance_id, @@ -133,7 +135,7 @@ def create_instance_endpoint(body: CreateInstanceRequest, request: Request) -> C name=body.name, end_date=end.isoformat(), evaluation_frequency_seconds=freq_seconds, - tracks=[t.model_dump() for t in body.tracks], + tracks=tracks_dump, ) admin_token = secrets.token_urlsafe(16) diff --git a/core/models.py b/core/models.py index 66f92d6..8328bc5 100644 --- a/core/models.py +++ b/core/models.py @@ -19,6 +19,7 @@ class OperatorConfig(BaseModel): guidelines: str = "" instance_id: str = "default" min_submissions: int = 5 + tracks: list[dict] = [] # [{name, description_markdown}] — phase 6 addition class SkillRequest(BaseModel): diff --git a/skills/hackathon_novelty/__init__.py b/skills/hackathon_novelty/__init__.py index 00e084d..273b532 100644 --- a/skills/hackathon_novelty/__init__.py +++ b/skills/hackathon_novelty/__init__.py @@ -24,17 +24,26 @@ from skills.hackathon_novelty.tools import set_context from skills.hackathon_novelty.agent import run_agent from skills.hackathon_novelty.guardrails import HackathonNoveltyFilter -from skills.hackathon_novelty.config import ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS, MIN_SUBMISSIONS, SIMILARITY_DUPLICATE_THRESHOLD +from skills.hackathon_novelty.config import ( + ALLOWED_OUTPUT_KEYS, + USER_OUTPUT_KEYS, + MIN_SUBMISSIONS, + SIMILARITY_DUPLICATE_THRESHOLD, + LOW_CONFIDENCE_THRESHOLD, +) def run_skill(inputs: list[HackathonSubmission], params: OperatorConfig) -> SkillResponse: - """Full 4-layer pipeline: ingest → deterministic → agent (multi-node graph) → guardrails → response.""" + """Full 4-layer pipeline: ingest → deterministic → agent (multi-node graph) → guardrails → response. - if len(inputs) < params.min_submissions: - return SkillResponse( - skill="hackathon_novelty", - results=[{"submission_id": s.submission_id, "status": "insufficient_submissions"} for s in inputs], - ) + The pipeline runs at any cohort size; results are tagged confidence: "low" when + the cohort is below LOW_CONFIDENCE_THRESHOLD so the agent skill can warn early + submitters that scores will firm up as more submissions land. + """ + if not inputs: + return SkillResponse(skill="hackathon_novelty", results=[]) + + confidence = "low" if len(inputs) < LOW_CONFIDENCE_THRESHOLD else "high" # Layer 0: Ingestion — normalize/extract text from any format normalized = run_ingest(inputs) @@ -42,8 +51,13 @@ def run_skill(inputs: list[HackathonSubmission], params: OperatorConfig) -> Skil if sub.submission_id in normalized: sub.idea_text = normalized[sub.submission_id] - # Layer 1: Deterministic (now uses normalized text for embeddings) - det = run_deterministic(inputs, guidelines=params.guidelines, criteria=params.criteria) + # Layer 1: Deterministic — embeddings, novelty, clustering, track alignment, name collisions + det = run_deterministic( + inputs, + guidelines=params.guidelines, + criteria=params.criteria, + tracks=params.tracks, + ) # Build submissions map and set tool context submissions_map = {s.submission_id: s for s in inputs} @@ -98,6 +112,12 @@ def run_skill(inputs: list[HackathonSubmission], params: OperatorConfig) -> Skil status=ar.get("status", "analyzed") if ar else "error", analysis_depth=ar.get("analysis_depth", "full"), duplicate_of=ar.get("duplicate_of", None), + track_alignments=det["track_alignments"][i], + best_fit_track=det["best_fit_tracks"][i], + cluster_label=det["clusters"][i], + cluster_size=det["cluster_sizes"][i], + confidence=confidence, + name_collisions=det["name_collisions"].get(sid, []), ) results.append(result.model_dump()) diff --git a/skills/hackathon_novelty/config.py b/skills/hackathon_novelty/config.py index 313e4c0..55a7c1d 100644 --- a/skills/hackathon_novelty/config.py +++ b/skills/hackathon_novelty/config.py @@ -32,6 +32,13 @@ "status", "analysis_depth", "duplicate_of", + # Phase 6 + "track_alignments", + "best_fit_track", + "cluster_label", + "cluster_size", + "confidence", + "name_collisions", } SCORE_BOUNDS = { @@ -47,9 +54,21 @@ SIMILARITY_DUPLICATE_THRESHOLD = 0.7 LOW_NOVELTY_THRESHOLD = 0.1 -# Participant-facing output — only Conclave-unique signals. +# Participant-facing output — what individual submitters see for their own submission. # Admin sees ALLOWED_OUTPUT_KEYS (everything). Users see USER_OUTPUT_KEYS. -USER_OUTPUT_KEYS = {"submission_id", "novelty_score", "aligned"} +USER_OUTPUT_KEYS = { + "submission_id", + "novelty_score", + "track_alignments", + "best_fit_track", + "cluster_label", + "cluster_size", + "confidence", + "name_collisions", +} + +# Cohort size below which scores are flagged with confidence: "low". +LOW_CONFIDENCE_THRESHOLD = 5 # Per-node model overrides — set via CONCLAVE_*_MODEL env vars. # Empty string falls back to CONCLAVE_DEFAULT_MODEL (or DeepSeek-V3.1 if unset). diff --git a/skills/hackathon_novelty/deterministic.py b/skills/hackathon_novelty/deterministic.py index 3f9450a..bc46792 100644 --- a/skills/hackathon_novelty/deterministic.py +++ b/skills/hackathon_novelty/deterministic.py @@ -1,5 +1,7 @@ from __future__ import annotations import hashlib +from difflib import SequenceMatcher + import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.cluster import KMeans @@ -7,6 +9,10 @@ from scipy.stats import rankdata from skills.hackathon_novelty.models import HackathonSubmission +# Two project names with SequenceMatcher.ratio() at or above this threshold +# are flagged as a name collision. +NAME_COLLISION_THRESHOLD = 0.85 + # Singleton — loads model once, reuses across calls _model: SentenceTransformer | None = None _model_load_failed = False @@ -95,10 +101,70 @@ def cluster_submissions(embeddings: np.ndarray) -> list[str]: return [label_names[l] for l in labels] +def _project_name(submission: HackathonSubmission) -> str: + """Best-effort title extraction. First non-empty line, capped at 80 chars.""" + text = (submission.idea_text or "").strip() + if not text: + return "" + first = text.split("\n", 1)[0].strip() + return first[:80] + + +def compute_name_collisions(submissions: list[HackathonSubmission]) -> dict[str, list[dict]]: + """For each submission, return any other submissions with similar project names. + + Uses difflib.SequenceMatcher (no external deps). O(N^2) — fine for hackathon-scale N. + """ + names = [_project_name(s) for s in submissions] + out: dict[str, list[dict]] = {s.submission_id: [] for s in submissions} + for i, a in enumerate(submissions): + if not names[i]: + continue + for j, b in enumerate(submissions): + if i == j or not names[j]: + continue + sim = SequenceMatcher(None, names[i].lower(), names[j].lower()).ratio() + if sim >= NAME_COLLISION_THRESHOLD: + out[a.submission_id].append({ + "other_submission_id": b.submission_id, + "similarity": round(float(sim), 3), + }) + return out + + +def compute_track_alignments( + submission_embeddings: np.ndarray, + tracks: list[dict], +) -> tuple[list[dict[str, float]], list[str | None]]: + """For each submission, score alignment against each track. + + Returns (per_submission_alignments, per_submission_best_fit). Track scores + are cosine similarities clipped to [0, 1]. + """ + n = submission_embeddings.shape[0] + if not tracks: + return [{} for _ in range(n)], [None] * n + + track_texts = [t.get("description_markdown") or t.get("name", "") for t in tracks] + track_names = [t.get("name", f"track_{i}") for i, t in enumerate(tracks)] + track_embeddings = compute_embeddings(track_texts) + sim = cosine_similarity(submission_embeddings, track_embeddings) + sim = np.clip(sim, 0.0, 1.0) + + alignments: list[dict[str, float]] = [] + best_fit: list[str | None] = [] + for i in range(n): + row = sim[i] + alignments.append({track_names[j]: round(float(row[j]), 3) for j in range(len(tracks))}) + best_fit.append(track_names[int(np.argmax(row))]) + return alignments, best_fit + + def run_deterministic( submissions: list[HackathonSubmission], guidelines: str = "", criteria: dict[str, float] | None = None, + tracks: list[dict] | None = None, ) -> dict: """ Full deterministic pipeline. Returns dict with: @@ -106,8 +172,12 @@ def run_deterministic( - sim_matrix: np.ndarray (N, N) - novelty_scores: np.ndarray (N,) - percentiles: np.ndarray (N,) — internal, used by triage_context - - clusters: list[str] (N,) — internal, used by triage_context + - clusters: list[str] (N,) — cluster label per submission + - cluster_sizes: list[int] (N,) - submission_ids: list[str] (N,) + - name_collisions: dict[submission_id -> list[{other_submission_id, similarity}]] + - track_alignments: list[dict[track_name -> score]] (N,) + - best_fit_tracks: list[str | None] (N,) """ texts = [fuse_text(s) for s in submissions] embeddings = compute_embeddings(texts) @@ -115,6 +185,9 @@ def run_deterministic( novelty_scores = compute_novelty_scores(sim_matrix) percentiles = compute_percentiles(novelty_scores) clusters = cluster_submissions(embeddings) + cluster_sizes = [clusters.count(c) for c in clusters] + name_collisions = compute_name_collisions(submissions) + track_alignments, best_fit_tracks = compute_track_alignments(embeddings, tracks or []) return { "embeddings": embeddings, @@ -122,5 +195,9 @@ def run_deterministic( "novelty_scores": novelty_scores, "percentiles": percentiles, "clusters": clusters, + "cluster_sizes": cluster_sizes, "submission_ids": [s.submission_id for s in submissions], + "name_collisions": name_collisions, + "track_alignments": track_alignments, + "best_fit_tracks": best_fit_tracks, } diff --git a/skills/hackathon_novelty/models.py b/skills/hackathon_novelty/models.py index d110590..fc612ce 100644 --- a/skills/hackathon_novelty/models.py +++ b/skills/hackathon_novelty/models.py @@ -26,6 +26,12 @@ class HackathonSubmission(Submission): deck_text: Optional[str] = None +class NameCollision(BaseModel): + """Another submission whose project name is similar to this one.""" + other_submission_id: str + similarity: float = Field(ge=0.0, le=1.0) + + class NoveltyResult(BaseModel): """Final output for one submission after guardrails. This is what leaves the skill.""" submission_id: str @@ -36,3 +42,10 @@ class NoveltyResult(BaseModel): status: str = "analyzed" # "analyzed" | "duplicate" analysis_depth: str = "full" # "full" | "flagged" duplicate_of: Optional[str] = None # submission_id of the original if status="duplicate" + # Phase 6 additions + track_alignments: dict[str, float] = {} # track name → 0-1 alignment + best_fit_track: Optional[str] = None + cluster_label: Optional[str] = None + cluster_size: int = 0 + confidence: str = "high" # "low" when cohort N < 5 + name_collisions: list[NameCollision] = [] diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 8b34f75..cd6a155 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -48,6 +48,12 @@ def _fake_run_skill(inputs, params): "status": "analyzed", "analysis_depth": "full", "duplicate_of": None, + "track_alignments": {"DeFi": 0.4}, + "best_fit_track": "DeFi", + "cluster_label": "A", + "cluster_size": 2, + "confidence": "high", + "name_collisions": [], } for s in inputs ], @@ -130,8 +136,9 @@ def test_full_e2e_workflow(client): body = r.json() assert body["submission_id"] == "sub_001" assert "novelty_score" in body - assert "aligned" in body - # Users should NOT see internal fields + assert "confidence" in body + assert "track_alignments" in body + # Users should NOT see internal/admin fields assert "criteria_scores" not in body assert "status" not in body @@ -399,7 +406,11 @@ def test_missing_agent_result_produces_error_status(): "novelty_scores": np.array([0.5, 0.6, 0.7, 0.8, 0.9]), "percentiles": np.array([20.0, 40.0, 60.0, 80.0, 100.0]), "clusters": ["A", "A", "B", "B", "C"], + "cluster_sizes": [2, 2, 2, 2, 1], "submission_ids": [f"sub_{i:03d}" for i in range(1, 6)], + "name_collisions": {f"sub_{i:03d}": [] for i in range(1, 6)}, + "track_alignments": [{} for _ in range(5)], + "best_fit_tracks": [None] * 5, } with patch("skills.hackathon_novelty.run_ingest", return_value={}), \ diff --git a/tests/test_hackathon_novelty.py b/tests/test_hackathon_novelty.py index f910489..d517092 100644 --- a/tests/test_hackathon_novelty.py +++ b/tests/test_hackathon_novelty.py @@ -142,11 +142,14 @@ def test_run_skill_with_mocked_llm(): assert "criteria_scores" in r -def test_run_skill_insufficient_submissions(): +def test_run_skill_marks_low_confidence_for_small_cohort(): + """Small cohorts (N < LOW_CONFIDENCE_THRESHOLD) get confidence='low'. + The pipeline still runs end-to-end so early submitters see results + even before more submissions land.""" subs = [HackathonSubmission(submission_id="x", idea_text="test")] config = OperatorConfig(criteria={"originality": 1.0}) response = run_skill(subs, config) - assert response.results[0]["status"] == "insufficient_submissions" + assert response.results[0]["confidence"] == "low" def test_filter_strips_extra_keys(): From 1cb6ec98c4eb3b5b3c3d2c09097e14cb925c3b9f Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 19:30:55 +0530 Subject: [PATCH 08/22] Add operator dashboard endpoints and evaluation-run history - New endpoint GET /cohort/aggregates: cohort size, last-evaluation timestamp, cluster distribution, track distribution, name-collision pair count. - New endpoint GET /cohort/timeline: history of every pipeline tick for this instance with per-tick aggregate snapshot (top clusters, top tracks, collision count). - Each pipeline run now records to evaluation_runs storage table with a compact snapshot. - GET /submissions admin response gains idea_title_or_summary (first line, truncated to 80 chars) so the operator can see broadly what's being submitted without raw idea text. - Storage gains record_evaluation_run + list_evaluation_runs. Phase 7 of pivot/agent-skill. 69 tests pass. --- api/routes.py | 88 ++++++++++++++++++++++++++++++++++++++++++++- storage/__init__.py | 5 +++ storage/sqlite.py | 31 ++++++++++++++++ tests/test_e2e.py | 75 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 1 deletion(-) diff --git a/api/routes.py b/api/routes.py index ad52c94..9e539d8 100644 --- a/api/routes.py +++ b/api/routes.py @@ -97,9 +97,39 @@ async def _run_pipeline(instance_id: str) -> int: storage.upsert_result(instance_id, r["submission_id"], r) storage.set_instance_triggered(instance_id, True) + + snapshot = _build_snapshot(response.results) + storage.record_evaluation_run( + instance_id=instance_id, + submission_count=len(response.results), + snapshot=snapshot, + ) return len(response.results) +def _build_snapshot(results: list[dict]) -> dict: + """Aggregate stats captured per evaluation tick for the dashboard timeline.""" + cluster_counts: dict[str, int] = {} + track_counts: dict[str, int] = {} + collisions = 0 + for r in results: + c = r.get("cluster_label") + if c: + cluster_counts[c] = cluster_counts.get(c, 0) + 1 + t = r.get("best_fit_track") + if t: + track_counts[t] = track_counts.get(t, 0) + 1 + collisions += len(r.get("name_collisions") or []) + # Top-3 clusters and tracks for compactness + top_clusters = sorted(cluster_counts.items(), key=lambda kv: kv[1], reverse=True)[:3] + top_tracks = sorted(track_counts.items(), key=lambda kv: kv[1], reverse=True)[:3] + return { + "top_clusters": [{"label": k, "count": v} for k, v in top_clusters], + "top_tracks": [{"track": k, "count": v} for k, v in top_tracks], + "name_collision_pairs": collisions // 2, # each collision counted twice (once per side) + } + + # --- Endpoints --- @router.post("/instances") @@ -378,12 +408,16 @@ def get_submissions(request: Request): meta = [] for sub in subs.values(): + idea_text = sub.get("idea_text") or "" + first_line = idea_text.split("\n", 1)[0].strip() + title = first_line[:80] if first_line else "" meta.append({ "submission_id": sub.get("submission_id", ""), "submitted_at": sub.get("_submitted_at"), - "has_text": bool(sub.get("idea_text")), + "has_text": bool(idea_text), "has_file": bool(sub.get("idea_file")), "has_repo": bool(sub.get("repo_summary")), + "idea_title_or_summary": title, }) return {"submissions": meta} @@ -415,6 +449,58 @@ def get_all_results(request: Request): return {"results": storage.list_results(instance_id)} +@router.get("/cohort/aggregates") +def cohort_aggregates(request: Request): + """Operator-only cohort summary: cluster + track distribution, collision count, + cohort size, last-evaluation timestamp.""" + token_info = _resolve_token(request) + if token_info["role"] != "admin": + raise HTTPException(status_code=403, detail="Only admin can view cohort aggregates") + + instance_id = token_info["instance_id"] + results = storage.list_results(instance_id) + + cluster_counts: dict[str, int] = {} + track_counts: dict[str, int] = {} + collisions = 0 + last_at = None + for r in results: + c = r.get("cluster_label") + if c: + cluster_counts[c] = cluster_counts.get(c, 0) + 1 + t = r.get("best_fit_track") + if t: + track_counts[t] = track_counts.get(t, 0) + 1 + collisions += len(r.get("name_collisions") or []) + + runs = storage.list_evaluation_runs(instance_id) + if runs: + last_at = runs[-1]["ran_at"] + + return { + "cohort_size": storage.count_submissions(instance_id), + "last_evaluation_at": last_at, + "cluster_distribution": [ + {"label": k, "count": v} + for k, v in sorted(cluster_counts.items(), key=lambda kv: kv[1], reverse=True) + ], + "track_distribution": [ + {"track": k, "count": v} + for k, v in sorted(track_counts.items(), key=lambda kv: kv[1], reverse=True) + ], + "name_collision_pairs": collisions // 2, + } + + +@router.get("/cohort/timeline") +def cohort_timeline(request: Request): + """Operator-only history of evaluation ticks for this instance.""" + token_info = _resolve_token(request) + if token_info["role"] != "admin": + raise HTTPException(status_code=403, detail="Only admin can view cohort timeline") + return {"runs": storage.list_evaluation_runs(token_info["instance_id"])} + + @router.get("/results/{submission_id}") def get_results(submission_id: str, request: Request): """ diff --git a/storage/__init__.py b/storage/__init__.py index a229947..f02482a 100644 --- a/storage/__init__.py +++ b/storage/__init__.py @@ -33,6 +33,9 @@ # registrations get_registration_token, set_registration_token, + # evaluation runs + record_evaluation_run, + list_evaluation_runs, ) __all__ = [ @@ -57,4 +60,6 @@ "add_submission_to_token", "get_registration_token", "set_registration_token", + "record_evaluation_run", + "list_evaluation_runs", ] diff --git a/storage/sqlite.py b/storage/sqlite.py index b76a82a..3593a2f 100644 --- a/storage/sqlite.py +++ b/storage/sqlite.py @@ -354,3 +354,34 @@ def set_registration_token(instance_id: str, supabase_user_id: str, token: str) """, (instance_id, supabase_user_id, token), ) + + +# --- Evaluation runs --- + +def record_evaluation_run(instance_id: str, submission_count: int, snapshot: dict | None = None) -> str: + """Record one pipeline tick. Returns the run_id.""" + import uuid as _uuid + run_id = str(_uuid.uuid4()) + _get_conn().execute( + "INSERT INTO evaluation_runs (run_id, instance_id, ran_at, submission_count, data) VALUES (?, ?, ?, ?, ?)", + (run_id, instance_id, _now(), int(submission_count), json.dumps(_to_jsonable(snapshot)) if snapshot else None), + ) + return run_id + + +def list_evaluation_runs(instance_id: str) -> list[dict]: + """Return history of pipeline ticks for an instance, oldest-first.""" + rows = _get_conn().execute( + "SELECT run_id, ran_at, submission_count, data FROM evaluation_runs " + "WHERE instance_id = ? ORDER BY ran_at ASC", + (instance_id,), + ).fetchall() + out = [] + for row in rows: + out.append({ + "run_id": row["run_id"], + "ran_at": row["ran_at"], + "submission_count": row["submission_count"], + "snapshot": json.loads(row["data"]) if row["data"] else None, + }) + return out diff --git a/tests/test_e2e.py b/tests/test_e2e.py index cd6a155..8d0b87e 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -472,6 +472,81 @@ def test_submit_missing_required_field_returns_422(client): assert r.status_code == 422 +def test_cohort_aggregates_admin_only(client): + """GET /cohort/aggregates returns cluster + track distribution, collisions, cohort size.""" + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, admin_token = _setup_instance() + + user_token = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] + for i in range(1, 4): + client.post( + "/submit", + json={"submission_id": f"sub_{i}", "idea_text": f"Idea {i}"}, + headers={"X-Instance-Token": user_token}, + ) + client.post("/trigger", headers={"X-Instance-Token": admin_token}) + + # Admin can read aggregates + r = client.get("/cohort/aggregates", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + body = r.json() + assert body["cohort_size"] == 3 + assert body["last_evaluation_at"] is not None + assert isinstance(body["cluster_distribution"], list) + assert isinstance(body["track_distribution"], list) + assert "name_collision_pairs" in body + + # User cannot + r = client.get("/cohort/aggregates", headers={"X-Instance-Token": user_token}) + assert r.status_code == 403 + + +def test_cohort_timeline_records_each_trigger(client): + """Each /trigger appends to /cohort/timeline.""" + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, admin_token = _setup_instance() + user_token = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] + + client.post( + "/submit", + json={"submission_id": "sub_1", "idea_text": "first"}, + headers={"X-Instance-Token": user_token}, + ) + client.post("/trigger", headers={"X-Instance-Token": admin_token}) + + client.post( + "/submit", + json={"submission_id": "sub_2", "idea_text": "second"}, + headers={"X-Instance-Token": user_token}, + ) + client.post("/trigger", headers={"X-Instance-Token": admin_token}) + + r = client.get("/cohort/timeline", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + runs = r.json()["runs"] + assert len(runs) == 2 + assert runs[0]["submission_count"] == 1 + assert runs[1]["submission_count"] == 2 + + +def test_submissions_includes_idea_title(client): + """Admin sees a sanitized idea_title_or_summary on each submission row.""" + instance_id, admin_token = _setup_instance() + user_token = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] + + client.post( + "/submit", + json={"submission_id": "sub_1", "idea_text": "Decentralized prediction market"}, + headers={"X-Instance-Token": user_token}, + ) + + r = client.get("/submissions", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + rows = r.json()["submissions"] + assert len(rows) == 1 + assert rows[0]["idea_title_or_summary"] == "Decentralized prediction market" + + def test_cross_user_result_isolation(client): """User A cannot read User B's result even if they know the submission_id.""" with patch.object(skill_card, "run", _fake_run_skill): From 974dd6cb6df96070f518292d7a006f2251b79966 Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Tue, 5 May 2026 19:47:09 +0530 Subject: [PATCH 09/22] Add Solana attestation publishing on end_date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New module infra/solana.py: publishes a SHA-256 of the cohort report via the SPL Memo program. No custom on-chain code needed — the signed transaction with a deterministic memo IS the attestation. Anyone can look up the txn, read the memo, verify the signer pubkey. - hash_report() is order-independent: sorts results by submission_id before hashing, so two ticks over the same data produce the same hash. - Graceful degradation: if CONCLAVE_SOLANA_KEYPAIR is unset, returns a 'local_only' record without hitting the network. Tests rely on this. - Configuration via env: CONCLAVE_SOLANA_KEYPAIR (base58 / JSON-array / base64), CONCLAVE_SOLANA_RPC_URL (default devnet), CONCLAVE_SOLANA_NETWORK. - Scheduler hooks attestation publish into the final end_date tick. - New endpoints: GET /attestations (any valid token) and admin-only POST /attestations/publish for the demo path that doesn't want to wait for end_date. - Storage gains attestations table data column + record/list helpers. - requirements.txt adds solders + solana. Phase 8 of pivot/agent-skill. 74 tests pass. --- api/routes.py | 24 +++++++ infra/scheduler.py | 30 +++++++++ infra/solana.py | 134 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + storage/__init__.py | 5 ++ storage/sqlite.py | 45 +++++++++++++ tests/test_attestation.py | 85 ++++++++++++++++++++++++ 7 files changed, 325 insertions(+) create mode 100644 infra/solana.py create mode 100644 tests/test_attestation.py diff --git a/api/routes.py b/api/routes.py index 9e539d8..7ece4da 100644 --- a/api/routes.py +++ b/api/routes.py @@ -501,6 +501,30 @@ def cohort_timeline(request: Request): return {"runs": storage.list_evaluation_runs(token_info["instance_id"])} +@router.get("/attestations") +def list_attestations(request: Request): + """Public-readable list of on-chain attestations for this instance. + + Anyone with a valid token (admin or user) can read so participants can + verify the enclave published the final report they received.""" + token_info = _resolve_token(request) + return {"attestations": storage.list_attestations(token_info["instance_id"])} + + +@router.post("/attestations/publish") +async def publish_attestation_now(request: Request): + """Admin-only: force an immediate attestation publish over the current cohort. + Useful for the demo path when waiting for end_date isn't practical.""" + token_info = _resolve_token(request) + if token_info["role"] != "admin": + raise HTTPException(status_code=403, detail="Only admin can publish attestations") + instance_id = token_info["instance_id"] + from infra.scheduler import _publish_final_attestation + await _publish_final_attestation(instance_id) + runs = storage.list_attestations(instance_id) + return {"latest": runs[-1] if runs else None} + + @router.get("/results/{submission_id}") def get_results(submission_id: str, request: Request): """ diff --git a/infra/scheduler.py b/infra/scheduler.py index 7f1818e..13f1c0f 100644 --- a/infra/scheduler.py +++ b/infra/scheduler.py @@ -52,6 +52,35 @@ async def _run_pipeline_safely(instance_id: str) -> None: logger.error("scheduler: pipeline failed for instance %s: %s", instance_id, e, exc_info=True) +async def _publish_final_attestation(instance_id: str) -> None: + """Publish the final cohort report hash to Solana devnet.""" + from infra import solana + results = storage.list_results(instance_id) + if not results: + logger.info("scheduler: no results to attest for instance %s", instance_id) + return + report_hash = solana.hash_report(results) + loop = asyncio.get_running_loop() + try: + record = await loop.run_in_executor(None, solana.publish_attestation, report_hash) + except Exception as e: + logger.error("scheduler: solana publish errored for %s: %s", instance_id, e) + return + storage.record_attestation( + instance_id=instance_id, + report_hash=record["report_hash_hex"], + tx_sig=record.get("tx_sig"), + chain=record["chain"], + extra={ + "pubkey": record.get("pubkey"), + "explorer_url": record.get("explorer_url"), + "status": record.get("status"), + "error": record.get("error"), + }, + ) + logger.info("scheduler: attestation recorded for %s status=%s", instance_id, record.get("status")) + + async def _loop_for(instance_id: str) -> None: """Inner loop. Sleeps `evaluation_frequency_seconds`, ticks, repeats until end_date.""" while True: @@ -70,6 +99,7 @@ async def _loop_for(instance_id: str) -> None: if now >= end_date: # Final tick on the way out so the end-of-hackathon report is fresh. await _run_pipeline_safely(instance_id) + await _publish_final_attestation(instance_id) logger.info("scheduler: instance %s reached end_date, exiting", instance_id) return diff --git a/infra/solana.py b/infra/solana.py new file mode 100644 index 0000000..b35ea2b --- /dev/null +++ b/infra/solana.py @@ -0,0 +1,134 @@ +"""Solana devnet attestation publication. + +Publishes a SHA-256 hash of the final cohort report to Solana devnet using the +SPL Memo program. The transaction itself becomes the attestation: anyone can +look it up by signature, read the memo, and verify the signer pubkey is the +enclave's known service key (ideally part of the TDX measurement). + +Why memo and not a custom Anchor program: simpler, no deployment, identical +verification semantics. The signed-tx-with-known-payload pattern is what +matters; on-chain code is unnecessary for a recordkeeping use case. + +Configuration via env: +- CONCLAVE_SOLANA_KEYPAIR — base58-encoded 64-byte secret (solana CLI format) +- CONCLAVE_SOLANA_RPC_URL — defaults to https://api.devnet.solana.com +- CONCLAVE_SOLANA_NETWORK — defaults to "devnet" (used for explorer URL) + +If the keypair env var is unset, publish_attestation() returns a "local_only" +record and skips the network call. This keeps the dev/test path running +without Solana and lets the enclave operator opt in by setting the keypair. +""" +from __future__ import annotations + +import base64 +import hashlib +import json +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + +MEMO_PROGRAM_ID = "MemoSq4gqABAXKb96qnH8TysNcWxMyWCqXgDLGmfcHr" +DEFAULT_RPC = "https://api.devnet.solana.com" +DEFAULT_NETWORK = "devnet" + + +def is_configured() -> bool: + return bool(os.environ.get("CONCLAVE_SOLANA_KEYPAIR")) + + +def hash_report(results: list[dict]) -> bytes: + """Deterministic SHA-256 of the cohort report. Sorted by submission_id.""" + sorted_results = sorted(results, key=lambda r: r.get("submission_id", "")) + payload = json.dumps(sorted_results, sort_keys=True, default=str).encode("utf-8") + return hashlib.sha256(payload).digest() + + +def _explorer_url(tx_sig: str, network: str) -> str: + return f"https://explorer.solana.com/tx/{tx_sig}?cluster={network}" + + +def _load_keypair(): + """Decode the configured keypair (base58 or JSON-array form).""" + from solders.keypair import Keypair # type: ignore + + raw = os.environ.get("CONCLAVE_SOLANA_KEYPAIR", "").strip() + if not raw: + raise RuntimeError("CONCLAVE_SOLANA_KEYPAIR not set") + if raw.startswith("["): + # Solana CLI keygen format: JSON array of 64 ints + return Keypair.from_bytes(bytes(json.loads(raw))) + # Try base58 + try: + import base58 # type: ignore + return Keypair.from_bytes(base58.b58decode(raw)) + except Exception: + # Last resort: base64 + return Keypair.from_bytes(base64.b64decode(raw)) + + +def publish_attestation(report_hash: bytes) -> dict[str, Any]: + """Publish a memo-bearing transaction to Solana devnet. + + Returns {tx_sig, pubkey, explorer_url, chain, status, report_hash_hex}. + Status is 'published' on success, 'local_only' if Solana is unconfigured, + or 'failed' if the broadcast errored (we still return locally). + """ + network = os.environ.get("CONCLAVE_SOLANA_NETWORK", DEFAULT_NETWORK) + rpc_url = os.environ.get("CONCLAVE_SOLANA_RPC_URL", DEFAULT_RPC) + report_hash_hex = report_hash.hex() + + if not is_configured(): + logger.info("solana: keypair unconfigured, recording local-only attestation") + return { + "tx_sig": None, + "pubkey": None, + "explorer_url": None, + "chain": f"solana-{network}", + "status": "local_only", + "report_hash_hex": report_hash_hex, + } + + try: + from solana.rpc.api import Client # type: ignore + from solders.instruction import Instruction, AccountMeta # type: ignore + from solders.message import Message # type: ignore + from solders.pubkey import Pubkey # type: ignore + from solders.transaction import Transaction # type: ignore + + keypair = _load_keypair() + client = Client(rpc_url) + memo_payload = f"conclave-attestation:{report_hash_hex}".encode("utf-8") + + instruction = Instruction( + program_id=Pubkey.from_string(MEMO_PROGRAM_ID), + accounts=[AccountMeta(pubkey=keypair.pubkey(), is_signer=True, is_writable=False)], + data=memo_payload, + ) + + recent_blockhash = client.get_latest_blockhash().value.blockhash + message = Message.new_with_blockhash([instruction], keypair.pubkey(), recent_blockhash) + tx = Transaction([keypair], message, recent_blockhash) + resp = client.send_transaction(tx) + tx_sig = str(resp.value) + + return { + "tx_sig": tx_sig, + "pubkey": str(keypair.pubkey()), + "explorer_url": _explorer_url(tx_sig, network), + "chain": f"solana-{network}", + "status": "published", + "report_hash_hex": report_hash_hex, + } + except Exception as e: + logger.error("solana: publish failed: %s", e, exc_info=True) + return { + "tx_sig": None, + "pubkey": None, + "explorer_url": None, + "chain": f"solana-{network}", + "status": "failed", + "report_hash_hex": report_hash_hex, + "error": str(e), + } diff --git a/requirements.txt b/requirements.txt index 1a10176..f2eb1cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,5 @@ pandas langgraph-cli[inmem] pdfplumber python-docx +solders +solana diff --git a/storage/__init__.py b/storage/__init__.py index f02482a..36e6640 100644 --- a/storage/__init__.py +++ b/storage/__init__.py @@ -36,6 +36,9 @@ # evaluation runs record_evaluation_run, list_evaluation_runs, + # attestations + record_attestation, + list_attestations, ) __all__ = [ @@ -62,4 +65,6 @@ "set_registration_token", "record_evaluation_run", "list_evaluation_runs", + "record_attestation", + "list_attestations", ] diff --git a/storage/sqlite.py b/storage/sqlite.py index 3593a2f..fa1292e 100644 --- a/storage/sqlite.py +++ b/storage/sqlite.py @@ -101,6 +101,7 @@ def _init_schema(conn: sqlite3.Connection) -> None: tx_sig TEXT, chain TEXT NOT NULL DEFAULT 'solana-devnet', published_at TEXT NOT NULL, + data TEXT, PRIMARY KEY (instance_id, report_hash) ); """ @@ -385,3 +386,47 @@ def list_evaluation_runs(instance_id: str) -> list[dict]: "snapshot": json.loads(row["data"]) if row["data"] else None, }) return out + + +# --- Attestations --- + +def record_attestation( + instance_id: str, + report_hash: str, + tx_sig: str | None, + chain: str, + extra: dict | None = None, +) -> None: + """Persist one attestation. report_hash is hex-encoded SHA-256.""" + extras_json = json.dumps(_to_jsonable(extra)) if extra else None + _get_conn().execute( + """ + INSERT INTO attestations (instance_id, report_hash, tx_sig, chain, published_at, data) + VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT(instance_id, report_hash) DO UPDATE SET + tx_sig = excluded.tx_sig, + chain = excluded.chain, + published_at = excluded.published_at, + data = excluded.data + """, + (instance_id, report_hash, tx_sig, chain, _now(), extras_json), + ) + + +def list_attestations(instance_id: str) -> list[dict]: + rows = _get_conn().execute( + "SELECT report_hash, tx_sig, chain, published_at, data FROM attestations " + "WHERE instance_id = ? ORDER BY published_at ASC", + (instance_id,), + ).fetchall() + out = [] + for row in rows: + extras = json.loads(row["data"]) if row["data"] else {} + out.append({ + "report_hash": row["report_hash"], + "tx_sig": row["tx_sig"], + "chain": row["chain"], + "published_at": row["published_at"], + **extras, + }) + return out diff --git a/tests/test_attestation.py b/tests/test_attestation.py new file mode 100644 index 0000000..2706ce6 --- /dev/null +++ b/tests/test_attestation.py @@ -0,0 +1,85 @@ +"""Tests for the Solana attestation path. + +By default (no CONCLAVE_SOLANA_KEYPAIR set), publish_attestation runs in +local-only mode and skips the network call. These tests cover that path +plus the API endpoints and storage. The actual devnet broadcast is +exercised manually during the smoke test in phase 9. +""" +from __future__ import annotations +import os +os.environ.setdefault("CONCLAVE_DB_PATH", ":memory:") +os.environ.setdefault("CONCLAVE_DISABLE_SCHEDULER", "1") + +import pytest +from fastapi.testclient import TestClient + +import storage +from infra import solana +from tests.test_e2e import _setup_instance, _fake_run_skill # noqa: F401 +from unittest.mock import patch +from skills.hackathon_novelty import skill_card + + +@pytest.fixture(autouse=True) +def clear_stores(): + storage.reset_all() + yield + + +@pytest.fixture +def client(): + from main import app + return TestClient(app) + + +def test_hash_report_is_deterministic(): + a = [{"submission_id": "x", "novelty_score": 0.5}, {"submission_id": "y", "novelty_score": 0.3}] + b = list(reversed(a)) # same content, different order + assert solana.hash_report(a) == solana.hash_report(b) + + +def test_hash_report_changes_with_payload(): + a = [{"submission_id": "x", "novelty_score": 0.5}] + b = [{"submission_id": "x", "novelty_score": 0.6}] + assert solana.hash_report(a) != solana.hash_report(b) + + +def test_publish_attestation_local_only_when_unconfigured(monkeypatch): + monkeypatch.delenv("CONCLAVE_SOLANA_KEYPAIR", raising=False) + record = solana.publish_attestation(b"\x00" * 32) + assert record["status"] == "local_only" + assert record["tx_sig"] is None + assert record["report_hash_hex"] == "00" * 32 + + +def test_publish_endpoint_records_attestation(client): + """Admin-only POST /attestations/publish records an attestation row even in local_only mode.""" + with patch.object(skill_card, "run", _fake_run_skill): + instance_id, admin_token = _setup_instance() + user_token = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] + client.post( + "/submit", + json={"submission_id": "sub_1", "idea_text": "An idea"}, + headers={"X-Instance-Token": user_token}, + ) + client.post("/trigger", headers={"X-Instance-Token": admin_token}) + + r = client.post("/attestations/publish", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + latest = r.json()["latest"] + assert latest is not None + assert latest["status"] == "local_only" + assert latest["report_hash"] + + # Listed via GET /attestations + r = client.get("/attestations", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + att = r.json()["attestations"] + assert len(att) == 1 + + +def test_publish_endpoint_admin_only(client): + instance_id, _ = _setup_instance() + user_token = client.post("/register", json={"instance_id": instance_id}).json()["user_token"] + r = client.post("/attestations/publish", headers={"X-Instance-Token": user_token}) + assert r.status_code == 403 From 7292a77c29a5b6eadb45a445dd87b5ffdd14867c Mon Sep 17 00:00:00 2001 From: Prakhar Ojha <68009969+prakhar728@users.noreply.github.com> Date: Wed, 6 May 2026 00:29:48 +0530 Subject: [PATCH 10/22] Frontend: drop participant pages, template gallery, procurement components MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete app/templates/, app/access/, app/i/[id]/ — replaced by the agent-skill flow (participants never touch the web UI in the pivot). - Delete component shells used only by procurement / template gallery: template-card, procurement-scorecard, negotiation-panel, dataset-upload-card, procurement-policy-preview, release-token-card, hard-constraints-card, milestone-breakdown, chat-message. - Stub app/page.tsx, app/setup/page.tsx, app/dashboard/[id]/page.tsx so the codebase compiles between phases. Real implementations land in frontend phases 3, 4, 5. lib/api.ts and lib/types.ts still reference procurement types; those get removed in frontend phase 2. Frontend phase 1 of pivot/agent-skill. --- client/apps/web/app/access/page.tsx | 118 --- client/apps/web/app/dashboard/[id]/page.tsx | 721 +------------ client/apps/web/app/i/[id]/page.tsx | 964 ------------------ client/apps/web/app/page.tsx | 480 +-------- client/apps/web/app/setup/page.tsx | 482 +-------- client/apps/web/app/templates/page.tsx | 59 -- client/apps/web/components/chat-message.tsx | 111 -- .../web/components/dataset-upload-card.tsx | 256 ----- .../web/components/hard-constraints-card.tsx | 43 - .../web/components/milestone-breakdown.tsx | 46 - .../apps/web/components/negotiation-panel.tsx | 256 ----- .../components/procurement-policy-preview.tsx | 130 --- .../web/components/procurement-scorecard.tsx | 89 -- .../web/components/release-token-card.tsx | 78 -- client/apps/web/components/template-card.tsx | 163 --- 15 files changed, 27 insertions(+), 3969 deletions(-) delete mode 100644 client/apps/web/app/access/page.tsx delete mode 100644 client/apps/web/app/i/[id]/page.tsx delete mode 100644 client/apps/web/app/templates/page.tsx delete mode 100644 client/apps/web/components/chat-message.tsx delete mode 100644 client/apps/web/components/dataset-upload-card.tsx delete mode 100644 client/apps/web/components/hard-constraints-card.tsx delete mode 100644 client/apps/web/components/milestone-breakdown.tsx delete mode 100644 client/apps/web/components/negotiation-panel.tsx delete mode 100644 client/apps/web/components/procurement-policy-preview.tsx delete mode 100644 client/apps/web/components/procurement-scorecard.tsx delete mode 100644 client/apps/web/components/release-token-card.tsx delete mode 100644 client/apps/web/components/template-card.tsx diff --git a/client/apps/web/app/access/page.tsx b/client/apps/web/app/access/page.tsx deleted file mode 100644 index 0547951..0000000 --- a/client/apps/web/app/access/page.tsx +++ /dev/null @@ -1,118 +0,0 @@ -"use client" - -import * as React from "react" -import { useRouter } from "next/navigation" -import { Lock, ShieldCheck, CircleNotch, ArrowRight } from "@phosphor-icons/react" -import { api, ApiError } from "@/lib/api" -import Link from "next/link" - -export default function AccessPage() { - const router = useRouter() - const [token, setToken] = React.useState("") - const [loading, setLoading] = React.useState(false) - const [error, setError] = React.useState("") - - async function handleAccess() { - const t = token.trim() - if (!t) return - setLoading(true) - setError("") - try { - const { instance_id, role } = await api.resolveToken(t) - if (role !== "admin") { - setError("This token doesn't have admin access. Make sure you're using the admin token issued at setup.") - setLoading(false) - return - } - // Persist token in the same format the dashboard expects - const existing = localStorage.getItem(`ndai_instance_${instance_id}`) - const data = existing ? JSON.parse(existing) : {} - localStorage.setItem( - `ndai_instance_${instance_id}`, - JSON.stringify({ ...data, instance_id, admin_token: t }), - ) - router.push(`/dashboard/${instance_id}`) - } catch (err) { - if (err instanceof ApiError && err.status === 403) { - setError("Invalid or expired token.") - } else { - setError("Could not reach the enclave. Make sure the server is running.") - } - setLoading(false) - } - } - - return ( -
- {/* Nav */} -
-
- -
- -
- NDAI - - - Create new instance - -
-
- - {/* Content */} -
-
- {/* Header */} -
-
- -
-

- Access your dashboard -

-

- Paste the admin token you received when you created the instance. -

-
- - {/* Card */} -
-
- - { setToken(e.target.value); setError("") }} - onKeyDown={(e) => e.key === "Enter" && handleAccess()} - placeholder="adm_…" - autoFocus - className="w-full rounded-xl border border-[#d2d2d7] bg-[#f5f5f7] px-4 py-3 text-sm font-mono text-[#1d1d1f] placeholder:text-[#aeaeb2] focus:outline-none focus:border-primary/50 focus:ring-1 focus:ring-primary/20 transition-all" - /> - {error && ( -

{error}

- )} -
- - -
- -

- Don't have a token?{" "} - - Create an instance - -

-
-
-
- ) -} diff --git a/client/apps/web/app/dashboard/[id]/page.tsx b/client/apps/web/app/dashboard/[id]/page.tsx index d6d65d1..69cac7c 100644 --- a/client/apps/web/app/dashboard/[id]/page.tsx +++ b/client/apps/web/app/dashboard/[id]/page.tsx @@ -1,719 +1,12 @@ -"use client" - -import * as React from "react" -import { use } from "react" -import { - Copy, - Check, - Lightning, - ArrowCounterClockwise, - Export, - ShieldCheck, - CaretDown, -} from "@phosphor-icons/react" -import { EnclaveSigBadge } from "@/components/enclave-sig-badge" -import { StatusPill } from "@/components/status-pill" -import { FieldCell, ResultExpandedRow } from "@/components/result-renderer" -import { HardConstraintsCard } from "@/components/hard-constraints-card" -import { MilestoneBreakdown } from "@/components/milestone-breakdown" -import { ProcurementScorecard } from "@/components/procurement-scorecard" -import { NegotiationPanel } from "@/components/negotiation-panel" -import { ReleaseTokenCard } from "@/components/release-token-card" -import { api } from "@/lib/api" -import type { DisplayMap, NoveltyResult, ProcurementResult, SubmissionMeta } from "@/lib/types" -import { cn } from "@workspace/ui/lib/utils" -import Link from "next/link" -import { ArrowLeft } from "@phosphor-icons/react" - -type Tab = "overview" | "submissions" | "results" | "deals" | "traces" - -export default function DashboardPage({ params }: { params: Promise<{ id: string }> }) { - const { id } = use(params) - const [tab, setTab] = React.useState("overview") - const [adminToken, setAdminToken] = React.useState(null) - const [tokenInput, setTokenInput] = React.useState("") - const [isProcurement, setIsProcurement] = React.useState(false) - - // Hackathon state - const [results, setResults] = React.useState([]) - const [display, setDisplay] = React.useState({}) - const [submissionMetas, setSubmissionMetas] = React.useState([]) - const [triggering, setTriggering] = React.useState(false) - const [triggered, setTriggered] = React.useState(false) - - // Procurement state - const [procResults, setProcResults] = React.useState([]) - const [dealActions, setDealActions] = React.useState>({}) - - const [subCount, setSubCount] = React.useState(0) - const [threshold, setThreshold] = React.useState(5) - const [copied, setCopied] = React.useState(false) - - React.useEffect(() => { - const raw = localStorage.getItem(`ndai_instance_${id}`) - if (raw) { - const data = JSON.parse(raw) - setAdminToken(data.admin_token) - } - }, [id]) - - React.useEffect(() => { - async function fetchStatus() { - try { - const inst = await api.checkInstance(id) - setSubCount(inst.submissions) - setThreshold(inst.threshold) - if (inst.triggered) setTriggered(true) - const proc = inst.skill_name === "confidential_data_procurement" - setIsProcurement(proc) - if (!proc && inst.skill_name) { - api.getSkill(inst.skill_name).then((card) => { - if (card.user_display) setDisplay(card.user_display) - }).catch(() => {}) - } - } catch { - // ignore - } - } - fetchStatus() - const interval = setInterval(fetchStatus, 10000) - return () => clearInterval(interval) - }, [id]) - - React.useEffect(() => { - if (!adminToken) return - api.checkInstance(id).then((inst) => { - if (inst.skill_name === "confidential_data_procurement") { - api.getProcurementResults(adminToken).then((r) => { - if (r.results.length > 0) setProcResults(r.results) - }).catch(() => {}) - } else { - api.getAllResults(adminToken).then((r) => { - if (r.results.length > 0) { - setResults(r.results) - setTriggered(true) - } - }).catch(() => {}) - api.getSubmissions(adminToken).then((r) => { - if (r.submissions.length > 0) setSubmissionMetas(r.submissions) - }).catch(() => {}) - } - }).catch(() => {}) - }, [adminToken]) - - async function runAnalysis() { - if (!adminToken) return - setTriggering(true) - await api.trigger(adminToken) - const [r, s] = await Promise.all([ - api.getAllResults(adminToken), - api.getSubmissions(adminToken), - ]) - setResults(r.results) - setSubmissionMetas(s.submissions) - setTriggered(true) - setTriggering(false) - } - - async function copyLink() { - const url = `${window.location.origin}/i/${id}` - await navigator.clipboard.writeText(url) - setCopied(true) - setTimeout(() => setCopied(false), 2000) - } - - async function handleAccept(procResult: ProcurementResult) { - if (!adminToken) return - await api.acceptDeal(adminToken, procResult.submission_id) - const token = await api.getReleaseToken(adminToken, procResult.submission_id) - setDealActions((a) => ({ ...a, [procResult.submission_id]: "accepted" })) - setProcResults((rs) => - rs.map((r) => - r.submission_id === procResult.submission_id - ? { ...r, release_token: token, negotiation: { ...r.negotiation, state: "accepted" }, settlement: { state: "authorized", amount: r.proposed_payment } } - : r, - ), - ) - } - - async function handleReject(procResult: ProcurementResult) { - if (!adminToken) return - await api.rejectDeal(adminToken, procResult.submission_id) - setDealActions((a) => ({ ...a, [procResult.submission_id]: "rejected" })) - setProcResults((rs) => - rs.map((r) => - r.submission_id === procResult.submission_id - ? { ...r, negotiation: { ...r.negotiation, state: "rejected" }, settlement: { state: "failed" } } - : r, - ), - ) - } - - async function handleRenegotiate(procResult: ProcurementResult, revisedBudget: number) { - if (!adminToken) return - await api.requestNegotiation(adminToken, procResult.submission_id, revisedBudget) - setDealActions((a) => ({ ...a, [procResult.submission_id]: "renegotiating" })) - setProcResults((rs) => - rs.map((r) => - r.submission_id === procResult.submission_id - ? { ...r, negotiation: { state: "requested_by_buyer", revised_budget: revisedBudget, used: true } } - : r, - ), - ) - } - - // Token gate - if (!adminToken) { - return ( -
-
-
- -
-

Enter admin token

-

- Paste the admin token issued when you created this instance. -

- setTokenInput(e.target.value)} - placeholder="adm_…" - className="w-full rounded-xl border border-[#d2d2d7] bg-[#f5f5f7] px-4 py-2.5 text-sm font-mono text-[#1d1d1f] placeholder:text-[#aeaeb2] focus:outline-none focus:border-primary/50 focus:ring-1 focus:ring-primary/20 mb-3 transition-all" - /> - -
-
- ) - } - - const participantUrl = `${typeof window !== "undefined" ? window.location.origin : ""}/i/${id}` - const status = triggered || procResults.length > 0 - ? "complete" - : subCount >= threshold - ? "analyzing" - : "accepting" - - // Tabs vary by protocol - const tabs: Tab[] = isProcurement - ? ["overview", "submissions", "deals", "traces"] - : ["overview", "submissions", "results", "traces"] - - // Procurement deal stats - const acceptedDeals = procResults.filter((r) => r.negotiation.state === "accepted").length - const pendingDeals = procResults.filter((r) => r.negotiation.state === "none").length - const totalPayment = procResults - .filter((r) => r.settlement.state === "authorized") - .reduce((sum, r) => sum + (r.settlement.amount ?? 0), 0) - - return ( -
- {/* Top bar */} -
-
-
-
- - - -
-
-

- {isProcurement ? "Confidential Data Procurement" : "Hackathon Novelty"} -

- -
-

{id}

-
-
- -
- - {/* Tabs */} -
- {tabs.map((t) => ( - - ))} -
-
-
- -
- - {/* ── OVERVIEW ── */} - {tab === "overview" && ( -
- {isProcurement ? ( - <> -
- {[ - { label: "Seller Submissions", value: subCount }, - { label: "Evaluated", value: procResults.length }, - { label: "Deals Closed", value: acceptedDeals }, - { label: "Total Settled", value: totalPayment > 0 ? `$${totalPayment.toLocaleString()}` : "—" }, - ].map(({ label, value }) => ( -
-

{label}

-

{value}

-
- ))} -
- - {procResults.length > 0 && ( -
-

Deal Pipeline

-
- {procResults.map((r) => ( -
- {r.submission_id} -
- ${r.proposed_payment.toLocaleString()} - -
-
- ))} -
-
- )} - - {procResults.length === 0 && ( -
-

No evaluated submissions yet.

-

- Sellers submit datasets via the seller link. Results appear here after the enclave evaluates each one. -

-
- )} - - ) : ( - <> -
- {[ - { label: "Submissions", value: subCount }, - { label: "Threshold", value: threshold }, - { label: "Analyzed", value: triggered ? results.length : "—" }, - { label: "Status", value: triggered ? "Complete" : "Open" }, - ].map(({ label, value }) => ( -
-

{label}

-

{value}

-
- ))} -
-
-
-

Submission progress

- {subCount} / {threshold} — analysis triggers at {threshold} -
-
-
-
-
-
- - {triggered && ( - - )} -
- - )} -
- )} - - {/* ── SUBMISSIONS ── */} - {tab === "submissions" && ( -
-
- {isProcurement - ? "Dataset content is processed inside the enclave only. Raw rows are never visible here." - : "Submission content is processed inside the enclave only. You cannot read raw submissions."} -
-
- - - - {isProcurement - ? ["#", "Submitted at", "Dataset", "Eval status", "Score", "Payment"].map((h) => ( - - )) - : ["#", "Submitted at", "Text", "PDF", "GitHub", "Status"].map((h) => ( - - )) - } - - - - {isProcurement - ? procResults.length > 0 - ? procResults.map((r, i) => ( - - - - - - - - - )) - : Array.from({ length: subCount }).map((_, i) => ( - - - - - - - - - )) - : submissionMetas.length > 0 - ? submissionMetas.map((s, i) => ( - - - - - - - - - )) - : Array.from({ length: subCount }).map((_, i) => ( - - - - - - - - - )) - } - -
{h}{h}
{i + 1} - {new Date(Date.now() - i * 3600000).toLocaleString()} - {r.submission_id} - - - {(r.partial_score * 100).toFixed(0)} - - ${r.proposed_payment.toLocaleString()} -
{i + 1} - {new Date(Date.now() - i * 3600000).toLocaleString()} - - pending -
{i + 1} - {s.submitted_at ? new Date(s.submitted_at).toLocaleString() : "—"} - {s.has_text ? : }{s.has_file ? : }{s.has_repo ? : } - received -
{i + 1} - received -
-
-
- )} - - {/* ── DEALS (procurement) ── */} - {tab === "deals" && isProcurement && ( -
- {procResults.length === 0 ? ( -
-

No evaluated datasets yet.

-

Results appear here automatically after the enclave evaluates each seller submission.

-
- ) : ( - <> -
- - Results signed by enclave -
- {procResults.map((r) => ( - handleAccept(r)} - onReject={() => handleReject(r)} - onRenegotiate={(val) => handleRenegotiate(r, val)} - /> - ))} - - )} -
- )} - - {/* ── RESULTS (hackathon) ── */} - {tab === "results" && !isProcurement && ( -
- {!triggered ? ( -
-

No results yet. Run analysis to see results.

- -
- ) : ( -
-
- - Results signed by enclave -
- -
- )} -
- )} - - {/* ── TRACES ── */} - {tab === "traces" && ( -
-

Trace data will appear here after analysis runs.

-

- {isProcurement - ? "Traces show which evaluation tools ran, output filter pass/fail per constraint, and claim-verification results. No raw dataset content." - : "Traces show which tools Claude called per submission, output filter pass/fail, and jailbreak test results. They contain no raw submission content."} -

-
- )} -
-
- ) -} - -// --------------------------------------------------------------------------- -// Procurement deal card -// --------------------------------------------------------------------------- - -function DealCard({ - result, - onAccept, - onReject, - onRenegotiate, -}: { - result: ProcurementResult - onAccept: () => void - onReject: () => void - onRenegotiate: (revisedBudget: number) => void -}) { - const [expanded, setExpanded] = React.useState(false) - +// Minimal stub — operator cohort dashboard is implemented in frontend phase 5. +// See plans/ui_changes.md §1. +export default function DashboardStub() { return ( -
- {/* Header row */} -
setExpanded((v) => !v)} - > -
- {result.submission_id} - -
-
-
-

Score

-

{(result.partial_score * 100).toFixed(0)}

-
-
-

Proposed

-

${result.proposed_payment.toLocaleString()}

-
- -
+
+
+

Dashboard

+

Operator cohort view lands in phase 5.

- - {/* Expanded deal detail */} - {expanded && ( -
- - - - - {/* Claim results */} - {Object.keys(result.claim_results).length > 0 && ( -
-

Claim Verification

-
- {Object.entries(result.claim_results).map(([claim, passed]) => ( -
- - {passed ? "✓" : "✗"} - - {claim} -
- ))} -
-
- )} - - - - {result.release_token && } - - {result.enclave_signature && ( - - )} -
- )}
) } - -function DealStatusBadge({ - state, - settlement, -}: { - state: string - settlement: string -}) { - if (settlement === "authorized") - return Settled - if (state === "accepted") - return Accepted - if (state === "rejected") - return Rejected - if (["requested_by_buyer", "requested_by_seller", "awaiting_counterparty", "renegotiation_submitted"].includes(state)) - return Renegotiating - return Pending decision -} - -// --------------------------------------------------------------------------- -// Hackathon results table (unchanged) -// --------------------------------------------------------------------------- - -function ResultsTable({ results, display }: { results: NoveltyResult[]; display: DisplayMap }) { - const colFields = Object.entries(display).filter(([, h]) => h.type !== "score_table") - const colCount = 1 + colFields.length - - return ( -
- - - - - {colFields.map(([key, hint]) => ( - - ))} - - - - {results.map((r) => ( - - ))} - -
Submission ID{hint.label}
-
- ) -} - -function HackathonResultRow({ - result, - colFields, - colCount, - display, -}: { - result: NoveltyResult - colFields: [string, import("@/lib/types").DisplayHint][] - colCount: number - display: DisplayMap -}) { - const [expanded, setExpanded] = React.useState(false) - const row = result as unknown as Record - - return ( - <> - setExpanded((v) => !v)} - > - {result.submission_id} - {colFields.map(([key, hint]) => ( - - - - ))} - - {expanded && ( - - - - {result.enclave_signature && ( -
- -
- )} - - - )} - - ) -} diff --git a/client/apps/web/app/i/[id]/page.tsx b/client/apps/web/app/i/[id]/page.tsx deleted file mode 100644 index 2f88dfe..0000000 --- a/client/apps/web/app/i/[id]/page.tsx +++ /dev/null @@ -1,964 +0,0 @@ -"use client" - -import * as React from "react" -import { use } from "react" -import { - Lock, - Check, - X, - GithubLogo, - FilePdf, - ArrowRight, - CircleNotch, - ShieldCheck, -} from "@phosphor-icons/react" -import { AttestationWidget } from "@/components/attestation-widget" -import { EnclaveSigBadge } from "@/components/enclave-sig-badge" -import { ResultDetail } from "@/components/result-renderer" -import { DatasetUploadCard } from "@/components/dataset-upload-card" -import { ProcurementScorecard } from "@/components/procurement-scorecard" -import { HardConstraintsCard } from "@/components/hard-constraints-card" -import { MilestoneBreakdown } from "@/components/milestone-breakdown" -import { NegotiationPanel } from "@/components/negotiation-panel" -import { ReleaseTokenCard } from "@/components/release-token-card" -import { api, ApiError } from "@/lib/api" -import type { - DisplayMap, - NoveltyResult, - ProcurementResult, - NegotiationStatus, - SellerClaim, - SubmitResponse, -} from "@/lib/types" -import { cn } from "@workspace/ui/lib/utils" -import { Suspense } from "react" - -type PageState = - | "login" - | "attest" - | "form" - | "pending" - | "results" - // Procurement-specific - | "uploading" - | "pending_evaluation" - | "evaluation_complete" - | "awaiting_negotiation" - | "released" - | "rejected" - -const TOKEN_CACHE_KEY = (instanceId: string) => `conclave_user_token_${instanceId}` - -function ParticipantContent({ id }: { id: string }) { - const [pageState, setPageState] = React.useState("login") - const [userToken, setUserToken] = React.useState("") - const [instanceMissing, setInstanceMissing] = React.useState(false) - const [isProcurement, setIsProcurement] = React.useState(false) - const [toast, setToast] = React.useState(null) - const [userIdentity, setUserIdentity] = React.useState(null) - - function showToast(msg: string) { - setToast(msg) - setTimeout(() => setToast(null), 4000) - } - - function handleAuthError(err: unknown) { - if (err instanceof ApiError && err.status === 403) { - localStorage.removeItem(TOKEN_CACHE_KEY(id)) - setUserToken("") - setPageState("login") - showToast("Your session has expired. Please log in again.") - } - } - - // --- OTP auth state --- - const [email, setEmail] = React.useState("") - const [otpCode, setOtpCode] = React.useState("") - const [otpSent, setOtpSent] = React.useState(false) - const [authLoading, setAuthLoading] = React.useState(false) - const [authError, setAuthError] = React.useState("") - - async function checkPriorSubmission(token: string, procurementMode: boolean) { - try { - const { submission_ids } = await api.getMySubmissions(token) - if (submission_ids.length === 0) { - setPageState("attest") - return - } - const sid = submission_ids[0]! - setSubmissionId(sid) - if (procurementMode) { - try { - const r = await api.getProcurementResult(token, sid) - setProcResult(r) - setPageState("evaluation_complete") - } catch { - setPageState("pending_evaluation") - } - } else { - try { - const r = await api.getOwnResult(token, sid) - setResult(r) - setPageState("results") - } catch { - const inst = await api.checkInstance(id) - setSubmitResponse({ - submission_id: sid, - status: "received_pending", - submissions_count: inst.submissions, - threshold: inst.threshold, - }) - setPageState("pending") - } - } - } catch (err) { - handleAuthError(err) - if (!(err instanceof ApiError && err.status === 403)) { - setPageState("attest") - } - } - } - - React.useEffect(() => { - const cached = localStorage.getItem(TOKEN_CACHE_KEY(id)) - - api.checkInstance(id).then(async (inst) => { - const proc = inst.skill_name === "confidential_data_procurement" - setIsProcurement(proc) - if (!proc && inst.skill_name) { - const card = await api.getSkill(inst.skill_name).catch(() => null) - if (card?.user_display) setSkillDisplay(card.user_display) - } - if (cached) { - setUserToken(cached) - await checkPriorSubmission(cached, proc) - } - }).catch(() => setInstanceMissing(true)) - - if (cached) { - return - } - import("@/lib/supabase").then(({ supabase }) => { - supabase.auth.getSession().then(async ({ data }) => { - const access_token = data.session?.access_token - if (!access_token) return - setAuthLoading(true) - const user = data.session?.user - if (user) { - const provider = user.app_metadata?.provider ?? "email" - if (provider === "github") { - setUserIdentity(`GitHub: ${user.user_metadata?.user_name ?? user.email}`) - } else if (provider === "google") { - setUserIdentity(`Google: ${user.email}`) - } else { - setUserIdentity(user.email ?? null) - } - } - try { - const { user_token } = await api.verifyToken(access_token, id) - saveToken(user_token) - const inst = await api.checkInstance(id) - const proc = inst.skill_name === "confidential_data_procurement" - if (!proc && inst.skill_name) { - const card = await api.getSkill(inst.skill_name).catch(() => null) - if (card?.user_display) setSkillDisplay(card.user_display) - } - await checkPriorSubmission(user_token, proc) - } catch (err) { - handleAuthError(err) - } - setAuthLoading(false) - }) - }) - }, [id]) - - function saveToken(token: string) { - setUserToken(token) - localStorage.setItem(TOKEN_CACHE_KEY(id), token) - } - - async function handleSendOtp() { - if (!email.trim()) return - setAuthLoading(true) - setAuthError("") - try { - await api.sendOtp(email.trim(), id) - setOtpSent(true) - } catch { - setAuthError("Failed to send OTP. Check the email and try again.") - } - setAuthLoading(false) - } - - async function handleVerifyOtp() { - if (!otpCode.trim()) return - setAuthLoading(true) - setAuthError("") - try { - const { user_token } = await api.verifyOtp(email.trim(), otpCode.trim(), id) - saveToken(user_token) - setUserIdentity(email.trim()) - setPageState("attest") - } catch { - setAuthError("Invalid or expired OTP. Try again.") - } - setAuthLoading(false) - } - - const [skillDisplay, setSkillDisplay] = React.useState({}) - - // Hackathon form state - const [ideaText, setIdeaText] = React.useState("") - const [repoUrl, setRepoUrl] = React.useState("") - const [repoSummary, setRepoSummary] = React.useState(null) - const [repoLoading, setRepoLoading] = React.useState(false) - const [githubConnected, setGithubConnected] = React.useState(false) - const [submitting, setSubmitting] = React.useState(false) - const [submitResponse, setSubmitResponse] = React.useState(null) - const [result, setResult] = React.useState(null) - const [submissionId, setSubmissionId] = React.useState("") - - // Procurement seller state - const [datasetName, setDatasetName] = React.useState("") - const [datasetReference, setDatasetReference] = React.useState("") - const [datasetFile, setDatasetFile] = React.useState(null) - const [metadataFile, setMetadataFile] = React.useState(null) - const [reservePrice, setReservePrice] = React.useState("") - const [sellerClaims, setSellerClaims] = React.useState([]) - const [sellerNote, setSellerNote] = React.useState("") - const [procResult, setProcResult] = React.useState(null) - - // Procurement polling — pending_evaluation: wait for first result - React.useEffect(() => { - if (pageState !== "pending_evaluation" || !userToken || !submissionId) return - const interval = setInterval(async () => { - try { - const r = await api.getProcurementResult(userToken, submissionId) - setProcResult(r) - setPageState("evaluation_complete") - clearInterval(interval) - } catch { - // Not ready yet - } - }, 8000) - return () => clearInterval(interval) - }, [pageState, userToken, submissionId]) - - // Procurement polling — evaluation_complete / awaiting_negotiation: refresh negotiation state - React.useEffect(() => { - if ( - (pageState !== "evaluation_complete" && pageState !== "awaiting_negotiation") || - !userToken || !submissionId - ) return - const interval = setInterval(async () => { - try { - const r = await api.getProcurementResult(userToken, submissionId) - setProcResult(r) - // Advance page state only on authorization (rejection is shown inline) - if (r.settlement.state === "authorized") setPageState("released") - else if (r.negotiation.state === "requested_by_buyer" && pageState !== "evaluation_complete") { - // Buyer responded — bring seller back to evaluation_complete to show action buttons - setPageState("evaluation_complete") - } - } catch { - // Ignore transient errors - } - }, 5000) - return () => clearInterval(interval) - }, [pageState, userToken, submissionId]) - - // Hackathon polling - React.useEffect(() => { - if (pageState !== "pending" || !submitResponse || !userToken) return - const interval = setInterval(async () => { - try { - const r = await api.getOwnResult(userToken, submissionId) - setResult(r) - setPageState("results") - clearInterval(interval) - } catch { - // Not ready yet - } - }, 8000) - return () => clearInterval(interval) - }, [pageState, submitResponse, userToken, submissionId]) - - async function fetchRepo() { - if (!repoUrl || !userToken) return - setRepoLoading(true) - try { - const r = await api.fetchRepo(userToken, repoUrl) - setRepoSummary(r.repo_summary) - } catch { - // ignore - } - setRepoLoading(false) - } - - async function handleHackathonSubmit() { - if (!ideaText.trim() || !userToken) return - setSubmitting(true) - const res = await api.submit(userToken, { - idea_text: ideaText, - repo_summary: repoSummary ?? "", - deck_text: "", - }) - setSubmissionId(res.submission_id) - setSubmitResponse(res) - setSubmitting(false) - setPageState("pending") - } - - async function handleDatasetSubmit() { - if (!datasetName.trim() || !reservePrice || !userToken || !datasetFile) return - setPageState("uploading") - try { - const res = await api.submitDataset( - userToken, - { - dataset_name: datasetName, - dataset_reference: datasetReference || undefined, - seller_claims: sellerClaims, - metadata: {}, - reserve_price: parseFloat(reservePrice.replace(/,/g, "")), - note: sellerNote || undefined, - }, - datasetFile, - metadataFile, - ) - setSubmissionId(res.submission_id) - setPageState("pending_evaluation") - } catch (err) { - handleAuthError(err) - if (!(err instanceof ApiError && err.status === 403)) { - showToast("Upload failed. Please check your file and try again.") - setPageState("form") - } - } - } - - async function handleAccept() { - if (!procResult || !userToken) return - await api.acceptDeal(userToken, procResult.submission_id) - const updated = await api.getProcurementResult(userToken, procResult.submission_id) - setProcResult(updated) - setPageState("released") - } - - async function handleReject() { - if (!procResult || !userToken) return - await api.rejectDeal(userToken, procResult.submission_id) - const updated = await api.getProcurementResult(userToken, procResult.submission_id) - setProcResult(updated) - setPageState("rejected") - } - - async function handleRenegotiate(revisedValue: number) { - if (!procResult || !userToken) return - await api.submitRenegotiation(userToken, procResult.submission_id, revisedValue) - const updated = await api.getProcurementResult(userToken, procResult.submission_id) - setProcResult(updated) - setPageState("awaiting_negotiation") - } - - async function handleLogout() { - const { supabase } = await import("@/lib/supabase") - await supabase.auth.signOut() - localStorage.removeItem(TOKEN_CACHE_KEY(id)) - setUserToken("") - setPageState("login") - setOtpSent(false) - setOtpCode("") - setEmail("") - } - - const canHackathonSubmit = ideaText.trim().length > 20 && !submitting - const canDatasetSubmit = - datasetName.trim().length > 0 && - reservePrice.trim().length > 0 && - !!datasetFile && - pageState === "form" - - if (instanceMissing) { - return ( -
-
-
- ⚠️ -
-

Instance not found

-

- This submission link is invalid or has expired. Ask the organizer for a fresh link. -

-
-
- ) - } - - const procurementSteps = [ - "Login", "Verify", "Submit", "Evaluating", "Result", - ] as const - const procurementStateOrder: PageState[] = [ - "login", "attest", "form", "pending_evaluation", "evaluation_complete", - ] - - const hackathonSteps = ["Login", "Verify", "Submit", "Wait", "Results"] as const - const hackathonStateOrder: PageState[] = ["login", "attest", "form", "pending", "results"] - - const steps = isProcurement ? procurementSteps : hackathonSteps - const stateOrder = isProcurement ? procurementStateOrder : hackathonStateOrder - - return ( -
- {toast && ( -
- - {toast} -
- )} - - {pageState !== "login" && ( -
- {userIdentity && ( - {userIdentity} - )} - -
- )} - -
- {/* Header */} -
-
- - {isProcurement ? "Accepting datasets" : "Accepting submissions"} -
-

- {isProcurement ? "Confidential Data Procurement" : "Hackathon Novelty Scoring"} -

-

- {isProcurement - ? "Submit your dataset for confidential evaluation. Raw rows never leave the enclave before agreement." - : "Submit your idea for anonymous novelty scoring. Your data stays inside the enclave."} -

-
- - {/* Progress steps — hide for terminal procurement states */} - {!["released", "rejected", "awaiting_negotiation"].includes(pageState) && ( -
- {steps.map((label, i) => { - const done = stateOrder.indexOf(pageState) > i - const active = stateOrder.indexOf(pageState) === i - return ( - -
- - {done ? "✓" : i + 1} - - {label} -
- {i < steps.length - 1 && } -
- ) - })} -
- )} - - {/* ── LOGIN ── */} - {pageState === "login" && ( -
- {authLoading ? ( -
- -
- ) : !otpSent ? ( - <> - - -
-
- or -
-
- setEmail(e.target.value)} - onKeyDown={(e) => e.key === "Enter" && handleSendOtp()} - placeholder="you@example.com" - className="w-full rounded-xl border border-[#d2d2d7] bg-white px-4 py-2.5 text-sm text-[#1d1d1f] placeholder:text-[#aeaeb2] focus:outline-none focus:border-primary/50 focus:ring-1 focus:ring-primary/20 transition-all" - /> - {authError &&

{authError}

} - -

- We'll email you a 6-digit code. No password needed. -

- - ) : ( - <> -

- Code sent to {email} -

- setOtpCode(e.target.value.replace(/\D/g, "").slice(0, 6))} - onKeyDown={(e) => e.key === "Enter" && handleVerifyOtp()} - placeholder="123456" - className="w-full rounded-xl border border-[#d2d2d7] bg-white px-4 py-2.5 text-sm font-mono text-center tracking-widest text-[#1d1d1f] placeholder:text-[#aeaeb2] focus:outline-none focus:border-primary/50 focus:ring-1 focus:ring-primary/20 transition-all" - /> - {authError &&

{authError}

} - - - - )} -
- )} - - {/* ── ATTEST ── */} - {pageState === "attest" && ( -
- setPageState("form")} /> -

- The submission form unlocks after enclave verification. -

-
- )} - - {/* ── FORM — Hackathon ── */} - {pageState === "form" && !isProcurement && ( -
-
- Enclave verified -
-
-
- - {ideaText.length > 0 && ( - - Secured - - )} -
-