diff --git a/README.md b/README.md index 86393c3..bba4715 100644 --- a/README.md +++ b/README.md @@ -100,9 +100,10 @@ You can also add this line to your `.bashrc`, `.zshrc`, or environment setup scr ## Installation ```bash -pip install repello-agent-wiz + pip install repello-agent-wiz ``` + ## Prerequisites Before running any analysis commands, you must set your OpenAI API key as an environment variable: @@ -145,7 +146,6 @@ agent-wiz analyze --input agentchat_graph.json ``` This will generate a report like: `autogen_report.md` based on the provided graph and threat modeling frameworks. - __Run agent-wiz --help for more info:__ ```bash usage: agent-wiz [-h] {extract,analyze,visualize} ... diff --git a/pyproject.toml b/pyproject.toml index 1b05297..f077ed1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,8 @@ classifiers = [ dependencies = [ "openai", + "markdown", + "weasyprint", ] [project.scripts] diff --git a/src/repello_agent_wiz/analyzers/css.txt b/src/repello_agent_wiz/analyzers/css.txt new file mode 100644 index 0000000..501d756 --- /dev/null +++ b/src/repello_agent_wiz/analyzers/css.txt @@ -0,0 +1,99 @@ + @page { + size: A3 landscape; + margin-left: 1.5cm; + margin-right: 1.5cm; + margin-top: 1.5cm; + margin-bottom: 2.5cm; + + @bottom-center { + content: "Page " counter(page) " of " counter(pages); /* The page numbering */ + font-family: 'Helvetica', 'Arial', sans-serif; /* Optional: Style the footer text */ + font-size: 9pt; + color: #555; /* Dim color for footer */ + vertical-align: top; /* Align text within the margin box */ + padding-top: 5pt; /* Space above the page number */ + } + } + @page :first { + margin-top: 0; + } + body { + font-family: 'Helvetica', 'Arial', sans-serif; + line-height: 1.4; + font-size: 14pt; + } + img { + /* Prevent horizontal overflow */ + max-width: 110%; + + + /* Prevent excessive vertical size - Adjust vh value as needed */ + /* 85vh means max 85% of the page's viewport height */ + /* This helps prevent images from dominating a page */ + /* and running into the footer margin */ + max-height: 85vh; + + /* Try to keep the image on one page */ + page-break-inside: avoid; + + /* Ensure proper block behavior for page breaks */ + display: block; + margin-top: -1em; /* Add some space above image */ + margin-left: -1.9cm; + margin-right: -1.5cm; + margin-bottom: 1em; /* Add some space below image */ + } + h2, h3, h4, h5, h6 { + page-break-after: avoid; + margin-top: 1.5em; + margin-bottom: 0.5em; + } + table { + border-collapse: collapse; + width: 100%; + margin-top: 1em; + margin-bottom: 1em; + page-break-inside: avoid; + font-size: 9pt; + } + th, td { + border: 1px solid #ccc; + padding: 6px; + text-align: left; + vertical-align: top; + word-wrap: break-word; + } + th { + background-color: #f2f2f2; + font-weight: bold; + } + pre { + background-color: #f8f8f8; + border: 1px solid #ccc; + padding: 10px; + overflow-x: auto; + page-break-inside: avoid; + white-space: pre-wrap; + word-wrap: break-word; + } + code { + font-family: monospace; + background-color: #f8f8f8; + padding: 0.2em 0.4em; + border-radius: 3px; + } + ul, ol { + padding-left: 2em; + } + ul ul, ol ol, ul ol, ol ul { + margin-left: 1.5em; + padding-left: 1.5em; + margin-bottom: 0; + } + ul ul ul, ol ol ol /* etc */ { + margin-left: 1.5em; + padding-left: 1.5em; + } + li { + margin-bottom: 0.3em; + } \ No newline at end of file diff --git a/src/repello_agent_wiz/analyzers/sys_prompt.txt b/src/repello_agent_wiz/analyzers/sys_prompt.txt index a079476..e12412d 100644 --- a/src/repello_agent_wiz/analyzers/sys_prompt.txt +++ b/src/repello_agent_wiz/analyzers/sys_prompt.txt @@ -1,60 +1,101 @@ You are an expert in cybersecurity threat modeling. +Your task is to perform a MAESTRO-based threat analysis on the agentic workflow graph provided in JSON format. +Your primary goal is to maximize the security insights and actionable recommendations derived solely from the provided JSON structure, node attributes (like source_location), edge conditions, and naming conventions. Be extremely specific in your analysis and recommendations, constantly referencing the relevant JSON elements as evidence for why a control or recommendation is necessary based on the observed structure. -Your task is to perform a MAESTRO-based threat analysis on the following agentic workflow graph. +--- CRITICAL INSTRUCTION: Handling source_location --- ---- + When Referencing Code: You MUST scrutinize the source_location attribute for each relevant node (especially Tool nodes) in the input JSON. -MAESTRO Framework: - + If source_location Exists and is Complete: If a node provides a specific file path and line range (e.g., {"file": "path/to/code.py", "line": 10, "end_line": 20}), you MUST reference this exact location in your report when discussing the code implementation, using the format: path/to/code.py: lines 10-20. State that this specific location requires review/controls. Apply this consistently in all relevant sections (Assets, Entrypoints, Controls, Debt, Threats, Scenarios, Risks, Operations, Recommendations). ---- + If source_location is Missing or Incomplete: If a node lacks the source_location attribute, or if the attribute is present but null, empty, or lacks specific file/line data, you MUST NOT invent or assume a source location. Do NOT use vague phrases like "code at source_location" or "implementation at source_location" for that node. Instead, state that the specific implementation location is 'N/A' (e.g., in the Assets table) or focus recommendations/threats on the tool's function (derived from its name) without pointing to a specific, unavailable code block. + --- END CRITICAL INSTRUCTION --- +MAESTRO Framework: + Agentic Workflow JSON Graph: ---- - +Your task is to generate a detailed and structured MAESTRO threat modeling report strictly in Markdown format. Adhere strictly to the information present in the JSON, especially regarding source_location as per the critical instruction above. When inferring based on names (e.g., data types, sensitivity), explicitly state that it is an inference based on the name. Do NOT add information or controls not derivable from the JSON structure itself. The focus is on analyzing the provided structure and recommending controls pertinent to it. -Your task is to generate a detailed and structured MAESTRO threat modeling report strictly in Markdown format, with the following sections: +MAESTRO Analysis of Agentic Workflow {input_agentic_framework_from_json_here} -MAESTRO Analysis of Agentic Workflow 1. Mission -Describe the high-level objective of the system based on the agent names and functions. Summarize its goal and purpose in a few paragraphs. +* Describe the high-level objective synthesized strictly from the combination of agent names (e.g., PlanningAgent, WebSearchAgent) and the functions of their associated Tools (e.g., search_web_tool, analyze_data) as listed in the JSON nodes. +* Infer the system's goal (e.g., data gathering, analysis, reporting) based on these names. +* Explicitly mention the interaction patterns suggested by the JSON edges and metadata, such as the presence of specific group chat types (SelectorGroupChat, RoundRobinGroupChat) indicated in metadata or edge condition types (member_of_team, group_sequence). 2. Assets -List the key assets in a bullet or table format: - -Agents (by name) - -Their key tools/functions - -Data types being processed - -3. Entrypoints -Identify which nodes or functions act as external or internal entrypoints into the system. Show these as a list or table. - -4. Security Controls -Based on the structure, deduce what (if any) security controls might be present or are recommended (e.g. access control, validation, logging). - -5. Threats -Create a detailed table of likely threats. Each row must include: - -Threat - -Likelihood (Low, Medium, High) - -Impact (Low, Medium, High) - -Risk Score (e.g. Medium-High) - -6. Risks -Describe risks in narrative form derived from the threats: what could go wrong, what would be the impact to the system? - -7. Operations -Explain how agents interact at runtime. Suggest monitoring or operational practices to support observability and resilience. - -8. Recommendations -Provide a prioritized list of security improvements or design changes based on the above analysis. - +* Create a detailed table with the following columns, populating data directly from the JSON nodes: +* Agent/Node Name: (name attribute). +* Node Type/Function: (node_type, or function_name if applicable). +* Associated Tools (for Agents): List the name of Tool nodes linked via edges. +* Tool Function (for Tools): The function_name of the Tool node. +* Tool Source Location: Include the specific file: lines start-end if provided in the source_location attribute for the Tool node. If not provided or incomplete, state 'Not specified in JSON'. State that if a location is specified, it points to the implementation requiring review. +* Inferred Data Types: Infer based only on agent/tool names. State this is inferred. +* Inferred Sensitivity Classification: Assign Low/Medium/High based only on inferred data types/function names. State this is inferred. +* Trust Boundaries: Identify boundaries based on JSON structure: Agents using external-implying tools (e.g., names containing 'Search', 'API') cross an External Boundary. Communication via specific edge types (group_sequence) defines an Internal Communication Boundary. Interaction via __start__ suggests a System Input Boundary. + +3. Agent Interaction Analysis +* Specific Data Flows/Control Flows: Document the explicitly defined interactions from the JSON edges. List flows clearly (e.g., "Flow from NodeA to NodeB occurs under condition X"). Describe the interaction type (e.g., direct tool call, sequence step) based on the edge's source, target, condition, and metadata. Do NOT simply append the condition type in parentheses like (static). +* Tool Permissions/Access (Inferred): Based on Tool function_names, state the implied necessary permissions (e.g., external network access for tools named 'search', 'fetch'; data access for tools named 'analyze', 'query'). +* External System Access: Explicitly list agents/tools whose names or functions imply external interaction (e.g., WebSearchAgent_search_web_tool, API_Tool). +* Data Transformation Points: Identify agents using tools whose names imply data modification or analysis (e.g., DataAnalystAgent using calculate_metrics_tool). + +4. Entrypoints +* Create a comprehensive table grounded in the JSON structure: +* Entrypoint Description: User Input (via __start__ edges), External Systems/APIs (via tools implying external access), Internal Channels (via defined group_sequence or team edges), Tool Interfaces (Agent -> Tool edges). +* Trust Level (Inferred): Assign based on type (e.g., External Low, User Medium, Internal Medium). +* Potential Attack Vectors (Based on Type & JSON Context): List vectors relevant to each entrypoint type, referencing JSON elements (e.g., Prompt Injection for Assistant agents, SSRF for external tools, Tampering for internal channels, code vulnerabilities within Tool implementations if their specific source_location (file:line-range) is provided in the JSON). + +5. Recommended Security Controls (Based on Workflow Structure) +* Based only on the workflow structure, node types/functions, interaction patterns, and tool source_location data (when available) revealed in the JSON, detail the necessary security controls. Focus on what controls are needed because of this specific structure. Structure the recommendations as follows: +* Input Validation: Detail where validation is critical based on JSON (e.g., inputs to external tools, inputs to tools with sensitive-sounding names like 'analyze_stock'). If a specific file:line-range is provided in source_location, state that validation must be implemented robustly at that location. If not provided, recommend validation based on the tool's function. +* Output Sanitization: Specify where outputs need sanitization (e.g., data returned from external tools) before use by other agents. +* Authentication/Authorization: Recommend mechanisms for agents invoking tools (especially those inferred as sensitive) and for securing internal communication channels identified by edges (e.g., group_sequence). +* Secure Communication: Recommend securing the specific internal communication channels identified by edges and their associated metadata (e.g., RoundRobinGroupChat). +* Rate Limiting/Resource Controls: Recommend for agents/tools identified as interacting externally or performing computationally intensive tasks (inferred from names). +* Secrets Management: Based on tool names implying external API use (e.g., GoogleSearchTool, SomeAPITool), recommend secure handling and injection practices. If specific source_location data is provided for the tool, note that the implementation at that location must handle secrets securely. +* Code Security: If source_location providing specific file/lines is available for a Tool (e.g., path/to/code.py: lines X-Y), state the critical need for Security Code Review and SAST scanning of that specific implementation. If not available, recommend code review generally for the tool's function based on its name/type and inferred risks. + +6. Technical Debt Analysis +* Identify potential security weaknesses implied by the structure shown in the JSON: +* Potential Bypasses: Focus on risks arising from unspecified validation/sanitization for inputs/outputs of Tool nodes (whose specific implementation location may be file:line-range per source_location, if provided). +* Areas Requiring Scrutiny: List agents/tools highlighted by the JSON structure (external tools, sensitive tools, Assistant agents, the Tool nodes themselves especially if a specific source_location (file:line-range) is provided). +* Potential Architectural Issues: Note risks from unspecified security models for defined interactions (group_sequence, team edges) and reliance on external tools without defined security wrappers, pointing to specific source_location data (if provided) for implementation details that need verification. + +7. Threats +* Create a detailed table, linking threats explicitly to JSON components (name, function_name, node_type, specific source_location data if available, edges). The table MUST have the following columns: +* Threat Description: Be precise, referencing JSON elements (e.g., "Prompt Injection attack against AgentX (function_name: Assistant)", "Potential SSRF in SomeToolName implementation (code at the file:line-range specified in its source_location, if provided)", "Data Tampering on the internal communication channel defined by group_sequence edges between NodeA and NodeB"). +* Affected JSON Element(s): List specific node names or edge descriptions, including specific source_location file/lines if relevant and provided. +* Attack Vector: User Input, Tool Input Parameter, External API Response, Internal Channel (identified by edges). +* Likelihood (Inferred): Assign Low/Medium/High based on inferences from JSON (External interaction, 'Assistant' type, sensitive tool names imply higher risk; presence of specific source_location data can increase focus/risk on that code). State basis is inferred. +* Impact (Inferred): Assign Low/Medium/High based on inferred sensitivity of data/function. State basis is inferred. +* Risk Score (Inferred): Assign Low/Medium/High based on Likelihood and Impact. State basis is inferred. +* STRIDE Category: Assign appropriately. + +8. Runtime Threat Scenarios +* Describe scenarios using specific agent names, tool names, interaction patterns from JSON, and specific source_location file/line pointers (if available from JSON): "Crafted user input causes PlanningAgent (Assistant) to generate malicious instructions for SearchAgent, potentially exploiting vulnerabilities in search_tool (code at the file/lines specified in its source_location, if provided)." "Manipulation of the communication on the group_sequence edge between AgentA and AgentB injects false data." "A vulnerability in data_analysis_tool code (at the file/lines specified in its source_location, if provided) triggered by AnalysisAgent leads to incorrect results passed to ReportAgent." + +9. Risks +* Describe risks connecting threats (linked to JSON elements including specific source_location data if available) to business outcomes, highlighting specific interaction paths and code locations where applicable: "Risk of incorrect business decisions if AnalysisAgent uses flawed data from data_analysis_tool (code at file/lines specified in its source_location, if provided) or tampered inputs via internal channels." "Risk of data leakage/compromise if external_api_tool (code at file/lines specified in its source_location, if provided) is vulnerable." "Risk of workflow disruption via manipulation of the communication channel defined by the team edges." + +10. Operations +* Monitoring Requirements: Focus on JSON elements: invocations/outputs of Tool nodes (esp. external/sensitive ones, reference specific source_location data if available), traffic/integrity on internal channels (identified by edges), inputs to Assistant agents. +* Logging Strategies: Log agent requests to tools, data on internal channels, errors from tool implementations (correlate with specific source_location if available). +* Alerting Thresholds: Anomalous external connections (external tools), high error rates (from specific tool code identified by source_location, if available), injection patterns ('Assistant' inputs), internal channel deviations. +* Operational Practices: Prioritized, regular security code reviews of the specific files/lines indicated by the source_location attribute, when this attribute is provided and contains specific data in the JSON. If not specified, prioritize review based on tool function/risk. Log aggregation/analysis. + +11. Risk Prioritization Matrix +* Create a matrix prioritizing risks identified in Section 7. +* Base prioritization factors explicitly on JSON evidence: Such as whether a tool interacts externally (inferred from name/function), handles sensitive data (inferred from name/function), the presence of Assistant type agents, the structure of internal communication channels (edges/metadata), and whether a specific code implementation location (file:line-range) is identified via source_location (indicating a concrete code-level risk area). Do not just refer to the process of referencing. + +12. Recommendations +* Provide a prioritized list, making recommendations highly specific and linking them directly back to JSON elements (including specific source_location file/line data when available) as justification. Do NOT use conditional phrasing like "If source_location specifies...". Directly state the recommendation and its justification. +* Code-Level: Example: "Perform mandatory security code review and SAST scanning for the some_external_tool implementation at path/file.py: lines X-Y (location provided in source_location), as its name implies external interaction." OR (if location not provided) "Perform security code review for some_external_tool focusing on secure external request handling, as its specific implementation location is not provided in the JSON." Example 2: "Implement robust input validation and output encoding within the some_sensitive_tool implementation at path/other_file.py: lines A-B (location provided in source_location), due to its inferred sensitive function." +* Architectural: Example: "Implement secure transport/message signing for the internal communication channel(s) identified by group_sequence edges.", "Consider sandboxing for processes executing tools like external_tool_A or complex_analysis_tool_B." +* Agent/Tool Specific: Example: "Apply prompt filtering for agents like PlanningAgent identified as Assistant type.", "Ensure secure management and injection of API keys potentially required by SomeAPIAccessTool; review the implementation at path/api_tool.py: lines C-D (if specified in source_location) for secure practices." +* Monitoring/Detection: Example: "Deploy specific monitoring for external calls made by external_tool_A and for traffic integrity on the channel defined by group_sequence edges." +* Circuit Breakers: Example: "Recommend implementing circuit breakers for external calls made by tools like external_tool_A and external_tool_B." 💡 Format the entire report using valid Markdown syntax with proper headings, bullet points, and tables where appropriate. Do not include any introductory or concluding statements — only the report content. diff --git a/src/repello_agent_wiz/analyzers/threat_modelling.py b/src/repello_agent_wiz/analyzers/threat_modelling.py index 39d7d2e..a1bdedb 100644 --- a/src/repello_agent_wiz/analyzers/threat_modelling.py +++ b/src/repello_agent_wiz/analyzers/threat_modelling.py @@ -1,20 +1,47 @@ +import datetime import os import json - from openai import OpenAI import importlib.resources as pkg_resources - from repello_agent_wiz import analyzers +import re + +def process_markdown_input(md_text): + image_line = "![](https://raw.githubusercontent.com/Repello-AI/Agent-Wiz/master/assets/agent_wiz.png)" + now_utc = datetime.datetime.now(datetime.timezone.utc) + current_datetime_string = now_utc.strftime("%Y-%m-%d %H:%M:%S %Z") + if not current_datetime_string.endswith("UTC"): + current_datetime_string = now_utc.strftime("%Y-%m-%d %H:%M:%S") + " UTC" + generated_at_line = f"*Generated at {current_datetime_string}*" + + lines = md_text.splitlines() + first_line = lines[0] + rest_of_lines = lines[1:] + rest_of_text = "\n".join(rest_of_lines) + + md_text_structured = f"{first_line}\n{generated_at_line}\n\n{rest_of_text}" + md_text_with_image_and_date = image_line + "\n\n" + md_text_structured + + final_lines = md_text_with_image_and_date.splitlines() + corrected_lines = [] + for line in final_lines: + if re.match(r'^ [-*+] ', line): + corrected_lines.append(' ' + line) + else: + corrected_lines.append(line) + return "\n".join(corrected_lines) def generate_maestro_analysis_report(json_path: str): - # Load embedded files with pkg_resources.files(analyzers).joinpath("maestro.txt").open("r") as f: maestro = f.read() with pkg_resources.files(analyzers).joinpath("sys_prompt.txt").open("r") as f: sys_prompt_template = f.read() + with pkg_resources.files(analyzers).joinpath("css.txt").open("r") as f: + css_style = f.read() + with open(json_path, "r") as f: graph_data = json.load(f) framework = graph_data.get("metadata", {}).get("framework", "unknown") @@ -25,24 +52,52 @@ def generate_maestro_analysis_report(json_path: str): sys_prompt = sys_prompt_template.replace("", maestro) sys_prompt = sys_prompt.replace("", graph_json) - # Initialize the OpenAI client properly client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - # Use the client instance to create the completion response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": sys_prompt}], temperature=0.3 ) - # Extract content and remove markdown code block if present report = response.choices[0].message.content.strip() if report.startswith("```") and report.endswith("```"): report = "\n".join(report.splitlines()[1:-1]).strip() - - output_path = f"{framework}_report.md" + md_input_path = f"AgentChat_report.md" - with open(output_path, "w") as f: + with open(md_input_path, "w", encoding='utf-8') as f: f.write(report) - print(f"[✓] Saved MAESTRO analysis to: {output_path}") + with open(md_input_path, "r", encoding='utf-8') as f: + report_md_raw_content = f.read() + + report_md_content = process_markdown_input(report_md_raw_content) + + + try: + import markdown + from weasyprint import HTML, CSS + from weasyprint.text.fonts import FontConfiguration + + try: + html_content = markdown.markdown( + report_md_content, + extensions=['tables', 'fenced_code', 'sane_lists'] + ) + + font_config = FontConfiguration() + html = HTML(string=html_content) + css = CSS(string=css_style, font_config=font_config) + pdf_output_path = f"{framework}_report.pdf" + html.write_pdf(pdf_output_path, stylesheets=[css], font_config=font_config) + + print(f"[✓] Saved MAESTRO analysis (PDF) to: {pdf_output_path}") + + return pdf_output_path + + except ImportError: + print("[✗] Error: WeasyPrint or Markdown library not installed. Cannot generate PDF.") + except Exception as e: + print(f"[✗] Error generating PDF: {e}") + + return md_input_path diff --git a/src/repello_agent_wiz/visualizers/visualizer.py b/src/repello_agent_wiz/visualizers/visualizer.py index c3c3dd5..a563773 100644 --- a/src/repello_agent_wiz/visualizers/visualizer.py +++ b/src/repello_agent_wiz/visualizers/visualizer.py @@ -3,7 +3,6 @@ import importlib.resources as pkg_resources from pathlib import Path - def generate_visualization(json_path: str, open_browser: bool = False): import repello_agent_wiz.templates