diff --git a/ms_agent/app/fin_research.py b/ms_agent/app/fin_research.py index 64da11fe4..1767d3e70 100644 --- a/ms_agent/app/fin_research.py +++ b/ms_agent/app/fin_research.py @@ -2209,6 +2209,36 @@ def create_interface(): opacity: 0.95; } + .main-header .main-intro { + max-width: 1024px; + margin: 1rem auto 0.75rem; + padding: 0.85rem 1.25rem; + background: rgba(15, 23, 42, 0.22); + border-radius: 0.85rem; + border: 1px solid rgba(255, 255, 255, 0.28); + font-size: clamp(0.95rem, 1.4vw, 1.1rem); + line-height: 1.65; + box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.08), + 0 10px 30px rgba(15, 23, 42, 0.18); + backdrop-filter: blur(2px); + } + + .main-header .main-intro span { + display: block; + } + + .main-header .main-intro .cn { + font-weight: 600; + letter-spacing: 0.01em; + } + + .main-header .main-intro .en { + margin-top: 0.35rem; + font-size: clamp(0.9rem, 1.25vw, 1.05rem); + color: rgba(226, 232, 240, 0.95); + letter-spacing: 0.01em; + } + .main-header .powered-by { margin-top: 0.35rem; font-size: clamp(0.85rem, 1.2vw, 1rem); @@ -2826,6 +2856,14 @@ def create_interface(): background: linear-gradient(135deg, #1e40af 0%, #1e3a8a 100%); } + .dark .main-header .main-intro { + background: rgba(15, 23, 42, 0.6); + border-color: rgba(148, 163, 184, 0.35); + box-shadow: inset 0 0 0 1px rgba(59, 130, 246, 0.25), + 0 10px 30px rgba(2, 6, 23, 0.55); + color: rgba(248, 250, 252, 0.95); + } + .dark .status-container { background: #1e293b; border-color: #334155; @@ -2943,6 +2981,10 @@ def create_interface():

📊 FinResearch 金融深度研究

Multi-Agent Financial Research Workflow

+
+ 面向金融研究的多智能体分析引擎,实现从原始市场信号到专业级研究的自动化、端到端金融报告生成。 + A multi-agent analysis engine for financial research that automates the journey from raw market signals to professional-grade insights and end-to-end financial report generation. +

Powered by Readme + | + + Examples +

""") @@ -3046,7 +3094,7 @@ def create_interface(): search_api_key = gr.Textbox( label='搜索引擎 API Key (可选 | Optional)', - placeholder='支持 exa: / serpapi: ', + placeholder='输入格式:exa:xxx 或 serpapi:xxx', type='password' ) @@ -3082,7 +3130,7 @@ def create_interface():
-
⏳ 等待启动... | Waiting to start...
+
⏳ 准备就绪... | Ready for Execution....
@@ -3199,10 +3247,10 @@ def create_interface(): 💡 提示 | Tip

- 研究任务通常需要十几分钟时间完成。您可以实时查看右侧的执行状态,了解当前是哪个 Agent 在工作。建议在研究目标中明确指定股票代码、时间范围和关注的分析维度,以获得更精准的结果。 + 研究任务通常需要十几分钟时间完成。您可以实时查看右侧的执行状态,了解当前是哪个 Agent 在工作。建议在研究目标中明确指定股票代码、时间范围和关注的分析维度,以获得更精准的结果。如果希望获得速度更快、更稳定的体验,建议在本地进行部署。 - Research tasks typically take several minutes to complete. You can monitor the execution status on the right to see which agent is working. Specify stock tickers, time ranges, and analysis dimensions for more accurate results. + Research tasks typically take several minutes to complete. You can monitor the execution status on the right to see which agent is working. Specify stock tickers, time ranges, and analysis dimensions for more accurate results. For a faster and more stable experience, we recommend deploying it locally.

diff --git a/ms_agent/tools/__init__.py b/ms_agent/tools/__init__.py index 9d4517403..0f2e2cdf3 100644 --- a/ms_agent/tools/__init__.py +++ b/ms_agent/tools/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from .agent_tool import AgentTool from .code import CodeExecutionTool, SandboxManagerFactory from .filesystem_tool import FileSystemTool from .mcp_client import MCPClient diff --git a/ms_agent/tools/agent_tool.py b/ms_agent/tools/agent_tool.py new file mode 100644 index 000000000..a3a2ec319 --- /dev/null +++ b/ms_agent/tools/agent_tool.py @@ -0,0 +1,330 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import uuid +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import json +from ms_agent.agent.loader import AgentLoader +from ms_agent.llm.utils import Message, Tool +from ms_agent.tools.base import ToolBase +from ms_agent.utils import get_logger +from omegaconf import DictConfig, ListConfig, OmegaConf + +logger = get_logger() + + +def _to_container(value: Any) -> Any: + if isinstance(value, DictConfig): + return OmegaConf.to_container(value, resolve=True) + if isinstance(value, ListConfig): + return OmegaConf.to_container(value, resolve=True) + return value + + +@dataclass +class _AgentToolSpec: + tool_name: str + description: str + parameters: Dict[str, Any] + config_path: Optional[str] + inline_config: Optional[Dict[str, Any]] + server_name: str + tag_prefix: str + input_mode: str + request_field: Optional[str] + input_template: Optional[str] + output_mode: str + max_output_chars: int + trust_remote_code: Optional[bool] + env: Optional[Dict[str, str]] + + +class AgentTool(ToolBase): + """Expose existing ms-agent agents as callable tools.""" + + DEFAULT_SERVER = 'agent_tools' + + def __init__(self, config: DictConfig, **kwargs): + super().__init__(config) + self._trust_remote_code = kwargs.get('trust_remote_code', True) + self._specs: Dict[str, _AgentToolSpec] = {} + self._server_tools: Dict[str, List[Tool]] = {} + self._load_specs() + + @property + def enabled(self) -> bool: + return bool(self._specs) + + def _load_specs(self): + tools_cfg = getattr(self.config, 'tools', DictConfig({})) + agent_tools_cfg = getattr(tools_cfg, 'agent_tools', None) + if agent_tools_cfg is None: + return + + if isinstance(agent_tools_cfg, DictConfig) and hasattr( + agent_tools_cfg, 'definitions'): + definitions = agent_tools_cfg.definitions + server_name = getattr(agent_tools_cfg, 'server_name', + self.DEFAULT_SERVER) + else: + definitions = agent_tools_cfg + server_name = self.DEFAULT_SERVER + + definitions_list: List[Any] + if isinstance(definitions, DictConfig): + definitions_list = [definitions] + elif isinstance(definitions, ListConfig): + definitions_list = list(definitions) + elif isinstance(definitions, list): + definitions_list = definitions + else: + logger.warning('agent_tools configuration is not iterable; skip.') + return + + for idx, spec_cfg in enumerate(definitions_list): + spec = self._build_spec(spec_cfg, server_name, idx) + if spec is None: + continue + if spec.tool_name in self._specs: + logger.warning( + 'Duplicate agent tool name detected: %s, overriding previous definition.', + spec.tool_name) + self._specs[spec.tool_name] = spec + + self._build_server_index() + + def _build_spec(self, cfg: Union[DictConfig, Dict[str, Any]], + default_server, idx: int) -> Optional[_AgentToolSpec]: + cfg = cfg or {} + cfg = cfg if isinstance(cfg, DictConfig) else DictConfig(cfg) + tool_name = getattr(cfg, 'tool_name', None) or getattr( + cfg, 'name', None) + if not tool_name: + logger.warning( + 'agent_tools[%s] missing tool_name/name field, skip.', idx) + return None + + agent_cfg = getattr(cfg, 'agent', None) + config_path = getattr(cfg, 'config_path', None) + inline_cfg = getattr(cfg, 'config', None) + if agent_cfg is not None: + config_path = getattr(agent_cfg, 'config_path', config_path) + inline_cfg = getattr(agent_cfg, 'config', inline_cfg) + inline_cfg = _to_container( + inline_cfg) if inline_cfg is not None else None + + if not config_path and inline_cfg is None: + logger.warning( + 'agent_tools[%s] (%s) missing config_path/config definition.', + idx, tool_name) + return None + + description = getattr(cfg, 'description', + f'Invoke agent "{tool_name}" as a tool.') + parameters = getattr(cfg, 'parameters', None) + if parameters is None: + parameters = { + 'type': 'object', + 'properties': { + 'request': { + 'type': + 'string', + 'description': + f'Task description forwarded to the sub-agent {tool_name}.' + }, + }, + 'required': ['request'], + 'additionalProperties': True, + } + else: + parameters = _to_container(parameters) + + tag_prefix = getattr( + cfg, 'tag_prefix', + f'{getattr(self.config, "tag", "agent")}-{tool_name}-') + + request_field = getattr(cfg, 'request_field', 'request') + input_template = getattr(cfg, 'input_template', None) + input_mode = getattr(cfg, 'input_mode', 'text') + output_mode = getattr(cfg, 'output_mode', 'final_message') + max_chars = int(getattr(cfg, 'max_output_chars', 5000)) + server_name = getattr(cfg, 'server_name', default_server) + trust_remote_code = getattr(cfg, 'trust_remote_code', None) + + env_cfg = getattr(cfg, 'env', None) + env_cfg = _to_container(env_cfg) if env_cfg is not None else None + + if config_path and not os.path.isabs(config_path): + base_dir = getattr(self.config, 'local_dir', None) + if base_dir: + config_path = os.path.normpath( + os.path.join(base_dir, config_path)) + + return _AgentToolSpec( + tool_name=tool_name, + description=description, + parameters=parameters, + config_path=config_path, + inline_config=inline_cfg, + server_name=server_name, + tag_prefix=tag_prefix, + input_mode=input_mode, + request_field=request_field, + input_template=input_template, + output_mode=output_mode, + max_output_chars=max_chars, + trust_remote_code=trust_remote_code, + env=env_cfg, + ) + + def _build_server_index(self): + server_map: Dict[str, List[Tool]] = {} + for spec in self._specs.values(): + server_map.setdefault(spec.server_name, []).append( + Tool( + tool_name=spec.tool_name, + server_name=spec.server_name, + description=spec.description, + parameters=spec.parameters, + )) + self._server_tools = server_map + + async def connect(self): + return None + + async def cleanup(self): + return None + + async def get_tools(self) -> Dict[str, Any]: + return self._server_tools + + async def call_tool(self, server_name: str, *, tool_name: str, + tool_args: dict) -> str: + if tool_name not in self._specs: + raise ValueError(f'Agent tool "{tool_name}" not registered.') + spec = self._specs[tool_name] + if spec.server_name != server_name: + raise ValueError( + f'Agent tool "{tool_name}" is not part of server "{server_name}".' + ) + + payload = self._build_payload(tool_args, spec) + agent = self._build_agent(spec) + messages = await self._run_agent(agent, payload) + return self._format_output(messages, spec) + + def _build_agent(self, spec: _AgentToolSpec): + if spec.inline_config is not None: + config_override = OmegaConf.create(spec.inline_config) + else: + config_override = None + + trust_remote_code = spec.trust_remote_code + if trust_remote_code is None: + trust_remote_code = self._trust_remote_code + + tag = f'{spec.tag_prefix}{uuid.uuid4().hex[:8]}' + agent = AgentLoader.build( + config_dir_or_id=spec.config_path, + config=config_override, + env=spec.env, + tag=tag, + trust_remote_code=trust_remote_code, + ) + + generation_cfg = getattr(agent.config, 'generation_config', + DictConfig({})) + # OmegaConf.update( + # generation_cfg, + # 'stream', + # False, + # merge=True, + # ) + agent.config.generation_config = generation_cfg + return agent + + async def _run_agent(self, agent, payload): + result = await agent.run(payload) + if hasattr(result, '__aiter__'): + history = None + async for chunk in result: + history = chunk + result = history + return result + + def _build_payload(self, tool_args: dict, spec: _AgentToolSpec): + if spec.input_mode == 'messages': + field = spec.request_field or 'messages' + raw_messages = tool_args.get(field) + if not isinstance(raw_messages, list): + raise ValueError( + f'Agent tool "{spec.tool_name}" expects "{field}" to be a list of messages.' + ) + return [ + Message( + role=msg.get('role', 'user'), + content=msg.get('content', ''), + tool_calls=msg.get('tool_calls', []), + tool_call_id=msg.get('tool_call_id'), + name=msg.get('name'), + reasoning_content=msg.get('reasoning_content', ''), + ) for msg in raw_messages # TODO: Change role to user or not + ] + + if spec.input_template: + template_args = defaultdict(lambda: '', tool_args) + try: + return spec.input_template.format_map(template_args) + except Exception as exc: + logger.warning( + 'Failed to render input template for tool %s: %s. Falling back to JSON payload.', + spec.tool_name, exc) + + field = spec.request_field or 'request' + if field in tool_args and isinstance(tool_args[field], str): + return tool_args[field] + + return json.dumps(tool_args, ensure_ascii=False, indent=2) + + def _format_output(self, messages: Any, spec: _AgentToolSpec) -> str: + if not isinstance(messages, list): + return self._truncate(str(messages), spec.max_output_chars) + + if spec.output_mode == 'history': + serialized = [self._serialize_message(msg) for msg in messages] + return self._truncate( + json.dumps(serialized, ensure_ascii=False, indent=2), + spec.max_output_chars) + + if spec.output_mode == 'raw_json': + serialized = [msg.to_dict() for msg in messages] # type: ignore + return self._truncate( + json.dumps(serialized, ensure_ascii=False), + spec.max_output_chars) + + # Default: return final assistant message text + for msg in reversed(messages): + if getattr(msg, 'role', '') == 'assistant': + return self._truncate(msg.content or '', spec.max_output_chars) + + return self._truncate(messages[-1].content or '', + spec.max_output_chars) + + def _serialize_message(self, message: Message) -> Dict[str, Any]: + data = message.to_dict() + if data.get('tool_calls'): + for call in data['tool_calls']: + if isinstance(call.get('arguments'), dict): + call['arguments'] = json.dumps( + call['arguments'], ensure_ascii=False) + return data + + @staticmethod + def _truncate(text: str, limit: int) -> str: + if limit <= 0: + return text + if len(text) <= limit: + return text + return text[:limit] + '\n\n[AgentTool truncated output]' diff --git a/ms_agent/tools/docling/doc_loader.py b/ms_agent/tools/docling/doc_loader.py index 7b84c7fcf..5daf1652b 100644 --- a/ms_agent/tools/docling/doc_loader.py +++ b/ms_agent/tools/docling/doc_loader.py @@ -1,7 +1,9 @@ # flake8: noqa +# yapf: disable import ast import os from typing import Dict, Iterator, List, Optional, Tuple, Union +from unittest.mock import patch as mock_patch from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.accelerator_options import AcceleratorOptions @@ -21,7 +23,8 @@ download_models_pic_classifier_ms, html_handle_figure, html_handle_image, - patch_easyocr_models) + patch_easyocr_models, + requests_get_with_timeout) from ms_agent.utils.logger import get_logger from ms_agent.utils.patcher import patch from ms_agent.utils.utils import normalize_url_or_file, txt_to_html @@ -196,9 +199,10 @@ def check_url_valid(url: tuple[int, str]) -> tuple[int, str] | None: return None # Try to send a HEAD request to check if the URL is accessible - response = requests.head(_url, timeout=10) + response = requests.head(_url, timeout=(10, 25)) if response.status_code >= 400: - response = requests.get(_url, stream=True, timeout=10) + response = requests.get( + _url, stream=True, timeout=(10, 25)) if response.status_code >= 400: logger.warning( f'URL returned error status {response.status_code}: {_url}' @@ -256,9 +260,10 @@ def check_file_valid(file: tuple[int, str]) -> tuple[int, str] | None: file_paths = [(i, file) for i, file in enumerate(url_or_files) if file and not file.startswith('http')] preprocessed = [] + max_workers = min(8, (os.cpu_count() or 4)) # Step1: Remove urls that cannot be processed - with ThreadPoolExecutor() as executor: + with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(check_url_valid, url) for url in http_urls ] @@ -268,7 +273,7 @@ def check_file_valid(file: tuple[int, str]) -> tuple[int, str] | None: preprocessed.append(result) # Step2: Add file paths that are valid - with ThreadPoolExecutor() as executor: + with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(check_file_valid, file) for file in file_paths ] @@ -303,6 +308,8 @@ def _postprocess(doc: DoclingDocument) -> Union[DoclingDocument, None]: download_models_pic_classifier_ms) @patch(HTMLDocumentBackend, 'handle_image', html_handle_image) @patch(HTMLDocumentBackend, 'handle_figure', html_handle_figure) + @mock_patch('docling_core.utils.file.requests.get', + requests_get_with_timeout) def load( self, urls_or_files: list[str], diff --git a/ms_agent/tools/docling/patches.py b/ms_agent/tools/docling/patches.py index ac4f45521..b2ec77afd 100644 --- a/ms_agent/tools/docling/patches.py +++ b/ms_agent/tools/docling/patches.py @@ -1,4 +1,5 @@ # flake8: noqa +import sys from pathlib import Path from bs4 import Tag @@ -170,3 +171,18 @@ def patch_easyocr_models(): 'url'] = 'https://modelscope.cn/models/ms-agent/kannada_g2/resolve/master/kannada_g2.zip' recognition_models['gen2']['cyrillic_g2'][ 'url'] = 'https://modelscope.cn/models/ms-agent/cyrillic_g2/resolve/master/cyrillic_g2.zip' + + +def requests_get_with_timeout( + *args, + _original_requests_get=sys.modules['requests'].get, + **kwargs +): # yapf: disable + """ + Wrapper for requests.get that enforces a default timeout if none is provided. + This is used to patch docling_core.utils.file.requests.get only. + """ + if 'timeout' not in kwargs or kwargs['timeout'] is None: + kwargs['timeout'] = (10, 30) + + return _original_requests_get(*args, **kwargs) diff --git a/ms_agent/tools/jina_reader.py b/ms_agent/tools/jina_reader.py new file mode 100644 index 000000000..d8962f617 --- /dev/null +++ b/ms_agent/tools/jina_reader.py @@ -0,0 +1,138 @@ +import asyncio +import random +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field +from typing import Dict, List, Optional +from urllib.error import HTTPError, URLError +from urllib.parse import quote +from urllib.request import Request, urlopen + +DEFAULT_HEADERS: Dict[str, str] = { + 'User-Agent': + 'Mozilla/5.0 (compatible; ms-agent/1.0; +https://example.com)', + 'Accept': 'text/plain; charset=utf-8', + 'Accept-Language': 'en-US,en;q=0.9', +} + + +@dataclass +class JinaReaderConfig: + base_endpoint: str = 'https://r.jina.ai/' + timeout: float = 30.0 + retries: int = 3 + backoff_base: float = 0.8 + backoff_max: float = 8.0 + headers: Dict[str, + str] = field(default_factory=lambda: DEFAULT_HEADERS.copy()) + + +def _build_reader_url(target_url: str, base_endpoint: str) -> str: + encoded_target = quote(target_url, safe=":/?&=%#@!$'*+,;[]()") + base = base_endpoint if base_endpoint.endswith( + '/') else f'{base_endpoint}/' + return f'{base}{encoded_target}' + + +def _postprocess_text(raw_text: str) -> str: + """ + Lightweight cleanup suitable for LLM consumption. + - Normalize line breaks + - Collapse excessive blank lines + - Trim leading/trailing whitespace + """ + if not raw_text: + return '' + text = raw_text.replace('\r\n', '\n').replace('\r', '\n') + # Collapse 3+ consecutive blank lines down to 2 + while '\n\n\n' in text: + text = text.replace('\n\n\n', '\n\n') + return text.strip() + + +def fetch_single_text(url: str, config: JinaReaderConfig) -> str: + """ + Synchronous fetch of a single URL via Jina Reader with retry/backoff and postprocessing. + """ + request_url = _build_reader_url(url, config.base_endpoint) + attempt = 0 + while True: + attempt += 1 + try: + req = Request(request_url, headers=config.headers) + with urlopen(req, timeout=config.timeout) as resp: + data = resp.read() + return _postprocess_text( + data.decode('utf-8', errors='replace')) + except HTTPError as e: + # Retry on 429 and 5xx, otherwise fail fast + status = getattr(e, 'code', None) + if status in (429, 500, 502, 503, + 504) and attempt <= config.retries: + sleep_s = min(config.backoff_max, + config.backoff_base * (2**(attempt - 1))) + sleep_s *= random.uniform(0.7, 1.4) + time.sleep(sleep_s) + continue + return '' + except URLError: + if attempt <= config.retries: + sleep_s = min(config.backoff_max, + config.backoff_base * (2**(attempt - 1))) + sleep_s *= random.uniform(0.7, 1.4) + time.sleep(sleep_s) + continue + return '' + except Exception: + # Unknown error; do not loop excessively + if attempt <= config.retries: + sleep_s = min(config.backoff_max, + config.backoff_base * (2**(attempt - 1))) + sleep_s *= random.uniform(0.7, 1.4) + time.sleep(sleep_s) + continue + return '' + + +async def fetch_texts_via_jina( + urls: List[str], + config: Optional[JinaReaderConfig] = None, + semaphore: Optional[asyncio.Semaphore] = None, + executor: Optional[ThreadPoolExecutor] = None) -> List[str]: + """ + Asynchronously fetch a list of URLs via Jina Reader. + Allows caller-provided concurrency controls (semaphore/executor) to integrate with pipeline resource management. + """ + if not urls: + return [] + cfg = config or JinaReaderConfig() + loop = asyncio.get_event_loop() + + local_sem = semaphore or asyncio.Semaphore(8) + + async def _bound(u: str) -> str: + async with local_sem: + return await loop.run_in_executor(executor, fetch_single_text, u, + cfg) + + tasks = [_bound(u) for u in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + texts: List[str] = [] + for r in results: + if isinstance(r, Exception): + continue + if isinstance(r, str) and r.strip(): + texts.append(r) + return texts + + +if __name__ == '__main__': + urls = [ + 'https://arxiv.org/pdf/2408.09869', + 'https://github.com/modelscope/evalscope', + 'https://www.news.cn/talking/20250530/691e47a5d1a24c82bfa2371d1af40630/c.html', + ] + texts = asyncio.run(fetch_texts_via_jina(urls)) + for text in texts: + print(text) + print('-' * 100) diff --git a/ms_agent/tools/tool_manager.py b/ms_agent/tools/tool_manager.py index 290d0960b..0b5d51eec 100644 --- a/ms_agent/tools/tool_manager.py +++ b/ms_agent/tools/tool_manager.py @@ -10,6 +10,7 @@ import json from ms_agent.llm.utils import Tool, ToolCall +from ms_agent.tools.agent_tool import AgentTool from ms_agent.tools.base import ToolBase from ms_agent.tools.code import CodeExecutionTool, LocalCodeExecutionTool from ms_agent.tools.filesystem_tool import FileSystemTool @@ -77,6 +78,12 @@ def __init__(self, if hasattr(config, 'tools') and hasattr(config.tools, 'financial_data_fetcher'): self.extra_tools.append(FinancialDataFetcher(config)) + if hasattr(config, 'tools') and getattr(config.tools, 'agent_tools', + None): + agent_tool = AgentTool( + config, trust_remote_code=self.trust_remote_code) + if agent_tool.enabled: + self.extra_tools.append(agent_tool) self.tool_call_timeout = getattr(config, 'tool_call_timeout', TOOL_CALL_TIMEOUT) local_dir = self.config.local_dir if hasattr(self.config, diff --git a/ms_agent/utils/utils.py b/ms_agent/utils/utils.py index a2a7da3ee..4c8cd45ae 100644 --- a/ms_agent/utils/utils.py +++ b/ms_agent/utils/utils.py @@ -375,7 +375,7 @@ def load_image_from_url_to_pil(url: str) -> 'Image.Image': """ from PIL import Image try: - response = requests.get(url) + response = requests.get(url, timeout=(10, 25)) # Raise an HTTPError for bad responses (4xx or 5xx) response.raise_for_status() image_bytes = BytesIO(response.content) diff --git a/projects/fin_research/aggregator.yaml b/projects/fin_research/aggregator.yaml index dffb0ef38..319566d02 100644 --- a/projects/fin_research/aggregator.yaml +++ b/projects/fin_research/aggregator.yaml @@ -7,12 +7,14 @@ llm: generation_config: stream: true + stream_options: + include_usage: true prompt: system: | - You are an intelligent financial analysis agent. + You are an intelligent financial analysis agent. You solve financial analysis tasks through systematic tool usage and step-by-step reasoning. Your task is to generate a comprehensive financial report by integrating insights from two separate sources: - Financial Data Analysis Report — derived from collected financial metrics, datasets, and quantitative analyses. - Online Sentiment Analysis Report — derived from web-based data, including news, social media, \ @@ -21,8 +23,12 @@ prompt: + Follow the multi-phase workflow defined below. + After completing a phase, you MUST NOT summarize, pause, or ask whether to continue without tool calls. + Immediately and automatically start the next phase. + Phase1: Synthesize Findings and Create Report Outline: - You MUST begin each message in this phase with `[ACT=outline]: ` on the first line to indicate intent. + **You MUST begin each message in this phase with `[ACT=outline]: ` on the first line to indicate intent.** - Synthesize important findings: - Carefully read and interpret the user's original plan for the financial analysis task \ (e.g., focus on industry trends, company performance, investment risks, etc.). @@ -38,11 +44,9 @@ prompt: - You are encouraged to retrieve one or more principles as the foundation for constructing \ the report outline, ensuring the final report's professionalism and refinement. - Output the report outline to a file in the default working directory. - - Avoid redundant tool calls for file queries or reads when the relevant information has \ - already been loaded or provided in the current context. Phase2: Generate the Final Report Chapter by Chapter: - You MUST begin each message in this phase with `[ACT=partial_report]: ` on the first line to indicate intent. + **You MUST begin each message in this phase with `[ACT=partial_report]: ` on the first line to indicate intent.** - Generate the report chapter by chapter according to the outline, for each chapter: - Write the full content for that chapter only and persist it to `.md`. - From Chapter 2 onward, append a **“Mismatch with Prior Chapters”** section listing any inconsistencies \ @@ -52,11 +56,13 @@ prompt: - Save the consolidated results to `cross_chapter_mismatches.md` in the default working directory. Phase3: Consolidate and Finalize the Report: - You MUST begin each message in this phase with `[ACT=final_report]: ` on the first line to indicate intent. + **You MUST begin each message in this phase with `[ACT=final_report]: ` on the first line to indicate intent.** - Integrate all chapters into a coherent final report: - Review the report outline, all chapter files, `cross_chapter_mismatches.md`, and user's original plan for the financial analysis task. - Summarize and resolve all mismatch issues, ensuring consistency in data, scope, and terminology. - Search for and retain valuable visual elements from the historical context and available resources in the working directory. + - You are STRONGLY ENCOURAGED to retrieve relevant writing specs before drafting the final report, \ + using the spec tools that are currently visible via the spec_loader server. - Perform self-check and refinement for logical flow, clarity, and professional tone, \ and optionally append a brief **Final_Checklist** section summarizing remaining issues and validations. - Output the final report in markdown formated text without tool calls. @@ -65,9 +71,12 @@ prompt: - **Every turn MUST include at least one tool call — without exception — unless you are producing the final report. \ Failure to call a tool in any non-final-report turn is considered a violation of the protocol.** - - Retrieve principles depending on the principle_skill server. + - Retrieve principle specs, writing specs, or other available specs through the spec_loader server, \ + based on the spec tools currently exposed to you. - To reduce the number of conversation turns, multiple tools can be invoked simultaneously within a single turn when necessary. - When using the file_system---write_file tool, pass an empty string ('') to the path argument to use the default working directory. + - Avoid redundant tool calls for file queries or reads when the relevant information has \ + already been loaded or provided in the current context. @@ -88,6 +97,7 @@ prompt: - Maintains a professional tone and report-style formatting. - Keep figure and table numbering independent, each following its own consistent sequence. - Please generate the final report in the same language used in the plan (Chinese, English, etc.). + - You are encouraged to follow the writing specs retrieved from the spec_loader server if available. @@ -121,8 +131,10 @@ tools: mcp: false exclude: - create_directory + spec_loader: + mcp: false plugins: - - tools/principle_skill + - tools/spec_loader handler: time_handler diff --git a/projects/fin_research/analyst.yaml b/projects/fin_research/analyst.yaml index 13eb6e873..992750142 100644 --- a/projects/fin_research/analyst.yaml +++ b/projects/fin_research/analyst.yaml @@ -7,6 +7,8 @@ llm: generation_config: stream: true + stream_options: + include_usage: true prompt: @@ -74,17 +76,16 @@ prompt: assumptions (frequency, annualization, risk-free rate, etc.), conclusions, limitations, and next steps. - You must output all file paths as relative paths to the / directory in this phase, \ for example: "./sessions/session_b5e8d412/profitability_trends.png". + - You MUST output the final report in natural language before you stop. - - Use standard OpenAI function calling to invoke tools. \ - Do NOT output code in assistant's natural language output. - - Every turn MUST include at least one tool call, unless you're providing the FINAL summary. + - Use standard OpenAI function calling to invoke tools. Do NOT output code in assistant's natural language output. + - If you use [ACT=code], [ACT=collect], or [ACT=fix], you MUST include at least one tool call in that turn. + - If you use [ACT=report], you MUST output the comprehensive summary in markdown format and MUST NOT call any tools in that turn. - After each tool call, carefully review the output. - State explicitly what you learned and what comes next. - Continue calling tools until you have sufficient evidence to conclude. - - When analysis is complete and you need to provide a comprehensive summary, \ - you can use [ACT=report] without tools and stop. @@ -123,6 +124,7 @@ prompt: - If a read fails: rotate encodings; if a column is missing: print df.columns and adjust; if time parsing fails: print sample bad rows and fix. - Keep outputs reproducible and auditable (print what changed and why). - Use shell_executor only for non-destructive tasks (e.g., mkdir -p /sessions/..., ls -l) if shell_executor is AVAILABLE. + - NEVER invent data or results. Only use values from retrieved data or executed code. If data is missing or non-computable, say so explicitly. @@ -153,6 +155,7 @@ prompt: [ACT=report] Purpose: Delivering the final synthesis: data & cleaning summary, key figures (absolute paths), \ metrics & assumptions, conclusions, limitations, next steps. + // Note: This final report MUST appear exactly once in natural language output before the conversation ends. // Note: This is the ONLY case where no tool call follows - final summary only. Example 4 - Error handling: diff --git a/projects/fin_research/collector.yaml b/projects/fin_research/collector.yaml index e33b7d868..5eb27f228 100644 --- a/projects/fin_research/collector.yaml +++ b/projects/fin_research/collector.yaml @@ -7,6 +7,8 @@ llm: generation_config: stream: true + stream_options: + include_usage: true prompt: diff --git a/projects/fin_research/orchestrator.yaml b/projects/fin_research/orchestrator.yaml index 4a69544d1..3a5977ec0 100644 --- a/projects/fin_research/orchestrator.yaml +++ b/projects/fin_research/orchestrator.yaml @@ -7,6 +7,8 @@ llm: generation_config: stream: true + stream_options: + include_usage: true prompt: diff --git a/projects/fin_research/tools/principle_skill.py b/projects/fin_research/tools/principle_skill.py deleted file mode 100644 index 849030aa3..000000000 --- a/projects/fin_research/tools/principle_skill.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# flake8: noqa -import os -from typing import Any, Dict, List, Optional, Tuple - -import json -from ms_agent.llm.utils import Tool -from ms_agent.tools.base import ToolBase -from ms_agent.utils import get_logger - -logger = get_logger() - -PRINCIPLE_GUIDE = """ - -- **MECE (Mutually Exclusive, Collectively Exhaustive) — non-overlapping, no-omission framing** - - **Use for:** Building problem & metric trees, defining scopes and boundaries, avoiding gaps/duplication. - - **Best for:** Kick-off structuring of any report (industry/company/portfolio/risk). - - **Deliverable:** 3-5 first-level dimensions; second-level factors with measurement definitions; a “Problem → Scope → Metrics” blueprint. - -- **Value Chain (Porter) — sources of cost/value** - - **Use for:** Explaining fundamentals and levers behind Gross Margin / ROIC (primary + support activities). - - **Best for:** Company & supply-chain research; cost curve and pass-through analysis. - - **Deliverable:** Stage → Drivers → Bottlenecks → Improvements → Financial impact (quantified to GM/Cash Flow/KPIs). - -- **BCG Growth-Share Matrix (Boston Matrix) — growth x share portfolio positioning** - - **Use for:** Placing multi-business/multi-track items into Star/Cash Cow/Question Mark/Dog to guide resource/weighting decisions. - - **Best for:** Comparing industry sub-segments; managing company business portfolios. - - **Deliverable:** Quadrant mapping; capital/attention flow plan (e.g., from Cows → Stars/Questions); target weights and migration triggers. - -- **80/20 (Pareto) — focus on the vital few** - - **Use for:** Selecting the top ~20% drivers that explain most outcomes across metrics/assets/factors; compressing workload. - - **Best for:** Return/risk attribution; metric prioritization; evidence triage. - - **Deliverable:** Top-K key drivers + quantified contributions + tracking KPIs; fold the remainder into “long-tail management.” - -- **SWOT → TOWS — from inventory to action pairing** - - **Use for:** Pairing internal (S/W) and external (O/T) to form SO/WO/ST/WT **actionable strategies** with KPIs. - - **Best for:** Strategy setting, post-investment management, risk hedging and adjustment thresholds. - - **Deliverable:** Action list with owners/KPIs/thresholds and financial mapping (revenue/GM/cash-flow impact). - -- **Pyramid / Minto — conclusion-first presentation wrapper** - - **Use for:** Packaging analysis as “Answer → 3 parallel supports → key evidence/risk hedges” for fast executive reading. - - **Best for:** Executive summaries, IC materials, report front pages. - - **Deliverable:** One-sentence conclusion (direction + range + time frame), three parallel key points, strongest evidence charts. - -""" -ROUTING_GUIDE = """ - -Here are some Heuristic hints for selecting the appropriate principles for the task: -- Need to “frame & define scope”? → Start with **MECE**; if explaining costs/moats, add **Value Chain**. -- Multi-business/multi-track “allocation decisions”? → Use **BCG** for positioning & weights, then **80/20** to focus key drivers. -- Want to turn inventory into **executable actions**? → **SWOT→TOWS** for strategy+KPI and threshold design. -- Delivering to management? → Present the whole piece with **Pyramid**; other principles provide evidence and structural core. - -""" - - -class PrincipleSkill(ToolBase): - """Aggregate access to multiple analysis principles. - - Server name: `principle_skill` - - This tool exposes a single function `load_principles` that loads one or more - principle knowledge files and returns their content to the model. Each - principle provides concise concept definitions and guidance on how to apply - the principle to financial analysis and report writing. The underlying - knowledge is stored as Markdown files and can be configured via - `tools.principle_skill.principle_dir` in the agent config. When not provided, - the tool falls back to `projects/fin_research/tools/principles` - under the current working directory. - - Supported principle identifiers (case-insensitive, synonyms allowed): - - MECE → MECE.md - - Pyramid / Minto / Minto Pyramid → Minto_Pyramid.md - - SWOT → SWOT.md - - Value Chain → Value_Chain.md - - Pareto / 80-20 / 80/20 → Pareto_80-20.md - - Boston Matrix / BCG / Boston Consulting Group → Boston_Matrix.md - """ - - PRINCIPLE_DIR = 'projects/fin_research/tools/principles' - - def __init__(self, config): - super().__init__(config) - tools_cfg = getattr(config, 'tools', - None) if config is not None else None - self.exclude_func(getattr(tools_cfg, 'principle_skill', None)) - - configured_dir = None - if tools_cfg is not None: - configured_dir = getattr(tools_cfg, 'principle_dir', None) - - default_root = os.getcwd() - default_dir = os.path.join(default_root, self.PRINCIPLE_DIR) - - # If a config-specified directory exists, prefer it; else use default. - self.principle_dir = configured_dir or default_dir - - # Build a mapping from normalized user inputs to on-disk filenames and display names - self._name_to_file: Dict[str, - Tuple[str, - str]] = self._build_principle_index() - - async def connect(self): - # Warn once if the directory cannot be found; still operate to allow deferred config - if not os.path.isdir(self.principle_dir): - logger.warning_once( - f'[principle_skill] Principle directory not found: {self.principle_dir}. ' - f'Configure tools.principle_skill.principle_dir or ensure default exists.' - ) - - async def _get_tools_inner(self) -> Dict[str, Any]: - tools = { - 'principle_skill': [ - Tool( - tool_name='load_principles', - server_name='principle_skill', - description= - (f'Load one or more analysis principles (concept + how to apply to ' - f'financial analysis) and return their curated Markdown content.\n\n' - f'This is a single-aggregator tool designed to fetch multiple principles ' - f'in one call. Provide a list of requested principles via the "principles" ' - f'parameter. The tool supports common synonyms and is case-insensitive.\n\n' - f'Examples of valid principle identifiers: "MECE", "Pyramid", "Minto", ' - f'"SWOT", "Value Chain", "Pareto", "80-20", "80/20", "Boston Matrix", "BCG".\n\n' - f'When format is "markdown" (default), the tool returns a single combined ' - f'Markdown string (optionally including section titles). When format is ' - f'"json", the tool returns a JSON object mapping principle to content.\n' - f'{PRINCIPLE_GUIDE}\n' - f'{ROUTING_GUIDE}\n'), - parameters={ - 'type': 'object', - 'properties': { - 'principles': { - 'type': - 'array', - 'items': { - 'type': 'string' - }, - 'description': - ('List of principles to load. Case-insensitive; supports synonyms.\n' - 'Allowed identifiers include (non-exhaustive):\n' - '- MECE\n- Pyramid\n- Minto\n- SWOT\n- Value Chain\n' - '- Pareto\n- 80-20\n- 80/20\n- Boston Matrix\n- BCG\n' - ), - }, - 'format': { - 'type': - 'string', - 'enum': ['markdown', 'json'], - 'description': - ('Output format: "markdown" (combined Markdown string) or "json" ' - '(JSON object mapping principle to content). Default: "markdown".' - ), - }, - 'include_titles': { - 'type': - 'boolean', - 'description': - ('When format="markdown", if true, each section is prefixed with a ' - 'Markdown heading of the canonical principle title. Default: true.' - ), - }, - 'join_with': { - 'type': - 'string', - 'description': - ('When format="markdown", the delimiter used to join multiple ' - 'sections. Default: "\n\n---\n\n".'), - }, - 'strict': { - 'type': - 'boolean', - 'description': - ('If true, unknown principles cause an error. If false, unknown ' - 'items are ignored with a note in the output. Default: false.' - ), - }, - }, - 'required': ['principles'], - 'additionalProperties': False, - }, - ) - ] - } - return tools - - async def call_tool(self, server_name: str, *, tool_name: str, - tool_args: dict) -> str: - return await getattr(self, tool_name)(**tool_args) - - async def load_principles( - self, - principles: List[str], - format: str = 'markdown', - include_titles: bool = False, - join_with: str = '\n\n---\n\n', - strict: bool = False, - ) -> str: - """Load requested principle documents and return their content. - - Returns: - str: Markdown string (default) or JSON string mapping principle → content. - """ - - if not principles: - return json.dumps( - { - 'success': False, - 'error': 'No principles provided.' - }, - ensure_ascii=False, - indent=2, - ) - - resolved: Dict[str, Tuple[str, str]] = {} - unknown: List[str] = [] - for name in principles: - key = self._normalize_name(name) - if key in self._name_to_file: - resolved[name] = self._name_to_file[key] - else: - unknown.append(name) - - if unknown and strict: - return json.dumps( - { - 'success': - False, - 'error': - 'Unknown principles (strict mode): ' + ', '.join(unknown) - }, - ensure_ascii=False, - indent=2, - ) - - loaded: Dict[str, str] = {} - for original_name, (filename, canonical_title) in resolved.items(): - path = os.path.join(self.principle_dir, filename) - try: - with open(path, 'r') as f: - content = f.read().strip() - loaded[canonical_title] = content - except Exception as e: # noqa - loaded[ - canonical_title] = f'Failed to load {filename}: {str(e)}' - - if not loaded: - return json.dumps( - { - 'success': False, - 'error': 'Failed to load any principles.' - }, - ensure_ascii=False, - indent=2, - ) - - if format == 'json': - payload = { - 'success': True, - 'principles': loaded, - 'unknown': unknown, - 'source_dir': self.principle_dir, - } - return json.dumps(payload, ensure_ascii=False) - - # Default: markdown - sections: List[str] = [] - for title, content in loaded.items(): - if include_titles: - sections.append(f'# {title}\n\n{content}') - else: - sections.append(content) - - if unknown and not strict: - sections.append( - f'> Note: Unknown principles ignored: {", ".join(unknown)}') - - return json.dumps( - { - 'success': True, - 'sections': sections - }, - ensure_ascii=False, - indent=2, - ) - - def _build_principle_index(self) -> Dict[str, Tuple[str, str]]: - """Return mapping from normalized query → (filename, canonical title).""" - entries: List[Tuple[List[str], str, str]] = [ - # synonyms, filename, canonical title - (['mece', 'mutually exclusive and collectively exhaustive'], - 'MECE.md', 'MECE'), - ([ - 'pyramid', 'minto', 'minto pyramid', 'pyramid principle', - 'minto_pyramid' - ], 'Minto_Pyramid.md', 'Pyramid (Minto Pyramid)'), - (['swot', 'swot analysis'], 'SWOT.md', 'SWOT'), - (['value chain', 'value-chain', - 'value_chain'], 'Value_Chain.md', 'Value Chain'), - ([ - 'pareto', '80-20', '80/20', 'pareto 80-20', 'pareto_80-20', - '8020' - ], 'Pareto_80-20.md', 'Pareto (80/20 Rule)'), - ([ - 'boston matrix', 'bcg', 'boston consulting group', - 'boston_matrix', 'boston' - ], 'Boston_Matrix.md', 'Boston Matrix (BCG)'), - ] - - index: Dict[str, Tuple[str, str]] = {} - for synonyms, filename, title in entries: - for s in synonyms: - index[self._normalize_name(s)] = (filename, title) - return index - - @staticmethod - def _normalize_name(name: str) -> str: - s = (name or '').strip().lower() - s = s.replace('_', ' ').replace('-', ' ') - s = ' '.join(s.split()) # collapse whitespace - # normalize 80/20 variants - s = s.replace('80/20', '80-20').replace('80 20', '80-20') - s = s.replace('8020', '80-20') - return s diff --git a/projects/fin_research/tools/spec_constant.py b/projects/fin_research/tools/spec_constant.py new file mode 100644 index 000000000..904f2b9e4 --- /dev/null +++ b/projects/fin_research/tools/spec_constant.py @@ -0,0 +1,106 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# flake8: noqa +# isort: skip_file +# yapf: disable + +WRITING_SPEC_GUIDE = """ + + +- **Structure & Layering — control section depth and hierarchy** + - **Use for:** Deciding how many levels of headings/sections to use in a report. + - **Best for:** Long-form financial/company/industry reports where the model might create 4+ nested levels. + - **Key constraints:** Default to 2–3 heading levels; avoid creating sub-sub-subsections with only 1–2 short paragraphs. + +- **Methodology Exposure — how much to talk about frameworks** + - **Use for:** When you are tempted to write long “Research Methodology” sections or repeatedly mention MECE/SWOT/80-20 in the main text. + - **Best for:** Analyst-style reports where frameworks should be *used* implicitly instead of being “lectured”. + - **Key constraints:** Briefly mention the approach once if needed; do NOT devote full chapters to methods; avoid repeating framework names as slogans. + +- **Bullets & Paragraph Rhythm — bullets vs narrative** + - **Use for:** Deciding when to use bullet points vs continuous paragraphs. + - **Best for:** Sections with many drivers/risks/factors where you may over-bullet every sentence. + - **Key constraints:** Bullets for lists (drivers, risks, recommendations); keep explanatory reasoning in paragraphs; avoid “one sentence per bullet” patterns. + +- **Task Focus & Relevance — stay anchored to the user's question** + - **Use for:** Ensuring that each chapter directly serves the original question instead of drifting into generic industry essays. + - **Best for:** Prompts that ask for specific company/period/comparison/forecast. + - **Key constraints:** Minimize repeated background; tie each section back to the key metrics and drivers required by the task (e.g., profitability, cash-flow quality, competition, forecasts). + +- **Tone & Analyst Voice — sound like a human analyst, not a textbook** + - **Use for:** Choosing phrasing style and presentation voice. + - **Best for:** Sell-side / buy-side style reports, IC memos, investment notes. + - **Key constraints:** Conclusion-first; 2–4 key supporting points; professional but readable; avoid academic jargon and over-formal “methodology lectures”. + +- **Density & Length Control — right amount of detail** + - **Use for:** Controlling report length and pruning low-value content. + - **Best for:** Long multi-chapter outputs where token budget and human attention are limited. + - **Key constraints:** Prioritize conclusions, drivers, and critical numbers; compress or omit peripheral background; avoid repeating the same facts in multiple chapters. + +""" +WRITING_ROUTING_GUIDE = """ + +Heuristics for selecting writing style specs: + +- Report feels too much like an academic paper or you want a clear report skeleton? + → Load **Structure & Layering** + **Tone & Analyst Voice**. + +- You're about to write a long “Research Methodology” chapter or heavily talk about MECE/SWOT/etc.? + → Load **Methodology Exposure** (and follow its constraints strictly). + +- You're using many bullet points and the text starts looking like a checklist? + → Load **Bullets & Paragraph Rhythm** to rebalance bullets vs narrative flow. + +- The user's question is narrow (e.g., “past 4 quarters + next 2 quarters”), but you're expanding a lot on generic industry background? + → Load **Task Focus & Relevance** to keep all chapters anchored to the core task. + +- The answer tends to be very long and repetitive, and you need to compress while preserving value? + → Load **Density & Length Control**; it tells you what to prune and what to keep. + +You can combine multiple specs in one call, e.g.: +- For an analyst-style profitability & forecast report: + → [ "structure", "tone", "methods", "bullets", "focus" ] + +""" + +PRINCIPLE_SPEC_GUIDE = """ + +- **MECE (Mutually Exclusive, Collectively Exhaustive) — non-overlapping, no-omission framing** + - **Use for:** Building problem & metric trees, defining scopes and boundaries, avoiding gaps/duplication. + - **Best for:** Kick-off structuring of any report (industry/company/portfolio/risk). + - **Deliverable:** 3-5 first-level dimensions; second-level factors with measurement definitions; a “Problem → Scope → Metrics” blueprint. + +- **Value Chain (Porter) — sources of cost/value** + - **Use for:** Explaining fundamentals and levers behind Gross Margin / ROIC (primary + support activities). + - **Best for:** Company & supply-chain research; cost curve and pass-through analysis. + - **Deliverable:** Stage → Drivers → Bottlenecks → Improvements → Financial impact (quantified to GM/Cash Flow/KPIs). + +- **BCG Growth-Share Matrix (Boston Matrix) — growth x share portfolio positioning** + - **Use for:** Placing multi-business/multi-track items into Star/Cash Cow/Question Mark/Dog to guide resource/weighting decisions. + - **Best for:** Comparing industry sub-segments; managing company business portfolios. + - **Deliverable:** Quadrant mapping; capital/attention flow plan (e.g., from Cows → Stars/Questions); target weights and migration triggers. + +- **80/20 (Pareto) — focus on the vital few** + - **Use for:** Selecting the top ~20% drivers that explain most outcomes across metrics/assets/factors; compressing workload. + - **Best for:** Return/risk attribution; metric prioritization; evidence triage. + - **Deliverable:** Top-K key drivers + quantified contributions + tracking KPIs; fold the remainder into “long-tail management.” + +- **SWOT → TOWS — from inventory to action pairing** + - **Use for:** Pairing internal (S/W) and external (O/T) to form SO/WO/ST/WT **actionable strategies** with KPIs. + - **Best for:** Strategy setting, post-investment management, risk hedging and adjustment thresholds. + - **Deliverable:** Action list with owners/KPIs/thresholds and financial mapping (revenue/GM/cash-flow impact). + +- **Pyramid / Minto — conclusion-first presentation wrapper** + - **Use for:** Packaging analysis as “Answer → 3 parallel supports → key evidence/risk hedges” for fast executive reading. + - **Best for:** Executive summaries, IC materials, report front pages. + - **Deliverable:** One-sentence conclusion (direction + range + time frame), three parallel key points, strongest evidence charts. + +""" +PRINCIPLE_ROUTING_GUIDE = """ + +Here are some Heuristic hints for selecting the appropriate principles for the task: +- Need to “frame & define scope”? → Start with **MECE**; if explaining costs/moats, add **Value Chain**. +- Multi-business/multi-track “allocation decisions”? → Use **BCG** for positioning & weights, then **80/20** to focus key drivers. +- Want to turn inventory into **executable actions**? → **SWOT→TOWS** for strategy+KPI and threshold design. +- Delivering to management? → Present the whole piece with **Pyramid**; other principles provide evidence and structural core. + +""" diff --git a/projects/fin_research/tools/spec_loader.py b/projects/fin_research/tools/spec_loader.py new file mode 100644 index 000000000..6ed8e1e21 --- /dev/null +++ b/projects/fin_research/tools/spec_loader.py @@ -0,0 +1,406 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# flake8: noqa +import os +from typing import Any, Dict, List, Tuple + +import json +from ms_agent.llm.utils import Tool +from ms_agent.tools.base import ToolBase +from ms_agent.utils import get_logger +from spec_constant import (PRINCIPLE_ROUTING_GUIDE, PRINCIPLE_SPEC_GUIDE, + WRITING_ROUTING_GUIDE, WRITING_SPEC_GUIDE) + +logger = get_logger() + + +class SpecLoader(ToolBase): + """Aggregate access to multiple specs for financial reports. + + Server name: `spec_loader` + + This tool exposes functions `load__specs` that loads one or more + spec files and returns their content to the model. Each + spec provides concise rules and examples on how to structure and phrase + financial analysis reports in an analyst-like style. + The underlying knowledge is stored as Markdown files and can be configured via + `tools.spec_loader.spec_dir` in the agent config. When not provided, + the tool falls back to `projects/fin_research/tools/specs` + under the current working directory. + + Supported spec tools (case-insensitive, synonyms allowed): + - writing_specs → writing_specs/xxxx.md + - principle_specs → principle_specs/xxxx.md + """ + + SPEC_DIR = 'projects/fin_research/tools/specs' + + def __init__(self, config): + super().__init__(config) + tools_cfg = getattr(config, 'tools', + None) if config is not None else None + spec_cfg = getattr(tools_cfg, 'spec_loader', + None) if tools_cfg is not None else None + self.exclude_func(spec_cfg) + + configured_dir = getattr(spec_cfg, 'spec_dir', + None) if spec_cfg is not None else None + default_dir = os.path.join(os.getcwd(), self.SPEC_DIR) + self.spec_dir = configured_dir or default_dir + + async def connect(self): + # Warn once if the directory cannot be found; still operate to allow deferred config + if not os.path.isdir(self.spec_dir): + logger.warning_once( + f'Spec directory not found: {self.spec_dir}. ' + f'Configure tools.spec_loader.spec_dir or ensure default exists.' + ) + + async def get_tools(self) -> Dict[str, Any]: + tools: Dict[str, List[Tool]] = { + 'spec_loader': [ + Tool( + tool_name='load_writing_specs', + server_name='spec_loader', + description= + ('Load one or more writing-style specs (rules + examples) and return ' + 'their curated Markdown content. Use this when you are unsure about ' + 'how to structure or phrase a financial report in an analyst-like style.\n\n' + 'Supported spec identifiers (case-insensitive, synonyms allowed):\n' + '- structure → section depth / headings\n' + '- methods → how much to expose MECE/SWOT/etc.\n' + '- bullets → bullets vs paragraphs\n' + '- focus → task focus and relevance\n' + '- tone → analyst-style voice\n' + '- density → length and information density control\n\n' + 'Provide a list of requested writing specs via the "writing_specs" parameter.\n\n' + f'{WRITING_SPEC_GUIDE}\n' + f'{WRITING_ROUTING_GUIDE}\n'), + parameters={ + 'type': 'object', + 'properties': { + 'writing_specs': { + 'type': + 'array', + 'items': { + 'type': 'string' + }, + 'description': + ('List of writing specs to load. Case-insensitive; supports synonyms.\n' + 'Allowed identifiers include (non-exhaustive):\n' + '- structure\n- methods\n- bullets\n- focus\n- tone\n- density\n' + ), + }, + 'format': { + 'type': + 'string', + 'enum': ['markdown', 'json'], + 'description': + ('Output format: "markdown" (combined Markdown string) or "json" ' + '(JSON object mapping spec to content). Default: "markdown".' + ), + }, + 'include_titles': { + 'type': + 'boolean', + 'description': + ('When format="markdown", if true, each section is prefixed with a ' + 'Markdown heading of the canonical spec title. Default: false.' + ), + }, + 'join_with': { + 'type': + 'string', + 'description': + ('When format="markdown", the delimiter used to join multiple ' + 'sections. Default: "\n\n---\n\n".'), + }, + 'strict': { + 'type': + 'boolean', + 'description': + ('If true, unknown specs cause an error. If false, unknown items are ' + 'ignored with a note in the output. Default: false.' + ), + }, + }, + 'required': ['writing_specs'], + 'additionalProperties': False, + }, + ), + Tool( + tool_name='load_principle_specs', + server_name='spec_loader', + description= + (f'Load one or more analysis principles (concept + how to apply to ' + f'financial analysis) and return their curated Markdown content.\n\n' + f'This is a single-aggregator tool designed to fetch multiple principles ' + f'in one call. Provide a list of requested principles via the "principles" ' + f'parameter. The tool supports common synonyms and is case-insensitive.\n\n' + f'Examples of valid principle identifiers: "MECE", "Pyramid", "Minto", ' + f'"SWOT", "Value Chain", "Pareto", "80-20", "80/20", "Boston Matrix", "BCG".\n\n' + f'When format is "markdown" (default), the tool returns a single combined ' + f'Markdown string (optionally including section titles). When format is ' + f'"json", the tool returns a JSON object mapping principle to content.\n' + f'{PRINCIPLE_SPEC_GUIDE}\n' + f'{PRINCIPLE_ROUTING_GUIDE}\n'), + parameters={ + 'type': 'object', + 'properties': { + 'principles': { + 'type': + 'array', + 'items': { + 'type': 'string' + }, + 'description': + ('List of principles to load. Case-insensitive; supports synonyms.\n' + 'Allowed identifiers include (non-exhaustive):\n' + '- MECE\n- Pyramid\n- Minto\n- SWOT\n- Value Chain\n' + '- Pareto\n- 80-20\n- 80/20\n- Boston Matrix\n- BCG\n' + ), + }, + 'format': { + 'type': + 'string', + 'enum': ['markdown', 'json'], + 'description': + ('Output format: "markdown" (combined Markdown string) or "json" ' + '(JSON object mapping principle to content). Default: "markdown".' + ), + }, + 'include_titles': { + 'type': + 'boolean', + 'description': + ('When format="markdown", if true, each section is prefixed with a ' + 'Markdown heading of the canonical principle title. Default: false.' + ), + }, + 'join_with': { + 'type': + 'string', + 'description': + ('When format="markdown", the delimiter used to join multiple ' + 'sections. Default: "\n\n---\n\n".'), + }, + 'strict': { + 'type': + 'boolean', + 'description': + ('If true, unknown principles cause an error. If false, unknown ' + 'items are ignored with a note in the output. Default: false.' + ), + }, + }, + 'required': ['principles'], + 'additionalProperties': False, + }, + ) + ] + } + + if hasattr(self, 'exclude_functions') and self.exclude_functions: + tools['spec_loader'] = [ + t for t in tools['spec_loader'] + if t['tool_name'] not in self.exclude_functions + ] + + return tools + + async def call_tool(self, server_name: str, *, tool_name: str, + tool_args: dict) -> str: + return await getattr(self, tool_name)(**tool_args) + + async def load_writing_specs(self, writing_specs: List[str], + **kwargs) -> str: + writing_spec_map = self._build_writing_spec_index() + return await self.load_specs(writing_spec_map, writing_specs, **kwargs) + + async def load_principle_specs(self, principles: List[str], + **kwargs) -> str: + principle_map = self._build_principle_spec_index() + return await self.load_specs(principle_map, principles, **kwargs) + + async def load_specs( + self, + spec_map: Dict[str, Tuple[str, str]], + specs: List[str], + format: str = 'markdown', + include_titles: bool = False, + join_with: str = '\n\n---\n\n', + strict: bool = False, + ) -> str: + """Load requested specs documents and return their content. + + Returns: + str: Markdown string (default) or JSON string mapping spec to content. + """ + + if not specs: + return json.dumps( + { + 'success': False, + 'error': 'No specs provided.' + }, + ensure_ascii=False, + indent=2, + ) + + resolved: Dict[str, Tuple[str, str]] = {} + unknown: List[str] = [] + for name in specs: + key = self._normalize_name(name) + if key in spec_map: + resolved[name] = spec_map[key] + else: + unknown.append(name) + + if unknown and strict: + return json.dumps( + { + 'success': False, + 'error': + 'Unknown specs (strict mode): ' + ', '.join(unknown), + }, + ensure_ascii=False, + indent=2, + ) + + loaded: Dict[str, str] = {} + for _, (filename, canonical_title) in resolved.items(): + path = os.path.join(self.spec_dir, filename) + try: + with open(path, 'r') as f: + content = f.read().strip() + loaded[canonical_title] = content + except Exception as e: # noqa + loaded[ + canonical_title] = f'Failed to load {filename}: {str(e)}' + + if not loaded: + return json.dumps( + { + 'success': False, + 'error': 'Failed to load any specs.' + }, + ensure_ascii=False, + indent=2, + ) + + if format == 'json': + payload = { + 'success': True, + 'specs': loaded, + 'unknown': unknown, + 'source_dir': self.spec_dir, + } + return json.dumps(payload, ensure_ascii=False) + + # Default: markdown + sections: List[str] = [] + for title, content in loaded.items(): + if include_titles: + sections.append(f'# {title}\n\n{content}') + else: + sections.append(content) + + if unknown and not strict: + sections.append( + f'> Note: Unknown specs ignored: {", ".join(unknown)}') + + return json.dumps( + { + 'success': True, + 'sections': join_with.join(sections) + }, + ensure_ascii=False, + indent=2) + + def _build_writing_spec_index(self) -> Dict[str, Tuple[str, str]]: + """Return writing spec mapping from normalized query → (filename, canonical title).""" + entries = [ + # synonyms, filename, canonical title + ( + ['structure', 'structure & layering', 'layering', 'sections'], + 'writing_specs/Structure_Layering.md', + 'Structure & Layering', + ), + ( + [ + 'methods', 'methodology', 'framework exposure', + 'methodology exposure' + ], + 'writing_specs/Methodology_Exposure.md', + 'Methodology Exposure', + ), + ( + [ + 'bullets', 'bullet', 'bullets & paragraphs', + 'paragraph rhythm' + ], + 'writing_specs/Bullets_Paragraph_Rhythm.md', + 'Bullets & Paragraph Rhythm', + ), + ( + ['focus', 'relevance', 'task focus', 'task focus & relevance'], + 'writing_specs/Task_Focus_Relevance.md', + 'Task Focus & Relevance', + ), + ( + ['tone', 'voice', 'analyst voice', 'tone & analyst voice'], + 'writing_specs/Tone_Analyst_Voice.md', + 'Tone & Analyst Voice', + ), + ( + ['density', 'length', 'density & length', 'length control'], + 'writing_specs/Density_Length_Control.md', + 'Density & Length Control', + ), + ] + + index: Dict[str, Tuple[str, str]] = {} + for synonyms, filename, title in entries: + for s in synonyms: + index[self._normalize_name(s)] = (filename, title) + return index + + def _build_principle_spec_index(self) -> Dict[str, Tuple[str, str]]: + """Return principle spec mapping from normalized query → (filename, canonical title).""" + entries: List[Tuple[List[str], str, str]] = [ + # synonyms, filename, canonical title + (['mece', 'mutually exclusive and collectively exhaustive'], + 'principle_specs/MECE.md', 'MECE'), + ([ + 'pyramid', 'minto', 'minto pyramid', 'pyramid principle', + 'minto_pyramid' + ], 'principle_specs/Minto_Pyramid.md', 'Pyramid (Minto Pyramid)'), + (['swot', 'swot analysis'], 'principle_specs/SWOT.md', 'SWOT'), + (['value chain', 'value-chain', + 'value_chain'], 'principle_specs/Value_Chain.md', 'Value Chain'), + ([ + 'pareto', '80-20', '80/20', 'pareto 80-20', 'pareto_80-20', + '8020' + ], 'principle_specs/Pareto_80-20.md', 'Pareto (80/20 Rule)'), + ([ + 'boston matrix', 'bcg', 'boston consulting group', + 'boston_matrix', 'boston' + ], 'principle_specs/Boston_Matrix.md', 'Boston Matrix (BCG)'), + ] + + index: Dict[str, Tuple[str, str]] = {} + for synonyms, filename, title in entries: + for s in synonyms: + index[self._normalize_name(s)] = (filename, title) + return index + + @staticmethod + def _normalize_name(name: str) -> str: + s = (name or '').strip().lower() + s = s.replace('_', ' ').replace('-', ' ') + s = ' '.join(s.split()) # collapse whitespace + + # normalize 80/20 variants, specially for principle specs + s = s.replace('80/20', '80-20').replace('80 20', '80-20') + s = s.replace('8020', '80-20') + + return s diff --git a/projects/fin_research/tools/principles/Boston_Matrix.md b/projects/fin_research/tools/specs/principle_specs/Boston_Matrix.md similarity index 100% rename from projects/fin_research/tools/principles/Boston_Matrix.md rename to projects/fin_research/tools/specs/principle_specs/Boston_Matrix.md diff --git a/projects/fin_research/tools/principles/MECE.md b/projects/fin_research/tools/specs/principle_specs/MECE.md similarity index 100% rename from projects/fin_research/tools/principles/MECE.md rename to projects/fin_research/tools/specs/principle_specs/MECE.md diff --git a/projects/fin_research/tools/principles/Minto_Pyramid.md b/projects/fin_research/tools/specs/principle_specs/Minto_Pyramid.md similarity index 100% rename from projects/fin_research/tools/principles/Minto_Pyramid.md rename to projects/fin_research/tools/specs/principle_specs/Minto_Pyramid.md diff --git a/projects/fin_research/tools/principles/Pareto_80-20.md b/projects/fin_research/tools/specs/principle_specs/Pareto_80-20.md similarity index 100% rename from projects/fin_research/tools/principles/Pareto_80-20.md rename to projects/fin_research/tools/specs/principle_specs/Pareto_80-20.md diff --git a/projects/fin_research/tools/principles/SWOT.md b/projects/fin_research/tools/specs/principle_specs/SWOT.md similarity index 100% rename from projects/fin_research/tools/principles/SWOT.md rename to projects/fin_research/tools/specs/principle_specs/SWOT.md diff --git a/projects/fin_research/tools/principles/Value_Chain.md b/projects/fin_research/tools/specs/principle_specs/Value_Chain.md similarity index 100% rename from projects/fin_research/tools/principles/Value_Chain.md rename to projects/fin_research/tools/specs/principle_specs/Value_Chain.md diff --git a/projects/fin_research/tools/specs/writing_specs/Bullets_Paragraph_Rhythm.md b/projects/fin_research/tools/specs/writing_specs/Bullets_Paragraph_Rhythm.md new file mode 100644 index 000000000..46a796f7e --- /dev/null +++ b/projects/fin_research/tools/specs/writing_specs/Bullets_Paragraph_Rhythm.md @@ -0,0 +1,24 @@ +# Bullets & Paragraph Rhythm +Balance bullet lists and narrative flow. + +## Overview +Bullets structure lists; paragraphs carry reasoning. +Over-bulleted text reads robotic; insufficient bullets reduces clarity. + +## Core Rules +- Use bullets for **drivers, risks, assumptions, KPIs**. +- Keep explanations and transitions in paragraphs. +- Avoid long sections made only of bullets. + +## Do +- Precede/follow bullet lists with short paragraphs (2–4 sentences). +- Use bullets only when enumerating items. + +## Don’t +- Don’t place every sentence in a bullet. +- Don’t create sections that contain only a single short bullet point without explanation. + +## Checklist +- [ ] Bullets only where listing is appropriate +- [ ] Reasoning remains in prose +- [ ] Bullet sections are not excessively long diff --git a/projects/fin_research/tools/specs/writing_specs/Density_Length_Control.md b/projects/fin_research/tools/specs/writing_specs/Density_Length_Control.md new file mode 100644 index 000000000..efe5a3da8 --- /dev/null +++ b/projects/fin_research/tools/specs/writing_specs/Density_Length_Control.md @@ -0,0 +1,27 @@ +# Density & Length Control +Maintain high information value with minimal redundancy. + +## Overview +Long reports risk redundancy. +This spec ensures conciseness without losing analytical depth. + +## Core Rules +- Prioritize high-signal content directly related to profitability, cash flows, competition, and forecasts + (e.g., margins, key drivers, valuation, competitive moves). +- Compress low-value background while keeping essential macro/policy factors that affect conclusions. +- One main idea per paragraph; supporting evidence can stay in the same paragraph. + +## Do +- Summarize background when necessary instead of narrating it in full. +- Consolidate similar facts into a single, high-density paragraph or table. +- Prune paragraphs that don’t change conclusions, risk assessment, or forecasts. + +## Don’t +- Don’t repeat identical information across chapters unless it is a deliberate recap in the executive summary or conclusion. +- Don’t include irrelevant company history or generic industry description. +- Don’t let sections balloon in length without new insights or decision-relevant facts. + +## Checklist +- [ ] No unnecessary repetition of facts across sections (brief recaps in summary/conclusion are allowed) +- [ ] Each paragraph adds analytical or decision-relevant value +- [ ] Section length is proportional to its importance for the investment view diff --git a/projects/fin_research/tools/specs/writing_specs/Methodology_Exposure.md b/projects/fin_research/tools/specs/writing_specs/Methodology_Exposure.md new file mode 100644 index 000000000..37338b486 --- /dev/null +++ b/projects/fin_research/tools/specs/writing_specs/Methodology_Exposure.md @@ -0,0 +1,25 @@ +# Methodology Exposure +Control how frameworks appear in reports. + +## Overview +Analyst reports should **use** frameworks implicitly, not **discuss** them like a textbook. + +## Core Rules +- As a rule of thumb, mention methodology at most once in the main text, and keep it brief. +- Only add extra clarification in footnotes or appendices when strictly necessary (e.g., for data sources or backtests). +- No standalone “Methodology” chapters. +- Avoid repeated naming of MECE / SWOT / 80-20 / Value Chain. +- Emphasize **conclusions and drivers**, not analytical process. + +## Do +- Use frameworks silently to organize logic. +- Introduce the approach in 1 sentence if needed. + +## Don’t +- Don't describe frameworks in detail. +- Don’t repeatedly cite them or make them focal content. + +## Checklist +- [ ] Frameworks named ≤1 time in the main text +- [ ] No method-focused chapters +- [ ] Focus stays on results, not process diff --git a/projects/fin_research/tools/specs/writing_specs/Structure_Layering.md b/projects/fin_research/tools/specs/writing_specs/Structure_Layering.md new file mode 100644 index 000000000..b0586167b --- /dev/null +++ b/projects/fin_research/tools/specs/writing_specs/Structure_Layering.md @@ -0,0 +1,28 @@ +# Structure & Layering +Guidelines for clean, shallow section hierarchy. + +## Overview +Control outline depth so the report reads like an analyst note, not an academic paper. +Keep structures simple, readable, and coherent. +Use this shallow hierarchy as the default template; only exceed it in rare, task-specific cases + +## Core Rules +- **Default to 2–3 heading levels** (Chapter → Section → Subsection). +- Avoid trivial micro-sections that don’t add structural value. Short, necessary sections (e.g., risk disclaimer) are acceptable. +- Use **natural paragraphs** for flow; don’t fragment text excessively. +- Keep a typical pattern: Summary → Core Analysis → Forecast → Risks → Conclusion. + +## Do +- Use headings only for meaningful topic shifts. +- Merge closely related content. +- Maintain internal narrative continuity. + +## Don’t +- Don’t use deep hierarchies (1.2.3.4.5). +- Don’t repeat identical background across sections. +- Don’t make bullet-only sections. + +## Checklist +- [ ] ≤3 levels used +- [ ] No redundant micro-sections +- [ ] Narrative flows smoothly diff --git a/projects/fin_research/tools/specs/writing_specs/Task_Focus_Relevance.md b/projects/fin_research/tools/specs/writing_specs/Task_Focus_Relevance.md new file mode 100644 index 000000000..e6d1ad6e6 --- /dev/null +++ b/projects/fin_research/tools/specs/writing_specs/Task_Focus_Relevance.md @@ -0,0 +1,27 @@ +# Task Focus & Relevance +Keep the report anchored to the user’s specific question. + +## Overview +Avoid drifting into long industry essays. +Ensure all content serves the **company-scope-time window** defined by the task. + +## Core Rules +- Tie every major section to one or more core analytical dimensions required by the task (e.g., profitability, growth, cash flow, competition, risk, or forecast). +- Minimize repeated or irrelevant background. +- Avoid policy histories or macro digressions unless directly relevant. +- When macro/policy factors are mentioned, immediately link them to specific company metrics or trend changes in the defined time window. + +## Do +- Restate the task in the introduction. +- Explicitly link each section to the target companies/time frame. +- Consolidate industry/macro background into **one short background subsection** (e.g., 1–2 paragraphs in the introduction). +- When later sections need background, **reuse it in one sentence and reference the earlier subsection** (e.g., “As noted in Section 1.1, …”) instead of rewriting the full context. + +## Don’t +- Don’t include unrelated macro/policy background. +- Don’t duplicate the same context across chapters. + +## Checklist +- [ ] Each section can be explained in one sentence how it serves the task. +- [ ] Background concise and non-repetitive +- [ ] No scope creep diff --git a/projects/fin_research/tools/specs/writing_specs/Tone_Analyst_Voice.md b/projects/fin_research/tools/specs/writing_specs/Tone_Analyst_Voice.md new file mode 100644 index 000000000..169a8e742 --- /dev/null +++ b/projects/fin_research/tools/specs/writing_specs/Tone_Analyst_Voice.md @@ -0,0 +1,26 @@ +# Tone & Analyst Voice +Write like a professional financial analyst. + +## Overview +Aim for a concise, conclusion-first, evidence-backed voice. +Avoid academic-style writing or methodology-focused exposition. + +## Core Rules +- Start with clear, top-line conclusions. +- Use professional but readable language. +- Calibrate confidence in forecasts (avoid absolute claims). +- Avoid academic framing and jargon-heavy methodology talk. + +## Do +- Use causal reasoning (“due to…”, “driven by…”). +- Use domain-specific vocabulary naturally. +- Keep sentences tight and information-dense. + +## Don’t +- Don’t slip into academic or thesis-like style. +- Don’t overstate certainty (“will definitely”, “must”). + +## Checklist +- [ ] Conclusion-first structure +- [ ] Tone = analyst, not textbook +- [ ] Language concise, factual, and readable