diff --git a/aworld-cli/src/aworld_cli/console.py b/aworld-cli/src/aworld_cli/console.py index 563728266..8dd274e0f 100644 --- a/aworld-cli/src/aworld_cli/console.py +++ b/aworld-cli/src/aworld_cli/console.py @@ -15,7 +15,6 @@ from rich.style import Style from rich.table import Table from rich.text import Text -import os from aworld.logs.util import logger from ._globals import console @@ -30,6 +29,7 @@ class AWorldCLI: def __init__(self): self.console = console self.user_input = UserInputHandler(console) + # self.team_handler = InteractiveTeamHandler(console) def _get_gradient_text(self, text: str, start_color: str, end_color: str) -> Text: """Create a Text object with a horizontal gradient.""" @@ -176,13 +176,13 @@ async def _edit_models_config(self, config, current_config: dict): else: default_cfg.pop('base_url', None) - # Diffusion (models.diffusion -> DIFFUSION_* for video_creator agent) - self.console.print("\n[bold]Diffusion configuration[/bold] [dim](optional, for video_creator agent)[/dim]") + # Diffusion (models.diffusion -> DIFFUSION_* for diffusion agent) + self.console.print("\n[bold]Diffusion configuration[/bold] [dim](optional, for diffusion agent)[/dim]") self.console.print(" [dim]Leave empty to use Media LLM or default LLM config above[/dim]\n") if 'diffusion' not in current_config['models']: - # Migrate from legacy models.video_creator - current_config['models']['diffusion'] = current_config['models'].get('video_creator') or {} - current_config['models'].pop('video_creator', None) + # Migrate from legacy models.diffusion + current_config['models']['diffusion'] = current_config['models'].get('diffusion') or {} + current_config['models'].pop('diffusion', None) diff_cfg = current_config['models']['diffusion'] current_diff_api_key = diff_cfg.get('api_key', '') @@ -230,6 +230,58 @@ async def _edit_models_config(self, config, current_config: dict): if not diff_cfg: current_config['models'].pop('diffusion', None) + # Audio (models.audio -> AUDIO_* for audio agent) + self.console.print("\n[bold]Audio configuration[/bold] [dim](optional, for audio agent)[/dim]") + self.console.print(" [dim]Leave empty to use Media LLM or default LLM config above[/dim]\n") + if 'audio' not in current_config['models']: + current_config['models']['audio'] = {} + audio_cfg = current_config['models']['audio'] + + current_audio_api_key = audio_cfg.get('api_key', '') + if current_audio_api_key: + masked = current_audio_api_key[:8] + "..." if len(current_audio_api_key) > 8 else "***" + self.console.print(f" [dim]Current AUDIO_API_KEY: {masked}[/dim]") + audio_api_key = Prompt.ask(" AUDIO_API_KEY", default=current_audio_api_key, password=True) + if audio_api_key: + audio_cfg['api_key'] = audio_api_key + else: + audio_cfg.pop('api_key', None) + + current_audio_model = audio_cfg.get('model', '') + self.console.print(" [dim]e.g. claude-3-5-sonnet-20241022 · Enter to inherit from Media/default[/dim]") + audio_model = Prompt.ask(" AUDIO_MODEL_NAME", default=current_audio_model) + if audio_model: + audio_cfg['model'] = audio_model + else: + audio_cfg.pop('model', None) + + current_audio_base_url = audio_cfg.get('base_url', '') + audio_base_url = Prompt.ask(" AUDIO_BASE_URL", default=current_audio_base_url) + if audio_base_url: + audio_cfg['base_url'] = audio_base_url + else: + audio_cfg.pop('base_url', None) + + current_audio_provider = audio_cfg.get('provider', 'openai') + audio_provider = Prompt.ask(" AUDIO_PROVIDER", default=current_audio_provider) + if audio_provider: + audio_cfg['provider'] = audio_provider + else: + audio_cfg.pop('provider', None) + + current_audio_temp = audio_cfg.get('temperature', 0.1) + audio_temp = Prompt.ask(" AUDIO_TEMPERATURE", default=str(current_audio_temp)) + if audio_temp: + try: + audio_cfg['temperature'] = float(audio_temp) + except ValueError: + audio_cfg.pop('temperature', None) + else: + audio_cfg.pop('temperature', None) + + if not audio_cfg: + current_config['models'].pop('audio', None) + config.save_config(current_config) self.console.print(f"\n[green]✅ Configuration saved to {config.get_config_path()}[/green]") table = Table(title="Default LLM Configuration", box=box.ROUNDED) @@ -258,6 +310,19 @@ async def _edit_models_config(self, config, current_config: dict): self.console.print() self.console.print(diff_table) + if current_config['models'].get('audio'): + audio_table = Table(title="Audio Configuration (AUDIO_*)", box=box.ROUNDED) + audio_table.add_column("Setting", style="cyan") + audio_table.add_column("Value", style="green") + for key, value in current_config['models']['audio'].items(): + if key == 'api_key': + masked_value = value[:8] + "..." if len(str(value)) > 8 else "***" + audio_table.add_row(key, masked_value) + else: + audio_table.add_row(key, str(value)) + self.console.print() + self.console.print(audio_table) + async def _edit_skills_config(self, config, current_config: dict): """Edit skills section of config (global SKILLS_PATH and per-agent XXX_SKILLS_PATH).""" default_skills_path = str(Path.home() / ".aworld" / "skills") @@ -905,6 +970,7 @@ async def run_chat_session(self, agent_name: str, executor: Callable[[str], Any] f"Type '/agents' to list all available agents.\n" f"Type '/cost' for current session, '/cost -all' for global history.\n" f"Type '/compact' to run context compression.\n" + f"Type '/team' for agent team management.\n" f"Type '/memory' to edit project context, '/memory view' to view, '/memory status' for status.\n" f"Use @filename to include images or text files (e.g., @photo.jpg or @document.txt)." ) @@ -921,6 +987,7 @@ async def run_chat_session(self, agent_name: str, executor: Callable[[str], Any] slash_cmds = [ "/agents", "/skills", "/new", "/restore", "/latest", "/exit", "/quit", "/switch", "/cost", "/cost -all", "/compact", + "/team", "/memory", "/memory view", "/memory reload", "/memory status", ] switch_with_agents = [f"/switch {n}" for n in agent_names] if agent_names else [] @@ -941,6 +1008,7 @@ async def run_chat_session(self, agent_name: str, executor: Callable[[str], Any] "/memory view": "View current memory content", "/memory reload": "Reload memory from file", "/memory status": "Show memory system status", + "/team": "Agent team management commands", "exit": "Exit chat", "quit": "Exit chat", } @@ -1178,12 +1246,12 @@ async def run_chat_session(self, agent_name: str, executor: Callable[[str], Any] try: parts = user_input.split(maxsplit=1) subcommand = parts[1] if len(parts) > 1 else "" - + # Import required modules import os from pathlib import Path import subprocess - + # Find AWORLD.md file def find_aworld_file(): """Find AWORLD.md in standard locations""" @@ -1197,11 +1265,11 @@ def find_aworld_file(): if path.exists(): return path return None - + def get_editor(): """Get editor from environment variables""" return os.environ.get('VISUAL') or os.environ.get('EDITOR') or 'nano' - + if subcommand == "view": # View current memory content aworld_file = find_aworld_file() @@ -1216,20 +1284,20 @@ def get_editor(): from rich.syntax import Syntax syntax = Syntax(content, "markdown", theme="monokai", line_numbers=False) self.console.print(Panel(syntax, title="AWORLD.md", border_style="cyan")) - + elif subcommand == "reload": # Reload memory from file self.console.print("[dim]Memory reload functionality requires agent restart.[/dim]") self.console.print("[dim]The AWORLD.md file will be automatically loaded on next agent start.[/dim]") - + elif subcommand == "status": # Show memory system status aworld_file = find_aworld_file() - from rich.table import Table + # Use global Table import (line 16) instead of local import table = Table(title="Memory System Status", box=box.ROUNDED) table.add_column("Property", style="cyan") table.add_column("Value", style="green") - + if aworld_file: table.add_row("AWORLD.md Location", str(aworld_file)) table.add_row("File Size", f"{aworld_file.stat().st_size} bytes") @@ -1240,25 +1308,25 @@ def get_editor(): else: table.add_row("AWORLD.md Location", "Not found") table.add_row("Status", "❌ Not configured") - + table.add_row("Feature", "AWORLDFileNeuron") table.add_row("Auto-load", "Enabled") self.console.print(table) - + else: # Edit AWORLD.md (default action) aworld_file = find_aworld_file() - + if not aworld_file: # Create new file in user directory (DEFAULT) default_location = Path.home() / '.aworld' / 'AWORLD.md' self.console.print(f"[yellow]No AWORLD.md found. Creating new file at:[/yellow]") self.console.print(f"[cyan]{default_location}[/cyan]") self.console.print(f"[dim](Default: ~/.aworld/AWORLD.md)[/dim]\n") - + # Create directory if needed default_location.parent.mkdir(parents=True, exist_ok=True) - + # Create template template = """# Project Context @@ -1283,11 +1351,11 @@ def get_editor(): """ default_location.write_text(template, encoding='utf-8') aworld_file = default_location - + # Open in editor editor = get_editor() self.console.print(f"[dim]Opening {aworld_file} in {editor}...[/dim]") - + try: # Open editor and wait for it to close result = subprocess.run([editor, str(aworld_file)]) @@ -1301,13 +1369,18 @@ def get_editor(): self.console.print("[dim]Set EDITOR or VISUAL environment variable to your preferred editor.[/dim]") except Exception as e: self.console.print(f"[red]Error opening editor: {e}[/red]") - + except Exception as e: self.console.print(f"[red]Error handling memory command: {e}[/red]") import traceback traceback.print_exc() continue + # Handle team command + if user_input.lower().startswith("/team"): + # await self.team_handler.handle_command(user_input) + continue + # Handle agents command if user_input.lower() in ("/agents", "agents"): try: diff --git a/aworld-cli/src/aworld_cli/core/config.py b/aworld-cli/src/aworld_cli/core/config.py index d5b3c2a26..b2d6d268d 100644 --- a/aworld-cli/src/aworld_cli/core/config.py +++ b/aworld-cli/src/aworld_cli/core/config.py @@ -281,9 +281,9 @@ def _apply_filesystem_config(filesystem_cfg: Optional[Dict[str, Any]] = None) -> def _apply_diffusion_models_config(models_config: Dict[str, Any]) -> None: """ - Apply models.diffusion config to DIFFUSION_* env vars for video_creator agent. + Apply models.diffusion config to DIFFUSION_* env vars for diffusion agent. Priority: models.diffusion config > existing DIFFUSION_* env vars > LLM_*. - Supports models.video_creator for backwards compatibility. + Supports models.diffusion for backwards compatibility. """ diff_cfg = models_config.get('diffusion') diff_cfg = diff_cfg if isinstance(diff_cfg, dict) else {} @@ -345,6 +345,69 @@ def _apply_diffusion_models_config(models_config: Dict[str, Any]) -> None: os.environ['DIFFUSION_TEMPERATURE'] = str(float(temperature)) +def _apply_audio_models_config(models_config: Dict[str, Any]) -> None: + """ + Apply models.audio config to AUDIO_* env vars for audio agent. + Priority: models.audio config > existing AUDIO_* env vars > LLM_*. + """ + audio_cfg = models_config.get('audio') + audio_cfg = audio_cfg if isinstance(audio_cfg, dict) else {} + api_key = (audio_cfg.get('api_key') or '').strip() + model_name = (audio_cfg.get('model') or '').strip() + base_url = (audio_cfg.get('base_url') or '').strip() + provider = (audio_cfg.get('provider') or '').strip() + temperature = audio_cfg.get('temperature') + + if not api_key: + api_key = (os.environ.get('AUDIO_API_KEY') or '').strip() + if not api_key: + api_key = (os.environ.get('LLM_API_KEY') or '').strip() + if not api_key: + for key in ('OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'GEMINI_API_KEY'): + v = (os.environ.get(key) or '').strip() + if v: + api_key = v + if not provider and 'OPENAI' in key: + provider = 'openai' + elif not provider and 'ANTHROPIC' in key: + provider = 'anthropic' + elif not provider and 'GEMINI' in key: + provider = 'gemini' + break + if not model_name: + model_name = (os.environ.get('AUDIO_MODEL_NAME') or '').strip() + if not model_name: + model_name = (os.environ.get('LLM_MODEL_NAME') or '').strip() + if not base_url: + base_url = (os.environ.get('AUDIO_BASE_URL') or '').strip() + if not base_url: + base_url = (os.environ.get('LLM_BASE_URL') or '').strip() + if not base_url: + for key in ('OPENAI_BASE_URL', 'ANTHROPIC_BASE_URL', 'GEMINI_BASE_URL'): + v = (os.environ.get(key) or '').strip() + if v: + base_url = v + break + if not provider: + provider = (os.environ.get('AUDIO_PROVIDER') or '').strip() + if not provider: + provider = 'openai' + if temperature is None: + env_temp = (os.environ.get('AUDIO_TEMPERATURE') or '').strip() + if env_temp: + temperature = float(env_temp) + + if api_key: + os.environ['AUDIO_API_KEY'] = api_key + if model_name: + os.environ['AUDIO_MODEL_NAME'] = model_name + if base_url: + os.environ['AUDIO_BASE_URL'] = base_url + os.environ['AUDIO_PROVIDER'] = provider + if temperature is not None: + os.environ['AUDIO_TEMPERATURE'] = str(float(temperature)) + + def _apply_models_config_to_env(models_config: Dict[str, Any]) -> None: """ Apply models config (api_key, model, base_url) to os.environ. @@ -381,6 +444,7 @@ def _apply_models_config_to_env(models_config: Dict[str, Any]) -> None: if base_url: os.environ['LLM_BASE_URL'] = base_url _apply_diffusion_models_config(models_config) + _apply_audio_models_config(models_config) return # Legacy: nested models.default.{provider} or models.{provider} default_providers = {k: v for k, v in default_cfg.items() @@ -422,6 +486,7 @@ def _apply_models_config_to_env(models_config: Dict[str, Any]) -> None: os.environ['LLM_BASE_URL'] = base_url _apply_diffusion_models_config(models_config) + _apply_audio_models_config(models_config) def _load_from_local_env(source_path: str) -> tuple[Dict[str, Any], str, str]: @@ -439,6 +504,7 @@ def _load_from_local_env(source_path: str) -> tuple[Dict[str, Any], str, str]: }) # Apply DIFFUSION_* from LLM_* when not set in .env _apply_diffusion_models_config({}) + _apply_audio_models_config({}) logger.info(f"[config] load_dotenv loaded from: {source_path} {os.environ.get('LLM_MODEL_NAME')} {os.environ.get('LLM_BASE_URL')}") return _env_to_config(), "local", source_path diff --git a/aworld-cli/src/aworld_cli/core/context.py b/aworld-cli/src/aworld_cli/core/context.py index 6758aa41e..2cfc1a24f 100644 --- a/aworld-cli/src/aworld_cli/core/context.py +++ b/aworld-cli/src/aworld_cli/core/context.py @@ -82,6 +82,7 @@ def check_session_token_limit( history = JSONLHistory(str(history_path)) stats = history.get_token_stats(session_id=session_id) + logger.info(f"check_session_token_limit|agent_name={agent_name}|session_id={session_id}|limit={limit}|stats={stats}") # Use current agent's context_window_tokens (ctx) when agent_name provided if agent_name: @@ -90,7 +91,7 @@ def check_session_token_limit( total = ( agent_stats.get("context_window_tokens", 0) if agent_stats - else stats.get("total_tokens", 0) + else 0 ) else: total = stats.get("total_tokens", 0) diff --git a/aworld-cli/src/aworld_cli/history.py b/aworld-cli/src/aworld_cli/history.py index 327571681..7593ab842 100644 --- a/aworld-cli/src/aworld_cli/history.py +++ b/aworld-cli/src/aworld_cli/history.py @@ -378,7 +378,7 @@ def get_token_stats(self, session_id: Optional[str] = None) -> Dict: return stats - def format_cost_display(self, session_id: Optional[str] = None, limit: int = 10) -> str: + def format_cost_display(self, session_id: Optional[str] = None, limit: int = 20) -> str: """ Format token cost statistics for display. diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/__init__.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/__init__.py similarity index 100% rename from aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/__init__.py rename to aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/__init__.py diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/audio.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/audio.py new file mode 100644 index 000000000..97f90133f --- /dev/null +++ b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/audio.py @@ -0,0 +1,125 @@ +import os +from pathlib import Path +from typing import Dict, Any, List + +from aworld.agents.audio_agent import AudioAgent +from aworld.config import AgentConfig, ModelConfig +from aworld.core.agent.swarm import Swarm +from aworld.core.common import Observation, ActionModel +from aworld.core.context.base import Context +from aworld.core.event.base import Message +from aworld.runners.hook.hook_factory import HookFactory +from aworld.runners.hook.hooks import PreLLMCallHook, PostLLMCallHook +from aworld.sandbox import Sandbox +from aworld_cli.core import agent +from aworld_cli.core.skill_registry import collect_plugin_and_user_skills +from .mcp_config import mcp_config + + +@HookFactory.register(name="pre_audio_hook") +class PreMultiTaskVideoCreatorHook(PreLLMCallHook): + """Hook triggered before LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" + + async def exec(self, message: Message, context: Context = None) -> Message: + if message.sender.startswith('audio'): + # Logging and monitoring only - do not modify content + pass + return message + + +@HookFactory.register(name="post_audio_hook") +class PostMultiTaskVideoCreatorHook(PostLLMCallHook): + """Hook triggered after LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" + + async def exec(self, message: Message, context: Context = None) -> Message: + if message.sender.startswith('audio'): + # Logging and monitoring only - do not modify content + pass + return message + + +class AudioCreatorAgent(AudioAgent): + """An agent specializing in creating, editing, and generating video content.""" + + async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}, message: Message = None, + **kwargs) -> List[ActionModel]: + """ + Execute the agent's policy for video creation tasks. + + This agent handles video creation and editing: + 1. Creating new videos from images, audio clips, or text. + 2. Editing existing videos (e.g., trimming, concatenating, adding effects). + 3. Adding or replacing audio tracks in videos. + 4. Programmatically generating animations or visual effects. + """ + return await super().async_policy(observation, info, message, **kwargs) + + +@agent( + name="audio_generator", + desc="""An intelligent assistant specially designed for text-to-speech audio generation. Use when: +- Converting text to speech audio. +- Generating audio with different voices and styles. +- Creating audio files with customized speed and encoding. + +Cannot process (do NOT delegate to this agent): Video generation, document reading/analysis (.pdf, .docx), database queries, web scraping, or general code debugging not related to audio generation. + +**Invocation format (MUST follow when calling):** +- `content`: Required. The text to convert to speech. +- `info`: Optional JSON string. Use when passing audio params, e.g.: + {"voice_type": "zh_male_M392_conversation_wvae_bigtts", "encoding": "mp3", "speed_ratio": 1.0, "output_path": "./output/audio.mp3", "uid": "user_123"} + Supported keys: + - voice_type: Voice type identifier (e.g., "zh_male_M392_conversation_wvae_bigtts") + - encoding: Audio format (mp3, wav, pcm, ogg_opus), default: "mp3" + - speed_ratio: Speech speed (0.5 to 2.0), default: 1.0 + - output_path: Output file path (optional, auto-generated if not provided) + - uid: User ID for the request (optional) +""" +) +def build_audio_swarm(): + """Build and configure the multi-task audio agent swarm.""" + # APP_EVALUATOR_SKILLS_DIR: override skill read directory (plugin root with skills/ subdir) + plugin_base_dir = Path(__file__).resolve().parents[2] # smllc plugin root + env_skills_dir = Path(os.path.expanduser(os.environ.get("SKILLS_PATH"))).resolve() + skill_configs = collect_plugin_and_user_skills(plugin_base_dir, user_dir=env_skills_dir) + + # Create Agent configuration (AUDIO_* from models.audio or fallback to MEDIA_LLM_*/LLM_*) + agent_config = AgentConfig( + llm_config=ModelConfig( + llm_model_name=os.environ.get("AUDIO_MODEL_NAME", "claude-3-5-sonnet-20241022"), + llm_provider=os.environ.get("AUDIO_PROVIDER", "openai"), + llm_api_key=os.environ.get("AUDIO_API_KEY"), + llm_base_url=os.environ.get("AUDIO_BASE_URL"), + llm_temperature=float(os.environ.get("AUDIO_TEMPERATURE", "0.1")), + params={"max_completion_tokens": 59000}, + llm_stream_call=os.environ.get("STREAM", "0").lower() in ("1", "true", "yes") + ), + skill_configs=skill_configs + ) + + # Extract all server keys from mcp_config + mcp_servers = list(mcp_config.get("mcpServers", {}).keys()) + + # Configure sandbox with MCP servers + sandbox = Sandbox( + mcp_config=mcp_config + ) + sandbox.reuse = True + + _prompt_path = Path(__file__).resolve().parent / "prompt.txt" + _system_prompt = _prompt_path.read_text(encoding="utf-8") + + # Create MultiTaskVideoCreatorAgent instance + audio = AudioCreatorAgent( + name="audio", + desc="An intelligent assistant for creating, editing, and generating video content.", + conf=agent_config, + system_prompt=_system_prompt, + mcp_servers=mcp_servers, + mcp_config=mcp_config, + sandbox=sandbox, + # tool_names = ["CAST_SEARCH"] + ) + + # Return the Swarm containing this Agent + return Swarm(audio) diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/mcp_config.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/mcp_config.py similarity index 100% rename from aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/mcp_config.py rename to aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/mcp_config.py diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/prompt.txt b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/audio/prompt.txt new file mode 100644 index 000000000..e69de29bb diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py index f9bbce20e..3108177de 100644 --- a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py +++ b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py @@ -17,9 +17,11 @@ from aworld.experimental.cast.tools import CAST_ANALYSIS, CAST_CODER from aworld.logs.util import logger from aworld_cli.core.context_tool import CONTEXT_TOOL +from .audio.audio import build_audio_swarm from .developer.developer import build_developer_swarm from .evaluator.evaluator import build_evaluator_swarm -from .video_creator.video_creator import build_video_creator_swarm +from .diffusion.diffusion import build_diffusion_swarm +import traceback sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -243,23 +245,25 @@ def build_aworld_agent(include_skills: Optional[str] = None): tool_names=[CONTEXT_TOOL, 'CAST_SEARCH'] ) - # Directly instantiate developer, evaluator, and video_creator as sub-agents + # Directly instantiate developer, evaluator, and diffusion as sub-agents try: developer_swarm = build_developer_swarm() evaluator_swarm = build_evaluator_swarm() - video_creator_swarm = build_video_creator_swarm() + diffusion_swarm = build_diffusion_swarm() + audio_swarm = build_audio_swarm() sub_agents = ( extract_agents_from_swarm(developer_swarm) + extract_agents_from_swarm(evaluator_swarm) - + extract_agents_from_swarm(video_creator_swarm) + + extract_agents_from_swarm(diffusion_swarm) + + extract_agents_from_swarm(audio_swarm) ) if sub_agents: - logger.info(f"🤝 Adding {len(sub_agents)} sub-agent(s) to Aworld TeamSwarm (developer, evaluator, video_creator)") + logger.info(f"🤝 Adding {len(sub_agents)} sub-agent(s) to Aworld TeamSwarm (developer, evaluator, diffusion)") return TeamSwarm(aworld_agent, *sub_agents, max_steps=100) else: logger.info("ℹ️ No sub-agents extracted, creating Aworld TeamSwarm without sub-agents") return TeamSwarm(aworld_agent) except Exception as e: - logger.warning(f"⚠️ Failed to instantiate sub-agents: {e}, creating Aworld TeamSwarm without sub-agents") + logger.warning(f"⚠️ Failed to instantiate sub-agents: {e}, creating Aworld TeamSwarm without sub-agents {traceback.format_exc()}") return TeamSwarm(aworld_agent) diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/__init__.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/video_creator.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/diffusion.py similarity index 89% rename from aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/video_creator.py rename to aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/diffusion.py index f7b963fa4..41fecf5b7 100644 --- a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/video_creator.py +++ b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/diffusion.py @@ -2,12 +2,10 @@ from pathlib import Path from typing import Dict, Any, List -from aworld.agents.llm_agent import Agent from aworld.agents.video_agent import VideoAgent from aworld.config import AgentConfig, ModelConfig from aworld.core.agent.swarm import Swarm from aworld.core.common import Observation, ActionModel -from aworld.core.context.amni.config import get_default_config, AgentContextConfig, ContextEnvConfig from aworld.core.context.base import Context from aworld.core.event.base import Message from aworld.runners.hook.hook_factory import HookFactory @@ -18,23 +16,23 @@ from .mcp_config import mcp_config -@HookFactory.register(name="pre_video_creator_hook") +@HookFactory.register(name="pre_diffusion_hook") class PreMultiTaskVideoCreatorHook(PreLLMCallHook): """Hook triggered before LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" async def exec(self, message: Message, context: Context = None) -> Message: - if message.sender.startswith('video_creator'): + if message.sender.startswith('diffusion'): # Logging and monitoring only - do not modify content pass return message -@HookFactory.register(name="post_video_creator_hook") +@HookFactory.register(name="post_diffusion_hook") class PostMultiTaskVideoCreatorHook(PostLLMCallHook): """Hook triggered after LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" async def exec(self, message: Message, context: Context = None) -> Message: - if message.sender.startswith('video_creator'): + if message.sender.startswith('diffusion'): # Logging and monitoring only - do not modify content pass return message @@ -58,7 +56,7 @@ async def async_policy(self, observation: Observation, info: Dict[str, Any] = {} @agent( - name="video_creator", + name="video_diffusion", desc="""An intelligent assistant specially designed for creating, editing, and generating video content. Use when: - Creating new videos from images, audio clips, or text. - Editing existing videos (e.g., trimming, concatenating, adding effects or overlays). @@ -74,8 +72,8 @@ async def async_policy(self, observation: Observation, info: Dict[str, Any] = {} Supported keys: image_url, reference_images (list of paths/URLs/base64), resolution, duration (must be ≤ 5 seconds), fps, poll, poll_interval, poll_timeout, download_video, output_dir. """ ) -def build_video_creator_swarm(): - """Build and configure the multi-task video_creator agent swarm.""" +def build_diffusion_swarm(): + """Build and configure the multi-task diffusion agent swarm.""" # APP_EVALUATOR_SKILLS_DIR: override skill read directory (plugin root with skills/ subdir) plugin_base_dir = Path(__file__).resolve().parents[2] # smllc plugin root env_skills_dir = Path(os.path.expanduser(os.environ.get("SKILLS_PATH"))).resolve() @@ -108,8 +106,8 @@ def build_video_creator_swarm(): _system_prompt = _prompt_path.read_text(encoding="utf-8") # Create MultiTaskVideoCreatorAgent instance - video_creator = MultiTaskVideoCreatorAgent( - name="video_creator", + diffusion = MultiTaskVideoCreatorAgent( + name="diffusion", desc="An intelligent assistant for creating, editing, and generating video content.", conf=agent_config, system_prompt=_system_prompt, @@ -120,4 +118,4 @@ def build_video_creator_swarm(): ) # Return the Swarm containing this Agent - return Swarm(video_creator) + return Swarm(diffusion) diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/mcp_config.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/mcp_config.py new file mode 100644 index 000000000..500648ee7 --- /dev/null +++ b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/mcp_config.py @@ -0,0 +1,12 @@ +import sys + +mcp_config = { + "mcpServers": { + "terminal": { + "command": sys.executable, + "args": ["-m", "examples.gaia.mcp_collections.tools.terminal"], + "env": {}, + "client_session_timeout_seconds": 9999.0, + } + } +} diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/prompt.txt b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/prompt.txt similarity index 100% rename from aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/prompt.txt rename to aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/diffusion/prompt.txt diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/prompt.txt b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/prompt.txt index a363fa4de..45ae4d4d7 100644 --- a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/prompt.txt +++ b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/prompt.txt @@ -26,11 +26,21 @@ You are equipped with multiple assistants. It is your job to know which to use a * `terminal`: A tool set that can execute terminal commands. - **Path restriction:** Do not `cd` to other directories; always operate from the working directory ({{ARTIFACT_DIRECTORY}}). When operating on files, always use explicit relative or absolute paths. - **Timeout requirement:** If the task may take longer than 60 seconds to complete, you MUST use async execution + sync polling: start a background process, redirect its output to an intermediate file, and poll that file every 60 seconds to check whether the task has completed. Example: `your_script.sh > /tmp/task.log 2>&1 &`, then read from `/tmp/task.log` to check task progress. Bad example (do NOT do this for long-running tasks): `mcp_execute_command(command="python xxx_script.py", timeout=120, output_format="text")`. -* `video_creator`: Sub-agent for creating videos from images, audio, and text. - - **When to invoke:** All video creation tasks MUST be routed to `video_creator`. +* `video_diffusion`: Sub-agent for creating videos from images, audio, and text. + - **When to invoke:** All video creation tasks MUST be routed to `video_diffusion`. - **Call params:** `content` (required: prompt text); `info` (optional, JSON string). - **Example info:** `{"image_url": "", "image_tail": "", "reference_images": ["", ""], "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "({{ARTIFACT_DIRECTORY}})", "sound": "on"}`; `duration` must be ≤ 5 seconds. - **Supported info keys:** image_url, image_tail (last frame for image-to-video), reference_images (list of paths/URLs/base64), resolution, duration, fps, poll, poll_interval, poll_timeout, download_video, output_dir. +* `audio_generator`: Sub-agent for text-to-speech audio generation. + - **When to invoke:** All text-to-speech audio generation tasks MUST be routed to `audio_generator`. + - **Call params:** `content` (required: text to convert to speech); `info` (optional, JSON string). + - **Example info:** `{"voice_type": "zh_male_M392_conversation_wvae_bigtts", "encoding": "mp3", "speed_ratio": 1.0, "output_path": "({{ARTIFACT_DIRECTORY}})/audio.mp3", "uid": "user_123"}` + - **Supported info keys:** + - voice_type: Voice identifier (e.g., "zh_male_M392_conversation_wvae_bigtts", "zh_female_F001_conversation_wvae_bigtts") + - encoding: Audio format (mp3, wav, pcm, ogg_opus), default: "mp3" + - speed_ratio: Speech speed (0.5 to 2.0), default: 1.0 + - output_path: Output file path (optional, auto-generated if not provided) + - uid: User ID (optional) ## 4. Available Skills * Please be aware that if you need to have access to a particular skill to help you to complete the task, you MUST use the appropriate `SKILL_tool` to activate the skill, which returns you the exact skill content. diff --git a/aworld-cli/src/aworld_cli/main.py b/aworld-cli/src/aworld_cli/main.py index 13b43347c..71c9da4ca 100644 --- a/aworld-cli/src/aworld_cli/main.py +++ b/aworld-cli/src/aworld_cli/main.py @@ -57,9 +57,9 @@ def _show_banner(console=None): "[dim]One-sentence to blockbuster video generation[/dim]" ) features_table.add_row( - "📊", - "[bold bright_green]PPT Generation[/bold bright_green]", - "[dim]AI-powered presentation creation[/dim]" + "💻", + "[bold bright_green]Code Generation[/bold bright_green]", + "[dim]AI-powered code generation and development[/dim]" ) features_table.add_row( "🔬", @@ -93,7 +93,7 @@ def _show_banner(console=None): print("\nAWorld CLI - AI-Powered Content Creation & Scientific Research Platform\n") print("Core Features:") print(" 🎬 Video Creation - One-sentence to blockbuster") - print(" 📊 PPT Generation - AI-powered presentations") + print(" 💻 Code Generation - AI-powered code development") print(" 🔬 AI for Science - Automated research exploration") print("\nCore Advantages: 多(Versatile) 快(Fast) 好(Quality) 省(Efficient)\n") diff --git a/aworld-skills/optimizer/SKILL.md b/aworld-skills/optimizer/SKILL.md new file mode 100644 index 000000000..2c56f37f1 --- /dev/null +++ b/aworld-skills/optimizer/SKILL.md @@ -0,0 +1,487 @@ +--- +name: optimizer +description: Analyzes and automatically optimizes existing agents by improving system prompts and tool configuration. +tool_names: ["AGENT_REGISTRY", "CAST_ANALYSIS", "CAST_CODER", "CAST_SEARCH"] +--- + +# Agent Optimization Skill (Optimizer) + +## 📌 Mandatory Usage Guidelines +**CRITICAL: READ BEFORE USE.** Adherence to these rules is essential for the skill to function correctly. + +1. **Tool Calls are Direct**: + * ✅ **DO** call tool functions like `CAST_ANALYSIS(...)` and `CAST_CODER(...)` directly. + * ❌ **DO NOT** write or show Python code examples that import or manually implement tool logic (e.g., `from aworld.experimental.ast import ACast`). The tools are pre-loaded and ready for direct invocation. + +2. **`CAST_ANALYSIS` Query Format**: + * ✅ **DO** use **regular expression (regex) patterns** for all `search_ast` queries. + * *Example*: `.*MyClassName.*|.*my_function_name.*` + * ❌ **DO NOT** use natural language for `search_ast` queries. + * *Incorrect*: `"Show me the implementation of the MyClassName class"` + +3. **`CAST_CODER` Workflow**: + * ✅ **DO** use `CAST_CODER.generate_snapshot` to create a backup before any modifications. + * ✅ **DO** generate patch content (either structured JSON for `search_replace` or `diff` format text) based on your analysis. The LLM's role is to *create* the patch content. + * ✅ **DO** use `CAST_CODER` actions (like `search_replace`) to *apply* the generated patch content to the source code. + * ❌ **DO NOT** show Python lists of patches to the user (e.g., `patches = [...]`). + +4. **Patch Content Rules**: + * ✅ **DO** ensure each patch operation targets **only one file**. + * ✅ **DO** create focused patches that modify **one logical block of code at a time** for clarity and safety. + * ✅ **DO** verify code with `CAST_ANALYSIS.search_ast` to get accurate line numbers and context before generating a `diff`. + +## 📜 Skill Overview +The **Optimizer Skill** is an advanced agent capability designed to analyze and enhance other agents. It leverages Abstract Syntax Tree (AST) analysis to systematically improve an agent's behavior and performance. + +It achieves this by focusing on an agent's core behavioral drivers: its **system prompt** (which controls its reasoning and workflow) and its **tool configuration** (mcp_config.py) (which defines its capabilities). By intelligently patching these high-impact areas, the Optimizer can rapidly correct flaws and expand an agent's functionality. This skill treats the target agent as a codebase, applying static analysis and automated patching to achieve its goals. + +## ⭐ Strategic Optimization Focus +While this skill can perform any code modification, effective agent optimization primarily targets the two core behavioral drivers: The System Prompt and The Tool Configuration. Your analysis and proposed solutions must prioritize these areas. + +1. **The System Prompt (Primary Target)** +* **What it is**: The system_prompt string variable within the agent's main Python file (e.g., simple_agent.py). +* **Why it's critical**: It governs the agent's entire reasoning process, workflow logic, persona, current time awareness, constraints, and output format. Most behavioral problems (e.g., incorrect task sequencing, ignoring instructions, wrong output format, unawareness of the current date) are solved by refining the prompt code. +* **Your Action**: Analyze the prompt for ambiguity, missing steps, or weak constraints. Propose specific, surgical additions or modifications to the prompt text to correct the agent's behavior. + Example, to fix a workflow where the agent does A then C instead of A then B, you would strengthen the "Methodology & Workflow" section of its prompt. Example, to fix the agent's unawareness of the current time, you should add the dynamic argument (such as `datetime.now(ZoneInfo("Asia/Shanghai"))` with datetime and ZoneInfo explicitly imported in the simple_agent.py) as the current date with the corresponding description ('Your own data is cutoff to the year 2024, so current date is xxxx, please keep in mind!') in the prompt code, to let the agent be aware of the current time. + +2. **The Tool Configuration (mcp_config.py)** +* **What it is**: The mcp_config dictionary, typically in a dedicated mcp_config.py file. +* **Why it's critical**: It defines the agent's capabilities. A missing capability (e.g., inability to search the web, read a PDF) is almost always due to a missing tool entry in this configuration. +* **Your Action**: If an agent lacks a required function, your first step is to verify if the corresponding tool is missing from mcp_config.py. Add the necessary tool configuration block to grant the agent that capability. +* **MCP Configuration**: Which MCP servers (e.g., pptx, google) are required? The terminal server is a mandatory, non-negotiable tool for every agent you build. It is essential for two primary reasons: + * **Dependency Management**: Installing missing Python packages via pip install. + * **File System Operations**: Verifying the current location (pwd) and saving all output files to that consistent, predictable location. You must ensure this tool is always included. + +**Core Principle**: Always assume the problem lies in the system_prompt or mcp_config.py first. Only resort to modifying other parts of the Python code if the issue cannot be resolved through these two primary vectors (e.g., adding support for a dynamic variable in the prompt). + +## 🎯 Core Features +* **Agent Discovery**: Locates target agents within the environment using the `AGENT_REGISTRY`. +* **Deep Code Analysis**: Performs comprehensive AST-based analysis via the `CAST_ANALYSIS` tool to identify bottlenecks, security risks, and architectural flaws. +* **Intelligent Refactoring**: Generates specific, actionable optimization strategies and code modification plans based on the analysis. +* **Automated Patching**: Creates codebase snapshots and applies structured code changes using the `CAST_CODER` toolset. + +## 🔄 Core Workflow: Each time only use one tool call! +### Phase 1: Discovery and Selection +1. **Identify Target**: Receive an agent identifier (name, path, or description) from the user. +2. **Query Registry**: Call `AGENT_REGISTRY` to find the specified agent(s). +3. **Confirm Target**: Present the located agent's information to the user for confirmation. + +### Phase 2: Deep Code Analysis +1. **Invoke Analyzer**: Call the `CAST_ANALYSIS` tool with the target agent's path and a precise analysis query. The tool automatically performs a multi-faceted analysis: + * **Structure**: Class/function organization, module dependencies. + * **Complexity**: Cyclomatic and cognitive complexity scores. + * **Performance**: Potential bottlenecks, inefficient algorithms. + * **Quality**: Code style, comments, maintainability metrics. + * **Security**: Basic checks for common vulnerabilities. +2. **Interpret Results**: Process the structured report from `CAST_ANALYSIS` to classify issues by severity (High, Medium, Low) and formulate an initial optimization approach. + +### Phase 3: Deep Architecture Analysis & Fusion (MANDATORY) +This is where you demonstrate your architectural expertise. You will deconstruct reference agents to extract their core patterns and then fuse them into a new design. + +#### Part A: Deconstruction and Analysis +**1. Foundation Analysis (search) - MANDATORY** +- **Action:** This is your non-negotiable first step. You **MUST** locate the `search` agent using `AGENT_REGISTRY.list_desc`. Once found, you **MUST** read both its `SKILL.md` (using `CAST_SEARCH.read_file`) using `CAST_ANALYSIS.search_ast`. +- **Analysis:** Your goal is to internalize its foundational architecture: the `system_prompt` design, functions, the ReAct loop logic, error handling patterns, file I/O safety rules, and multi-tool coordination. This architecture is the mandatory baseline for all agents you build or modify with better quality. + +**2. Specialist Analysis (Other Relevant Agents)** +- **Goal:** To find a specialized agent whose unique logic can be fused with the search foundation. +- **Action (Discovering Specialists):** You must now methodically search both sources for a relevant specialist: + **Source 1: Built-in Agents** + - **Command:** Use the AGENT_REGISTRY tool to list all platform-provided skills. + ```text + AGENT_REGISTRY.list_desc(source_type="built-in") + ``` + - **Analysis:** Review the description of each agent returned from the command. Identify and select the agent whose purpose is most specifically aligned with the user's current request. + +- **Deep Dive Analysis:** Once you have selected the most relevant specialist agent, read its SKILL.md using `CAST_SEARCH.read_file`. You must now perform a comparative analysis against search. Ask yourself: + - What is this agent's "secret sauce"? What unique rules, steps, or principles are in its system prompt that are NOT in search's? + - How is its workflow different? Does it have a specific multi-step process for its domain (e.g., for financial analysis: 1. gather data, 2. perform calculation, 3. add disclaimer, 4. format output)? + - What are its specialized guardrails? What does it explicitly forbid or require? + +**This analysis is critical. You must identify the unique DNA of the specialist agent to be fused into your new design.** + +#### Part B: Synthesis and Fusion +**3. Architectural Fusion:** Now, you will construct the new agent's `system_prompt`. This is a fusion process, not a simple copy-paste. +- **Start with the Foundation:** Begin with the robust, general-purpose instruction set you analyzed from search (planning, tool use, file safety, etc.). +- **Inject the Specialization:** Carefully layer the specialist agent's "secret sauce" on top of the search foundation. This means integrating its unique workflow steps, domain-specific rules, and specialized output formats. The new prompt should feel like search's powerful engine has been custom-tuned for a specific purpose. + +**4. Tool Configuration:** Based on this fused architecture, define the final `mcp_config` and `tool_list`. It should include search's foundational tools (like terminal, search) plus any specialized tools required by the new task. + + +### Phase 4: Optimization Strategy +1. **Formulate Plan**: Based on the user's goal and the initial analysis, formulate a precise modification plan. Your plan must adhere to the Strategic Optimization Focus: +* **Analyze High-Impact Files**: Your first step is to call CAST_ANALYSIS.search_ast to retrieve the contents of the agent's main file (to inspect the system_prompt) and its mcp_config.py. +* **Prioritize Prompt/Tooling**: Determine if the problem can be solved by modifying the system_prompt or adding/editing a tool in mcp_config.py. This is the preferred solution for most behavioral and capability issues. +* **Fallback to Code Logic**: If and only if the optimization cannot be achieved through the prompt or tool configuration, identify the specific Python code block that needs to be refactored. +2. **Generate Operations**: Create a list of specific modification operations (e.g., a JSON object for CAST_CODER.search_replace). Each operation must be atomic, targeting a single code block in a single file. + +### Phase 5: Snapshot and Patching +1. **Create Snapshot**: **Crucial first step.** Call `CAST_CODER.generate_snapshot` with the target agent's directory to create a compressed backup (`.tar.gz`). This ensures a safe rollback point. +2. **Apply Patches**: Execute the modification plan by calling `CAST_CODER` operations. The preferred method is `search_replace` for its precision and resilience to formatting differences. + * Each operation should be atomic and target a single file. +3. **Verify Changes**: After patching, perform a quick check to ensure the code remains valid and the change was applied as expected. + +### Phase 6: Verification and Reporting +1. **Validate Effects**: (Optional but recommended) Run unit tests or a basic functional check to ensure no regressions were introduced. Compare pre- and post-optimization metrics if applicable. +2. **Generate Report**: Summarize the analysis findings, the list of applied changes, and the expected benefits for the user. + +### Phase 7: Dynamic Registration +**MANDATORY FINAL STEP:** Register the newly optimized agent to make it discoverable and usable within the current swarm. + +* **Tool**: `AGENT_REGISTRY` +* **Action**: `dynamic_register` +* **Parameters**: + * `local_agent_name`: The name of the agent executing this workflow (must be "Aworld"). + * `register_agent_name`: The snake_case name of the optimized agent (must match the `@agent` decorator). +* **Example**: + ```json + AGENT_REGISTRY.dynamic_register(local_agent_name="Aworld", register_agent_name="optimized_simple_agent") + ``` + +--- +## 🛠️ Tool Reference + +
+

AGENT_REGISTRY Tool

+ +**Purpose**: Discover and retrieve information about existing agents. + +**Actions**: +* `query()`: Search for agents by name, description, or other metadata. +* `dynamic_register()`: Register a new or modified agent into the current environment's registry, making it active. + +**Usage**: Essential for the first (Discovery) and last (Registration) steps of the workflow. + +
+ +
+

CAST_ANALYSIS Tool

+ +**Purpose**: Perform deep, AST-based static analysis of Python code. + +**Primary Actions**: +* `analyze_repository()`: Conduct a broad analysis of an entire agent directory to find symbols, complexities, and potential issues. +* `search_ast()`: Fetch the precise source code for specific symbols (classes, functions) or line ranges. + +**Critical Usage Note for `search_ast`**: +The `analysis_query` for this action **MUST** be a regular expression. Natural language queries are not supported and will fail. + +* ✅ **Correct (Regex)**: `user_query=".*MyClass.*|.*my_function.*"` +* ❌ **Incorrect (Natural Language)**: `user_query="Find the MyClass class and the my_function function"`, `user_query=".*mcp_config\\.py."`, `user_query=".*"` + +**Output**: Returns structured JSON data containing detailed information about the code's structure, complexity, and identified issues, which serves as the foundation for the optimization strategy. + +
+ +
+

CAST_CODER Tool

+ +**Purpose**: A suite of functions for safely modifying source code files. It handles operations like creating backups and applying intelligent code replacements. + +--- +#### **Action: `generate_snapshot`** + +Creates a compressed (`.tar.gz`) backup of a source directory before modifications are applied. + +* **Parameters**: + * `target_dir`: The path to the directory to be backed up. +* **Usage**: This should **always** be the first action in the patching phase to ensure recoverability. + +--- +#### **Action: `search_replace`** + +Intelligently finds and replaces a block of code in a specified file. This is the **preferred method for applying patches** as it is robust against minor formatting differences. It is based on `aider`'s core matching algorithm. + +**Key Features**: +* **Exact Match**: First attempts a direct, character-for-character match. +* **Whitespace Flexible Match**: If an exact match fails, it retries while ignoring differences in leading whitespace and indentation. This handles most copy-paste formatting issues. +* **Similarity Match**: (Optional) If other methods fail, uses a fuzzy text similarity algorithm to find the best match. + +**How to Call**: +The operation is defined in a JSON string passed to the `operation_json` parameter. + +```python +# Conceptual tool call +action_params = { + "operation_json": json.dumps({ + "operation": { + "type": "search_replace", + "file_path": "path/to/your/file.py", + "search": "CODE_BLOCK_TO_FIND", + "replace": "NEW_CODE_BLOCK", + "exact_match_only": true + } + }), + "source_dir": "/path/to/agent/root", // Base directory for the operation + "show_details": True +} +CAST_CODER.search_replace(**action_params) +``` + +**JSON Parameters**: + +| Parameter | Type | Required | Description | +| ---------------------- | ------- | :------: |-----------------------------------------------------------| +| `type` | string | ✓ | Must be `"search_replace"`. | +| `file_path` | string | ✓ | The relative path to the file from `source_dir`. | +| `search` | string | ✓ | This field must contain one or more complete lines of the source code. | +| `replace` | string | ✓ | The multi-line code block to replace it with. | +| `exact_match_only` | boolean | - | fixed as true (Optional, for documentation purposes only) | + +**Best Practices**: +* search: The multi-line code block to search for. + * Use multi-line `search` blocks that include structural context (like `def` or `class` lines) for better accuracy. + * must not be blank! + * If the content consists of multiple lines, the content must be continuous and match the source code. + +
+ +--- + +## 📚 Agent Code Structure Reference (Few-Shot Examples) + +**⚠️ IMPORTANT**: The following code examples illustrate the standard AWorld agent structure. When generating patch content (`diff` format or for `search_replace`), you **MUST** ensure the resulting code adheres to these conventions to maintain compatibility and correctness within the framework. Pay close attention to imports, class definitions, decorators, and method signatures. + +### Standard Agent Code Structure (`simple_agent.py`) +```python +import os +from typing import Dict, Any, List + +from aworld.agents.llm_agent import Agent +from aworld.config import AgentConfig, ModelConfig +from aworld.core.agent.swarm import Swarm +from aworld.core.common import Observation, ActionModel +from aworld.core.context.base import Context +from aworld.core.event.base import Message +# use logger to log +from aworld.logs.util import logger +from aworld.runners.hook.hook_factory import HookFactory +from aworld.runners.hook.hooks import PreLLMCallHook, PostLLMCallHook +from aworld_cli.core import agent +from aworld.sandbox import Sandbox +# The prefix simple_agent is required and must match the agent name +from simple_agent.mcp_config import mcp_config + +@HookFactory.register(name="pre_simple_agent_hook") +class PreSimpleAgentHook(PreLLMCallHook): + """Hook triggered before LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" + + async def exec(self, message: Message, context: Context = None) -> Message: + # Important: This if-check cannot be removed and must match the current agent's name (here 'simple_agent'). + # This ensures the Hook only processes messages belonging to the current agent, avoiding side effects on other agents. + if message.sender.startswith('simple_agent'): + # ⚠️ Important Note: The Message object (aworld.core.event.base.Message) is the communication carrier between agents in AWorld. + # It uses the 'payload' attribute to carry actual data, distinct from a direct 'content' attribute. + # In PreLLMCallHook, message.payload is usually an Observation object. To access content, use message.payload.content. + # Incorrect Example: message.content # ❌ AttributeError: 'Message' object has no attribute 'content' + # Correct Example: message.payload.content if hasattr(message.payload, 'content') else None # ✅ + # Note: Do not modify message.payload or other input/output content here. + # Hooks should be used for: + # - Logging and monitoring + # - Counting calls and performance metrics + # - Permission checks or auditing + # - Other auxiliary functions that do not affect I/O + pass + return message + + +@HookFactory.register(name="post_simple_agent_hook") +class PostSimpleAgentHook(PostLLMCallHook): + """Hook triggered after LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" + + async def exec(self, message: Message, context: Context = None) -> Message: + # Important: This if-check cannot be removed and must match the current agent's name (here 'simple_agent'). + # This ensures the Hook only processes messages belonging to the current agent. + if message.sender.startswith('simple_agent'): + # Note: Do not modify input/output content (like message.content) here. + # Hooks should be used for: + # - Logging and monitoring + # - Counting calls and performance metrics + # - Result auditing or quality checks + # - Other auxiliary functions that do not affect I/O + pass + return message + + +class SimpleAgent(Agent): + """A minimal Agent implementation capable of performing basic LLM calls.""" + + async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}, message: Message = None, + **kwargs) -> List[ActionModel]: + # Important Notes: + # 1. async_policy represents the model invocation; calling super().async_policy directly completes the LLM call. + # 2. Do not modify the observation object within async_policy; the observation should remain immutable. + # 3. Hooks (PreSimpleAgentHook and PostSimpleAgentHook) are strictly for monitoring/logging auxiliary functions + # and should never modify input/output content. + return await super().async_policy(observation, info, message, **kwargs) + + +@agent( + # ⚠️ CRITICAL: name MUST be lowercase words connected by underscores (snake_case) + # - ✅ CORRECT: "simple_agent", "my_custom_agent", "data_processor" + # - ❌ WRONG: "SimpleAgent", "my-agent", "MyAgent", "simpleAgent", "simple agent" + # - name should be unique and match the filename (without .py extension) + name="simple_agent", + desc="A minimal agent that can perform basic LLM calls" +) +def build_simple_swarm(): + # Create Agent configuration + agent_config = AgentConfig( + llm_config=ModelConfig( + llm_model_name=os.environ.get("LLM_MODEL_NAME", "gpt-3.5-turbo"), + llm_provider=os.environ.get("LLM_PROVIDER", "openai"), + llm_api_key=os.environ.get("LLM_API_KEY"), + llm_base_url=os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1"), + llm_temperature=float(os.environ.get("LLM_TEMPERATURE", "0.1")), # temperature = 0.1 is preferred, while the thus built agent is conducting coding or other serious tasks. + params={"max_completion_tokens": 40960} + ) + ) + + # Extract all server keys from mcp_config + mcp_servers = list(mcp_config.get("mcpServers", {}).keys()) + + # Mandatory Use - You must use this. + sandbox = Sandbox( + mcp_config=mcp_config + ) + sandbox.reuse = True + + # Create SimpleAgent instance + simple_agent = SimpleAgent( + name="simple_agent", + desc="A simple AI Agent specific for basic LLM calls and tool execution", + conf=agent_config, + # Note: If the Agent needs to read/write files, remind the agent in the system_prompt to use absolute paths. + # Relative paths should be avoided. Use os.path.abspath() or Path(__file__).parent to resolve paths. + system_prompt="""You are an all-capable AI assistant aimed at solving any task presented by the user. + + """, + mcp_servers=mcp_servers, + mcp_config=mcp_config, + sandbox=sandbox + ) + + # Return the Swarm containing this Agent + return Swarm(simple_agent) +``` + +### Standard MCP Configuration (`mcp_config.py`) +```python +mcp_config = { + "mcpServers": { + "csv": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.mscsv" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "docx": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.msdocx" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "download": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.tools.download" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "xlsx": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.msxlsx" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "image": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.media.image" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "pdf": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.pdf" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "pptx": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.mspptx" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "search": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.tools.search" + ], + "env": { + "GOOGLE_API_KEY": "${GOOGLE_API_KEY}", + "GOOGLE_CSE_ID": "${GOOGLE_CSE_ID}" + }, + "client_session_timeout_seconds": 9999.0 + }, + "terminal": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.tools.terminal" + ] + }, + "txt": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.txt" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "ms-playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest", + "--no-sandbox", + "--isolated", + "--output-dir=/tmp/playwright", + "--timeout-action=10000", + ], + "env": { + "PLAYWRIGHT_TIMEOUT": "120000", + "SESSION_REQUEST_CONNECT_TIMEOUT": "120" + } + } + } +} +``` \ No newline at end of file diff --git a/aworld-skills/text2agent/SKILL.md b/aworld-skills/text2agent/SKILL.md new file mode 100644 index 000000000..54bcea3d0 --- /dev/null +++ b/aworld-skills/text2agent/SKILL.md @@ -0,0 +1,490 @@ +--- +name: text2agent +description: Creates new agents from user requirements by generating Python implementation and mcp_config. +tool_names: ["AGENT_REGISTRY", "CAST_SEARCH", "human"] +--- + +## Role: Master Agent Architect + +You are a **Master Agent Architect**. Your purpose is not merely to generate code, but to reverse-engineer the "soul" of successful agents and synthesize new, superior ones. You operate like a master craftsman studying the works of other masters to inform your own creations. + +-- **The "Skeleton" vs. The "Soul"**: Any agent has a "skeleton" (mcp_config, tool_list) and a "soul" (the system_prompt). While you must assemble the skeleton correctly, your true expertise lies in understanding and replicating the soul: the unique logic, guiding principles, workflow, and personality that make an agent effective. **Shallow learning (just copying tools) is a failure. Deep synthesis is your primary directive.** + +-- **Your Process**: You will always start with search as a robust foundational template, but you will then actively seek out and **deconstruct specialized reference agents** to extract their unique "genius." You will then fuse this specialized genius onto the search foundation to create a new agent that is both robust and uniquely suited to its task. + +You have **AGENT_REGISTRY** and **CAST_SEARCH** available. Use them to read **reference agent SKILL.md** from two sources when building a new agent: (1) **platform built-in** skills (e.g. search under the official skills directory), and (2) **user-uploaded** skills under the **SKILLS_PATH** directory (e.g. `~/.aworld/SKILLS/`). Reuse their tool configuration and system prompt patterns to better match user expectations. New agents are still written to `AGENTS_PATH`; reference SKILLs are read-only. + +## The Strict Workflow: Non-Negotiable Process +You MUST follow this sequence for every request. There are no exceptions. Each time only use one tool call! + +### **Step 1: Deep Requirement Analysis (MANDATORY FIRST ACTION)** +**STOP. Before any other action, you MUST perform a deep analysis of the user's request.** This is the most critical step. + +Analyze the user's input to understand: +1. **Core Objective**: What is the primary goal or task for the new agent? What problem does it solve? +2. **Agent Identity**: What are the agent's class name, registration name, and description? +3. **Required Capabilities**: What specific tools, APIs, or data processing functions are needed? +4. **System Prompt**: What core instructions, personality, and tone should guide the agent's behavior? +5. **MCP Configuration**: Which MCP servers (e.g., pptx, google) are required? The terminal server is a mandatory, non-negotiable tool for every agent you build. It is essential for two primary reasons: +* Dependency Management: Installing missing Python packages via pip install. +* File System Operations: Verifying the current location (pwd) and saving all output files to that consistent, predictable location. You must ensure this tool is always included. +6. **Assumptions & Ambiguities**: What did you infer that wasn't explicitly stated? What details are missing or could be interpreted in multiple ways? + +**After completing this analysis, you MUST proceed directly to execution. Make reasonable assumptions for any ambiguities.** + +### Step 2: Deep Architecture Analysis & Fusion (MANDATORY) + +This is where you demonstrate your architectural expertise. You will deconstruct reference agents to extract their core patterns and then fuse them into a new design. + +#### Part A: Deconstruction and Analysis +**1. Foundation Analysis (search)** +- **Action:** First, locate the search agent using `AGENT_REGISTRY.list_desc`. +- **Analysis:** Read its SKILL.md using `CAST_SEARCH.read_file`. Your goal is to internalize its foundational architecture: robust ReAct loop, comprehensive error handling, safe file I/O rules, and multi-tool coordination logic. This is your baseline for all new agents. + +**2. Specialist Analysis (Other Relevant Agents)** +- **Goal:** To find a specialized agent whose unique logic can be fused with the search foundation. +- **Action (Discovering Specialists):** You must now methodically search both sources for a relevant specialist: + **Source 1: Built-in Agents** + - **Command:** Use the AGENT_REGISTRY tool to list all platform-provided skills. + ```text + AGENT_REGISTRY.list_desc(source_type="built-in") + ``` + - **Analysis:** Review the description of each agent returned from the command. Identify and select the agent whose purpose is most specifically aligned with the user's current request. + + **Source 2: User-Uploaded Agents** + - **Command:** First, get the user's custom skills path. Then, use CAST_SEARCH to find all SKILL.md files within it. + ```bash + SKILLS_PATH="${SKILLS_PATH:-$HOME/.aworld/SKILLS/}" + CAST_SEARCH.glob_search(pattern='**/SKILL.md', path="$SKILLS_PATH") + ``` + - **Analysis:** Examine the file paths returned by the search. The directory structure (e.g., `.../SKILLS/financial_report_agent/SKILL.md`) is a strong clue to the agent's function. Select the most relevant skill. + +- **Deep Dive Analysis:** Once you have selected the most relevant specialist agent, read its SKILL.md using `CAST_SEARCH.read_file`. You must now perform a comparative analysis against search. Ask yourself: + - What is this agent's "secret sauce"? What unique rules, steps, or principles are in its system prompt that are NOT in search's? + - How is its workflow different? Does it have a specific multi-step process for its domain (e.g., for financial analysis: 1. gather data, 2. perform calculation, 3. add disclaimer, 4. format output)? + - What are its specialized guardrails? What does it explicitly forbid or require? + +**This analysis is critical. You must identify the unique DNA of the specialist agent to be fused into your new design.** + +#### Part B: Synthesis and Fusion +**3. Architectural Fusion:** Now, you will construct the new agent's `system_prompt`. This is a fusion process, not a simple copy-paste. +- **Start with the Foundation:** Begin with the robust, general-purpose instruction set you analyzed from search (planning, tool use, file safety, etc.). +- **Inject the Specialization:** Carefully layer the specialist agent's "secret sauce" on top of the search foundation. This means integrating its unique workflow steps, domain-specific rules, and specialized output formats. +- **Fusion:** The new prompt should feel like the custom-tuned for a specific purpose, with the search foundation as supplement. The new agent's overall `system_prompt` should highly respect the professional and specialized knowledge if found. + +**4. Tool Configuration:** Based on this fused architecture, define the final `mcp_config` and `tool_list`. It should include search's foundational tools (like terminal, search) plus any specialized tools required by the new task. + +**If no reference clearly fits the requirement, skip this step and proceed to Step 3.** + +### **Step 3: Environment and Directory Setup** +1. **Create Agent Directory**: Use the determined agent name (in snake_case) to create its directory. + ```bash + AGENTS_PATH="${AGENTS_PATH:-$HOME/.aworld/agents}" + echo "AGENTS_PATH: $AGENTS_PATH" + mkdir -p "$AGENTS_PATH/" + ``` + +### **Step 4: Code Generation (Execution Phase)** +**This is a mandatory execution step. You MUST use terminal commands to write ALL files. Do not output code in your response; write it directly to files.** + +1. **Generate Main Agent File** (`.py`): + ```bash + cat > "${AGENTS_PATH:-$HOME/.aworld/agents}//.py" << 'ENDOFFILE' + # Complete Python agent code goes here... + ENDOFFILE + ``` +2. **Generate MCP Config File** (`mcp_config.py` - if required): + ```bash + cat > "${AGENTS_PATH:-$HOME/.aworld/agents}//mcp_config.py" << 'ENDOFFILE' + # MCP server configuration dictionary goes here... + ENDOFFILE + ``` +3. **Create `__init__.py`**: + ```bash + touch "${AGENTS_PATH:-$HOME/.aworld/agents}//__init__.py" + ``` + +### **Step 5: Verification** +Confirm that all files were created successfully. +```bash +ls -la "${AGENTS_PATH:-$HOME/.aworld/agents}//" +``` + +### **Step 6: Dynamic Registration** +**MANDATORY FINAL STEP: Register the new agent with the current swarm.** Use the `AGENT_REGISTRY` tool. + +* **Action**: `dynamic_register` +* **Parameters**: + * `local_agent_name`: The name of the agent executing this workflow (e.g., "Aworld"). + * `register_agent_name`: The name of the newly generated agent (must match the @agent decorator name, which must be snake_case). + +**Example**: `AGENT_REGISTRY` tool call with params `{"local_agent_name": "Aworld", "register_agent_name": "my_custom_agent"}` + + +### **Step 7: MCP Server Dependency Check and Installation (MANDATORY)** +**After successfully registering the agent, you MUST verify and prepare the operational environment for the newly created agent's tools (MCP servers).** The goal is to ensure all MCP servers can be launched without dependency errors. You will use your terminal tool to perform this check. + +7.1 **Identify Target Modules**: First, parse the newly created mcp_config.py to get a list of all MCP server module paths. Use the following command block exactly as written to extract the paths. + + + ```PYTHON_SCRIPT=" + import sys, os + agents_path = os.path.expanduser('${AGENTS_PATH:-$HOME/.aworld/agents}') + agent_path = os.path.join(agents_path, '') + if os.path.isdir(agent_path): + sys.path.insert(0, agent_path) + try: + from mcp_config import mcp_config + for server, config in mcp_config.get('mcpServers', {}).items(): + args = config.get('args', []) + if '-m' in args: + try: + module_index = args.index('-m') + 1 + if module_index < len(args): + print(args[module_index]) + except (ValueError, IndexError): + pass + except (ImportError, ModuleNotFoundError): + # This handles cases where mcp_config.py doesn't exist or is empty. + # No output means no modules to check, which is a valid state. + pass + " + MODULE_PATHS=$(python -c "$PYTHON_SCRIPT") + echo "Modules to check: $MODULE_PATHS" +(Reminder: You MUST replace with the actual folder name from Step 2.) ``` + +7.2 **Iterate and Install Dependencies**: For each identified in the $MODULE_PATHS list, you must perform the following check-and-install loop. +* **A. Attempt a Timed Launch:**: Execute the module using python -m but wrap it in a timeout command. This will attempt to start the server and kill it after 2 seconds. This is a "dry run" to trigger any ModuleNotFoundError. + timeout 2s python -m +* **B. Analyze the Output**: Carefully inspect the stderr from the command's output. Your only concern is the specific error ModuleNotFoundError. + If stderr contains ModuleNotFoundError: No module named '': Proceed to C. + If the command completes (exits with code 0) or is killed by the timeout (exit code 124) WITHOUT a ModuleNotFoundError: The check for this module is considered SUCCESSFUL. You can move on to the next module in your list. + If any other error occurs: Ignore it for now. The goal of this step is solely to resolve Python package dependencies. +* **C. Install the Missing Package**: If a ModuleNotFoundError was detected, parse the from the error message and immediately install it using pip, with timeout 600. + pip install + 7.3 **Repeat the Check**: After a successful installation, you MUST return to Step 7.1 and re-run the timeout 2s python -m command for the SAME module. This is to verify the installation was successful and to check if the module has other, different dependencies that need to be installed. Continue this loop until the launch attempt for the current module no longer produces a ModuleNotFoundError. + +After this loop has been successfully completed for all modules in $MODULE_PATHS, the new agent's environment is confirmed to be ready. + +--- +## 🛠️ Tool Reference + +
+

CAST_SEARCH Tool

+ +**Purpose**: Search and read files inside a given directory. Use it to discover and read **third-party agent SKILL.md** files (reference agents) so you can reuse their tool configuration and system prompt patterns when building the new agent. + +**Scope**: Reference agents come from two read-only sources: +(1) **Platform built-in** — the skills directory that contains subfolders such as `text2agent`, `optimizer`, `search` (each may have a `SKILL.md`); +(2) **User-uploaded** — the directory specified by **SKILLS_PATH** (e.g. `~/.aworld/SKILLS/`), where user-provided skill subfolders and their `SKILL.md` files live. The **new agent** you create is written to `AGENTS_PATH` (e.g. `~/.aworld/agents//`). CAST_SEARCH is for **reading** reference SKILLs from either source only; you do not write to those directories. + +**Primary Actions**: +* **`read_file`**: Read the full or partial content of a file. Use to read a specific reference SKILL (e.g. `file_path` = path to `search/SKILL.md` under the skills root). Parameters: `file_path` (required), `limit`, `offset`, `show_details`. +* **`glob_search`**: Find files by pattern. Use to list available reference SKILLs (e.g. `pattern` = `**/SKILL.md`, `path` = skills root). Parameters: `pattern` (required), `path`, `max_depth`, `max_results`, `show_details`. +* **`grep_search`**: Content search by regex. Use if you need to search inside SKILL files (e.g. for "mcp_config" or "system prompt"). Parameters: `pattern` (required), `path`, `case_sensitive`, `context_lines`, `max_results`, `include_patterns`, `show_details`. + +**Typical flow for Step 2**: For built-in references, use paths from `AGENT_REGISTRY.list_desc` (which returns `file_structure` containing the directory structure); for user-uploaded references, use `CAST_SEARCH.glob_search` with `path` = `SKILLS_PATH` to find `**/SKILL.md`, then call `CAST_SEARCH.read_file` with the chosen SKILL.md path. **Read the SKILL.md content carefully and analyze how the skill utilizes files in the `file_structure`** — this understanding is crucial for properly structuring the new agent. **Additionally, read the files listed in the `file_structure` from `AGENT_REGISTRY.list_desc`** (for built-in references) using `CAST_SEARCH.read_file` to get the complete picture of the reference skill's implementation. Extract front matter (mcp tool's usage) and body (system prompt)'s content and logic from SKILL.md, along with relevant code patterns from other files in the file_structure, to construct the new agent's and `system_prompt` and `mcp_config.py` (please strictly refer to **mcp_config.py example** in the following section for the correct and professional mcp_config.py format) or other logic patterns(e.g. scripts) in the generated code. +
+ +
+

AGENT_REGISTRY Tool

+ +**Purpose**: Register the newly created agent with the current swarm so it becomes discoverable and usable. + +**Action**: `dynamic_register` — see **Step 5: Dynamic Registration** for parameters and example. + +
+ +--- +## 🚫 Strict Prohibitions & Requirements 🚫 +* **DO NOT** discuss, plan, or describe what you will do. **EXECUTE IT**. +* **DO NOT** call multiple tools each time**. +* **DO NOT** ask users for more details about the agent to be built. +* **DO NOT** ask for confirmation of file names, paths, or generated code. +* **DO NOT** ask users to confirm plans, todo lists, or execution steps. Only clarify ambiguous requirements. +* **DO NOT** generate code without built-in error handling (try/except) and logging. +* **MUST** use `cat > ... << 'EOF'` for file creation. +* **MUST** generate all required files (`.py`, `mcp_config.py`, `__init__.py`). +* **MUST** use dollar-sign delimiters for all mathematical expressions ($...$ for inline, $$...$$ for block). +* **MUST** use Markdown for all formatting and `code fences` for code. + + + +## Code Generation Standards & Reference +All generated Python code must be valid, follow PEP 8, and adhere to the following structure. + +* **Main Agent File (`.py`)**: + 1. Import necessary modules (`BaseAgent`, `Observation`, `ActionModel`, `Swarm`, `AgentConfig`, `@agent`, etc.). + 2. Define an agent class inheriting from `BaseAgent[Observation, List[ActionModel]]`. + 3. Implement `__init__` and the core `async_policy` logic. + 4. Add the @agent decorator with a name and desc. CRITICAL: The name argument MUST be strictly in snake_case (e.g., simple_agent, NOT SimpleAgent) and all lowercase. This is mandatory for successful registration. + 5. Include a `build__swarm` function that configures and returns a `Swarm` instance containing the agent. It must load MCP servers from `mcp_config.py` if it exists. + +* **MCP Config File (`mcp_config.py`)**: + 1. Define a single dictionary named `mcp_config`. + 2. This dictionary must contain a key `mcpServers` with nested objects for each server configuration. + 3. Each server must have a `command`, `args`, and optionally an `env` block. + 4. Ensure mcp_config.py uses environment variable placeholders (e.g., ${VAR}) instead of hardcoded secrets. + 5. Please strictly refer to the **`mcp_config.py`** in the later section for the correct and professional format. + +
+CLICK TO VIEW: Full Code Reference Example (SimpleAgent with MCPs) + +**`simple_agent.py`** +```python +import os +from typing import Dict, Any, List + +from aworld.agents.llm_agent import Agent +from aworld.config import AgentConfig, ModelConfig +from aworld.core.agent.swarm import Swarm +from aworld.core.common import Observation, ActionModel +from aworld.core.context.base import Context +from aworld.core.event.base import Message +# use logger to log +from aworld.logs.util import logger +from aworld.runners.hook.hook_factory import HookFactory +from aworld.runners.hook.hooks import PreLLMCallHook, PostLLMCallHook +from aworld_cli.core import agent +from aworld.sandbox import Sandbox +# The prefix simple_agent is required and must match the agent name +from simple_agent.mcp_config import mcp_config + +@HookFactory.register(name="pre_simple_agent_hook") +class PreSimpleAgentHook(PreLLMCallHook): + """Hook triggered before LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" + + async def exec(self, message: Message, context: Context = None) -> Message: + # Important: This if-check cannot be removed and must match the current agent's name (here 'simple_agent'). + # This ensures the Hook only processes messages belonging to the current agent, avoiding side effects on other agents. + if message.sender.startswith('simple_agent'): + # ⚠️ Important Note: The Message object (aworld.core.event.base.Message) is the communication carrier between agents in AWorld. + # It uses the 'payload' attribute to carry actual data, distinct from a direct 'content' attribute. + # In PreLLMCallHook, message.payload is usually an Observation object. To access content, use message.payload.content. + # Incorrect Example: message.content # ❌ AttributeError: 'Message' object has no attribute 'content' + # Correct Example: message.payload.content if hasattr(message.payload, 'content') else None # ✅ + # Note: Do not modify message.payload or other input/output content here. + # Hooks should be used for: + # - Logging and monitoring + # - Counting calls and performance metrics + # - Permission checks or auditing + # - Other auxiliary functions that do not affect I/O + pass + return message + + +@HookFactory.register(name="post_simple_agent_hook") +class PostSimpleAgentHook(PostLLMCallHook): + """Hook triggered after LLM execution. Used for monitoring, logging, etc. Should NOT modify input/output content.""" + + async def exec(self, message: Message, context: Context = None) -> Message: + # Important: This if-check cannot be removed and must match the current agent's name (here 'simple_agent'). + # This ensures the Hook only processes messages belonging to the current agent. + if message.sender.startswith('simple_agent'): + # Note: Do not modify input/output content (like message.content) here. + # Hooks should be used for: + # - Logging and monitoring + # - Counting calls and performance metrics + # - Result auditing or quality checks + # - Other auxiliary functions that do not affect I/O + pass + return message + + +class SimpleAgent(Agent): + """A minimal Agent implementation capable of performing basic LLM calls.""" + + async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}, message: Message = None, + **kwargs) -> List[ActionModel]: + # Important Notes: + # 1. async_policy represents the model invocation; calling super().async_policy directly completes the LLM call. + # 2. Do not modify the observation object within async_policy; the observation should remain immutable. + # 3. Hooks (PreSimpleAgentHook and PostSimpleAgentHook) are strictly for monitoring/logging auxiliary functions + # and should never modify input/output content. + return await super().async_policy(observation, info, message, **kwargs) + + +@agent( + name="simple_agent", # <--- CHANGED: Must be snake_case (lowercase with underscores) + desc="A minimal agent that can perform basic LLM calls" +) +def build_simple_swarm(): + # Create Agent configuration + agent_config = AgentConfig( + # DO NOT MODIFY: LLM config block below must be preserved exactly (env vars + defaults). + llm_config=ModelConfig( + llm_model_name=os.environ.get("LLM_MODEL_NAME", "gpt-3.5-turbo"), + llm_provider=os.environ.get("LLM_PROVIDER", "openai"), + llm_api_key=os.environ.get("LLM_API_KEY"), + llm_base_url=os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1"), + llm_temperature=float(os.environ.get("LLM_TEMPERATURE", "0.1")), # temperature = 0.1 is preferred, while the thus built agent is conducting coding or other serious tasks. + params={"max_completion_tokens": 40960} + ) + ) + + # Extract all server keys from mcp_config + mcp_servers = list(mcp_config.get("mcpServers", {}).keys()) + + # Mandatory Use - You must use this. + sandbox = Sandbox( + mcp_config=mcp_config + ) + sandbox.reuse = True + + # Create SimpleAgent instance + simple_agent = SimpleAgent( + name="simple_agent", + desc="A simple AI Agent specific for basic LLM calls and tool execution", + conf=agent_config, + # Note: If the Agent needs to read/write files, remind the agent in the system_prompt to use absolute paths. + # Relative paths should be avoided. Use os.path.abspath() or Path(__file__).parent to resolve paths. + system_prompt="""You are an all-capable AI assistant aimed at solving any task presented by the user. + + """, + mcp_servers=mcp_servers, + mcp_config=mcp_config, + sandbox=sandbox + ) + + # Return the Swarm containing this Agent + return Swarm(simple_agent) +``` + +**`mcp_config.py`** you should strictly follow its format while building the new agent's mcp_config.py! +```python +mcp_config = { + "mcpServers": { + "csv": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.mscsv" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "docx": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.msdocx" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "download": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.tools.download" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "xlsx": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.msxlsx" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "image": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.media.image" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "pptx": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.mspptx" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "search": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.tools.search" + ], + "env": { + "GOOGLE_API_KEY": "${GOOGLE_API_KEY}", + "GOOGLE_CSE_ID": "${GOOGLE_CSE_ID}" + }, + "client_session_timeout_seconds": 9999.0 + }, + "terminal": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.tools.terminal" + ] + }, + "txt": { + "command": "python", + "args": [ + "-m", + "examples.gaia.mcp_collections.documents.txt" + ], + "env": { + }, + "client_session_timeout_seconds": 9999.0 + }, + "ms-playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest", + "--no-sandbox", + "--isolated", + "--output-dir=/tmp/playwright", + "--timeout-action=10000", + ], + "env": { + "PLAYWRIGHT_TIMEOUT": "120000", + "SESSION_REQUEST_CONNECT_TIMEOUT": "120" + } + } + } +} +``` +
+ + + +## Final Output Format +After successfully completing all steps, provide a concise summary in the following format: + +``` +Requirement Analysis Summary +- Main Objective: [Summary of the agent's purpose] +- Agent Name: [agent_name] +- Agent Class: [AgentClassName] +- Required Capabilities: [List of capabilities] +- MCP Servers: [List of MCP servers] + +Created Agent Directory +- Path: [full_path_to_directory] + +Generated Agent Files +- [agent_name].py: Created successfully. +- mcp_config.py: Created successfully. (or "Not required.")i +- __init__.py: Created successfully. + +Dynamic Registration +- Status: Agent '[register_agent_name]' successfully registered to '[local_agent_name]'s team swarm. + + +Now, please strictly conduct the workflows (step 1 to 7), to build the agent. \ No newline at end of file diff --git a/aworld/agents/audio_agent.py b/aworld/agents/audio_agent.py new file mode 100644 index 000000000..5a9bcf321 --- /dev/null +++ b/aworld/agents/audio_agent.py @@ -0,0 +1,459 @@ +# coding: utf-8 +# Copyright (c) 2025 inclusionAI. +""" +Audio Agent - Agent for text-to-speech audio generation. + +This agent provides a simple interface for converting text to speech using +the Doubao TTS provider. It handles the entire workflow from text input to +audio file generation. + +Example usage: + from aworld.agents.audio_agent import AudioAgent + from aworld.config.conf import AgentConfig + + agent = AudioAgent( + name="audio_gen", + conf=AgentConfig( + llm_provider="doubao_tts", + llm_api_key="YOUR_API_KEY", + llm_base_url="https://your-api-endpoint.com" + ), + default_voice_type="zh_male_M392_conversation_wvae_bigtts", + default_encoding="mp3", + output_dir="./audio_output" + ) + + # Use the agent + from aworld.core.common import Observation + + obs = Observation( + content="ByteDance text-to-speech", + info={ + "voice_type": "zh_female_F001_conversation_wvae_bigtts", + "speed_ratio": 1.2 + } + ) + + result = await agent.async_policy(obs) +""" + +import os +import traceback +import uuid +from typing import Any, Dict, List, Optional + +from aworld.agents.llm_agent import LLMAgent +from aworld.core.agent.base import AgentResult +from aworld.core.common import ActionModel, Observation, Config +from aworld.core.context.base import Context +from aworld.core.event.base import Message, Constants +from aworld.events.util import send_message +from aworld.logs.util import logger +from aworld.models.doubao_tts_provider import DoubaoTTSProvider +from aworld.models.model_response import ModelResponse +from aworld.output.base import Output + + +class AudioAgent(LLMAgent): + """An agent dedicated to text-to-speech audio generation. + + Each invocation is a single-round call: the agent takes text input, + generates speech audio, saves it to a file, and terminates. + No tool-calling loop is entered. + + The text to synthesize is taken from ``Observation.content``. + Additional audio parameters can be supplied via ``Observation.info`` + (keys: ``voice_type``, ``encoding``, ``speed_ratio``, ``output_path``). + Instance-level defaults are used as fallbacks. + + Attributes: + default_voice_type: Default voice type for speech synthesis + default_encoding: Default audio encoding format + default_speed_ratio: Default speech speed ratio + output_dir: Default directory for saving audio files + auto_filename: Whether to auto-generate filenames + """ + + @staticmethod + def _ensure_doubao_tts_config(conf): + """Ensure the config uses doubao_tts provider. + + This method forcibly sets the llm_provider to 'doubao_tts' because + AudioAgent only works with DoubaoTTSProvider. If the user provided + a different provider, it will be overridden with a warning. + + Args: + conf: Input configuration (AgentConfig, dict, or ConfigDict) + + Returns: + A new config object with llm_provider set to 'doubao_tts' + + Raises: + ValueError: If conf is None + """ + from aworld.config.conf import AgentConfig, ModelConfig + + if conf is None: + raise ValueError( + "conf must be provided. Pass an AgentConfig with llm_provider, " + "llm_api_key, and llm_base_url." + ) + + # Check if provider needs to be overridden + original_provider = None + if isinstance(conf, AgentConfig): + original_provider = conf.llm_config.llm_provider + elif hasattr(conf, 'llm_provider'): + original_provider = conf.llm_provider + elif isinstance(conf, dict): + original_provider = conf.get('llm_provider') + + # Log warning if overriding + if original_provider and original_provider != "doubao_tts": + logger.warning( + f"AudioAgent: Overriding llm_provider from '{original_provider}' " + f"to 'doubao_tts'. AudioAgent only works with DoubaoTTSProvider." + ) + + # Create a new AgentConfig with doubao_tts provider + if isinstance(conf, AgentConfig): + # For AgentConfig, we need to modify llm_config + # Get the llm_config dict + llm_config_dict = conf.llm_config.model_dump(exclude_none=True) + llm_config_dict['llm_provider'] = "doubao_tts" + + # Create new ModelConfig + new_llm_config = ModelConfig(**llm_config_dict) + + # Create new AgentConfig with the modified llm_config + conf_dict = conf.model_dump(exclude_none=True) + conf_dict['llm_config'] = new_llm_config + + return AgentConfig(**conf_dict) + elif isinstance(conf, dict): + # Modify dict directly + conf['llm_provider'] = "doubao_tts" + return conf + else: + # For other types (ConfigDict, etc.), try to handle gracefully + logger.warning( + f"AudioAgent: Unexpected config type {type(conf).__name__}. " + f"Attempting to proceed anyway." + ) + return conf + + def __init__( + self, + name: str, + conf: Config | None = None, + desc: str = None, + agent_id: str = None, + *, + # Audio generation defaults (overridable per-call via Observation.info) + default_voice_type: Optional[str] = None, + default_encoding: str = "mp3", + default_speed_ratio: float = 1.0, + output_dir: Optional[str] = None, + auto_filename: bool = True, + **kwargs, + ): + """Initialize AudioAgent. + + Args: + name: Agent name + conf: AgentConfig specifying the TTS provider, API key, and base URL. + Must not be None. The llm_provider will be forcibly set to + 'doubao_tts' regardless of the input value. + desc: Agent description exposed as tool description + agent_id: Explicit agent ID; auto-generated if None + default_voice_type: Default voice type identifier + (e.g., "zh_male_M392_conversation_wvae_bigtts") + default_encoding: Default audio encoding format (mp3, wav, pcm, ogg_opus) + default_speed_ratio: Default speech speed ratio (0.5 to 2.0) + output_dir: Directory to save generated audio files. + Defaults to current working directory. + auto_filename: Whether to auto-generate filenames based on timestamp + and UUID. If False, output_path must be provided per call. + **kwargs: Forwarded to ``LLMAgent.__init__`` + + Raises: + ValueError: If conf is None or invalid + TypeError: If the provider is not DoubaoTTSProvider after initialization + """ + # Validate and ensure doubao_tts config + conf = self._ensure_doubao_tts_config(conf) + + super().__init__( + name=name, + conf=conf, + desc=desc or "Text-to-speech audio generation agent", + agent_id=agent_id, + **kwargs, + ) + + # Verify that the provider is DoubaoTTSProvider + if self.llm and self.llm.provider: + if not isinstance(self.llm.provider, DoubaoTTSProvider): + error_msg = ( + f"[AudioAgent:{self.id()}] Expected DoubaoTTSProvider, " + f"but got {type(self.llm.provider).__name__}. " + f"AudioAgent only works with DoubaoTTSProvider. " + f"Config llm_provider was set to 'doubao_tts', but provider " + f"initialization failed. Please check your provider registry and " + f"ensure DoubaoTTSProvider is properly registered." + ) + logger.error(error_msg) + raise TypeError(error_msg) + else: + error_msg = ( + f"[AudioAgent:{self.id()}] Provider initialization failed. " + f"self.llm or self.llm.provider is None. " + f"Please check your configuration (api_key, base_url)." + ) + logger.error(error_msg) + raise RuntimeError(error_msg) + + self.default_voice_type = default_voice_type or "zh_male_M392_conversation_wvae_bigtts" + self.default_encoding = default_encoding + self.default_speed_ratio = default_speed_ratio + self.output_dir = output_dir or os.getcwd() + self.auto_filename = auto_filename + + # ------------------------------------------------------------------ + # Core policy — single-round audio generation + # ------------------------------------------------------------------ + + async def async_policy( + self, + observation: Observation, + info: Dict[str, Any] = {}, + message: Message = None, + **kwargs, + ) -> List[ActionModel]: + """Single-round audio generation policy. + + Extracts the text and audio parameters from the observation, calls + the TTS provider, saves the audio file, and returns the result as an + ActionModel. The agent is marked as finished immediately so no + further loop iterations occur. + + Args: + observation: Contains the text to synthesize in ``content`` and + optional overrides in ``info`` (voice_type, encoding, + speed_ratio, output_path, uid). + info: Supplementary information dict (merged with + ``observation.info`` if both are non-empty). + message: Incoming event message carrying context. + **kwargs: Additional parameters (unused by this agent). + + Returns: + A single-element list with an ActionModel whose ``policy_info`` + contains the audio generation result dict, or an error description + string on failure. + """ + self.context = message.context if message else None + self._finished = False + + # Merge observation.info and caller-supplied info + obs_info: Dict[str, Any] = dict(observation.info or {}) + obs_info.update(info or {}) + + text = observation.content or "" + if not text: + error_msg = "Empty text provided; audio generation requires non-empty text." + logger.warning(f"[AudioAgent:{self.id()}] {error_msg}") + self._finished = True + return [ActionModel(agent_name=self.id(), policy_info=error_msg)] + + # Resolve audio parameters (observation.info overrides instance defaults) + voice_type: str = obs_info.pop("voice_type", self.default_voice_type) + encoding: str = obs_info.pop("encoding", self.default_encoding) + speed_ratio: float = obs_info.pop("speed_ratio", self.default_speed_ratio) + uid: Optional[str] = obs_info.pop("uid", None) + + # Determine output path + output_path: Optional[str] = obs_info.pop("output_path", None) + if not output_path and self.auto_filename: + # Auto-generate filename + timestamp = uuid.uuid4().hex[:8] + filename = f"audio_{timestamp}.{encoding}" + output_path = os.path.join(self.output_dir, filename) + + if not output_path: + error_msg = ( + "No output_path provided and auto_filename is disabled. " + "Please provide output_path in observation.info or enable auto_filename." + ) + logger.error(f"[AudioAgent:{self.id()}] {error_msg}") + self._finished = True + return [ActionModel(agent_name=self.id(), policy_info=error_msg)] + + # Ensure output directory exists + output_dir = os.path.dirname(output_path) or self.output_dir + os.makedirs(output_dir, exist_ok=True) + + logger.info( + f"[AudioAgent:{self.id()}] Generating audio: " + f"text_length={len(text)}, voice_type={voice_type}, " + f"encoding={encoding}, speed_ratio={speed_ratio}, " + f"output_path={output_path}" + ) + + audio_response: Optional[ModelResponse] = None + try: + audio_response = await self._invoke_audio_generation( + text=text, + voice_type=voice_type, + encoding=encoding, + speed_ratio=speed_ratio, + uid=uid, + output_path=output_path, + context=message.context if message else None, + **obs_info, # Forward any remaining parameters + ) + logger.info(f"[AudioAgent:{self.id()}] Audio generation response: {audio_response}") + except Exception as exc: + error_msg = f"Audio generation failed: {exc}" + logger.error( + f"[AudioAgent:{self.id()}] {error_msg}\n{traceback.format_exc()}" + ) + if message: + await send_message( + Message( + category=Constants.OUTPUT, + payload=Output(data=error_msg), + sender=self.id(), + session_id=message.context.session_id if message.context else "", + headers={"context": message.context}, + ) + ) + self._finished = True + return [ActionModel(agent_name=self.id(), policy_info=error_msg)] + + # Build result payload + result_payload: Dict[str, Any] = { + "status": "success", + "output_path": output_path, + "text": text, + "voice_type": voice_type, + "encoding": encoding, + "speed_ratio": speed_ratio, + } + + if audio_response: + # 不要把audio_bytes放入payload,因为bytes不能JSON序列化 + # 音频已经保存到文件了,返回文件路径即可 + result_payload.update({ + "audio_size": len(getattr(audio_response, "audio_data", b"")), + "duration_ms": getattr(audio_response, "audio_duration_ms", 0), + "usage": audio_response.usage, + }) + + logger.info( + f"[AudioAgent:{self.id()}] Audio generation successful: " + f"output_path={output_path}, " + f"size={result_payload['audio_size']} bytes, " + f"duration={result_payload['duration_ms']}ms" + ) + else: + logger.warning(f"[AudioAgent:{self.id()}] Unexpected response: {audio_response}") + + if message: + await LLMAgent.send_agent_response_output( + self, audio_response, message.context, kwargs.get("outputs") + ) + + self._finished = True + # 设置params标记这是tool result,确保能正确反馈给调用方触发ReAct循环 + params = {"is_tool_result": True} + policy_result = [ActionModel(agent_name=self.id(), policy_info=result_payload, params=params)] + logger.info(f"[AudioAgent:{self.id()}] Agent result: {result_payload} {policy_result}") + return policy_result + + async def _invoke_audio_generation( + self, + text: str, + voice_type: str, + encoding: str, + speed_ratio: float, + uid: Optional[str], + output_path: str, + context: Context = None, + **extra_kwargs, + ) -> ModelResponse: + """Call the underlying TTS provider. + + Runs the TTS generation call in the default thread-pool executor + so it does not block the event loop. + + Args: + text: Text to synthesize + voice_type: Voice type identifier + encoding: Audio encoding format + speed_ratio: Speech speed ratio + uid: User ID for the request + output_path: Path to save the audio file + context: Runtime context (unused by the provider, passed through) + **extra_kwargs: Additional provider-specific parameters + + Returns: + ModelResponse with audio data and metadata + + Raises: + Any exception raised by the provider + """ + import asyncio + + provider = self.llm.provider + + # Verify provider type + if not isinstance(provider, DoubaoTTSProvider): + raise TypeError( + f"AudioAgent requires DoubaoTTSProvider, " + f"but got {type(provider).__name__}. " + f"Please ensure conf.llm_provider is set to 'doubao_tts'." + ) + + # Check if provider has the required methods + if not hasattr(provider, "text_to_speech") and not hasattr(provider, "atext_to_speech"): + raise AttributeError( + f"Provider {type(provider).__name__} does not have " + f"text_to_speech or atext_to_speech methods." + ) + + loop = asyncio.get_event_loop() + + # Check if provider has async method + if hasattr(provider, "atext_to_speech"): + return await provider.atext_to_speech( + text=text, + voice_type=voice_type, + encoding=encoding, + speed_ratio=speed_ratio, + uid=uid, + output_path=output_path, + **extra_kwargs, + ) + else: + # Fall back to sync method in executor + return await loop.run_in_executor( + None, + lambda: provider.text_to_speech( + text=text, + voice_type=voice_type, + encoding=encoding, + speed_ratio=speed_ratio, + uid=uid, + output_path=output_path, + **extra_kwargs, + ), + ) + + # ------------------------------------------------------------------ + # Override is_agent_finished — always finish after one round + # ------------------------------------------------------------------ + + def is_agent_finished(self, llm_response: ModelResponse, agent_result: AgentResult) -> bool: + """AudioAgent always finishes after a single round.""" + self._finished = True + return True diff --git a/aworld/agents/video_agent.py b/aworld/agents/video_agent.py index b92b6cf6f..87e16aab3 100644 --- a/aworld/agents/video_agent.py +++ b/aworld/agents/video_agent.py @@ -290,7 +290,9 @@ async def async_policy( ) self._finished = True - policy_result = [ActionModel(agent_name=self.id(), policy_info=result_payload)] + # Mark params as a tool result so the caller can correctly trigger the ReAct loop. + params = {"is_tool_result": True} + policy_result = [ActionModel(agent_name=self.id(), policy_info=result_payload, params=params)] logger.info(f"agent_result: {result_payload}") return policy_result diff --git a/aworld/config/conf.py b/aworld/config/conf.py index 10c8c60c2..68dcc9473 100644 --- a/aworld/config/conf.py +++ b/aworld/config/conf.py @@ -118,7 +118,7 @@ def to_dict(self) -> ConfigDict: class ModelConfig(BaseConfig): model_config = ConfigDict(extra='allow') - llm_provider: str = "openai" + llm_provider: Optional[str] = None # Set to None to allow automatic provider detection llm_model_name: Optional[str] = None llm_temperature: float = 1. llm_base_url: str = "https://api.openai.com/v1" diff --git a/aworld/experimental/cast/tools/cast_analysis_tool.py b/aworld/experimental/cast/tools/cast_analysis_tool.py index a2c18e9f1..1e3d14b1c 100644 --- a/aworld/experimental/cast/tools/cast_analysis_tool.py +++ b/aworld/experimental/cast/tools/cast_analysis_tool.py @@ -44,12 +44,12 @@ class CAstAnalysisAction(ToolAction): required=False, desc="Whether to show detailed analysis information" ), - "enable_dependency_graph": ParamInfo( - name="enable_dependency_graph", - type="boolean", - required=False, - desc="Whether to build dependency graph and PageRank (costly for large repos, ~96s for 1k+ files). Default false. Set to true only when explicitly needed." - ) + # "enable_dependency_graph": ParamInfo( + # name="enable_dependency_graph", + # type="boolean", + # required=False, + # desc="Whether to build dependency graph and PageRank (costly for large repos, ~96s for 1k+ files). Default false. Set to true only when explicitly needed." + # ) }, desc="""Analyze the repository and build the three-tier index: - L1 logic (project structure, call/dependency graph, heatmap) diff --git a/aworld/experimental/cast/tools/cast_coder_tool.py b/aworld/experimental/cast/tools/cast_coder_tool.py index ec9b06b19..51d39a4e8 100644 --- a/aworld/experimental/cast/tools/cast_coder_tool.py +++ b/aworld/experimental/cast/tools/cast_coder_tool.py @@ -151,27 +151,41 @@ def bar(): SEARCH_REPLACE = ToolActionInfo( name="search_replace", input_params={ - "operation_json": ParamInfo( - name="operation_json", + "type": ParamInfo( + name="type", type="string", required=True, - desc="""JSON format precise search and replace operation instruction. Format: -{ - "operation": { - "type": "search_replace", - "file_path": "path/to/your/file.py", - "search": "CODE_BLOCK_TO_FIND", - "replace": "NEW_CODE_BLOCK", - "exact_match_only": true, - "replace_all": false - } -} - -Parameters: type (string, required) Must be "search_replace"; file_path (string, required) Relative path from source_dir; search (string, required) One or more complete lines when modifying existing file, or empty string for full file replacement / creating new file; replace (string, required) Replacement block or full file content; exact_match_only (boolean, optional) Fixed as true; replace_all (boolean, optional) When true, replace all occurrences of search in file; when false, only first match (default: false). - -Full file replacement: Use search="" and put full content in replace - works for both creating new files and replacing entire existing file content. - -Best Practices: When modifying existing file, use multi-line blocks with structural context (def/class) for accuracy. Use replace_all=true when the same block appears multiple times. For full file replacement, use search="" and put full content in replace.""" + desc='Operation type, must be "search_replace"' + ), + "file_path": ParamInfo( + name="file_path", + type="string", + required=True, + desc="Relative file path from source_dir" + ), + "search": ParamInfo( + name="search", + type="string", + required=True, + desc='Search block. Use empty string "" for full file replacement / creation.' + ), + "replace": ParamInfo( + name="replace", + type="string", + required=True, + desc="Replacement block or full file content" + ), + "exact_match_only": ParamInfo( + name="exact_match_only", + type="boolean", + required=False, + desc="Whether to use exact match only (default: true)" + ), + "replace_all": ParamInfo( + name="replace_all", + type="boolean", + required=False, + desc="When true, replace all occurrences; otherwise only first match (default: false)" ), "source_dir": ParamInfo( name="source_dir", @@ -425,12 +439,23 @@ async def do_step( action_results.append(action_result) elif action_name == CAstCoderAction.SEARCH_REPLACE.value.name: # Search and replace operation - operation_json = action.params.get("operation_json") + operation_type = action.params.get("type") + file_path = action.params.get("file_path") + search = action.params.get("search") + replace = action.params.get("replace") + exact_match_only = action.params.get("exact_match_only", True) + replace_all = action.params.get("replace_all", False) source_dir_raw = action.params.get("source_dir") show_details = action.params.get("show_details", True) - if not operation_json: - raise ValueError("operation_json is required") + if operation_type != "search_replace": + raise ValueError('type must be "search_replace"') + if not file_path: + raise ValueError("file_path is required") + if search is None: + raise ValueError("search is required") + if replace is None: + raise ValueError("replace is required") if source_dir_raw is None or source_dir_raw == "": raise ValueError( "source_dir is required but missing. " @@ -441,6 +466,20 @@ async def do_step( if not source_dir.exists(): raise ValueError(f"Source directory does not exist: {source_dir}") + operation_json = json.dumps( + { + "operation": { + "type": operation_type, + "file_path": file_path, + "search": search, + "replace": replace, + "exact_match_only": exact_match_only, + "replace_all": replace_all, + } + }, + ensure_ascii=False, + ) + # Use ACast's search_replace_operation method to handle search and replace try: result = self.acast.search_replace_operation( diff --git a/aworld/models/doubao_tts_provider.py b/aworld/models/doubao_tts_provider.py new file mode 100644 index 000000000..5fc2c3875 --- /dev/null +++ b/aworld/models/doubao_tts_provider.py @@ -0,0 +1,451 @@ +# coding: utf-8 +# Copyright (c) 2025 inclusionAI. +""" +Doubao TTS Provider - Text-to-Speech provider for Doubao (豆包) TTS API. + +This provider implements text-to-speech functionality using the Doubao TTS API. +It supports various voice types, audio encoding formats, and speech speed control. + +Example usage: + from aworld.models.doubao_tts_provider import DoubaoTTSProvider + + provider = DoubaoTTSProvider( + api_key="your_api_key", + base_url="https://your-api-endpoint.com" + ) + + response = provider.text_to_speech( + text="字节跳动语音合成", + voice_type="zh_male_M392_conversation_wvae_bigtts", + encoding="mp3", + speed_ratio=1.0 + ) + + # Save audio to file + with open("output.mp3", "wb") as f: + f.write(response.audio_data) +""" + +import base64 +import json +import os +import traceback +import uuid +from pathlib import Path +from typing import Any, Dict, List, Optional + +from aworld.core.llm_provider import LLMProviderBase +from aworld.logs.util import logger +from aworld.models.llm_http_handler import LLMHTTPHandler +from aworld.models.model_response import ModelResponse, LLMResponseError + + +class DoubaoTTSProvider(LLMProviderBase): + """Doubao TTS (Text-to-Speech) provider implementation. + + This provider interfaces with the Doubao TTS API to convert text into speech. + It supports multiple voice types, audio formats, and speech speed control. + + Attributes: + DEFAULT_VOICE_TYPE: Default voice type for speech synthesis + DEFAULT_ENCODING: Default audio encoding format + DEFAULT_SPEED_RATIO: Default speech speed ratio + SUPPORTED_ENCODINGS: List of supported audio encoding formats + """ + + DEFAULT_VOICE_TYPE = "zh_male_M392_conversation_wvae_bigtts" + DEFAULT_ENCODING = "mp3" + DEFAULT_SPEED_RATIO = 1.0 + SUPPORTED_ENCODINGS = ["mp3", "wav", "pcm", "ogg_opus"] + + def _init_provider(self) -> LLMHTTPHandler: + """Initialize Doubao TTS provider with HTTP handler. + + Returns: + LLMHTTPHandler: Configured HTTP handler for API requests + + Raises: + ValueError: If API key or base URL is not provided + """ + api_key = self.api_key or os.getenv("DOUBAO_TTS_API_KEY", "") + if not api_key: + raise ValueError( + "Doubao TTS API key not found. Set the DOUBAO_TTS_API_KEY " + "environment variable or pass api_key to the constructor." + ) + + base_url = self.base_url or os.getenv("DOUBAO_TTS_BASE_URL", "") + if not base_url: + raise ValueError( + "Doubao TTS base URL not found. Set the DOUBAO_TTS_BASE_URL " + "environment variable or pass base_url to the constructor." + ) + + self.api_key = api_key + self.base_url = base_url.rstrip("/") + + return LLMHTTPHandler( + base_url=self.base_url, + api_key=api_key, + model_name=self.model_name or "doubao_tts", + timeout=self.kwargs.get("timeout", 60), + max_retries=self.kwargs.get("max_retries", 3), + ) + + def _init_async_provider(self) -> LLMHTTPHandler: + """Initialize async provider (reuses sync provider). + + Returns: + LLMHTTPHandler: The same HTTP handler used for sync operations + """ + return self.provider if self.need_sync else self._init_provider() + + @classmethod + def supported_models(cls) -> list: + """Get list of supported TTS models. + + Returns: + list: List of supported model names + """ + return ["doubao_tts"] + + def text_to_speech( + self, + text: str, + voice_type: Optional[str] = None, + encoding: Optional[str] = None, + speed_ratio: Optional[float] = None, + uid: Optional[str] = None, + output_path: Optional[str] = None, + **kwargs + ) -> ModelResponse: + """Convert text to speech using Doubao TTS API (synchronous). + + Args: + text: Text content to synthesize into speech + voice_type: Voice type identifier (default: zh_male_M392_conversation_wvae_bigtts) + encoding: Audio encoding format (mp3, wav, pcm, ogg_opus) + speed_ratio: Speech speed ratio (0.5 to 2.0, default: 1.0) + uid: User ID for the request (auto-generated if not provided) + output_path: Optional path to save the audio file + **kwargs: Additional parameters for the API request + + Returns: + ModelResponse: Response containing audio data and metadata + + Raises: + LLMResponseError: If the API request fails + ValueError: If invalid parameters are provided + """ + if not self.provider: + raise RuntimeError( + "Sync provider not initialized. Set 'sync_enabled=True' in the constructor." + ) + + if not text: + raise ValueError("Text parameter is required and cannot be empty") + + # Set default values + voice_type = voice_type or self.DEFAULT_VOICE_TYPE + encoding = encoding or self.DEFAULT_ENCODING + speed_ratio = speed_ratio if speed_ratio is not None else self.DEFAULT_SPEED_RATIO + uid = uid or f"uid_{uuid.uuid4().hex[:8]}" + + # Validate encoding + if encoding not in self.SUPPORTED_ENCODINGS: + raise ValueError( + f"Unsupported encoding '{encoding}'. " + f"Supported encodings: {', '.join(self.SUPPORTED_ENCODINGS)}" + ) + + # Validate speed ratio + if not (0.5 <= speed_ratio <= 2.0): + logger.warning( + f"Speed ratio {speed_ratio} is outside recommended range [0.5, 2.0]. " + "Using anyway, but results may be unexpected." + ) + + # Build request payload + reqid = kwargs.get("reqid", str(uuid.uuid4())) + payload = { + "model": "doubao_tts", + "method": "/api/v1/tts", + "app": { + "token": "111", # Required non-empty string (no actual meaning) + "cluster": "volcano_tts" # Must be "volcano_tts" + }, + "user": { + "uid": uid + }, + "audio": { + "voice_type": voice_type, + "encoding": encoding, + "speed_ratio": speed_ratio + }, + "request": { + "reqid": reqid, + "text": text, + "operation": "query" + } + } + + logger.info( + f"[DoubaoTTSProvider] Synthesizing speech: " + f"text_length={len(text)}, voice_type={voice_type}, " + f"encoding={encoding}, speed_ratio={speed_ratio}" + ) + + try: + # Make API request + response_data = self.provider.sync_call(payload, endpoint="/v1/genericCall") + + # Parse response + return self._parse_tts_response( + response_data, + encoding=encoding, + output_path=output_path + ) + + except Exception as e: + error_msg = f"Doubao TTS request failed: {e}" + logger.error(f"{error_msg}\n{traceback.format_exc()}") + raise LLMResponseError(error_msg, "doubao_tts", None) + + async def atext_to_speech( + self, + text: str, + voice_type: Optional[str] = None, + encoding: Optional[str] = None, + speed_ratio: Optional[float] = None, + uid: Optional[str] = None, + output_path: Optional[str] = None, + **kwargs + ) -> ModelResponse: + """Convert text to speech using Doubao TTS API (asynchronous). + + Args: + text: Text content to synthesize into speech + voice_type: Voice type identifier + encoding: Audio encoding format + speed_ratio: Speech speed ratio + uid: User ID for the request + output_path: Optional path to save the audio file + **kwargs: Additional parameters + + Returns: + ModelResponse: Response containing audio data and metadata + + Raises: + LLMResponseError: If the API request fails + """ + if not self.async_provider: + raise RuntimeError( + "Async provider not initialized. Set 'async_enabled=True' in the constructor." + ) + + if not text: + raise ValueError("Text parameter is required and cannot be empty") + + # Set default values + voice_type = voice_type or self.DEFAULT_VOICE_TYPE + encoding = encoding or self.DEFAULT_ENCODING + speed_ratio = speed_ratio if speed_ratio is not None else self.DEFAULT_SPEED_RATIO + uid = uid or f"uid_{uuid.uuid4().hex[:8]}" + + # Validate encoding + if encoding not in self.SUPPORTED_ENCODINGS: + raise ValueError( + f"Unsupported encoding '{encoding}'. " + f"Supported encodings: {', '.join(self.SUPPORTED_ENCODINGS)}" + ) + + # Build request payload + reqid = kwargs.get("reqid", str(uuid.uuid4())) + payload = { + "model": "doubao_tts", + "method": "/api/v1/tts", + "app": { + "token": "111", + "cluster": "volcano_tts" + }, + "user": { + "uid": uid + }, + "audio": { + "voice_type": voice_type, + "encoding": encoding, + "speed_ratio": speed_ratio + }, + "request": { + "reqid": reqid, + "text": text, + "operation": "query" + } + } + + logger.info( + f"[DoubaoTTSProvider] Synthesizing speech (async): " + f"text_length={len(text)}, voice_type={voice_type}, " + f"encoding={encoding}, speed_ratio={speed_ratio}" + ) + + try: + # Make async API request + response_data = await self.async_provider.async_call( + payload, + endpoint="/v1/genericCall" + ) + + # Parse response + return self._parse_tts_response( + response_data, + encoding=encoding, + output_path=output_path + ) + + except Exception as e: + error_msg = f"Doubao TTS request failed (async): {e}" + logger.error(f"{error_msg}\n{traceback.format_exc()}") + raise LLMResponseError(error_msg, "doubao_tts", None) + + def _parse_tts_response( + self, + response_data: Dict[str, Any], + encoding: str, + output_path: Optional[str] = None + ) -> ModelResponse: + """Parse TTS API response and extract audio data. + + Args: + response_data: Raw API response data + encoding: Audio encoding format + output_path: Optional path to save the audio file + + Returns: + ModelResponse: Parsed response with audio data + + Raises: + LLMResponseError: If response parsing fails or API returns error + """ + # Check for API errors + code = response_data.get("code") + if code != 3000: + error_msg = response_data.get("message", "Unknown error") + logger.error( + f"[DoubaoTTSProvider] API error: code={code}, message={error_msg}" + ) + raise LLMResponseError( + f"Doubao TTS API error (code {code}): {error_msg}", + "doubao_tts", + response_data + ) + + # Extract audio data (base64 encoded) + audio_base64 = response_data.get("data") + if not audio_base64: + raise LLMResponseError( + "No audio data in response", + "doubao_tts", + response_data + ) + + # Decode base64 audio data + try: + audio_bytes = base64.b64decode(audio_base64) + except Exception as e: + raise LLMResponseError( + f"Failed to decode audio data: {e}", + "doubao_tts", + response_data + ) + + # Extract metadata + addition = response_data.get("addition", {}) + duration_ms = addition.get("duration", "0") + first_pkg_ms = addition.get("first_pkg", "0") + + logger.info( + f"[DoubaoTTSProvider] Speech synthesis successful: " + f"audio_size={len(audio_bytes)} bytes, " + f"duration={duration_ms}ms, " + f"first_package={first_pkg_ms}ms" + ) + + # Save to file if output_path is provided + if output_path: + try: + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_bytes(audio_bytes) + logger.info(f"[DoubaoTTSProvider] Audio saved to: {output_path}") + except Exception as e: + logger.warning( + f"[DoubaoTTSProvider] Failed to save audio to {output_path}: {e}" + ) + + # Generate a unique ID for this response + # Use request_id from response if available, otherwise generate UUID + response_id = response_data.get("request_id") or response_data.get("reqid") or f"tts-{uuid.uuid4().hex[:8]}" + + # Build ModelResponse + response = ModelResponse( + id=response_id, # Required: unique identifier for this response + model="doubao_tts", + content="", # TTS doesn't have text content + usage={ + "audio_bytes": len(audio_bytes), + "duration_ms": int(duration_ms), + "first_pkg_ms": int(first_pkg_ms) + }, + finish_reason="success", + raw_response=response_data + ) + + # Attach audio data as custom attribute + response.audio_data = audio_bytes + response.audio_encoding = encoding + response.audio_duration_ms = int(duration_ms) + response.output_path = output_path + + return response + + # ------------------------------------------------------------------------- + # Abstract method implementations (required by LLMProviderBase) + # These methods are not used for TTS, but must be implemented + # ------------------------------------------------------------------------- + + def completion( + self, + messages: List[Dict[str, str]], + temperature: float = 0.0, + max_tokens: int = None, + stop: List[str] = None, + context: Any = None, + **kwargs + ) -> ModelResponse: + """Not implemented for TTS provider. + + DoubaoTTSProvider is a text-to-speech provider and does not support + text completion. Use text_to_speech() method instead. + + Raises: + NotImplementedError: Always raised as this method is not applicable + """ + raise NotImplementedError( + "DoubaoTTSProvider is a TTS provider and does not support completion(). " + "Use text_to_speech() method instead." + ) + + def postprocess_response(self, response: Any) -> ModelResponse: + """Not implemented for TTS provider. + + DoubaoTTSProvider uses custom response processing in text_to_speech() + and atext_to_speech() methods. + + Raises: + NotImplementedError: Always raised as this method is not applicable + """ + raise NotImplementedError( + "DoubaoTTSProvider uses custom response processing. " + "This method is not used." + ) diff --git a/aworld/models/llm.py b/aworld/models/llm.py index fcfe46ef9..3dd94073c 100644 --- a/aworld/models/llm.py +++ b/aworld/models/llm.py @@ -51,6 +51,7 @@ "azure_openai": AzureOpenAIProvider, "ant": AntProvider, "together_video": TogetherVideoProvider, + "doubao_tts": None, # Lazy loaded to avoid circular import } # --------------------------------------------------------------------------- @@ -288,7 +289,22 @@ def _identify_provider(self, provider: str = None, base_url: str = None, model_n """ identified_provider = "openai" - # 1. Match base_url against endpoint patterns (covers both LLM and video providers) + # 1. FIRST: Check explicit provider (highest priority) + all_providers = {**PROVIDER_CLASSES, **VIDEO_PROVIDER_CLASSES} + if provider: + if provider in all_providers: + logger.info( + f"Using explicit provider: {provider}" + ) + return provider + else: + logger.warning( + f"Explicit provider '{provider}' not found in registry. " + f"Available providers: {list(all_providers.keys())}. " + f"Falling back to auto-detection." + ) + + # 2. SECOND: Match base_url against endpoint patterns (covers both LLM and video providers) if base_url: for p, patterns in ENDPOINT_PATTERNS.items(): if any(pattern in base_url for pattern in patterns): @@ -298,8 +314,8 @@ def _identify_provider(self, provider: str = None, base_url: str = None, model_n ) return identified_provider - # 2. Match model_name — video registry takes priority over LLM model names - if model_name and not base_url: + # 3. THIRD: Match model_name — video registry takes priority over LLM model names + if model_name: # Check video model registry first video_provider = _match_video_registry(model_name) if video_provider: @@ -317,14 +333,9 @@ def _identify_provider(self, provider: str = None, base_url: str = None, model_n ) break - # 3. Explicit provider overrides the auto-detected result - all_providers = {**PROVIDER_CLASSES, **VIDEO_PROVIDER_CLASSES} - if provider and provider in all_providers and identified_provider != provider: - logger.debug( - f"Provider mismatch: {provider} != {identified_provider}, using {provider} as provider" - ) - identified_provider = provider - + # 4. FOURTH: Default fallback + if identified_provider == "openai" and not provider and not base_url and not model_name: + logger.debug("No provider information provided, using default: openai") return identified_provider def _create_provider(self, **kwargs): @@ -344,7 +355,13 @@ def _create_provider(self, **kwargs): if self.provider_name in VIDEO_PROVIDER_CLASSES: self.provider = VIDEO_PROVIDER_CLASSES[self.provider_name](**kwargs) elif self.provider_name in PROVIDER_CLASSES: - self.provider = PROVIDER_CLASSES[self.provider_name](**kwargs) + provider_class = PROVIDER_CLASSES[self.provider_name] + # Lazy load DoubaoTTSProvider to avoid circular import + if provider_class is None and self.provider_name == "doubao_tts": + from aworld.models.doubao_tts_provider import DoubaoTTSProvider + provider_class = DoubaoTTSProvider + PROVIDER_CLASSES[self.provider_name] = provider_class + self.provider = provider_class(**kwargs) else: raise ValueError( f"Unknown provider '{self.provider_name}'. " diff --git a/aworld/models/together_video_provider.py b/aworld/models/together_video_provider.py index 80551ae23..da02fa3ec 100644 --- a/aworld/models/together_video_provider.py +++ b/aworld/models/together_video_provider.py @@ -139,7 +139,7 @@ def _init_provider(self): import_package("together") from together import Together - api_key = self.api_key or os.getenv("TOGETHER_API_KEY", "") + api_key = self.api_key or os.getenv("DIFFUSION_API_KEY") or os.getenv("TOGETHER_API_KEY", "") if not api_key: raise ValueError( "Together API key not found. Set the TOGETHER_API_KEY environment "