From dd07b0d86e155ed5adb4a2d02185b592633324af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AE=B7=E9=91=AB?= <vinci@yinxindeMacBook-Air.local>
Date: Wed, 26 Nov 2025 15:18:56 +0800
Subject: [PATCH] feat: add diff tool and support for chrome-devtools

---
 ms_agent/agent/llm_agent.py                   |  8 +-
 ms_agent/tools/filesystem_tool.py             | 94 ++++++++++++++++++-
 projects/code_scratch/architecture.yaml       |  6 +-
 .../code_scratch/callbacks/eval_callback.py   | 50 ++++------
 projects/code_scratch/coding.yaml             |  2 +-
 projects/code_scratch/refine.yaml             | 49 ++++++----
 6 files changed, 158 insertions(+), 51 deletions(-)

diff --git a/ms_agent/agent/llm_agent.py b/ms_agent/agent/llm_agent.py
index af7d42c63..82ed4272b 100644
--- a/ms_agent/agent/llm_agent.py
+++ b/ms_agent/agent/llm_agent.py
@@ -405,7 +405,7 @@ def handle_new_response(self, messages: List[Message],
         assert response_message is not None, 'No response message generated from LLM.'
         if response_message.tool_calls:
             self.log_output('[tool_calling]:')
-            for tool_call in response_message.tool_calls:
+            for idx, tool_call in enumerate(response_message.tool_calls):
                 tool_call = deepcopy(tool_call)
                 if isinstance(tool_call['arguments'], str):
                     try:
@@ -413,6 +413,12 @@ def handle_new_response(self, messages: List[Message],
                             tool_call['arguments'])
                     except json.decoder.JSONDecodeError:
                         pass
+                if tool_call['arguments'] is None:
+                    response_message.tool_calls[idx]['arguments'] = {
+                        '__error__':
+                        'Original arguments were None, replaced by default.'
+                    }
+
                 self.log_output(
                     json.dumps(tool_call, ensure_ascii=False, indent=4))
 
diff --git a/ms_agent/tools/filesystem_tool.py b/ms_agent/tools/filesystem_tool.py
index 03dc966ae..3d6b1992a 100644
--- a/ms_agent/tools/filesystem_tool.py
+++ b/ms_agent/tools/filesystem_tool.py
@@ -5,8 +5,9 @@
 
 from ms_agent.llm.utils import Tool
 from ms_agent.tools.base import ToolBase
-from ms_agent.utils import get_logger
+from ms_agent.utils import MAX_CONTINUE_RUNS, get_logger, retry
 from ms_agent.utils.constants import DEFAULT_OUTPUT_DIR
+from openai import OpenAI
 
 logger = get_logger()
 
@@ -21,6 +22,12 @@ def __init__(self, config, **kwargs):
         super(FileSystemTool, self).__init__(config)
         self.exclude_func(getattr(config.tools, 'file_system', None))
         self.output_dir = getattr(config, 'output_dir', DEFAULT_OUTPUT_DIR)
+        if 'edit_file' not in self.exclude_functions:
+            self.edit_file_config = getattr(config.tools.file_system,
+                                            'edit_file_config', None)
+            self.client = OpenAI(
+                api_key=self.edit_file_config.api_key,
+                base_url=self.edit_file_config.base_url)
         self.trust_remote_code = kwargs.get('trust_remote_code', False)
         self.allow_read_all_files = getattr(
             getattr(config.tools, 'file_system', {}), 'allow_read_all_files',
@@ -125,6 +132,65 @@ async def get_tools(self):
                         'required': ['path'],
                         'additionalProperties': False
                     }),
+                Tool(
+                    tool_name='edit_file',
+                    server_name='file_system',
+                    description=
+                    ('Use this tool to make an edit to an existing file.\n\n'
+                     'This will be read by a less intelligent model, which will quickly apply the edit. '
+                     'You should make it clear what the edit is, while also minimizing the unchanged code you write.\n'
+                     'When writing the edit, you should specify each edit in sequence, with the special comment '
+                     '// ... existing code ... to represent unchanged code in between edited lines.\n\n'
+                     'For example:\n\n// ... existing code ...\nFIRST_EDIT\n// ... existing code ...\n'
+                     'SECOND_EDIT\n// ... existing code ...\nTHIRD_EDIT\n// ... existing code ...\n\n'
+                     'You should still bias towards repeating as few lines of the original file '
+                     'as possible to convey the change.\n'
+                     'But, each edit should contain minimally sufficient context of unchanged lines '
+                     "around the code you're editing to resolve ambiguity.\n"
+                     'DO NOT omit spans of pre-existing code (or comments) without using the '
+                     '// ... existing code ... comment to indicate its absence. '
+                     'If you omit the existing code comment, the model may inadvertently delete these lines.\n'
+                     'If you plan on deleting a section, you must provide context before and after to delete it. '
+                     'If the initial code is ```code \\n Block 1 \\n Block 2 \\n Block 3 \\n code```, '
+                     'and you want to remove Block 2, you would output '
+                     '```// ... existing code ... \\n Block 1 \\n  Block 3 \\n // ... existing code ...```.\n'
+                     'Make sure it is clear what the edit should be, and where it should be applied.\n'
+                     'Make edits to a file in a single edit_file call '
+                     'instead of multiple edit_file calls to the same file. '
+                     'The apply model can handle many distinct edits at once.'
+                     ),
+                    parameters={
+                        'type': 'object',
+                        'properties': {
+                            'path': {
+                                'type': 'string',
+                                'description':
+                                'Path of the target file to modify.'
+                            },
+                            'instructions': {
+                                'type':
+                                'string',
+                                'description':
+                                ('A single sentence instruction describing '
+                                 'what you are going to do for the sketched edit. '
+                                 'This is used to assist the less intelligent model in applying the edit. '
+                                 'Use the first person to describe what you are going to do. '
+                                 'Use it to disambiguate uncertainty in the edit.'
+                                 )
+                            },
+                            'code_edit': {
+                                'type':
+                                'string',
+                                'description':
+                                ('Specify ONLY the precise lines of code that you wish to edit. '
+                                 'NEVER specify or write out unchanged code. '
+                                 'Instead, represent all unchanged code using the comment of the language '
+                                 "you're editing in - example: // ... existing code ..."
+                                 )
+                            }
+                        },
+                        'required': ['path', 'instructions', 'code_edit']
+                    }),
             ]
         }
         return {
@@ -267,3 +333,29 @@ async def list_files(self, path: str = None):
         except Exception as e:
             return f'List files of <{path or "root path"}> failed, error: ' + str(
                 e)
+
+    @retry(max_attempts=MAX_CONTINUE_RUNS, delay=1.0)
+    async def edit_file(self,
+                        path: str = None,
+                        instructions: str = None,
+                        code_edit: str = None):
+        try:
+            with open(os.path.join(self.output_dir, path), 'r') as f:
+                initial_code = f.read()
+                response = self.client.chat.completions.create(
+                    model=self.edit_file_config.diff_model,
+                    messages=[{
+                        'role':
+                        'user',
+                        'content':
+                        (f'<instruction>{instructions}</instruction>\n'
+                         f'<code>{initial_code}</code>\n'
+                         f'<update>{code_edit}</update>')
+                    }])
+                merged_code = response.choices[0].message.content
+
+            with open(os.path.join(self.output_dir, path), 'w') as f:
+                f.write(merged_code)
+            return f'Edit file <{path}> successfully.'
+        except Exception as e:
+            return f'Edit file <{path}> failed, error: ' + str(e)
diff --git a/projects/code_scratch/architecture.yaml b/projects/code_scratch/architecture.yaml
index 76936bac1..c790a05a5 100644
--- a/projects/code_scratch/architecture.yaml
+++ b/projects/code_scratch/architecture.yaml
@@ -1,6 +1,6 @@
 llm:
   service: openai
-  model: claude-sonnet-4-5-20250929
+  model: claude-haiku-4-5-20251001
   openai_api_key:
   openai_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
 
@@ -53,6 +53,10 @@ prompt:
 callbacks:
   - callbacks/artifact_callback
 
+tools:
+  file_system:
+    mcp: false
+
 max_chat_round: 1
 
 tool_call_timeout: 30000
diff --git a/projects/code_scratch/callbacks/eval_callback.py b/projects/code_scratch/callbacks/eval_callback.py
index 0757bcb96..2511d61af 100644
--- a/projects/code_scratch/callbacks/eval_callback.py
+++ b/projects/code_scratch/callbacks/eval_callback.py
@@ -4,7 +4,6 @@
 from contextlib import contextmanager
 from typing import List, Optional
 
-from file_parser import extract_code_blocks
 from ms_agent.agent.runtime import Runtime
 from ms_agent.callbacks import Callback
 from ms_agent.llm.utils import Message
@@ -26,6 +25,7 @@ def __init__(self, config: DictConfig):
         self.compile_round = 300
         self.cur_round = 0
         self.last_issue_length = 0
+        self.devtool_prompt = getattr(config.prompt, 'devtool', None)
 
     async def on_task_begin(self, runtime: Runtime, messages: List[Message]):
         self.omit_intermediate_messages(messages)
@@ -87,25 +87,17 @@ def check_install():
     @staticmethod
     def check_runtime():
         try:
-            os.system('pkill -f node')
-            if os.getcwd().endswith('backend'):
-                result = subprocess.run(['npm', 'run', 'dev'],
-                                        capture_output=True,
-                                        text=True,
-                                        timeout=5,
-                                        stdin=subprocess.DEVNULL)
-            else:
-                result = subprocess.run(['npm', 'run', 'build'],
-                                        capture_output=True,
-                                        text=True,
-                                        check=True)
+            result = subprocess.run(['npm', 'run', 'dev'],
+                                    capture_output=True,
+                                    text=True,
+                                    timeout=5,
+                                    stdin=subprocess.DEVNULL)
         except subprocess.CalledProcessError as e:
             output = EvalCallback._parse_e_msg(e)
         except subprocess.TimeoutExpired as e:
             output = EvalCallback._parse_e_msg(e)
         else:
             output = result.stdout + '\n' + result.stderr
-        os.system('pkill -f node')
         return output
 
     def _run_compile(self):
@@ -139,12 +131,21 @@ async def on_generate_response(self, runtime: Runtime,
         self.last_issue_length = len(messages) - 3 - self.last_issue_length
         self.omit_intermediate_messages(messages)
         query = self.get_compile_feedback('frontend').strip()
+
+        # compile -> devtools
         if not query:
-            human_feedback = True
-            query = self.get_human_feedback().strip()
+            feedback_type = 'devtools'
+            query = self.devtool_prompt
+            self.devtool_prompt = 'Use chrome-devtools to thoroughly test again'
         else:
-            human_feedback = False
+            feedback_type = 'compling'
             logger.warn(f'[Compile Feedback]: {query}]')
+
+        # devtools -> human
+        if not query:
+            feedback_type = 'human'
+            query = self.get_human_feedback().strip()
+
         if not query:
             self.feedback_ended = True
             feedback = (
@@ -153,22 +154,11 @@ async def on_generate_response(self, runtime: Runtime,
         else:
             all_local_files = await self.file_system.list_files()
             feedback = (
-                f'Feedback from {"human" if human_feedback else "compling"}: {query}\n'
+                f'Feedback from {feedback_type}: {query}\n'
                 f'The files on the local system of this project: {all_local_files}\n'
                 f'Now please analyze and fix this issue:\n')
         messages.append(Message(role='user', content=feedback))
-
-    async def on_tool_call(self, runtime: Runtime, messages: List[Message]):
-        design, _ = extract_code_blocks(
-            messages[-1].content, target_filename='design.txt')
-        if len(design) > 0:
-            front, design = messages[-1].content.split(
-                '```text: design.txt', maxsplit=1)
-            design, end = design.rsplit('```', 1)
-            design = design.strip()
-            if design:
-                messages[2].content = await self.do_arch_update(
-                    runtime=runtime, messages=messages, updated_arch=design)
+        logger.info(messages)
 
     async def after_tool_call(self, runtime: Runtime, messages: List[Message]):
         runtime.should_stop = runtime.should_stop and self.feedback_ended
diff --git a/projects/code_scratch/coding.yaml b/projects/code_scratch/coding.yaml
index d138ad1aa..298f82864 100644
--- a/projects/code_scratch/coding.yaml
+++ b/projects/code_scratch/coding.yaml
@@ -1,6 +1,6 @@
 llm:
   service: openai
-  model: claude-sonnet-4-5-20250929
+  model: claude-haiku-4-5-20251001
   openai_api_key:
   openai_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
 
diff --git a/projects/code_scratch/refine.yaml b/projects/code_scratch/refine.yaml
index a2ec5e96b..47c0decef 100644
--- a/projects/code_scratch/refine.yaml
+++ b/projects/code_scratch/refine.yaml
@@ -1,6 +1,6 @@
 llm:
   service: openai
-  model: claude-sonnet-4-5-20250929
+  model: claude-haiku-4-5-20251001
   openai_api_key:
   openai_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
 
@@ -40,21 +40,7 @@ prompt:
        * Do a minimum change in case that the normal code is damaged, if you are doing a break change, change related files also
        * Fix other issues you discover while reading the code files, and these issues need to be ones where you have identified the root cause
 
-    4. Express your thinking in concise and clear language. When you fix files, you should use the following format:
-
-    ```type: filename
-    text
-    ```
-
-    for example:
-    ```javascript: frontend/index.js
-    your code here
-    ```
-
-    `javascript: frontend/index.js` will be used as the filename. If you are fixing a file, you need to:
-      * Read the target file
-      * Follow the original data structures and file imports, do not break it(you may read more files depends on)
-      * Then output the complete fixed code of the file.
+    4. Express your thinking in concise and clear language. When you fix files, you should use the edit_file tool
 
     If you only output code snippets to demonstrate your conclusions, you can use standard code blocks:
 
@@ -66,8 +52,27 @@ prompt:
 
     Let's begin:
 
+  devtool: |
+    Use chrome-devtools to thoroughly test the generated frontend and backend code:
+    * List all console messages using list_console_messages to identify JavaScript errors, warnings, or logs
+    * Get detailed error information using get_console_message for each error or warning found
+    * List network requests using list_network_requests to check if API calls are successful, verify HTTP status codes, and identify failed requests
+    * Get detailed network request/response information using get_network_request to analyze request headers, payloads, and response data
+    * Take a snapshot of the page to understand the current UI state and available interactive elements
+    * Test the implemented functionality by:
+        - Clicking on interactive elements (buttons, links, forms) using click tool
+        - Filling out forms using fill or fill_form tools to test user input workflows
+        - Navigating between pages to verify routing works correctly
+        - Testing keyboard interactions using press_key when necessary
+    * Take screenshots at critical steps to document the UI state and verify visual correctness
+    * Analyze the feedback from all these operations to identify:
+        - Console errors (e.g., undefined variables, import errors, runtime exceptions)
+        - Network failures (e.g., 404/500 errors, CORS issues, timeout problems)
+        - UI/UX issues (e.g., broken layouts, missing elements, non-functional buttons)
+        - Logic errors (e.g., incorrect data display, failed form submissions)
+    * Use this comprehensive feedback to help the refine model better understand and fix the issues
+
 callbacks:
-  - callbacks/artifact_callback
   - callbacks/eval_callback
 
 tools:
@@ -77,6 +82,16 @@ tools:
       - create_directory
       - write_file
       - list_files
+    edit_file_config:
+      diff_model: morph-v3-fast
+      api_key:
+      base_url: https://api.morphllm.com/v1
+
+  chrome-devtools:
+    mcp: true
+    command: "npx"
+    args: ["-y", "chrome-devtools-mcp@latest"]
+    transport: "stdio"
 
 max_chat_round: 100