Merge pull request #230 from hetaoBackend/fix/gpt-model

joshbickett · web-flow · commit 37b38207d6f7 · 2025-02-28T14:51:47.000-08:00
feat(apis): add qwen-vl apis &amp;&amp; fix gpt-4o &amp; o1 model use in apis.py
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ ome
 
 ## Key Features
 - **Compatibility**: Designed for various multimodal models.
-- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3 and LLaVa.**
+- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
 - **Future Plans**: Support for additional models.
 
 ## Demo
@@ -76,6 +76,13 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a
 operate -m claude-3
 ```
 
+#### Try qwen `-m qwen-vl`
+Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it. 
+
+```
+operate -m qwen-vl
+```
+
 #### Try LLaVa Hosted Through Ollama `-m llava`
 If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!   
 *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview*   
diff --git a/operate/config.py b/operate/config.py
@@ -43,6 +43,9 @@ def __init__(self):
         self.anthropic_api_key = (
             None  # instance variables are backups in case saving to a `.env` fails
         )
+        self.qwen_api_key = (
+            None  # instance variables are backups in case saving to a `.env` fails
+        )
 
     def initialize_openai(self):
         if self.verbose:
@@ -66,6 +69,29 @@ def initialize_openai(self):
         client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
         return client
 
+    def initialize_qwen(self):
+        if self.verbose:
+            print("[Config][initialize_qwen]")
+
+        if self.qwen_api_key:
+            if self.verbose:
+                print("[Config][initialize_qwen] using cached qwen_api_key")
+            api_key = self.qwen_api_key
+        else:
+            if self.verbose:
+                print(
+                    "[Config][initialize_qwen] no cached qwen_api_key, try to get from env."
+                )
+            api_key = os.getenv("QWEN_API_KEY")
+
+        client = OpenAI(
+            api_key=api_key,
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        )
+        client.api_key = api_key
+        client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+        return client
+
     def initialize_google(self):
         if self.google_api_key:
             if self.verbose:
@@ -121,6 +147,7 @@ def validation(self, model, voice_mode):
         self.require_api_key(
             "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
         )
+        self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
 
     def require_api_key(self, key_name, key_description, is_required):
         key_exists = bool(os.environ.get(key_name))
@@ -147,6 +174,8 @@ def prompt_and_save_api_key(self, key_name, key_description):
                 self.google_api_key = key_value
             elif key_name == "ANTHROPIC_API_KEY":
                 self.anthropic_api_key = key_value
+            elif key_name == "QWEN_API_KEY":
+                self.qwen_api_key = key_value
             self.save_api_key_to_env(key_name, key_value)
             load_dotenv()  # Reload environment variables
             # Update the instance attribute with the new key
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -24,7 +24,7 @@
     get_label_coordinates,
 )
 from operate.utils.ocr import get_text_coordinates, get_text_element
-from operate.utils.screenshot import capture_screen_with_cursor
+from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot
 from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET
 
 # Load configuration
@@ -37,6 +37,9 @@ async def get_next_action(model, messages, objective, session_id):
         print("[Self-Operating Computer][get_next_action] model", model)
     if model == "gpt-4":
         return call_gpt_4o(messages), None
+    if model == "qwen-vl":
+        operation = await call_qwen_vl_with_ocr(messages, objective, model)
+        return operation, None
     if model == "gpt-4-with-som":
         operation = await call_gpt_4o_labeled(messages, objective, model)
         return operation, None
@@ -136,6 +139,123 @@ def call_gpt_4o(messages):
         return call_gpt_4o(messages)
 
 
+async def call_qwen_vl_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_qwen_vl_with_ocr]")
+
+    # Construct the path to the file within the package
+    try:
+        time.sleep(1)
+        client = config.initialize_qwen()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        # Call the function to capture the screen with the cursor
+        raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png")
+        capture_screen_with_cursor(raw_screenshot_filename)
+
+        # Compress screenshot image to make size be smaller
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg")
+        compress_screenshot(raw_screenshot_filename, screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text",
+                 "text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="qwen2.5-vl-72b-instruct",
+            messages=messages,
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        # used later for the messages
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_qwen_vl_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_qwen_vl_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_qwen_vl_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_qwen_vl_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
 def call_gemini_pro_vision(messages, objective):
     """
     Get the next action for Self-Operating Computer using Gemini Pro Vision
@@ -227,7 +347,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
         messages.append(vision_message)
 
         response = client.chat.completions.create(
-            model="o1",
+            model="gpt-4o",
             messages=messages,
         )
 
@@ -340,7 +460,7 @@ async def call_o1_with_ocr(messages, objective, model):
         messages.append(vision_message)
 
         response = client.chat.completions.create(
-            model="gpt-4o",
+            model="o1",
             messages=messages,
         )
 
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
@@ -213,16 +213,16 @@ def get_system_prompt(model, objective):
     """
 
     if platform.system() == "Darwin":
-        cmd_string = "command"
-        os_search_str = ["command", "space"]
+        cmd_string = "\"command\""
+        os_search_str = "[\"command\", \"space\"]"
         operating_system = "Mac"
     elif platform.system() == "Windows":
-        cmd_string = "ctrl"
-        os_search_str = ["win"]
+        cmd_string = "\"ctrl\""
+        os_search_str = "[\"win\"]"
         operating_system = "Windows"
     else:
-        cmd_string = "ctrl"
-        os_search_str = ["win"]
+        cmd_string = "\"ctrl\""
+        os_search_str = "[\"win\"]"
         operating_system = "Linux"
 
     if model == "gpt-4-with-som":
@@ -232,7 +232,7 @@ def get_system_prompt(model, objective):
             os_search_str=os_search_str,
             operating_system=operating_system,
         )
-    elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3":
+    elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
 
         prompt = SYSTEM_PROMPT_OCR.format(
             objective=objective,
diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py
@@ -25,3 +25,18 @@ def capture_screen_with_cursor(file_path):
         subprocess.run(["screencapture", "-C", file_path])
     else:
         print(f"The platform you're using ({user_platform}) is not currently supported")
+
+
+def compress_screenshot(raw_screenshot_filename, screenshot_filename):
+    with Image.open(raw_screenshot_filename) as img:
+        # Check if the image has an alpha channel (transparency)
+        if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
+            # Create a white background image
+            background = Image.new('RGB', img.size, (255, 255, 255))
+            # Paste the image onto the background, using the alpha channel as mask
+            background.paste(img, mask=img.split()[3])  # 3 is the alpha channel
+            # Save the result as JPEG
+            background.save(screenshot_filename, 'JPEG', quality=85)  # Adjust quality as needed
+        else:
+            # If no alpha channel, simply convert and save
+            img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85)