Skip to content

Commit 37b3820

Browse files
authored
Merge pull request #230 from hetaoBackend/fix/gpt-model
feat(apis): add qwen-vl apis && fix gpt-4o & o1 model use in apis.py
2 parents 14b579f + 7416b12 commit 37b3820

File tree

5 files changed

+182
-11
lines changed

5 files changed

+182
-11
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ ome
2020

2121
## Key Features
2222
- **Compatibility**: Designed for various multimodal models.
23-
- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3 and LLaVa.**
23+
- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
2424
- **Future Plans**: Support for additional models.
2525

2626
## Demo
@@ -76,6 +76,13 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a
7676
operate -m claude-3
7777
```
7878

79+
#### Try qwen `-m qwen-vl`
80+
Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it.
81+
82+
```
83+
operate -m qwen-vl
84+
```
85+
7986
#### Try LLaVa Hosted Through Ollama `-m llava`
8087
If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!
8188
*Note: Ollama currently only supports MacOS and Linux. Windows now in Preview*

operate/config.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ def __init__(self):
4343
self.anthropic_api_key = (
4444
None # instance variables are backups in case saving to a `.env` fails
4545
)
46+
self.qwen_api_key = (
47+
None # instance variables are backups in case saving to a `.env` fails
48+
)
4649

4750
def initialize_openai(self):
4851
if self.verbose:
@@ -66,6 +69,29 @@ def initialize_openai(self):
6669
client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
6770
return client
6871

72+
def initialize_qwen(self):
73+
if self.verbose:
74+
print("[Config][initialize_qwen]")
75+
76+
if self.qwen_api_key:
77+
if self.verbose:
78+
print("[Config][initialize_qwen] using cached qwen_api_key")
79+
api_key = self.qwen_api_key
80+
else:
81+
if self.verbose:
82+
print(
83+
"[Config][initialize_qwen] no cached qwen_api_key, try to get from env."
84+
)
85+
api_key = os.getenv("QWEN_API_KEY")
86+
87+
client = OpenAI(
88+
api_key=api_key,
89+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
90+
)
91+
client.api_key = api_key
92+
client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
93+
return client
94+
6995
def initialize_google(self):
7096
if self.google_api_key:
7197
if self.verbose:
@@ -121,6 +147,7 @@ def validation(self, model, voice_mode):
121147
self.require_api_key(
122148
"ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
123149
)
150+
self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
124151

125152
def require_api_key(self, key_name, key_description, is_required):
126153
key_exists = bool(os.environ.get(key_name))
@@ -147,6 +174,8 @@ def prompt_and_save_api_key(self, key_name, key_description):
147174
self.google_api_key = key_value
148175
elif key_name == "ANTHROPIC_API_KEY":
149176
self.anthropic_api_key = key_value
177+
elif key_name == "QWEN_API_KEY":
178+
self.qwen_api_key = key_value
150179
self.save_api_key_to_env(key_name, key_value)
151180
load_dotenv() # Reload environment variables
152181
# Update the instance attribute with the new key

operate/models/apis.py

Lines changed: 123 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
get_label_coordinates,
2525
)
2626
from operate.utils.ocr import get_text_coordinates, get_text_element
27-
from operate.utils.screenshot import capture_screen_with_cursor
27+
from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot
2828
from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET
2929

3030
# Load configuration
@@ -37,6 +37,9 @@ async def get_next_action(model, messages, objective, session_id):
3737
print("[Self-Operating Computer][get_next_action] model", model)
3838
if model == "gpt-4":
3939
return call_gpt_4o(messages), None
40+
if model == "qwen-vl":
41+
operation = await call_qwen_vl_with_ocr(messages, objective, model)
42+
return operation, None
4043
if model == "gpt-4-with-som":
4144
operation = await call_gpt_4o_labeled(messages, objective, model)
4245
return operation, None
@@ -136,6 +139,123 @@ def call_gpt_4o(messages):
136139
return call_gpt_4o(messages)
137140

138141

142+
async def call_qwen_vl_with_ocr(messages, objective, model):
143+
if config.verbose:
144+
print("[call_qwen_vl_with_ocr]")
145+
146+
# Construct the path to the file within the package
147+
try:
148+
time.sleep(1)
149+
client = config.initialize_qwen()
150+
151+
confirm_system_prompt(messages, objective, model)
152+
screenshots_dir = "screenshots"
153+
if not os.path.exists(screenshots_dir):
154+
os.makedirs(screenshots_dir)
155+
156+
# Call the function to capture the screen with the cursor
157+
raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png")
158+
capture_screen_with_cursor(raw_screenshot_filename)
159+
160+
# Compress screenshot image to make size be smaller
161+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg")
162+
compress_screenshot(raw_screenshot_filename, screenshot_filename)
163+
164+
with open(screenshot_filename, "rb") as img_file:
165+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
166+
167+
if len(messages) == 1:
168+
user_prompt = get_user_first_message_prompt()
169+
else:
170+
user_prompt = get_user_prompt()
171+
172+
vision_message = {
173+
"role": "user",
174+
"content": [
175+
{"type": "text",
176+
"text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."},
177+
{
178+
"type": "image_url",
179+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
180+
},
181+
],
182+
}
183+
messages.append(vision_message)
184+
185+
response = client.chat.completions.create(
186+
model="qwen2.5-vl-72b-instruct",
187+
messages=messages,
188+
)
189+
190+
content = response.choices[0].message.content
191+
192+
content = clean_json(content)
193+
194+
# used later for the messages
195+
content_str = content
196+
197+
content = json.loads(content)
198+
199+
processed_content = []
200+
201+
for operation in content:
202+
if operation.get("operation") == "click":
203+
text_to_click = operation.get("text")
204+
if config.verbose:
205+
print(
206+
"[call_qwen_vl_with_ocr][click] text_to_click",
207+
text_to_click,
208+
)
209+
# Initialize EasyOCR Reader
210+
reader = easyocr.Reader(["en"])
211+
212+
# Read the screenshot
213+
result = reader.readtext(screenshot_filename)
214+
215+
text_element_index = get_text_element(
216+
result, text_to_click, screenshot_filename
217+
)
218+
coordinates = get_text_coordinates(
219+
result, text_element_index, screenshot_filename
220+
)
221+
222+
# add `coordinates`` to `content`
223+
operation["x"] = coordinates["x"]
224+
operation["y"] = coordinates["y"]
225+
226+
if config.verbose:
227+
print(
228+
"[call_qwen_vl_with_ocr][click] text_element_index",
229+
text_element_index,
230+
)
231+
print(
232+
"[call_qwen_vl_with_ocr][click] coordinates",
233+
coordinates,
234+
)
235+
print(
236+
"[call_qwen_vl_with_ocr][click] final operation",
237+
operation,
238+
)
239+
processed_content.append(operation)
240+
241+
else:
242+
processed_content.append(operation)
243+
244+
# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
245+
assistant_message = {"role": "assistant", "content": content_str}
246+
messages.append(assistant_message)
247+
248+
return processed_content
249+
250+
except Exception as e:
251+
print(
252+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
253+
)
254+
if config.verbose:
255+
print("[Self-Operating Computer][Operate] error", e)
256+
traceback.print_exc()
257+
return gpt_4_fallback(messages, objective, model)
258+
139259
def call_gemini_pro_vision(messages, objective):
140260
"""
141261
Get the next action for Self-Operating Computer using Gemini Pro Vision
@@ -227,7 +347,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
227347
messages.append(vision_message)
228348

229349
response = client.chat.completions.create(
230-
model="o1",
350+
model="gpt-4o",
231351
messages=messages,
232352
)
233353

@@ -340,7 +460,7 @@ async def call_o1_with_ocr(messages, objective, model):
340460
messages.append(vision_message)
341461

342462
response = client.chat.completions.create(
343-
model="gpt-4o",
463+
model="o1",
344464
messages=messages,
345465
)
346466

operate/models/prompts.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -213,16 +213,16 @@ def get_system_prompt(model, objective):
213213
"""
214214

215215
if platform.system() == "Darwin":
216-
cmd_string = "command"
217-
os_search_str = ["command", "space"]
216+
cmd_string = "\"command\""
217+
os_search_str = "[\"command\", \"space\"]"
218218
operating_system = "Mac"
219219
elif platform.system() == "Windows":
220-
cmd_string = "ctrl"
221-
os_search_str = ["win"]
220+
cmd_string = "\"ctrl\""
221+
os_search_str = "[\"win\"]"
222222
operating_system = "Windows"
223223
else:
224-
cmd_string = "ctrl"
225-
os_search_str = ["win"]
224+
cmd_string = "\"ctrl\""
225+
os_search_str = "[\"win\"]"
226226
operating_system = "Linux"
227227

228228
if model == "gpt-4-with-som":
@@ -232,7 +232,7 @@ def get_system_prompt(model, objective):
232232
os_search_str=os_search_str,
233233
operating_system=operating_system,
234234
)
235-
elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3":
235+
elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
236236

237237
prompt = SYSTEM_PROMPT_OCR.format(
238238
objective=objective,

operate/utils/screenshot.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,18 @@ def capture_screen_with_cursor(file_path):
2525
subprocess.run(["screencapture", "-C", file_path])
2626
else:
2727
print(f"The platform you're using ({user_platform}) is not currently supported")
28+
29+
30+
def compress_screenshot(raw_screenshot_filename, screenshot_filename):
31+
with Image.open(raw_screenshot_filename) as img:
32+
# Check if the image has an alpha channel (transparency)
33+
if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
34+
# Create a white background image
35+
background = Image.new('RGB', img.size, (255, 255, 255))
36+
# Paste the image onto the background, using the alpha channel as mask
37+
background.paste(img, mask=img.split()[3]) # 3 is the alpha channel
38+
# Save the result as JPEG
39+
background.save(screenshot_filename, 'JPEG', quality=85) # Adjust quality as needed
40+
else:
41+
# If no alpha channel, simply convert and save
42+
img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85)

0 commit comments

Comments
 (0)