8
8
9
9
from PIL import Image
10
10
from ultralytics import YOLO
11
- import google . generativeai as genai
11
+
12
12
from operate .config import Config
13
13
from operate .exceptions import ModelNotRecognizedException
14
14
from operate .utils .screenshot import (
35
35
36
36
# Load configuration
37
37
VERBOSE = Config ().verbose
38
+ config = Config ()
38
39
39
40
40
41
async def get_next_action (model , messages , objective , session_id ):
42
+ if VERBOSE :
43
+ print ("[Self-Operating Computer][get_next_action]" )
44
+ print ("[Self-Operating Computer][get_next_action] model" , model )
41
45
if model == "gpt-4" :
42
46
return call_gpt_4_vision_preview (messages ), None
43
47
if model == "gpt-4-with-som" :
@@ -52,11 +56,10 @@ async def get_next_action(model, messages, objective, session_id):
52
56
53
57
54
58
def call_gpt_4_vision_preview (messages ):
55
- config = Config ()
56
- client = config .initialize_openai ()
57
59
if VERBOSE :
58
60
print ("[Self Operating Computer][get_next_action][call_gpt_4_v]" )
59
61
time .sleep (1 )
62
+ client = config .initialize_openai ()
60
63
try :
61
64
screenshots_dir = "screenshots"
62
65
if not os .path .exists (screenshots_dir ):
@@ -137,7 +140,10 @@ def call_gemini_pro_vision(messages, objective):
137
140
"""
138
141
Get the next action for Self-Operating Computer using Gemini Pro Vision
139
142
"""
140
- config = Config ()
143
+ if VERBOSE :
144
+ print (
145
+ "[Self Operating Computer][call_gemini_pro_vision]" ,
146
+ )
141
147
# sleep for a second
142
148
time .sleep (1 )
143
149
try :
@@ -152,11 +158,18 @@ def call_gemini_pro_vision(messages, objective):
152
158
time .sleep (1 )
153
159
prompt = get_system_prompt (objective )
154
160
155
- model = genai .GenerativeModel ("gemini-pro-vision" )
161
+ model = config .initialize_google ()
162
+ if VERBOSE :
163
+ print ("[Self Operating Computer][call_gemini_pro_vision] model" , model )
156
164
157
165
response = model .generate_content ([prompt , Image .open (screenshot_filename )])
158
166
159
167
content = response .text [1 :]
168
+ if VERBOSE :
169
+ print (
170
+ "[Self Operating Computer][call_gemini_pro_vision] response" , response
171
+ )
172
+ print ("[Self Operating Computer][call_gemini_pro_vision] content" , content )
160
173
161
174
content = json .loads (content )
162
175
if VERBOSE :
@@ -176,9 +189,8 @@ def call_gemini_pro_vision(messages, objective):
176
189
177
190
178
191
async def call_gpt_4_vision_preview_labeled (messages , objective ):
179
- config = Config ()
180
- client = config .initialize_openai ()
181
192
time .sleep (1 )
193
+ client = config .initialize_openai ()
182
194
try :
183
195
yolo_model = YOLO ("./operate/models/weights/best.pt" ) # Load your trained model
184
196
screenshots_dir = "screenshots"
0 commit comments