Skip to content

Commit bd5774d

Browse files
authored
Merge pull request #279 from benxu3/async-interpreter
01-rewrite
2 parents 84d5b17 + ef48e9c commit bd5774d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+7339
-8317
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ If you want to run local speech-to-text using Whisper, you must install Rust. Fo
127127

128128
## Customizations
129129

130-
To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in `i.py`. This file sets up an interpreter, and is powered by Open Interpreter.
130+
To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in the `profiles` directory under the `server` directory. This file sets up an interpreter, and is powered by Open Interpreter.
131+
132+
To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile. For the 01 Light, set `SPEAKER_SAMPLE_RATE` to 24000 for Coqui (local) or 22050 for OpenAI TTS. We currently don't support ElevenLabs TTS on the 01 Light.
131133

132134
## Ubuntu Dependencies
133135

software/poetry.lock

+4,011-1,833
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

software/pyproject.toml

+16-2
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,27 @@ psutil = "^5.9.8"
2828
typer = "^0.9.0"
2929
platformdirs = "^4.2.0"
3030
rich = "^13.7.1"
31-
open-interpreter = {extras = ["os"], version = "^0.2.5"}
32-
dateparser = "^1.2.0"
3331
pytimeparse = "^1.1.8"
3432
python-crontab = "^3.0.0"
3533
inquirer = "^3.2.4"
3634
pyqrcode = "^1.2.1"
35+
realtimestt = "^0.1.12"
36+
realtimetts = "^0.4.1"
37+
keyboard = "^0.13.5"
38+
pyautogui = "^0.9.54"
39+
ctranslate2 = "4.1.0"
40+
py3-tts = "^3.5"
41+
elevenlabs = "1.2.2"
42+
groq = "^0.5.0"
43+
open-interpreter = {extras = ["os"], version = "^0.2.6"}
44+
litellm = "1.35.35"
45+
openai = "1.30.5"
46+
pywebview = "*"
47+
pyobjc = "*"
3748

49+
sentry-sdk = "^2.4.0"
50+
plyer = "^2.1.0"
51+
pywinctl = "^0.3"
3852
[build-system]
3953
requires = ["poetry-core"]
4054
build-backend = "poetry.core.masonry.api"

software/pytest.ini

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; Config for Pytest Runner.
22
; suppress Deprecation Warning and User Warning to not spam the interface, but check periodically
3+
34
[pytest]
45
python_files = tests.py test_*.py
56
filterwarnings =

software/source/clients/base_device.py

+68-33
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
load_dotenv() # take environment variables from .env.
44

5+
import subprocess
56
import os
67
import sys
78
import asyncio
@@ -46,7 +47,7 @@
4647
CHUNK = 1024 # Record in chunks of 1024 samples
4748
FORMAT = pyaudio.paInt16 # 16 bits per sample
4849
CHANNELS = 1 # Mono
49-
RATE = 44100 # Sample rate
50+
RATE = 16000 # Sample rate
5051
RECORDING = False # Flag to control recording state
5152
SPACEBAR_PRESSED = False # Flag to track spacebar press state
5253

@@ -60,12 +61,18 @@
6061
# Specify OS
6162
current_platform = get_system_info()
6263

64+
6365
def is_win11():
6466
return sys.getwindowsversion().build >= 22000
6567

68+
6669
def is_win10():
6770
try:
68-
return platform.system() == "Windows" and "10" in platform.version() and not is_win11()
71+
return (
72+
platform.system() == "Windows"
73+
and "10" in platform.version()
74+
and not is_win11()
75+
)
6976
except:
7077
return False
7178

@@ -80,9 +87,10 @@ class Device:
8087
def __init__(self):
8188
self.pressed_keys = set()
8289
self.captured_images = []
83-
self.audiosegments = []
90+
self.audiosegments = asyncio.Queue()
8491
self.server_url = ""
8592
self.ctrl_pressed = False
93+
self.tts_service = ""
8694

8795
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
8896
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
@@ -144,11 +152,25 @@ def queue_all_captured_images(self):
144152

145153
async def play_audiosegments(self):
146154
"""Plays them sequentially."""
155+
156+
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
157+
mpv_process = subprocess.Popen(
158+
mpv_command,
159+
stdin=subprocess.PIPE,
160+
stdout=subprocess.DEVNULL,
161+
stderr=subprocess.DEVNULL,
162+
)
163+
147164
while True:
148165
try:
149-
for audio in self.audiosegments:
166+
audio = await self.audiosegments.get()
167+
168+
if self.tts_service == "elevenlabs":
169+
mpv_process.stdin.write(audio) # type: ignore
170+
mpv_process.stdin.flush() # type: ignore
171+
else:
150172
play(audio)
151-
self.audiosegments.remove(audio)
173+
152174
await asyncio.sleep(0.1)
153175
except asyncio.exceptions.CancelledError:
154176
# This happens once at the start?
@@ -267,19 +289,18 @@ def toggle_recording(self, state):
267289
def on_press(self, key):
268290
"""Detect spacebar press and Ctrl+C combination."""
269291
self.pressed_keys.add(key) # Add the pressed key to the set
270-
271292

272293
if keyboard.Key.space in self.pressed_keys:
273294
self.toggle_recording(True)
274-
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char('c')} <= self.pressed_keys:
295+
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
275296
logger.info("Ctrl+C pressed. Exiting...")
276297
kill_process_tree()
277298
os._exit(0)
278-
299+
279300
# Windows alternative to the above
280301
if key == keyboard.Key.ctrl_l:
281302
self.ctrl_pressed = True
282-
303+
283304
try:
284305
if key.vk == 67 and self.ctrl_pressed:
285306
logger.info("Ctrl+C pressed. Exiting...")
@@ -289,17 +310,17 @@ def on_press(self, key):
289310
except:
290311
pass
291312

292-
293-
294313
def on_release(self, key):
295314
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
296-
self.pressed_keys.discard(key) # Remove the released key from the key press tracking set
315+
self.pressed_keys.discard(
316+
key
317+
) # Remove the released key from the key press tracking set
297318

298319
if key == keyboard.Key.ctrl_l:
299320
self.ctrl_pressed = False
300321
if key == keyboard.Key.space:
301322
self.toggle_recording(False)
302-
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char('c'):
323+
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
303324
self.fetch_image_from_camera()
304325

305326
async def message_sender(self, websocket):
@@ -332,35 +353,48 @@ async def exec_ws_communication(websocket):
332353
chunk = await websocket.recv()
333354

334355
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
356+
# print("received chunk from server")
335357

336358
if type(chunk) == str:
337359
chunk = json.loads(chunk)
338360

339-
message = accumulator.accumulate(chunk)
361+
if chunk.get("type") == "config":
362+
self.tts_service = chunk.get("tts_service")
363+
continue
364+
365+
if self.tts_service == "elevenlabs":
366+
message = chunk
367+
else:
368+
message = accumulator.accumulate(chunk)
369+
340370
if message == None:
341371
# Will be None until we have a full message ready
342372
continue
343373

344374
# At this point, we have our message
345-
346-
if message["type"] == "audio" and message["format"].startswith("bytes"):
375+
if isinstance(message, bytes) or (
376+
message["type"] == "audio" and message["format"].startswith("bytes")
377+
):
347378
# Convert bytes to audio file
348-
349-
audio_bytes = message["content"]
350-
351-
# Create an AudioSegment instance with the raw data
352-
audio = AudioSegment(
353-
# raw audio data (bytes)
354-
data=audio_bytes,
355-
# signed 16-bit little-endian format
356-
sample_width=2,
357-
# 16,000 Hz frame rate
358-
frame_rate=16000,
359-
# mono sound
360-
channels=1,
361-
)
362-
363-
self.audiosegments.append(audio)
379+
if self.tts_service == "elevenlabs":
380+
audio_bytes = message
381+
audio = audio_bytes
382+
else:
383+
audio_bytes = message["content"]
384+
385+
# Create an AudioSegment instance with the raw data
386+
audio = AudioSegment(
387+
# raw audio data (bytes)
388+
data=audio_bytes,
389+
# signed 16-bit little-endian format
390+
sample_width=2,
391+
# 16,000 Hz frame rate
392+
frame_rate=22050,
393+
# mono sound
394+
channels=1,
395+
)
396+
397+
await self.audiosegments.put(audio)
364398

365399
# Run the code if that's the client's job
366400
if os.getenv("CODE_RUNNER") == "client":
@@ -369,7 +403,7 @@ async def exec_ws_communication(websocket):
369403
code = message["content"]
370404
result = interpreter.computer.run(language, code)
371405
send_queue.put(result)
372-
406+
373407
if is_win10():
374408
logger.info("Windows 10 detected")
375409
# Workaround for Windows 10 not latching to the websocket server.
@@ -399,6 +433,7 @@ async def start_async(self):
399433

400434
# Start watching the kernel if it's your job to do that
401435
if os.getenv("CODE_RUNNER") == "client":
436+
# client is not running code!
402437
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
403438

404439
asyncio.create_task(self.play_audiosegments())

0 commit comments

Comments
 (0)