2
2
3
3
load_dotenv () # take environment variables from .env.
4
4
5
+ import subprocess
5
6
import os
6
7
import sys
7
8
import asyncio
46
47
CHUNK = 1024 # Record in chunks of 1024 samples
47
48
FORMAT = pyaudio .paInt16 # 16 bits per sample
48
49
CHANNELS = 1 # Mono
49
- RATE = 44100 # Sample rate
50
+ RATE = 16000 # Sample rate
50
51
RECORDING = False # Flag to control recording state
51
52
SPACEBAR_PRESSED = False # Flag to track spacebar press state
52
53
60
61
# Specify OS
61
62
current_platform = get_system_info ()
62
63
64
+
63
65
def is_win11 ():
64
66
return sys .getwindowsversion ().build >= 22000
65
67
68
+
66
69
def is_win10 ():
67
70
try :
68
- return platform .system () == "Windows" and "10" in platform .version () and not is_win11 ()
71
+ return (
72
+ platform .system () == "Windows"
73
+ and "10" in platform .version ()
74
+ and not is_win11 ()
75
+ )
69
76
except :
70
77
return False
71
78
@@ -80,9 +87,10 @@ class Device:
80
87
def __init__ (self ):
81
88
self .pressed_keys = set ()
82
89
self .captured_images = []
83
- self .audiosegments = []
90
+ self .audiosegments = asyncio . Queue ()
84
91
self .server_url = ""
85
92
self .ctrl_pressed = False
93
+ self .tts_service = ""
86
94
87
95
def fetch_image_from_camera (self , camera_index = CAMERA_DEVICE_INDEX ):
88
96
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
@@ -144,11 +152,25 @@ def queue_all_captured_images(self):
144
152
145
153
async def play_audiosegments (self ):
146
154
"""Plays them sequentially."""
155
+
156
+ mpv_command = ["mpv" , "--no-cache" , "--no-terminal" , "--" , "fd://0" ]
157
+ mpv_process = subprocess .Popen (
158
+ mpv_command ,
159
+ stdin = subprocess .PIPE ,
160
+ stdout = subprocess .DEVNULL ,
161
+ stderr = subprocess .DEVNULL ,
162
+ )
163
+
147
164
while True :
148
165
try :
149
- for audio in self .audiosegments :
166
+ audio = await self .audiosegments .get ()
167
+
168
+ if self .tts_service == "elevenlabs" :
169
+ mpv_process .stdin .write (audio ) # type: ignore
170
+ mpv_process .stdin .flush () # type: ignore
171
+ else :
150
172
play (audio )
151
- self . audiosegments . remove ( audio )
173
+
152
174
await asyncio .sleep (0.1 )
153
175
except asyncio .exceptions .CancelledError :
154
176
# This happens once at the start?
@@ -267,19 +289,18 @@ def toggle_recording(self, state):
267
289
def on_press (self , key ):
268
290
"""Detect spacebar press and Ctrl+C combination."""
269
291
self .pressed_keys .add (key ) # Add the pressed key to the set
270
-
271
292
272
293
if keyboard .Key .space in self .pressed_keys :
273
294
self .toggle_recording (True )
274
- elif {keyboard .Key .ctrl , keyboard .KeyCode .from_char ('c' )} <= self .pressed_keys :
295
+ elif {keyboard .Key .ctrl , keyboard .KeyCode .from_char ("c" )} <= self .pressed_keys :
275
296
logger .info ("Ctrl+C pressed. Exiting..." )
276
297
kill_process_tree ()
277
298
os ._exit (0 )
278
-
299
+
279
300
# Windows alternative to the above
280
301
if key == keyboard .Key .ctrl_l :
281
302
self .ctrl_pressed = True
282
-
303
+
283
304
try :
284
305
if key .vk == 67 and self .ctrl_pressed :
285
306
logger .info ("Ctrl+C pressed. Exiting..." )
@@ -289,17 +310,17 @@ def on_press(self, key):
289
310
except :
290
311
pass
291
312
292
-
293
-
294
313
def on_release (self , key ):
295
314
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
296
- self .pressed_keys .discard (key ) # Remove the released key from the key press tracking set
315
+ self .pressed_keys .discard (
316
+ key
317
+ ) # Remove the released key from the key press tracking set
297
318
298
319
if key == keyboard .Key .ctrl_l :
299
320
self .ctrl_pressed = False
300
321
if key == keyboard .Key .space :
301
322
self .toggle_recording (False )
302
- elif CAMERA_ENABLED and key == keyboard .KeyCode .from_char ('c' ):
323
+ elif CAMERA_ENABLED and key == keyboard .KeyCode .from_char ("c" ):
303
324
self .fetch_image_from_camera ()
304
325
305
326
async def message_sender (self , websocket ):
@@ -332,35 +353,48 @@ async def exec_ws_communication(websocket):
332
353
chunk = await websocket .recv ()
333
354
334
355
logger .debug (f"Got this message from the server: { type (chunk )} { chunk } " )
356
+ # print("received chunk from server")
335
357
336
358
if type (chunk ) == str :
337
359
chunk = json .loads (chunk )
338
360
339
- message = accumulator .accumulate (chunk )
361
+ if chunk .get ("type" ) == "config" :
362
+ self .tts_service = chunk .get ("tts_service" )
363
+ continue
364
+
365
+ if self .tts_service == "elevenlabs" :
366
+ message = chunk
367
+ else :
368
+ message = accumulator .accumulate (chunk )
369
+
340
370
if message == None :
341
371
# Will be None until we have a full message ready
342
372
continue
343
373
344
374
# At this point, we have our message
345
-
346
- if message ["type" ] == "audio" and message ["format" ].startswith ("bytes" ):
375
+ if isinstance (message , bytes ) or (
376
+ message ["type" ] == "audio" and message ["format" ].startswith ("bytes" )
377
+ ):
347
378
# Convert bytes to audio file
348
-
349
- audio_bytes = message ["content" ]
350
-
351
- # Create an AudioSegment instance with the raw data
352
- audio = AudioSegment (
353
- # raw audio data (bytes)
354
- data = audio_bytes ,
355
- # signed 16-bit little-endian format
356
- sample_width = 2 ,
357
- # 16,000 Hz frame rate
358
- frame_rate = 16000 ,
359
- # mono sound
360
- channels = 1 ,
361
- )
362
-
363
- self .audiosegments .append (audio )
379
+ if self .tts_service == "elevenlabs" :
380
+ audio_bytes = message
381
+ audio = audio_bytes
382
+ else :
383
+ audio_bytes = message ["content" ]
384
+
385
+ # Create an AudioSegment instance with the raw data
386
+ audio = AudioSegment (
387
+ # raw audio data (bytes)
388
+ data = audio_bytes ,
389
+ # signed 16-bit little-endian format
390
+ sample_width = 2 ,
391
+ # 16,000 Hz frame rate
392
+ frame_rate = 22050 ,
393
+ # mono sound
394
+ channels = 1 ,
395
+ )
396
+
397
+ await self .audiosegments .put (audio )
364
398
365
399
# Run the code if that's the client's job
366
400
if os .getenv ("CODE_RUNNER" ) == "client" :
@@ -369,7 +403,7 @@ async def exec_ws_communication(websocket):
369
403
code = message ["content" ]
370
404
result = interpreter .computer .run (language , code )
371
405
send_queue .put (result )
372
-
406
+
373
407
if is_win10 ():
374
408
logger .info ("Windows 10 detected" )
375
409
# Workaround for Windows 10 not latching to the websocket server.
@@ -399,6 +433,7 @@ async def start_async(self):
399
433
400
434
# Start watching the kernel if it's your job to do that
401
435
if os .getenv ("CODE_RUNNER" ) == "client" :
436
+ # client is not running code!
402
437
asyncio .create_task (put_kernel_messages_into_queue (send_queue ))
403
438
404
439
asyncio .create_task (self .play_audiosegments ())
0 commit comments