Skip to content

Replace mimic3 with piper #686

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,6 @@ doku/*
**/workspace_status.json

.pytest_cache/

# tts model
*/bitbots_tts/model/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest **/bitbots_tts/model/ to ignore the whole directory at any depth (in case we move it at some point).

61 changes: 41 additions & 20 deletions bitbots_misc/bitbots_tts/bitbots_tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/usr/bin/env python3

import os
import subprocess
import time
import io
import traceback
import wave
from pathlib import Path

import numpy as np
import rclpy
import requests
from ament_index_python import get_package_prefix
import sounddevice as sd
from piper import PiperVoice
from rcl_interfaces.msg import Parameter, SetParametersResult
from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
from rclpy.executors import MultiThreadedExecutor
Expand All @@ -16,6 +17,12 @@

from bitbots_msgs.msg import Audio

# Load the Piper voice
bb_tts_dir = Path(__file__).parent.parent / "model" # TODO: check how to get nice relative paths
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean depending on where you wanna locate the model folder you should probably use:
get_package_prefix, get_package_share_path which would give you e.g. ~/colcon_ws/install/share/bitbots_tts.

The current solutions should also work, but I would then use an absolute path to ensure that it will work when building without symlinks.

model_path = bb_tts_dir / "en_US-lessac-medium.onnx"
config_path = bb_tts_dir / "en_US-lessac-medium.onnx.json"
voice = PiperVoice.load(model_path, config_path=config_path, use_cuda=False)


def speak(text: str, publisher: Publisher, priority: int = 20, speaking_active: bool = True) -> None:
"""Utility method which can be used by other classes to easily publish a message."""
Expand All @@ -27,10 +34,35 @@ def speak(text: str, publisher: Publisher, priority: int = 20, speaking_active:


def say(text: str) -> None:
"""Start the shell `say.sh` script to output given text with mimic3. Beware: this is blocking."""
script_path = os.path.join(get_package_prefix("bitbots_tts"), "lib/bitbots_tts/say.sh")
process = subprocess.Popen((script_path, text))
process.wait()
"""Use piper for speech synthesis and audio playback.
This is also used for speaking the ip adress during startup."""
synthesize_args = {
"length_scale": 1.0, # Phoneme length, if lower -> faster
"noise_scale": 0.667, # Generator noise, if lower -> more robotic
"noise_w": 0.8, # Phoneme width noise, if lower -> more robotic
"sentence_silence": 0.1, # seconds of silence after each sentence
}
with io.BytesIO() as buffer:
with wave.open(buffer, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would mean that we will generate a new wav file each time?
mimic was implicitly caching the already generated voices, which was also the reason for the web server if I remember correctly.

Have you tried how long the wav generation takes?

Copy link
Contributor Author

@val-ba val-ba May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is real time on my laptop, but I did not try it on the robot yet. Maybe we should cache them again though... just to be sure


buffer.seek(0)
with wave.open(buffer, "rb") as wav:
framerate = wav.getframerate()
sampwidth = wav.getsampwidth()
nchannels = wav.getnchannels()
nframes = wav.getnframes()
audio_bytes = wav.readframes(nframes)

# bytes to np array
dtype_map = {1: np.int8, 2: np.int16, 4: np.int32}
if sampwidth not in dtype_map:
raise ValueError(f"Unsupported sample width: {sampwidth}")
audio = np.frombuffer(audio_bytes, dtype=dtype_map[sampwidth])
if nchannels > 1:
audio = audio.reshape(-1, nchannels)

sd.play(audio, samplerate=framerate, blocking=True)


class Speaker(Node):
Expand Down Expand Up @@ -62,17 +94,6 @@ def __init__(self) -> None:
# Subscribe to the speak topic
self.create_subscription(Audio, "speak", self.speak_cb, 10, callback_group=MutuallyExclusiveCallbackGroup())

# Wait for the mimic server to start
while True:
try:
requests.get("http://localhost:59125")
break
except requests.exceptions.ConnectionError:
# log once per second that the server is not yet available
self.get_logger().info("Waiting for mimic server to start...", throttle_duration_sec=2.0)
time.sleep(0.5)
pass

# Start processing the queue
self.create_timer(0.1, self.run_speaker, callback_group=MutuallyExclusiveCallbackGroup())

Expand Down
1 change: 1 addition & 0 deletions bitbots_misc/bitbots_tts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
("share/ament_index/resource_index/packages", ["resource/" + package_name]),
("share/" + package_name + "/config", glob.glob("config/*.yaml")),
("share/" + package_name + "/launch", glob.glob("launch/*.launch")),
("share/" + package_name + "/model", glob.glob("model/*")),
],
install_requires=[
"setuptools",
Expand Down
Loading