Skip to content
This repository was archived by the owner on Oct 6, 2025. It is now read-only.
This repository was archived by the owner on Oct 6, 2025. It is now read-only.

Pipeline seems to randomly hang after loading the wav2txt model #31

@ethereal-engineer

Description

@ethereal-engineer

I'd like to get more debugging information on this but the last line (on a full pipeline running locally) is INFO:

DEBUG:rhasspy3.core:Loading config from /home/doc/rhasspy3/rhasspy3/configuration.yaml
DEBUG:rhasspy3.core:Loading config from /home/doc/rhasspy3/config/configuration.yaml
DEBUG:rhasspy3.program:mic_adapter_raw.py ['--samples-per-chunk', '1024', '--rate', '16000', '--width', '2', '--channels', '1', 'arecord -q -D "default" -r 16000 -c 1 -f S16_LE -t raw -']
DEBUG:rhasspy3.program:asr_adapter_wav2text.py ['script/wav2text --language en "/home/doc/rhasspy3/config/data/asr/faster-whisper/tiny-int8" "{wav_file}"']
DEBUG:rhasspy3.program:.venv/bin/python3 ['bin/porcupine_stream.py', '--model', 'grasshopper_linux.ppn']
DEBUG:rhasspy3.wake:detect: processing audio
DEBUG:rhasspy3.wake:detect: Detection(name='grasshopper_linux', timestamp=17743906487598)
DEBUG:rhasspy3.program:vad_adapter_raw.py ['--rate', '16000', '--width', '2', '--channels', '1', '--samples-per-chunk', '512', 'script/speech_prob "share/silero_vad.onnx"']
DEBUG:rhasspy3.vad:segment: processing audio
DEBUG:rhasspy3.vad:segment: speaking started
DEBUG:rhasspy3.vad:segment: speaking ended
INFO:faster_whisper_wav2text:Model loaded

This is my configuration.yaml, largely copied from the default, with modifications for testing and voice selection:

programs:

  # -----------
  # Audio input
  # -----------
  mic:

    # apt-get install alsa-utils
    arecord:
      command: |
        arecord -q -D "${device}" -r 16000 -c 1 -f S16_LE -t raw -
      adapter: |
        mic_adapter_raw.py --samples-per-chunk 1024 --rate 16000 --width 2 --channels 1
      template_args:
        device: "default"

    # https://people.csail.mit.edu/hubert/pyaudio/docs/
    pyaudio:
      command: |
        script/events

    # https://python-sounddevice.readthedocs.io
    sounddevice:
      command: |
        script/events

    # apt-get install gstreamer1.0-tools gstreamer1.0-plugins-base
    gstreamer_udp:
      command: |
        gst-launch-1.0 -v udpsrc address=${address} port=${port} ! rawaudioparse use-sink-caps=false format=pcm pcm-format=${format} sample-rate=${rate} num-channels=${channels} ! audioconvert ! audioresample ! volume volume=3.0 ! level ! fdsink fd=1 sync=false
      template_args:
        format: s16le
        rate: 16000
        channels: 1
        address: "0.0.0.0"
        port: 5000
      adapter: |
        mic_adapter_raw.py --samples-per-chunk 1024 --rate 16000 --width 2 --channels 1

    udp_raw:
      command: |
        bin/udp_raw.py --host ${host} --port ${port}
      template_args:
        host: 0.0.0.0
        port: 5000

  # -------------------
  # Wake word detection
  # -------------------
  wake:

    # https://github.com/Picovoice/porcupine
    # Models: see script/list_models
    porcupine1:
      command: |
        .venv/bin/python3 bin/porcupine_stream.py --model "${model}"
      template_args:
        model: "grasshopper_linux.ppn"

    # https://github.com/Kitt-AI/snowboy
    # Models included in share/
    # Custom wake word: https://github.com/rhasspy/snowboy-seasalt
    snowboy:
      command: |
        .venv/bin/python3 bin/snowboy_raw_text.py --model "${model}"
      adapter: |
        wake_adapter_raw.py
      template_args:
        model: "share/snowboy.umdl"

    # https://github.com/mycroftAI/mycroft-precise
    # Model included in share/
    precise-lite:
      command: |
        .venv/bin/python3 bin/precise.py "${model}"
      adapter: |
        wake_adapter_raw.py
      template_args:
        model: "share/hey_mycroft.tflite"

    # TODO: snowman
    # https://github.com/Thalhammer/snowman/

  # ------------------------
  # Voice activity detection
  # ------------------------
  vad:

    # https://github.com/snakers4/silero-vad
    # Model included in share/
    silero:
      command: |
        script/speech_prob "${model}"
      adapter: |
        vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 512
      template_args:
        model: "share/silero_vad.onnx"

    # https://pypi.org/project/webrtcvad/
    webrtcvad:
      command: |
        script/speech_prob ${sensitivity}
      adapter: |
        vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 480
      template_args:
        sensitivity: 3

    # Uses rms energy threshold.
    # For testing only.
    energy:
      command: |
        bin/energy_speech_prob.py --threshold ${threshold} --width 2 --samples-per-chunk 1024
      adapter: |
        vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 1024
      template_args:
        threshold: 300

  # --------------
  # Speech to text
  # --------------
  asr:

    # https://alphacephei.com/vosk/
    # Models: https://alphacephei.com/vosk/models
    vosk:
      command: |
        script/raw2text "${model}"
      adapter: |
        asr_adapter_raw2text.py --rate 16000 --width 2 --channels 1
      template_args:
        model: "${data_dir}/vosk-model-small-en-us-0.15"

    # Run server: asr vosk
    vosk.client:
      command: |
        client_unix_socket.py var/run/vosk.socket

    # https://stt.readthedocs.io
    # Models: https://coqui.ai/models/
    coqui-stt:
      command: |
        script/raw2text "${model}"
      adapter: |
        asr_adapter_raw2text.py --rate 16000 --width 2 --channels 1
      template_args:
        model: "${data_dir}/english_v1.0.0-large-vocab"

    # Run server: asr coqui-stt
    coqui-stt.client:
      command: |
        client_unix_socket.py var/run/coqui-stt.socket

    # https://github.com/cmusphinx/pocketsphinx
    # Models: https://github.com/synesthesiam/voice2json-profiles
    pocketsphinx:
      command: |
        script/raw2text "${model}"
      adapter: |
        asr_adapter_raw2text.py --rate 16000 --width 2 --channels 1
      template_args:
        model: "${data_dir}/en-us_pocketsphinx-cmu"

    # Run server: asr pocketsphinx
    pocketsphinx.client:
      command: |
        client_unix_socket.py var/run/pocketsphinx.socket

    # https://github.com/openai/whisper
    # Models: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large
    # Languages: af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,
    # eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,
    # ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,
    # pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,
    # ur,uz,vi,yi,yo,zh
    whisper:
      command: |
        script/wav2text --language ${language} --model-directory "${data_dir}" "${model}" "{wav_file}"
      adapter: |
        asr_adapter_wav2text.py
      template_args:
        language: "en"
        model: "tiny.en"

    # Run server: asr whisper
    whisper.client:
      command: |
        client_unix_socket.py var/run/whisper.socket

    # https://github.com/ggerganov/whisper.cpp/
    # Models: https://huggingface.co/datasets/ggerganov/whisper.cpp
    whisper-cpp:
      command: |
        script/wav2text "${model}" "{wav_file}"
      adapter: |
        asr_adapter_wav2text.py
      template_args:
        model: "${data_dir}/ggml-tiny.en.bin"

    # Run server: asr whisper-cpp
    whisper-cpp.client:
      command: |
        client_unix_socket.py var/run/whisper-cpp.socket

    # https://github.com/guillaumekln/faster-whisper/
    # Models: https://github.com/rhasspy/models/releases/tag/v1.0
    # (asr_faster-whisper-*)
    faster-whisper:
      command: |
        script/wav2text --language ${language} "${model}" "{wav_file}"
      adapter: |
        asr_adapter_wav2text.py
      template_args:
        model: "${data_dir}/tiny-int8"
        language: "en"

    # Run server: asr faster-whisper
    faster-whisper.client:
      command: |
        client_unix_socket.py var/run/faster-whisper.socket


  # --------------
  # Text to speech
  # --------------
  tts:

    # https://github.com/rhasspy/piper/
    # Models: https://github.com/piper/piper/releases/tag/v0.0.2
    piper:
      command: |
        bin/piper --model "${model}" --output_file -
      adapter: |
        tts_adapter_text2wav.py
      template_args:
        model: "${data_dir}/en-us-libritts-high.onnx"
      install:
        command: |
          script/setup.py --destination '${program_dir}/bin'
        check_file: "${program_dir}/bin/piper"
        download:
          command: |
            script/download.py --destination '${data_dir}' '${model}'
        downloads:
          en-us_libritts:
            description: "U.S. English voice"
            check_file: "${data_dir}/en-us-libritts-high.onnx"


    # Run server: tts piper
    piper.client:
      command: |
        client_unix_socket.py var/run/piper.socket

    # https://github.com/rhasspy/larynx/
    # Models: https://rhasspy.github.io/larynx/
    larynx:
      command: |
        .venv/bin/larynx --voices-dir "${data_dir}" --voice "${voice}"
      adapter: |
        tts_adapter_text2wav.py
      template_args:
        voice: "en-us"

    # Run server: tts larynx
    larynx.client:
      command: |
        bin/larynx_client.py ${url} ${voice}
      template_args:
        url: "http://localhost:5002/process"
        voice: "en-us"
      adapter: |
        tts_adapter_text2wav.py

    # https://github.com/espeak-ng/espeak-ng/
    # apt-get install espeak-ng
    espeak-ng:
      command: |
        espeak-ng -v "${voice}" --stdin -w "{temp_file}"
      adapter: |
        tts_adapter_text2wav.py --temp_file
      template_args:
        voice: "en-us"

    # http://www.festvox.org/flite/
    # Models: https://github.com/rhasspy/models/releases/tag/v1.0
    # (tts_flite-*)
    flite:
      command: |
        flite -voice "${voice}" -o "{temp_file}"
      template_args:
        voice: "cmu_us_slt"
      adapter: |
        tts_adapter_text2wav.py --temp_file

    # http://www.cstr.ed.ac.uk/projects/festival/
    # apt-get install festival festival-<lang>
    festival:
      command: |
        text2wave -o "{temp_file}" -eval "(voice_${voice})"
      template_args:
        voice: "cmu_us_slt_arctic_hts"
      adapter: |
        tts_adapter_text2wav.py --temp_file

    # https://tts.readthedocs.io
    # Models: see script/list_models
    coqui-tts:
      command: |
        .venv/bin/tts --model_name "${model}" --out_path "{temp_file}" --text "{text}"
      adapter: |
        tts_adapter_text2wav.py --temp_file --text
      template_args:
        model: "tts_models/en/ljspeech/vits"
        speaker_id: ""

    coqui-tts.client:
      command: |
        tts_adapter_http.py "${url}" --param speaker_id "${speaker_id}"
      template_args:
        url: "http://localhost:5002/api/tts"
        speaker_id: ""

    # http://mary.dfki.de/
    # Models: https://github.com/synesthesiam/opentts/releases/tag/v2.1
    # (marytts-voices.tar.gz)
    marytts:
      command: |
        bin/marytts.py "${url}" "${voice}"
      template_args:
        url: "http://localhost:59125/process"
        voice: "cmu-slt-hsmm"
      adapter: |
        tts_adapter_text2wav.py

    # https://github.com/mycroftAI/mimic3
    # Models: https://mycroftai.github.io/mimic3-voices/
    mimic3:
      command: |
        .venv/bin/mimic3 --voices-dir "${data_dir}" --voice "${voice}" --stdout
      adapter: |
        tts_adapter_text2wav.py
      template_args:
        voice: "apope"

    # Run server: tts mimic3
    mimic3.client:
      command: |
        client_unix_socket.py var/run/mimic3.socket

  # ------------------
  # Intent recognition
  # ------------------
  intent:

    # Simple regex matching
    regex:
      command: |
        bin/regex.py -i TurnOn "turn on (the )?(?P<name>.+)"

    # TODO: fsticuffs
    # https://github.com/rhasspy/rhasspy-nlu

    # TODO: hassil
    # https://github.com/home-assistant/hassil

    # TODO: rapidfuzz
    # https://github.com/rhasspy/rhasspy-fuzzywuzzy

    # TODO: snips-nlu
    # https://snips-nlu.readthedocs.io

  # ---------------
  # Intent Handling
  # ---------------
  handle:

    # Text only: repeats transcript back
    repeat:
      command: |
        cat
      shell: true
      adapter: |
        handle_adapter_text.py

    # Text only: send to HA Assist
    # https://www.home-assistant.io/docs/assist
    #
    # 1. Change server url
    # 2. Put long-lived access token in etc/token
    home_assistant:
      command: |
        bin/converse.py --language "${language}" "${url}" "${token_file}"
      adapter: |
        handle_adapter_text.py
      template_args:
        url: "http://10.0.0.153:8123/api/conversation/process"
        token_file: "${data_dir}/token"
        language: "en"

    # Intent only: answer English date/time requests
    date_time:
      command: |
        bin/date_time.py
      adapter: |
        handle_adapter_text.py

    # Intent only: produces canned response to regex intent system
    test:
      command: |
        name="$(jq -r .slots.name)"
        echo "Turned on ${name}."
      shell: true
      adapter: |
        handle_adapter_json.py

  # ------------
  # Audio output
  # ------------
  snd:

    # apt-get install alsa-utils
    aplay:
      command: |
        aplay -q -D "${device}" -r 22050 -f S16_LE -c 1 -t raw
      adapter: |
        snd_adapter_raw.py --rate 22050 --width 2 --channels 1
      template_args:
        device: "default"

    udp_raw:
      command: |
        bin/udp_raw.py --host "${host}" --port ${port}
      template_args:
        host: "127.0.0.1"
        port: 5001

  # -------------------
  # Remote base station
  # -------------------
  remote:

    # Sample tool to communicate with websocket API.
    # Use rhasspy3/bin/satellite_run.py
    websocket:
      command: |
        script/run "${uri}"
      template_args:
        uri: "ws://localhost:13331/pipeline/asr-tts"

# -----------------------------------------------------------------------------

servers:
  asr:
    vosk:
      command: |
        script/server "${model}"
      template_args:
        model: "${data_dir}/vosk-model-small-en-us-0.15"

    coqui-stt:
      command: |
        script/server "${model}"
      template_args:
        model: "${data_dir}/english_v1.0.0-large-vocab"

    pocketsphinx:
      command: |
        script/server "${model}"
      template_args:
        model: "${data_dir}/en-us_pocketsphinx-cmu"

    whisper:
      command: |
        script/server --model-directory "${data_dir}" --language ${language} --device ${device} ${model}
      template_args:
        language: "en"
        model: "tiny.en"
        device: "cpu"  # or cuda

    whisper-cpp:
      command: |
        script/server "${model}"
      template_args:
        model: "${data_dir}/ggml-tiny.en.bin"

    faster-whisper:
      command: |
        script/server --language ${language} --device ${device} "${model}"
      template_args:
        language: "en"
        model: "${data_dir}/tiny-int8"
        device: "cpu"  # or cuda

  tts:
    mimic3:
      command: |
        script/server --voice "${voice}" "${data_dir}"
      template_args:
        voice: "en_US/ljspeech_low"

    piper:
      command: |
        script/server "${model}"
      template_args:
        model: "${data_dir}/en-us-libritts-high.onnx"

    larynx:
      command: |
        script/server --voices-dir "${data_dir}" --host "${host}"
      template_args:
        host: "127.0.0.1"

    coqui-tts:
      command: |
        script/server


# -----------------------------------------------------------------------------

# Example satellites
# satellites:
#   default:
#     mic:
#       name: arecord
#     wake:
#       name: porcupine1
#     remote:
#       name: websocket
#     snd:
#       name: aplay

# -----------------------------------------------------------------------------

# Example pipelines
pipelines:

  # English (default)
  default:
    mic:
      name: arecord
    wake:
      name: porcupine1
    vad:
      name: silero
    asr:
      name: faster-whisper
    # intent:
    #   name: regex
    handle:
      name: repeat
    #  name: home_assistant
    tts:
      name: piper
    snd:
      name: aplay

  # # German
  # de:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: de
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/de-thorsten-low.onnx"

  # # French
  # fr:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: fr
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/fr-siwis-low.onnx"

  # # Spanish
  # es:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: es
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/es-carlfm-low.onnx"

  # # Italian
  # it:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: it
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/it-riccardo_fasol-low.onnx"

  # # Catalan
  # ca:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: ca
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/ca-upc_ona-low.onnx"

  # # Danish
  # da:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: da
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/da-nst_talesyntese-medium.onnx"

  # # Dutch
  # nl:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: nl
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/nl-nathalie-low.onnx"

  # # Norwegian
  # no:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: no
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/no-talesyntese-medium.onnx"

  # # Ukrainian
  # uk:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: uk
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/uk-lada-low.onnx"

  # # Vietnamese
  # vi:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: vi
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/vi-vivos-low.onnx"

  # # Chinese
  # zh:
  #   inherit: default
  #   asr:
  #     name: faster-whisper
  #     template_args:
  #       language: zh
  #   tts:
  #     name: piper
  #     template_args:
  #       model: "${data_dir}/zh-cn-huayan-low.onnx"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions