From 37992bd8a2974b2d19ed6ba9a989a2bf94839dfa Mon Sep 17 00:00:00 2001 From: Madhusudanan Kandasamy Date: Fri, 8 Jan 2016 09:58:42 +0530 Subject: [PATCH 1/7] google speech API V2 needs audio file in mono format --- VoiceCommand/speech-recog.sh | 2 +- VoiceCommand/voicecommand.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VoiceCommand/speech-recog.sh b/VoiceCommand/speech-recog.sh index 09f45f3..95c064b 100755 --- a/VoiceCommand/speech-recog.sh +++ b/VoiceCommand/speech-recog.sh @@ -32,6 +32,6 @@ done #sox -r 16000 -t alsa $hardware /dev/shm/out.flac silence 1 0.3 1% 1 0.5 1% #wget -q -U "rate=16000" -O - --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" "http://www.google.com/speech-api/v1/recognize?lang=en&client=Mozilla/5.0" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }' #arecord -D $hardware -f cd -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; wget -O - -o /dev/null --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" http://www.google.com/speech-api/v1/recognize?lang="$lang" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }' -arecord -D $hardware -f cd -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'Mozilla/5.0' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&client=Mozilla/5.0" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n' +arecord -D $hardware -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'Mozilla/5.0' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&client=Mozilla/5.0" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n' rm /dev/shm/out.flac diff --git a/VoiceCommand/voicecommand.cpp b/VoiceCommand/voicecommand.cpp index 442abd1..5b033fd 100644 --- a/VoiceCommand/voicecommand.cpp +++ b/VoiceCommand/voicecommand.cpp @@ -31,7 +31,7 @@ inline float GetVolume(string recordHW, string com_duration, bool nullout) { float vol = 0.0f; string run = "arecord -D "; run += recordHW; - run += " -f cd -t wav -d "; + run += " -t wav -d "; run += com_duration; run += " -r 16000 /dev/shm/noise.wav"; if(nullout) From 18d48d539b5ceb5caade2e9bb74589ea92e53601 Mon Sep 17 00:00:00 2001 From: Jeramy Morrill Date: Wed, 27 Apr 2016 12:39:31 -0400 Subject: [PATCH 2/7] removed no longer functioning google tts and replaced with @irsx02 solution for pico2wave --- VoiceCommand/tts | 65 ++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/VoiceCommand/tts b/VoiceCommand/tts index 7bbcfe7..1f77ca7 100755 --- a/VoiceCommand/tts +++ b/VoiceCommand/tts @@ -1,42 +1,31 @@ #!/bin/bash +#since google ended TTS, this wrapper-script replaces tts with pico2wave. +#version 0.2 -now rudimentarily handles language -l param. -#for the Raspberry Pi, we need to insert some sort of FILLER here since it cuts off the first bit of audio - -string=$@ -lang="en" -if [ "$1" == "-l" ] ; then - lang="$2" - string=`echo "$string" | sed -r 's/^.{6}//'` +if [ $# -lt 1 ] +then #no argument entered - i need something to say + /usr/bin/pico2wave -w /tmp/tempsound.wav "I have nothing to say." + /usr/bin/aplay -q /tmp/tempsound.wav + rm /tmp/tempsound.wav + exit 0 fi -#empty the original file -echo "" > "/dev/shm/speak.mp3" - -len=${#string} -while [ $len -ge 100 ] ; -do - #lets split this up so that its a maximum of 99 characters - tmp=${string:0:100} - string=${string:100} - - #now we need to make sure there aren't split words, let's find the last space and the string after it - lastspace=${tmp##* } - tmplen=${#lastspace} - - #here we are shortening the tmp string - tmplen=`expr 100 - $tmplen` - tmp=${tmp:0:tmplen} - - #now we concatenate and the string is reconstructed - string="$lastspace$string" - len=${#string} - - #get the first 100 characters - wget -q -U Mozilla -O "/dev/shm/tmp.mp3" "https://translate.google.com/translate_tts?tl=${lang}&q=$tmp&ie=UTF-8&total=1&idx=0&client=t" - cat "/dev/shm/tmp.mp3" >> "/dev/shm/speak.mp3" -done -#this will get the last remnants -wget -q -U Mozilla -O "/dev/shm/tmp.mp3" "https://translate.google.com/translate_tts?tl=${lang}&q=$string&ie=UTF-8&total=1&idx=0&client=t" -cat "/dev/shm/tmp.mp3" >> "/dev/shm/speak.mp3" -#now we finally say the whole thing -cat "/dev/shm/speak.mp3" | mpg123 - 1>>/dev/shm/voice.log 2>>/dev/shm/voice.log +if [ "$1" = "-l" ] #-l in event where user explicitly defines language. +then # Note: always assumes $2 is 'en' or a valid language option. + lang=$2 + if [ $lang = "en" ] #TODO: cant find the real source of en, but if + then # i see 'en' I'm hard coding en-US. + lang="en-US" #US English, mofo, do you speak it + fi + shift 2 + speech=$@ + /usr/bin/pico2wave -l $lang -w /tmp/tempsound.wav "$speech" + /usr/bin/aplay -q /tmp/tempsound.wav + rm /tmp/tempsound.wav + exit 0 +else #else lets go straight to speech-output + speech=$@ + /usr/bin/pico2wave -w /tmp/tempsound.wav "$speech" + /usr/bin/aplay -q /tmp/tempsound.wav + rm /tmp/tempsound.wav +fi From f433c18b5b5ad5dddda971c46856779d44de3e9c Mon Sep 17 00:00:00 2001 From: Jeramy Morrill Date: Wed, 27 Apr 2016 17:05:21 -0400 Subject: [PATCH 3/7] fixing key --- VoiceCommand/speech-recog.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VoiceCommand/speech-recog.sh b/VoiceCommand/speech-recog.sh index 95c064b..201a0c4 100755 --- a/VoiceCommand/speech-recog.sh +++ b/VoiceCommand/speech-recog.sh @@ -32,6 +32,6 @@ done #sox -r 16000 -t alsa $hardware /dev/shm/out.flac silence 1 0.3 1% 1 0.5 1% #wget -q -U "rate=16000" -O - --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" "http://www.google.com/speech-api/v1/recognize?lang=en&client=Mozilla/5.0" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }' #arecord -D $hardware -f cd -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; wget -O - -o /dev/null --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" http://www.google.com/speech-api/v1/recognize?lang="$lang" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }' -arecord -D $hardware -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'Mozilla/5.0' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&client=Mozilla/5.0" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n' +arecord -D $hardware -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'Mozilla/5.0' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyALX8AqZZBN-TjQlu8WKAkkNw9Go5NxfQs&client=Mozilla/5.0" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n' rm /dev/shm/out.flac From ecf144af50ad3c8fc2ad38358e37d98ab38b0b71 Mon Sep 17 00:00:00 2001 From: Jeramy Morrill Date: Wed, 27 Apr 2016 17:08:48 -0400 Subject: [PATCH 4/7] fixing key --- VoiceCommand/speech-recog.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VoiceCommand/speech-recog.sh b/VoiceCommand/speech-recog.sh index 201a0c4..cc0f7a4 100755 --- a/VoiceCommand/speech-recog.sh +++ b/VoiceCommand/speech-recog.sh @@ -32,6 +32,6 @@ done #sox -r 16000 -t alsa $hardware /dev/shm/out.flac silence 1 0.3 1% 1 0.5 1% #wget -q -U "rate=16000" -O - --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" "http://www.google.com/speech-api/v1/recognize?lang=en&client=Mozilla/5.0" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }' #arecord -D $hardware -f cd -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; wget -O - -o /dev/null --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" http://www.google.com/speech-api/v1/recognize?lang="$lang" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }' -arecord -D $hardware -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'Mozilla/5.0' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyALX8AqZZBN-TjQlu8WKAkkNw9Go5NxfQs&client=Mozilla/5.0" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n' +arecord -D $hardware -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'chromium' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyALX8AqZZBN-TjQlu8WKAkkNw9Go5NxfQs&client=chromium" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n' rm /dev/shm/out.flac From 74d78a87690c9e5a86dd8c46ffaf708a75aed2a8 Mon Sep 17 00:00:00 2001 From: Jeramy Morrill Date: Wed, 27 Apr 2016 17:16:13 -0400 Subject: [PATCH 5/7] adding test --- VoiceCommand/speech-rec-test.sh | 126 ++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 VoiceCommand/speech-rec-test.sh diff --git a/VoiceCommand/speech-rec-test.sh b/VoiceCommand/speech-rec-test.sh new file mode 100644 index 0000000..607477c --- /dev/null +++ b/VoiceCommand/speech-rec-test.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Usage info +show_help() { +cat << EOF + Usage: ${0##*/} [-h] [-i INFILE] [-d DURATION] [-r RATE] [-l LANGUAGE] [-k KEY] + + Record an utterance and send audio data to Google for speech recognition. + + -h|--help display this help and exit. + -i|--input INFILE use INFILE instead of recording a stream with sox or parecord. + -d|--duration FLOAT recoding duration in seconds (Default: 3). + -l|--language STRING set transcription language (Default: en_US). + Other languages: fr_FR, de_DE, es_ES, ... + -r|--rate INTEGER Sampling rate of audio data (Default: 16000, if data is to be recorded). + If -i|--input is used, the sampling rate must be supplied by the user. + -k|--key STRING Google Speech Recognition Key. + +EOF +} + +DURATION=3 +LANGUAGE=en_US +# Please replace this with your own key +KEY=AIzaSyAcalCzUvPmmJ7CZBFOEWx2Z1ZSn4Vs1gg + + +record() { + DURATION=$1 + SRATE=$2 + INFILE=$3 + + if hash rec 2>/dev/null; then + # try to record audio with sox + rec -q -c 1 -r $SRATE $INFILE trim 0 $DURATION + else + # fallback to parecord + timeout $DURATION parecord $INFILE --file-format=flac --rate=$SRATE --channels=1 + fi +} + +# parse parameters +while [[ $# -ge 1 ]] +do + key="$1" + case $key in + -h|--help) + show_help + exit 0 + ;; + -i|--input) + INFILE="$2" + shift + ;; + -d|--duration) + DURATION="$2" + shift + ;; + -r|--rate) + SRATE=$2 + shift + ;; + -l|--language) + LANGUAGE="$2" + shift + ;; + -k|--key) + KEY="$2" + shift + ;; + *) + echo "Unknown parameter '$key'. Type $0 -h for more information." + exit 1 + ;; + esac + shift +done + +if [[ ! "$DURATION" ]] + then + echo "ERROR: empty or invalid value for duration." + exit 1 +fi + +if [[ ! "$LANGUAGE" ]] + then + echo "ERROR: empty value for language." + exit 1 +fi + +if [[ ! "$INFILE" ]] + then + INFILE=record_`date "+%Y%b%d_%H-%M-%S"`.flac + if [[ ! "$SRATE" ]] + then + SRATE=16000 + fi + echo "Say something..." + echo "" + record $DURATION $SRATE $INFILE + +else + if [[ ! "$SRATE" ]] + then + >&2 echo "ERROR: no sampling rate specified for input file." + exit 1 + fi + + echo "Try to recognize speech from file $INFILE" + echo "" +fi + +RESULT=`wget -q --post-file $INFILE --header="Content-Type: audio/x-flac; rate=$SRATE" -O - "https://www.google.com/speech-api/v2/recognize?client=chromium&lang=$LANGUAGE&key=$KEY"` + +FILTERED=`echo "$RESULT" | grep "transcript.*}" | sed 's/,/\n/g;s/[{,},"]//g;s/\[//g;s/\]//g;s/:/: /g' | grep -o -i -e "transcript.*" -e "confidence:.*"` + +if [[ ! "$FILTERED" ]] + then + >&2 echo "Google was unable to recognize any speech in audio data" +else + echo "Recognition result:" + echo "" + echo "$FILTERED" +fi + +exit 0 From 606d67158e7f2019154cad9f87ae58bf5571e6e1 Mon Sep 17 00:00:00 2001 From: Jeramy Morrill Date: Wed, 27 Apr 2016 17:16:40 -0400 Subject: [PATCH 6/7] adding test --- VoiceCommand/speech-rec-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VoiceCommand/speech-rec-test.sh b/VoiceCommand/speech-rec-test.sh index 607477c..eb0ab9a 100644 --- a/VoiceCommand/speech-rec-test.sh +++ b/VoiceCommand/speech-rec-test.sh @@ -22,7 +22,7 @@ EOF DURATION=3 LANGUAGE=en_US # Please replace this with your own key -KEY=AIzaSyAcalCzUvPmmJ7CZBFOEWx2Z1ZSn4Vs1gg +KEY=AIzaSyALX8AqZZBN-TjQlu8WKAkkNw9Go5NxfQs record() { From 7e2dc6094df50bd4f49714c6e31ffb4f5fc9421b Mon Sep 17 00:00:00 2001 From: Jeramy Morrill Date: Wed, 27 Apr 2016 17:18:32 -0400 Subject: [PATCH 7/7] adding test --- VoiceCommand/speech-rec-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VoiceCommand/speech-rec-test.sh b/VoiceCommand/speech-rec-test.sh index eb0ab9a..e61d581 100644 --- a/VoiceCommand/speech-rec-test.sh +++ b/VoiceCommand/speech-rec-test.sh @@ -32,7 +32,7 @@ record() { if hash rec 2>/dev/null; then # try to record audio with sox - rec -q -c 1 -r $SRATE $INFILE trim 0 $DURATION + arecord -q -c 1 -r $SRATE $INFILE trim 0 $DURATION else # fallback to parecord timeout $DURATION parecord $INFILE --file-format=flac --rate=$SRATE --channels=1