Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions VoiceCommand/speech-rec-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/bin/bash

# Usage info
show_help() {
cat << EOF
Usage: ${0##*/} [-h] [-i INFILE] [-d DURATION] [-r RATE] [-l LANGUAGE] [-k KEY]

Record an utterance and send audio data to Google for speech recognition.

-h|--help display this help and exit.
-i|--input INFILE use INFILE instead of recording a stream with sox or parecord.
-d|--duration FLOAT recoding duration in seconds (Default: 3).
-l|--language STRING set transcription language (Default: en_US).
Other languages: fr_FR, de_DE, es_ES, ...
-r|--rate INTEGER Sampling rate of audio data (Default: 16000, if data is to be recorded).
If -i|--input is used, the sampling rate must be supplied by the user.
-k|--key STRING Google Speech Recognition Key.

EOF
}

DURATION=3
LANGUAGE=en_US
# Please replace this with your own key
KEY=AIzaSyALX8AqZZBN-TjQlu8WKAkkNw9Go5NxfQs


record() {
DURATION=$1
SRATE=$2
INFILE=$3

if hash rec 2>/dev/null; then
# try to record audio with sox
arecord -q -c 1 -r $SRATE $INFILE trim 0 $DURATION
else
# fallback to parecord
timeout $DURATION parecord $INFILE --file-format=flac --rate=$SRATE --channels=1
fi
}

# parse parameters
while [[ $# -ge 1 ]]
do
key="$1"
case $key in
-h|--help)
show_help
exit 0
;;
-i|--input)
INFILE="$2"
shift
;;
-d|--duration)
DURATION="$2"
shift
;;
-r|--rate)
SRATE=$2
shift
;;
-l|--language)
LANGUAGE="$2"
shift
;;
-k|--key)
KEY="$2"
shift
;;
*)
echo "Unknown parameter '$key'. Type $0 -h for more information."
exit 1
;;
esac
shift
done

if [[ ! "$DURATION" ]]
then
echo "ERROR: empty or invalid value for duration."
exit 1
fi

if [[ ! "$LANGUAGE" ]]
then
echo "ERROR: empty value for language."
exit 1
fi

if [[ ! "$INFILE" ]]
then
INFILE=record_`date "+%Y%b%d_%H-%M-%S"`.flac
if [[ ! "$SRATE" ]]
then
SRATE=16000
fi
echo "Say something..."
echo ""
record $DURATION $SRATE $INFILE

else
if [[ ! "$SRATE" ]]
then
>&2 echo "ERROR: no sampling rate specified for input file."
exit 1
fi

echo "Try to recognize speech from file $INFILE"
echo ""
fi

RESULT=`wget -q --post-file $INFILE --header="Content-Type: audio/x-flac; rate=$SRATE" -O - "https://www.google.com/speech-api/v2/recognize?client=chromium&lang=$LANGUAGE&key=$KEY"`

FILTERED=`echo "$RESULT" | grep "transcript.*}" | sed 's/,/\n/g;s/[{,},"]//g;s/\[//g;s/\]//g;s/:/: /g' | grep -o -i -e "transcript.*" -e "confidence:.*"`

if [[ ! "$FILTERED" ]]
then
>&2 echo "Google was unable to recognize any speech in audio data"
else
echo "Recognition result:"
echo ""
echo "$FILTERED"
fi

exit 0
2 changes: 1 addition & 1 deletion VoiceCommand/speech-recog.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ done
#sox -r 16000 -t alsa $hardware /dev/shm/out.flac silence 1 0.3 1% 1 0.5 1%
#wget -q -U "rate=16000" -O - --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" "http://www.google.com/speech-api/v1/recognize?lang=en&client=Mozilla/5.0" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }'
#arecord -D $hardware -f cd -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; wget -O - -o /dev/null --post-file /dev/shm/out.flac --header="Content-Type: audio/x-flac; rate=16000" http://www.google.com/speech-api/v1/recognize?lang="$lang" | sed -e 's/[{}]/''/g'| awk -v k="text" '{n=split($0,a,","); for (i=1; i<=n; i++) print a[i]; exit }' | awk -F: 'NR==3 { print $3; exit }'
arecord -D $hardware -f cd -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'Mozilla/5.0' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&client=Mozilla/5.0" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n'
arecord -D $hardware -t wav -d $duration -r 16000 | flac - -f --best --sample-rate 16000 -o /dev/shm/out.flac 1>/dev/shm/voice.log 2>/dev/shm/voice.log; curl -X POST --data-binary @/dev/shm/out.flac --user-agent 'chromium' --header 'Content-Type: audio/x-flac; rate=16000;' "https://www.google.com/speech-api/v2/recognize?output=json&lang=$lang&key=AIzaSyALX8AqZZBN-TjQlu8WKAkkNw9Go5NxfQs&client=chromium" | sed -e 's/[{}]/''/g' | awk -F":" '{print $4}' | awk -F"," '{print $1}' | tr -d '\n'

rm /dev/shm/out.flac
65 changes: 27 additions & 38 deletions VoiceCommand/tts
Original file line number Diff line number Diff line change
@@ -1,42 +1,31 @@
#!/bin/bash
#since google ended TTS, this wrapper-script replaces tts with pico2wave.
#version 0.2 -now rudimentarily handles language -l param.

#for the Raspberry Pi, we need to insert some sort of FILLER here since it cuts off the first bit of audio

string=$@
lang="en"
if [ "$1" == "-l" ] ; then
lang="$2"
string=`echo "$string" | sed -r 's/^.{6}//'`
if [ $# -lt 1 ]
then #no argument entered - i need something to say
/usr/bin/pico2wave -w /tmp/tempsound.wav "I have nothing to say."
/usr/bin/aplay -q /tmp/tempsound.wav
rm /tmp/tempsound.wav
exit 0
fi

#empty the original file
echo "" > "/dev/shm/speak.mp3"

len=${#string}
while [ $len -ge 100 ] ;
do
#lets split this up so that its a maximum of 99 characters
tmp=${string:0:100}
string=${string:100}

#now we need to make sure there aren't split words, let's find the last space and the string after it
lastspace=${tmp##* }
tmplen=${#lastspace}

#here we are shortening the tmp string
tmplen=`expr 100 - $tmplen`
tmp=${tmp:0:tmplen}

#now we concatenate and the string is reconstructed
string="$lastspace$string"
len=${#string}

#get the first 100 characters
wget -q -U Mozilla -O "/dev/shm/tmp.mp3" "https://translate.google.com/translate_tts?tl=${lang}&q=$tmp&ie=UTF-8&total=1&idx=0&client=t"
cat "/dev/shm/tmp.mp3" >> "/dev/shm/speak.mp3"
done
#this will get the last remnants
wget -q -U Mozilla -O "/dev/shm/tmp.mp3" "https://translate.google.com/translate_tts?tl=${lang}&q=$string&ie=UTF-8&total=1&idx=0&client=t"
cat "/dev/shm/tmp.mp3" >> "/dev/shm/speak.mp3"
#now we finally say the whole thing
cat "/dev/shm/speak.mp3" | mpg123 - 1>>/dev/shm/voice.log 2>>/dev/shm/voice.log
if [ "$1" = "-l" ] #-l in event where user explicitly defines language.
then # Note: always assumes $2 is 'en' or a valid language option.
lang=$2
if [ $lang = "en" ] #TODO: cant find the real source of en, but if
then # i see 'en' I'm hard coding en-US.
lang="en-US" #US English, mofo, do you speak it
fi
shift 2
speech=$@
/usr/bin/pico2wave -l $lang -w /tmp/tempsound.wav "$speech"
/usr/bin/aplay -q /tmp/tempsound.wav
rm /tmp/tempsound.wav
exit 0
else #else lets go straight to speech-output
speech=$@
/usr/bin/pico2wave -w /tmp/tempsound.wav "$speech"
/usr/bin/aplay -q /tmp/tempsound.wav
rm /tmp/tempsound.wav
fi
2 changes: 1 addition & 1 deletion VoiceCommand/voicecommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ inline float GetVolume(string recordHW, string com_duration, bool nullout) {
float vol = 0.0f;
string run = "arecord -D ";
run += recordHW;
run += " -f cd -t wav -d ";
run += " -t wav -d ";
run += com_duration;
run += " -r 16000 /dev/shm/noise.wav";
if(nullout)
Expand Down