Merge pull request #48 from NavodPeiris/dev

fixed empty trascription outputs when speaker is not speaking
NavodPeiris · Aug 25, 2024 · 66dd0f6 · 66dd0f6
2 parents 1b09984 + 5c3c29a
commit 66dd0f6
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -86,15 +86,13 @@ transcript will also indicate the timeframe in seconds where each speaker speaks
 ### Transcription example:
 
 ```
-from speechlib import Transcriptor
-
 file = "obama_zach.wav"  # your audio file
-voices_folder = "voices" # voices folder containing voice samples for recognition
+voices_folder = "" # voices folder containing voice samples for recognition
 language = "en"          # language code
 log_folder = "logs"      # log folder for storing transcripts
 modelSize = "tiny"     # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
 quantization = False   # setting this 'True' may speed up the process but lower the accuracy
-ACCESS_TOKEN = "your huggingface access token" # get permission to access pyannote/[email protected] on huggingface
+ACCESS_TOKEN = "your hf key" # get permission to access pyannote/[email protected] on huggingface
 
 # quantization only works on faster-whisper
 transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
@@ -111,6 +109,9 @@ res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
 # use a huggingface whisper model
 res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
 
+# use assembly ai model
+res = transcriptor.assemby_ai_model("your api key")
+
 res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...]
 ```
 

diff --git a/examples/.gitignore b/examples/.gitignore
@@ -7,3 +7,6 @@ __pycache__
 logs
 greek_convo_short.mp3
 greek_convo_short.wav
+my_test.py
+greek_convo.mp3
+greek_convo.wav
diff --git a/examples/transcribe.py b/examples/transcribe.py
@@ -1,8 +1,8 @@
 from speechlib import Transcriptor
 
-file = "greek_convo_short.mp3"  # your audio file
+file = "obama_zach.wav"  # your audio file
 voices_folder = "" # voices folder containing voice samples for recognition
-language = "el"          # language code
+language = "en"          # language code
 log_folder = "logs"      # log folder for storing transcripts
 modelSize = "tiny"     # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
 quantization = False   # setting this 'True' may speed up the process but lower the accuracy
@@ -12,16 +12,16 @@
 transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
 
 # use normal whisper
-#res = transcriptor.whisper()
+res = transcriptor.whisper()
 
 # use faster-whisper (simply faster)
-#res = transcriptor.faster_whisper()
+res = transcriptor.faster_whisper()
 
 # use a custom trained whisper model
-#res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
+res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
 
 # use a huggingface whisper model
-#res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
+res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
 
 # use assembly ai model
 res = transcriptor.assemby_ai_model("your api key")
diff --git a/library.md b/library.md
@@ -70,15 +70,13 @@ transcript will also indicate the timeframe in seconds where each speaker speaks
 ### Transcription example:
 
 ```
-from speechlib import Transcriptor
-
 file = "obama_zach.wav"  # your audio file
-voices_folder = "voices" # voices folder containing voice samples for recognition
+voices_folder = "" # voices folder containing voice samples for recognition
 language = "en"          # language code
 log_folder = "logs"      # log folder for storing transcripts
 modelSize = "tiny"     # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
 quantization = False   # setting this 'True' may speed up the process but lower the accuracy
-ACCESS_TOKEN = "your huggingface access token" # get permission to access pyannote/[email protected] on huggingface
+ACCESS_TOKEN = "your hf key" # get permission to access pyannote/[email protected] on huggingface
 
 # quantization only works on faster-whisper
 transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
@@ -95,7 +93,8 @@ res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
 # use a huggingface whisper model
 res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
 
-res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...]
+# use assembly ai model
+res = transcriptor.assemby_ai_model("your api key")
 ```
 
 #### if you don't want speaker names: keep voices_folder as an empty string ""

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="speechlib",
-    version="1.1.8",  
+    version="1.1.9",  
     description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.",
     packages=find_packages(),
     long_description=long_description,

diff --git a/setup_instruction.md b/setup_instruction.md
@@ -9,7 +9,7 @@ for publishing:
     pip install twine
 
 for install locally for testing:
-    pip install dist/speechlib-1.1.8-py3-none-any.whl
+    pip install dist/speechlib-1.1.9-py3-none-any.whl
 
 finally run:
     twine upload dist/*

diff --git a/speechlib/write_log_file.py b/speechlib/write_log_file.py
@@ -24,7 +24,8 @@ def write_log_file(common_segments, log_folder, file_name, language):
         text = segment[2]
         speaker = segment[3]
 
-        entry += f"{speaker} ({start} : {end}) : {text}\n"
+        if text != "" and text != None:
+            entry += f"{speaker} ({start} : {end}) : {text}\n"
 
     lf.write(bytes(entry.encode('utf-8')))      
     lf.close()