Skip to content

Commit

Permalink
Merge pull request #48 from NavodPeiris/dev
Browse files Browse the repository at this point in the history
fixed empty trascription outputs when speaker is not speaking
  • Loading branch information
NavodPeiris authored Aug 25, 2024
2 parents 1b09984 + 5c3c29a commit 66dd0f6
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 18 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,13 @@ transcript will also indicate the timeframe in seconds where each speaker speaks
### Transcription example:

```
from speechlib import Transcriptor
file = "obama_zach.wav" # your audio file
voices_folder = "voices" # voices folder containing voice samples for recognition
voices_folder = "" # voices folder containing voice samples for recognition
language = "en" # language code
log_folder = "logs" # log folder for storing transcripts
modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
quantization = False # setting this 'True' may speed up the process but lower the accuracy
ACCESS_TOKEN = "your huggingface access token" # get permission to access pyannote/[email protected] on huggingface
ACCESS_TOKEN = "your hf key" # get permission to access pyannote/[email protected] on huggingface
# quantization only works on faster-whisper
transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
Expand All @@ -111,6 +109,9 @@ res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
# use a huggingface whisper model
res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
# use assembly ai model
res = transcriptor.assemby_ai_model("your api key")
res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...]
```

Expand Down
3 changes: 3 additions & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ __pycache__
logs
greek_convo_short.mp3
greek_convo_short.wav
my_test.py
greek_convo.mp3
greek_convo.wav
12 changes: 6 additions & 6 deletions examples/transcribe.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from speechlib import Transcriptor

file = "greek_convo_short.mp3" # your audio file
file = "obama_zach.wav" # your audio file
voices_folder = "" # voices folder containing voice samples for recognition
language = "el" # language code
language = "en" # language code
log_folder = "logs" # log folder for storing transcripts
modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
quantization = False # setting this 'True' may speed up the process but lower the accuracy
Expand All @@ -12,16 +12,16 @@
transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)

# use normal whisper
#res = transcriptor.whisper()
res = transcriptor.whisper()

# use faster-whisper (simply faster)
#res = transcriptor.faster_whisper()
res = transcriptor.faster_whisper()

# use a custom trained whisper model
#res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")

# use a huggingface whisper model
#res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")

# use assembly ai model
res = transcriptor.assemby_ai_model("your api key")
9 changes: 4 additions & 5 deletions library.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,13 @@ transcript will also indicate the timeframe in seconds where each speaker speaks
### Transcription example:

```
from speechlib import Transcriptor
file = "obama_zach.wav" # your audio file
voices_folder = "voices" # voices folder containing voice samples for recognition
voices_folder = "" # voices folder containing voice samples for recognition
language = "en" # language code
log_folder = "logs" # log folder for storing transcripts
modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
quantization = False # setting this 'True' may speed up the process but lower the accuracy
ACCESS_TOKEN = "your huggingface access token" # get permission to access pyannote/[email protected] on huggingface
ACCESS_TOKEN = "your hf key" # get permission to access pyannote/[email protected] on huggingface
# quantization only works on faster-whisper
transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
Expand All @@ -95,7 +93,8 @@ res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt")
# use a huggingface whisper model
res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base")
res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...]
# use assembly ai model
res = transcriptor.assemby_ai_model("your api key")
```

#### if you don't want speaker names: keep voices_folder as an empty string ""
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="speechlib",
version="1.1.8",
version="1.1.9",
description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.",
packages=find_packages(),
long_description=long_description,
Expand Down
2 changes: 1 addition & 1 deletion setup_instruction.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ for publishing:
pip install twine

for install locally for testing:
pip install dist/speechlib-1.1.8-py3-none-any.whl
pip install dist/speechlib-1.1.9-py3-none-any.whl

finally run:
twine upload dist/*
Expand Down
3 changes: 2 additions & 1 deletion speechlib/write_log_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def write_log_file(common_segments, log_folder, file_name, language):
text = segment[2]
speaker = segment[3]

entry += f"{speaker} ({start} : {end}) : {text}\n"
if text != "" and text != None:
entry += f"{speaker} ({start} : {end}) : {text}\n"

lf.write(bytes(entry.encode('utf-8')))
lf.close()
Expand Down

0 comments on commit 66dd0f6

Please sign in to comment.