diff --git a/.gitignore b/.gitignore index 5957730..b4d9460 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,12 @@ venv build dist speechlib.egg-info -.env \ No newline at end of file +.env + +*.swp +*.swo + +# By default do not include these files for version control +# Override this by using 'git add -f' +*.wav +*.mp3 diff --git a/README.md b/README.md index d92fe32..7a97444 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,9 @@ Transcriptor method takes 7 arguments. 4. model size ("tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3") -5. ACCESS_TOKEN: huggingface acccess token (also get permission to access `pyannote/speaker-diarization@2.1`) +5. ACCESS_TOKEN: huggingface acccess token + 1. Permission to access `pyannote/speaker-diarization@2.1` and `pyannote/segmentation` + 2. Token requires permission for 'Read access to contents of all public gated repos you can access' 6. voices_folder (contains speaker voice samples for speaker recognition) @@ -86,13 +88,16 @@ transcript will also indicate the timeframe in seconds where each speaker speaks ### Transcription example: ``` +import os +from speechlib import Transcriptor + file = "obama_zach.wav" # your audio file voices_folder = "" # voices folder containing voice samples for recognition language = "en" # language code log_folder = "logs" # log folder for storing transcripts modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3] quantization = False # setting this 'True' may speed up the process but lower the accuracy -ACCESS_TOKEN = "your hf key" # get permission to access pyannote/speaker-diarization@2.1 on huggingface +ACCESS_TOKEN = "huggingface api key" # get permission to access pyannote/speaker-diarization@2.1 on huggingface # quantization only works on faster-whisper transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization) @@ -110,7 +115,7 @@ res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt") res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base") # use assembly ai model -res = transcriptor.assemby_ai_model("your api key") +res = transcriptor.assemby_ai_model("assemblyAI api key") res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...] ``` @@ -211,4 +216,4 @@ This library uses following huggingface models: #### https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb #### https://huggingface.co/Ransaka/whisper-tiny-sinhala-20k-8k-steps-v2 -#### https://huggingface.co/pyannote/speaker-diarization \ No newline at end of file +#### https://huggingface.co/pyannote/speaker-diarization diff --git a/examples/.gitignore b/examples/.gitignore index 8e174ff..811a5c3 100644 --- a/examples/.gitignore +++ b/examples/.gitignore @@ -9,4 +9,5 @@ greek_convo_short.mp3 greek_convo_short.wav my_test.py greek_convo.mp3 -greek_convo.wav \ No newline at end of file +greek_convo.wav +.env \ No newline at end of file diff --git a/examples/transcribe.py b/examples/transcribe.py index fc86d75..5723e96 100644 --- a/examples/transcribe.py +++ b/examples/transcribe.py @@ -1,3 +1,4 @@ +import os from speechlib import Transcriptor file = "obama_zach.wav" # your audio file @@ -6,7 +7,7 @@ log_folder = "logs" # log folder for storing transcripts modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3] quantization = False # setting this 'True' may speed up the process but lower the accuracy -ACCESS_TOKEN = "your hf key" # get permission to access pyannote/speaker-diarization@2.1 on huggingface +ACCESS_TOKEN = "huggingface api key" # get permission to access pyannote/speaker-diarization@2.1 on huggingface # quantization only works on faster-whisper transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization) @@ -24,4 +25,4 @@ res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base") # use assembly ai model -res = transcriptor.assemby_ai_model("your api key") \ No newline at end of file +res = transcriptor.assemby_ai_model("assemblyAI api key") diff --git a/library.md b/library.md index aba74a4..b7af440 100644 --- a/library.md +++ b/library.md @@ -70,13 +70,16 @@ transcript will also indicate the timeframe in seconds where each speaker speaks ### Transcription example: ``` +import os +from speechlib import Transcriptor + file = "obama_zach.wav" # your audio file voices_folder = "" # voices folder containing voice samples for recognition language = "en" # language code log_folder = "logs" # log folder for storing transcripts modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3] quantization = False # setting this 'True' may speed up the process but lower the accuracy -ACCESS_TOKEN = "your hf key" # get permission to access pyannote/speaker-diarization@2.1 on huggingface +ACCESS_TOKEN = "huggingface api key" # get permission to access pyannote/speaker-diarization@2.1 on huggingface # quantization only works on faster-whisper transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization) @@ -94,7 +97,9 @@ res = transcriptor.custom_whisper("D:/whisper_tiny_model/tiny.pt") res = transcriptor.huggingface_model("Jingmiao/whisper-small-chinese_base") # use assembly ai model -res = transcriptor.assemby_ai_model("your api key") +res = transcriptor.assemby_ai_model("assemblyAI api key") + +res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...] ``` #### if you don't want speaker names: keep voices_folder as an empty string "" diff --git a/requirements.txt b/requirements.txt index 5893b85..476b184 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -transformers==4.36.2 -torch==2.1.2 -torchaudio==2.1.2 -pydub==0.25.1 -pyannote.audio==3.1.1 -speechbrain==0.5.16 -accelerate==0.26.1 -faster-whisper==0.10.1 -openai-whisper==20231117 \ No newline at end of file +transformers>=4.36.2, <5.0.0 +torch>=2.1.2, <3.0.0 +torchaudio>=2.1.2, <3.0.0 +pydub>=0.25.1, <1.0.0 +pyannote.audio>=3.1.1, <4.0.0 +speechbrain>=0.5.16, <1.0.0 +accelerate>=0.26.1, <1.0.0 +faster-whisper>=0.10.1, <1.0.0 +openai-whisper>=20231117, <20240927 diff --git a/setup.py b/setup.py index 0dbef84..c8bb6df 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="speechlib", - version="1.1.9", + version="1.1.10", description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.", packages=find_packages(), long_description=long_description, @@ -19,7 +19,7 @@ "Programming Language :: Python :: 3.10", "Operating System :: OS Independent", ], - install_requires=["transformers", "torch", "torchaudio", "pydub", "pyannote.audio", "speechbrain==0.5.16", "accelerate", "faster-whisper", "openai-whisper", "assemblyai"], + install_requires=["transformers>=4.36.2, <5.0.0", "torch>=2.1.2, <3.0.0", "torchaudio>=2.1.2, <3.0.0", "pydub>=0.25.1, <1.0.0", "pyannote.audio>=3.1.1, <4.0.0", "speechbrain>=0.5.16, <1.0.0", "accelerate>=0.26.1, <1.0.0", "faster-whisper>=0.10.1, <1.0.0", "openai-whisper>=20231117, <20240927", "assemblyai"], python_requires=">=3.8", ) diff --git a/setup_instruction.md b/setup_instruction.md index a07ae94..bb32cda 100644 --- a/setup_instruction.md +++ b/setup_instruction.md @@ -9,7 +9,7 @@ for publishing: pip install twine for install locally for testing: - pip install dist/speechlib-1.1.9-py3-none-any.whl + pip install dist/speechlib-1.1.10-py3-none-any.whl finally run: twine upload dist/*