SYSTRAN · stillmatic · Nov 7, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,24 @@ venv/
 # Ignore IDE, Editor Files
 .idea/
 .vscode/
+
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
@@ -108,7 +108,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
     def split_to_word_tokens(
         self, tokens: List[int]
     ) -> Tuple[List[str], List[List[int]]]:
-        if self.language_code in {"zh", "ja", "th", "lo", "my"}:
+        if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
             # These languages don't typically use spaces, so it is difficult to split words
             # without morpheme analysis. Here, we instead split words at any
             # position where the tokens are decoded as valid unicode points
@@ -274,4 +274,5 @@ def split_tokens_on_spaces(
     "yi",
     "yo",
     "zh",
+    "yue",
 )
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -1,4 +1,5 @@
 import itertools
+import json
 import logging
 import os
 import zlib
@@ -92,8 +93,8 @@ def __init__(
 
         Args:
           model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
-            small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted
-            model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub.
+            small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a
+            converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
             When a size or a model ID is configured, the converted model is downloaded
             from the Hugging Face Hub.
           device: Device to use for computation ("cpu", "cuda", "auto").
@@ -113,6 +114,9 @@ def __init__(
             are saved in the standard Hugging Face cache directory.
           local_files_only:  If True, avoid downloading the file and return the path to the
             local cached file if it exists.
+          feature_size: Number of mel filters to use for feature extraction. If not set,
+            the number of mel filters is inferred from the model version. The first release
+            used 80 bins, but the large-v3 model uses 128 bins.
         """
         self.logger = get_logger()
 
@@ -142,7 +146,25 @@ def __init__(
                 "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
             )
 
-        self.feature_extractor = FeatureExtractor()
+        feature_extractor_file = os.path.join(model_path, "preprocessor_config.json")
+        if os.path.isfile(feature_extractor_file):
+            with open(feature_extractor_file, "r") as f:
+                config = json.load(f)
+            feat_kwargs = {
+                k: config[k]
+                for k in [
+                    "n_fft",
+                    "hop_length",
+                    "feature_size",
+                    "sampling_rate",
+                    "chunk_length",
+                ]
+                if k in config
+            }
+        else:
+            feat_kwargs = {}
+
+        self.feature_extractor = FeatureExtractor(**feat_kwargs)
         self.num_samples_per_token = self.feature_extractor.hop_length * 2
         self.frames_per_second = (
             self.feature_extractor.sampling_rate // self.feature_extractor.hop_length

diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py
@@ -21,6 +21,7 @@
     "large-v1": "guillaumekln/faster-whisper-large-v1",
     "large-v2": "guillaumekln/faster-whisper-large-v2",
     "large": "guillaumekln/faster-whisper-large-v2",
+    "large-v3": "bababababooey/faster-whisper-large-v3",
 }
 
 
@@ -50,7 +51,7 @@ def download_model(
     Args:
       size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
         (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
-        large), or a CTranslate2-converted model ID from the Hugging Face Hub
+        large, large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub
         (e.g. guillaumekln/faster-whisper-large-v2).
       output_dir: Directory where the model should be saved. If not set, the model is saved in
         the cache directory.
@@ -76,6 +77,7 @@ def download_model(
 
     allow_patterns = [
         "config.json",
+        "preprocessor_config.json",
         "model.bin",
         "tokenizer.json",
         "vocabulary.*",