diff --git a/narrative-audio-system/.gitignore b/narrative-audio-system/.gitignore new file mode 100644 index 0000000..e9b1d13 --- /dev/null +++ b/narrative-audio-system/.gitignore @@ -0,0 +1,29 @@ +# Claude Code internals +.claude/ + +# Python +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.egg-info/ +dist/ +build/ +.eggs/ +.venv/ +venv/ +env/ + +# Model checkpoints (large binary files) +*.pt +*.bin +*.onnx +*.ckpt + +# Cached embeddings (regenerate locally) +*_cache.npz + +# OS +.DS_Store +Thumbs.db diff --git a/narrative-audio-system/README.md b/narrative-audio-system/README.md new file mode 100644 index 0000000..9faf477 --- /dev/null +++ b/narrative-audio-system/README.md @@ -0,0 +1,332 @@ +# TableTalk Narrative Audio System + +This repository contains the Technical Evaluation Test for the **GSoC 2026 HumanAI: TableTalk** project. It implements an end-to-end pipeline for processing, classifying, and retrieving narrative audio for interactive storytelling. + +## 📄 Final Submission Documents +* **[Technical Report (PDF)](./TableTalk%20Narrative%20Audio%20System_%20Technical%20Report.pdf)** - Detailed analysis of methodology, results, and storytelling heuristics. +* **[Implementation Roadmap](./TableTalk%20Narrative%20Audio%20System_%20Technical%20Report.pdf#page=4)** - 12-week GSoC project plan. + +--- + +## 🌟 Key Results +* **Task 3 (Transcription):** Achieved an average **Word Error Rate (WER) of 16.67%** using OpenAI Whisper. +* **Task 2 (Classification):** Successfully trained a neural model to **38.6% accuracy** on the RAVDESS subset, identifying key markers for "Calm" vs. "Fearful" tones. +* **Bonus Task:** Developed a storytelling detection heuristic using **Mean Pitch Variation (123.01 Hz)** and **Pause Ratios (0.952)**. + +--- + +## 🛠️ Setup & Installation + +### 1. System Requirements +This project requires **FFmpeg** for audio processing. +* **macOS:** `brew install ffmpeg` +* **Ubuntu/Linux:** `sudo apt install ffmpeg` +* **Windows:** Install via [ffmpeg.org](https://ffmpeg.org/download.html) and add to PATH. + +### 2. Python Environment +```bash +# Clone the repository +git clone [YOUR_REPO_URL] +cd [REPO_NAME] + +# Install dependencies +pip install -r requirements.txt +``` + +> **Note:** `sounddevice` (added in Step 1) requires PortAudio. +> * **macOS:** `brew install portaudio` +> * **Ubuntu/Linux:** `sudo apt install portaudio19-dev` +> * **Windows:** included automatically via the pip wheel. + +--- + +## Project Structure + +- `run_pipeline.py`: End-to-end pipeline for all required tasks and bonus analysis +- `task0_audio_capture/audio_capture.py`: **Step 1** — real-time microphone capture & streaming +- `step2_vad/vad.py`: **Step 2** — Voice Activity Detection; labels frames as speech/silence and emits timestamped segments +- `task1_audio_pipeline/audio_pipeline.py`: Task 1 audio preprocessing and feature extraction +- `task2_tone_classification/train_classifier.py`: Task 2 tone classification model training and evaluation +- `task3_transcription/whisper_transcriber.py`: Task 3 batch transcription and WER measurement +- `task4_audio_retrieval/retrieval_prototype.py`: Task 4 retrieval prototype (filtering + semantic ranking) +- `task_bonus_storytelling/storytelling_analysis.py`: Bonus storytelling feature analysis and scoring +- `examples/`: Input recordings, labels, and generated output artifacts + +--- + +## Task Summary + +### Step 1: Audio Capture & Streaming + +The capture module (`task0_audio_capture/audio_capture.py`) provides real-time microphone input as the first stage of the pipeline: + +1. Opens a microphone stream via `sounddevice.InputStream` with a callback that fires every `chunk_size` samples +2. Each callback pushes raw PCM samples into a thread-safe `RollingBuffer` (circular deque) +3. Downstream steps read from the buffer without blocking audio capture + +Key parameters: + +| Parameter | Default | Effect | +|-----------|---------|--------| +| Sample rate | 16 000 Hz | Standard for speech | +| Chunk size | 1 024 samples | 64 ms per frame | +| Max buffer | 30 s | Rolling window; oldest frames dropped | + +Chunk size trade-offs: 512 samples = 32 ms (lower latency, higher CPU); 2048 samples = 128 ms (lower CPU, higher latency). + +**Standalone usage:** + +```bash +# Record 5 seconds and print stats +python task0_audio_capture/audio_capture.py --duration 5 --chunk 1024 --rate 16000 + +# Save captured audio to a WAV file +python task0_audio_capture/audio_capture.py --duration 5 --chunk 1024 --rate 16000 --save examples/captured_audio.wav + +# List available microphone devices +python task0_audio_capture/audio_capture.py --list-devices +``` + +**Library usage:** + +```python +from task0_audio_capture.audio_capture import AudioCaptureStream, RollingBuffer + +buf = RollingBuffer(max_seconds=10, sample_rate=16000) +with AudioCaptureStream(sample_rate=16000, chunk_size=1024, buffer=buf) as stream: + time.sleep(5) +audio = buf.get_audio() # 1-D float32 numpy array +``` + +When `run_pipeline.py` is executed without an explicit audio path argument, Step 1 automatically captures 5 seconds from the microphone and passes the recording to Task 1 onwards. If no microphone is available the pipeline falls back to a pre-recorded file. + +Primary output: `examples/captured_audio.wav` + +--- + +### Step 2: Voice Activity Detection (VAD) + +The VAD module (`step2_vad/vad.py`) labels every 10–30 ms audio frame as speech or non-speech and groups consecutive speech frames into timestamped `SpeechSegment` objects. Only detected speech windows are forwarded to the transcriber — silence is never sent, which prevents wasted compute and mid-sentence cuts. + +**Why it matters:** Whisper and similar models need clean speech boundaries to produce coherent transcripts. Sending raw silence causes hallucinations or empty outputs; cutting too early truncates words. + +**Backends (auto-selected)** + +| Backend | Latency | Accuracy | Requirement | +|---------|---------|----------|-------------| +| `webrtcvad` | < 1 ms/frame | Good (rule-based) | `pip install webrtcvad` | +| `silero-vad` | ~ 5 ms/frame | Better (neural) | `torch` + internet (auto-download) | +| `energy` | < 0.1 ms/frame | Basic fallback | none | + +**Key parameters** + +| Parameter | Default | Effect | +|-----------|---------|--------| +| Frame duration | 20 ms | VAD resolution (10/20/30 ms) | +| Aggressiveness | 2 | webrtcvad mode 0–3 (3 = most aggressive noise filtering) | +| Speech pad | 300 ms | Pre-speech buffer to avoid clipping first syllable | +| Silence pad | 400 ms | Post-speech tolerance — allows brief pauses within an utterance | +| Min speech | 250 ms | Discard clicks and noise bursts shorter than this | + +**Example output** +``` +[VAD] Speech started at T = 1.200 s +[VAD] Speech ended at T = 3.840 s (duration 2.640 s) +[VAD] Detected 1 speech segment(s) +[VAD] Total speech : 2.640 s / 5.056 s (52.2%) + [ 1] 1.200s -> 3.840s (2.640 s) +``` + +**Standalone usage:** + +```bash +# Run VAD on a WAV file +python step2_vad/vad.py --input examples/captured_audio.wav --mode 2 --frame-ms 20 + +# Run VAD on live microphone input (10 seconds) +python step2_vad/vad.py --duration 10 --mode 2 + +# Force a specific backend +python step2_vad/vad.py --input audio.wav --backend webrtcvad +``` + +**Library usage:** + +```python +from step2_vad.vad import detect_speech_segments + +segments = detect_speech_segments(audio_array, sample_rate=16000, aggressiveness=2) +for seg in segments: + print(f"Speech {seg.start:.2f}s - {seg.end:.2f}s ({seg.duration:.2f}s)") + # seg.audio is the float32 PCM for that utterance +``` + +In `run_pipeline.py`, Step 2 runs immediately after Step 1 and prints a per-segment breakdown before passing audio to Task 1. + +--- + +### Task 1: Audio Processing Pipeline + +The Task 1 pipeline: + +1. Loads `.wav` files from the input directory +2. Normalizes audio amplitude for consistent feature extraction +3. Segments audio into fixed windows when needed +4. Extracts machine-learning-ready features + +Extracted features include: + +1. MFCC coefficients +2. Pitch (fundamental frequency summary) +3. Spectral centroid +4. RMS energy +5. Duration + +Primary outputs: + +- `examples/task1_features_dataset.csv` +- `examples/normalized_audio/` + +### Task 2: Narrative Tone Classification + +The classifier uses MFCC-based features and a feedforward neural network to predict emotional tone labels. The training pipeline includes: + +1. Stratified train/test split +2. Feature standardization using train-set statistics +3. Neural model training with cross-entropy loss +4. Test evaluation + +Reported metrics: + +1. Accuracy +2. Weighted F1 score +3. Per-class report + +### Task 3: AI-Based Transcription + +The transcription module uses OpenAI Whisper to: + +1. Transcribe multiple recordings in batch +2. Save transcripts to a text output file +3. Measure transcription quality with Word Error Rate (WER) on a subset + +Primary output: + +- `examples/transcripts.txt` + +### Task 4: Narrative Audio Retrieval (TableTalk Simulation) + +The retrieval system uses a hybrid strategy: + +1. Structured filtering from query constraints (duration, energy, pitch, tone) +2. Semantic ranking over generated recording descriptions + +Example queries: + +1. `calm narration longer than 4 seconds` +2. `high-energy speech` +3. `dramatic dialogue` + +### Bonus: Storytelling Audio Analysis + +The bonus module analyzes several recordings for storytelling-related cues: + +1. Pacing and pauses +2. Pitch variation +3. Energy dynamics +4. Sentence-length characteristics from transcripts + +It also computes a heuristic `storytelling_score` and ranks clips by storytelling-like expressiveness. + +Primary output: + +- `examples/storytelling_analysis.csv` + +--- + +## Run Instructions + +### Run the full pipeline + +From the `narrative-audio-system/` directory: + +```bash +python run_pipeline.py examples/03-01-04-02-01-01-11.wav +``` + +### Run tasks individually + +Step 1 (audio capture): + +```bash +python task0_audio_capture/audio_capture.py --duration 5 --chunk 1024 --rate 16000 --save examples/captured_audio.wav +``` + +Step 2 (voice activity detection): + +```bash +python step2_vad/vad.py --input examples/captured_audio.wav --mode 2 --frame-ms 20 +``` + +Task 1: + +```bash +python task1_audio_pipeline/audio_pipeline.py +``` + +Task 2: + +```bash +python task2_tone_classification/train_classifier.py +``` + +Task 3: + +```bash +python task3_transcription/whisper_transcriber.py +``` + +Task 4: + +```bash +python task4_audio_retrieval/retrieval_prototype.py +``` + +Bonus task: + +```bash +python task_bonus_storytelling/storytelling_analysis.py +``` + +--- + +## Example Outputs + +Generated artifacts include: + +1. `examples/captured_audio.wav` (Step 1 — live microphone recording) +2. `examples/task1_features_dataset.csv` +3. `examples/normalized_audio/` +4. `examples/transcripts.txt` +5. `examples/storytelling_analysis.csv` + +Console outputs include: + +1. Task 2 test metrics (accuracy, weighted F1, class report) +2. Task 3 WER summary +3. Task 4 retrieval results for sample queries +4. Bonus storytelling summary and top-ranked clips + +--- + +## Approach and Discussion + +This project is designed as a practical, reproducible end-to-end prototype for narrative audio processing. + +- Task 1 converts raw recordings into structured numerical features. +- Task 2 demonstrates tone classification from audio-derived features. +- Task 3 provides scalable transcription with measurable quality. +- Task 4 combines explicit filtering with semantic retrieval for narrative-style queries. +- The bonus task explores prosodic and transcript-level cues for distinguishing storytelling narration from conversational speech. + +Current limitations include dataset scale, CPU transcription speed, and the heuristic nature of storytelling scoring. Future improvements include pretrained audio embeddings, stronger ranking objectives, and dedicated storytelling annotations. \ No newline at end of file diff --git a/narrative-audio-system/TableTalk Narrative Audio System_ Technical Report.pdf b/narrative-audio-system/TableTalk Narrative Audio System_ Technical Report.pdf new file mode 100644 index 0000000..f061c89 Binary files /dev/null and b/narrative-audio-system/TableTalk Narrative Audio System_ Technical Report.pdf differ diff --git a/narrative-audio-system/emotion_classifier/classifier.py b/narrative-audio-system/emotion_classifier/classifier.py new file mode 100644 index 0000000..2fb3382 --- /dev/null +++ b/narrative-audio-system/emotion_classifier/classifier.py @@ -0,0 +1,676 @@ +""" +Step 5 — Tone / Emotion Classification +======================================== +Classifies the emotional tone of an Utterance in parallel with Step 4 +transcription. Both steps consume the same audio buffer from Step 3 and +are dispatched concurrently via ThreadPoolExecutor so neither waits on the +other. + +Pipeline position +----------------- + [Step 3 Utterance] + | + +-----> Step 4 Transcriber (Whisper, ~300–600 ms) + | + +-----> Step 5 Classifier (MFCC + MLP, ~5–15 ms) <-- this module + | + [merged result: text + emotion] + +Backends +-------- + mfcc-mlp (default) + Extract 13 MFCC coefficients with librosa (~5 ms), run through the + 2-layer feedforward network from Task 2 (~10 ms on CPU). Fast enough + to never be the bottleneck. + + wav2vec2 / hubert (research direction) + Fine-tuning on wav2vec2 or HuBERT embeddings generalises far better to + real microphone audio than raw MFCCs. These embeddings capture + phonetic and prosodic context across a full utterance. Recommended + for production use with Matthew. + Not loaded by default — enable with backend="wav2vec2" and + pip install transformers. + +Output +------ + EmotionResult(label="tense", confidence=0.82, + start=1.2, end=3.84, latency_ms=12.4) + +Usage (standalone demo) +----------------------- + python emotion_classifier/classifier.py --input examples/captured_audio.wav + python emotion_classifier/classifier.py --train # retrain on RAVDESS + +Usage (library) +--------------- + from emotion_classifier.classifier import EmotionClassifier, EmotionResult + + clf = EmotionClassifier() + result = clf.classify(utterance) + print(result.label, result.confidence) + +Parallel usage with Step 4 +--------------------------- + from concurrent.futures import ThreadPoolExecutor + from emotion_classifier.classifier import EmotionClassifier + from transcriber.streaming_transcriber import Transcriber + + clf = EmotionClassifier() + trs = Transcriber() + + with ThreadPoolExecutor(max_workers=2) as pool: + t_future = pool.submit(trs.transcribe, utterance) + e_future = pool.submit(clf.classify, utterance) + transcript = t_future.result() + emotion = e_future.result() +""" + +import argparse +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import numpy as np + +# --------------------------------------------------------------------------- +# Optional heavy imports +# --------------------------------------------------------------------------- + +try: + import torch + import torch.nn as nn + _TORCH_AVAILABLE = True +except ImportError: + _TORCH_AVAILABLE = False + +try: + import librosa + _LIBROSA_AVAILABLE = True +except ImportError: + _LIBROSA_AVAILABLE = False + + +# --------------------------------------------------------------------------- +# Data class +# --------------------------------------------------------------------------- + +@dataclass +class EmotionResult: + """ + Tone / emotion prediction for one utterance. + + Attributes + ---------- + label : str + Predicted emotion label (e.g. "calm", "angry", "tense"). + confidence : float + Softmax probability of the top prediction (0–1). + start : float + Utterance start time in seconds. + end : float + Utterance end time in seconds. + latency_ms : float + Wall-clock time for feature extraction + inference (ms). + all_scores : dict + Softmax probability for every class: {"calm": 0.7, "angry": 0.2, ...} + backend : str + Which backend produced this result. + """ + label: str + confidence: float + start: float = 0.0 + end: float = 0.0 + latency_ms: float = 0.0 + all_scores: Dict[str, float] = field(default_factory=dict) + backend: str = "mfcc-mlp" + + @property + def duration(self) -> float: + return self.end - self.start + + def __repr__(self) -> str: + return ( + f'EmotionResult(label={self.label!r}, confidence={self.confidence:.2f}, ' + f'start={self.start:.3f}s, end={self.end:.3f}s, ' + f'latency={self.latency_ms:.1f}ms, backend={self.backend!r})' + ) + + +# --------------------------------------------------------------------------- +# Neural network (matches Task 2 architecture) +# --------------------------------------------------------------------------- + +class _ToneNet(nn.Module): + """2-layer feedforward classifier — identical to Task 2 ToneClassifier.""" + + def __init__(self, input_dim: int, hidden_dim: int, num_classes: int): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(hidden_dim, num_classes), + ) + + def forward(self, x): + return self.net(x) + + +# --------------------------------------------------------------------------- +# MFCC feature extraction +# --------------------------------------------------------------------------- + +def _extract_mfcc(audio: np.ndarray, sample_rate: int = 16000, + n_mfcc: int = 13) -> np.ndarray: + """Return mean MFCC vector (shape: [n_mfcc]) from a float32 audio array.""" + if not _LIBROSA_AVAILABLE: + raise ImportError("librosa not installed. Run: pip install librosa") + mfcc = librosa.feature.mfcc(y=audio.astype(np.float32), + sr=sample_rate, n_mfcc=n_mfcc) + return mfcc.mean(axis=1).astype(np.float32) + + +# --------------------------------------------------------------------------- +# Core classifier +# --------------------------------------------------------------------------- + +class EmotionClassifier: + """ + Classifies emotional tone from a raw audio array or Utterance object. + + Parameters + ---------- + model_path : str | Path | None + Path to a saved checkpoint (.pt) produced by save(). If None and + train_on_init=True, the model is trained on the RAVDESS examples. + label_map_path : str | Path | None + Path to emotion_labels.json. Defaults to examples/emotion_labels.json. + backend : str + "mfcc-mlp" — fast MFCC + MLP (default). + "wav2vec2" — requires transformers; better on real mic audio. + "hubert" — requires transformers; similar to wav2vec2. + n_mfcc : int + Number of MFCC coefficients (must match trained model). + hidden_dim : int + MLP hidden layer width. + sample_rate : int + Expected audio sample rate. + train_on_init : bool + If True and no model_path, auto-train on RAVDESS examples at startup. + verbose : bool + Print status messages. + """ + + RAVDESS_EMOTIONS = [ + "neutral", "calm", "happy", "sad", + "angry", "fearful", "disgust", "surprised", + ] + + def __init__( + self, + model_path: Optional[str] = None, + label_map_path: Optional[str] = None, + backend: str = "mfcc-mlp", + n_mfcc: int = 13, + hidden_dim: int = 64, + sample_rate: int = 16000, + train_on_init: bool = True, + verbose: bool = True, + ): + if backend not in ("mfcc-mlp", "wav2vec2", "hubert"): + raise ValueError(f"Unknown backend {backend!r}. " + "Choose 'mfcc-mlp', 'wav2vec2', or 'hubert'.") + + self.backend = backend + self.n_mfcc = n_mfcc + self.hidden_dim = hidden_dim + self.sample_rate = sample_rate + self.verbose = verbose + + self._model = None + self._class_names: List[str] = [] + self._feature_mean: Optional[np.ndarray] = None + self._feature_std: Optional[np.ndarray] = None + self._transformer_pipeline = None + + # Resolve label map path + root = Path(__file__).resolve().parent.parent + self._label_map_path = Path(label_map_path) if label_map_path else ( + root / "examples" / "emotion_labels.json" + ) + + if backend == "mfcc-mlp": + self._init_mfcc_mlp(model_path, train_on_init) + else: + self._init_transformer(backend) + + # ------------------------------------------------------------------ + # Initialisation helpers + # ------------------------------------------------------------------ + + def _init_mfcc_mlp(self, model_path, train_on_init: bool) -> None: + if not _TORCH_AVAILABLE: + raise ImportError("torch not installed. Run: pip install torch") + + if model_path and Path(model_path).exists(): + self._load_checkpoint(model_path) + elif train_on_init and self._label_map_path.exists(): + self._train_from_examples() + elif train_on_init: + if self.verbose: + print(f"[Classifier] label map not found at {self._label_map_path}; " + "using untrained model with RAVDESS class names.") + self._class_names = self.RAVDESS_EMOTIONS + self._feature_mean = np.zeros(self.n_mfcc, dtype=np.float32) + self._feature_std = np.ones(self.n_mfcc, dtype=np.float32) + self._model = _ToneNet(self.n_mfcc, self.hidden_dim, len(self._class_names)) + else: + raise RuntimeError("No model_path provided and train_on_init=False.") + + def _init_transformer(self, backend: str) -> None: + """ + Research direction: wav2vec2 / HuBERT embeddings. + + These models capture rich phonetic and prosodic context and + generalise far better to real microphone audio than raw MFCCs. + Recommended for production with real performer audio. + """ + try: + from transformers import pipeline as hf_pipeline + task = "audio-classification" + model_id = ( + "facebook/wav2vec2-base" + if backend == "wav2vec2" + else "facebook/hubert-base-ls960" + ) + self._transformer_pipeline = hf_pipeline( + task, model=model_id, device=-1 + ) + self._class_names = [] # set by the HF model + if self.verbose: + print(f"[Classifier] Loaded {backend} pipeline ({model_id})") + except ImportError: + raise ImportError( + f"transformers not installed for backend={backend!r}.\n" + " pip install transformers\n" + "Note: wav2vec2/HuBERT are research-direction backends — " + "use mfcc-mlp for production." + ) + + # ------------------------------------------------------------------ + # Training + # ------------------------------------------------------------------ + + def _train_from_examples(self) -> None: + import json + from sklearn.model_selection import train_test_split + + if self.verbose: + print(f"[Classifier] Training on {self._label_map_path} ...") + + with open(self._label_map_path, "r", encoding="utf-8-sig") as f: + label_map = json.load(f) + + audio_root = self._label_map_path.parent + features, labels = [], [] + for filename, emotion in label_map.items(): + wav = audio_root / filename + if not wav.is_file(): + continue + try: + import soundfile as sf + audio, sr = sf.read(str(wav), dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != self.sample_rate: + audio = librosa.resample(audio, orig_sr=sr, + target_sr=self.sample_rate) + vec = _extract_mfcc(audio, self.sample_rate, self.n_mfcc) + features.append(vec) + labels.append(emotion.strip().lower()) + except Exception: + continue + + if not features: + raise RuntimeError("No features could be extracted from examples/.") + + unique, encoded = np.unique(labels, return_inverse=True) + self._class_names = [str(c).strip().title() for c in unique.tolist()] + + X = np.array(features, dtype=np.float32) + self._feature_mean = X.mean(axis=0) + self._feature_std = X.std(axis=0) + 1e-6 + X_norm = (X - self._feature_mean) / self._feature_std + + if len(set(labels)) > 1: + X_tr, _, y_tr, _ = train_test_split( + X_norm, encoded, test_size=0.2, random_state=42, stratify=encoded + ) + else: + X_tr, y_tr = X_norm, encoded + + self._model = _ToneNet(self.n_mfcc, self.hidden_dim, len(self._class_names)) + criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-3) + X_tensor = torch.tensor(X_tr, dtype=torch.float32) + y_tensor = torch.tensor(y_tr, dtype=torch.long) + + self._model.train() + for epoch in range(30): + logits = self._model(X_tensor) + loss = criterion(logits, y_tensor) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + self._model.eval() + if self.verbose: + print( + f"[Classifier] Trained on {len(features)} samples " + f"({len(self._class_names)} classes): " + f"{', '.join(self._class_names)}" + ) + + # ------------------------------------------------------------------ + # Checkpoint I/O + # ------------------------------------------------------------------ + + def save(self, path: str) -> None: + """Save model weights + normalisation stats to a .pt file.""" + torch.save({ + "model_state": self._model.state_dict(), + "class_names": self._class_names, + "feature_mean": self._feature_mean, + "feature_std": self._feature_std, + "n_mfcc": self.n_mfcc, + "hidden_dim": self.hidden_dim, + }, path) + if self.verbose: + print(f"[Classifier] Saved to {path}") + + def _load_checkpoint(self, path: str) -> None: + ckpt = torch.load(path, map_location="cpu") + self._class_names = ckpt["class_names"] + self._feature_mean = ckpt["feature_mean"] + self._feature_std = ckpt["feature_std"] + n_mfcc = ckpt.get("n_mfcc", self.n_mfcc) + hidden_dim = ckpt.get("hidden_dim", self.hidden_dim) + self._model = _ToneNet(n_mfcc, hidden_dim, len(self._class_names)) + self._model.load_state_dict(ckpt["model_state"]) + self._model.eval() + if self.verbose: + print(f"[Classifier] Loaded checkpoint from {path}") + + # ------------------------------------------------------------------ + # Inference + # ------------------------------------------------------------------ + + def classify_array( + self, audio: np.ndarray, sample_rate: int = 16000, + start: float = 0.0, end: float = 0.0, + ) -> EmotionResult: + """ + Classify a raw float32 PCM array. + + Parameters + ---------- + audio : np.ndarray + 1-D float32 array at `sample_rate`. + sample_rate : int + start, end : float + Source timestamps for result metadata. + + Returns + ------- + EmotionResult + """ + if audio.ndim > 1: + audio = audio.flatten() + audio = audio.astype(np.float32) + + if sample_rate != self.sample_rate and _LIBROSA_AVAILABLE: + audio = librosa.resample(audio, orig_sr=sample_rate, + target_sr=self.sample_rate) + + t0 = time.perf_counter() + + if self.backend == "mfcc-mlp": + label, confidence, all_scores = self._infer_mfcc_mlp(audio) + else: + label, confidence, all_scores = self._infer_transformer(audio) + + latency_ms = (time.perf_counter() - t0) * 1000 + + result = EmotionResult( + label=label, + confidence=confidence, + start=start, + end=end, + latency_ms=latency_ms, + all_scores=all_scores, + backend=self.backend, + ) + + if self.verbose: + print( + f"[Classifier] {start:.3f}s -> {end:.3f}s " + f"({latency_ms:.1f}ms) " + f"{label!r} conf={confidence:.2f}" + ) + return result + + def classify(self, utterance) -> EmotionResult: + """ + Classify an Utterance object (from Step 3). + + Parameters + ---------- + utterance : Utterance + Must have .audio, .start, .end attributes. + + Returns + ------- + EmotionResult + """ + return self.classify_array( + audio=utterance.audio, + sample_rate=self.sample_rate, + start=utterance.start, + end=utterance.end, + ) + + def _infer_mfcc_mlp(self, audio: np.ndarray) -> Tuple[str, float, Dict]: + vec = _extract_mfcc(audio, self.sample_rate, self.n_mfcc) + norm = (vec - self._feature_mean) / self._feature_std + tensor = torch.tensor(norm, dtype=torch.float32).unsqueeze(0) + self._model.eval() + with torch.no_grad(): + logits = self._model(tensor) + probs = torch.softmax(logits, dim=1).squeeze(0).numpy() + idx = int(np.argmax(probs)) + label = self._class_names[idx] + all_scores = {name: float(probs[i]) + for i, name in enumerate(self._class_names)} + return label, float(probs[idx]), all_scores + + def _infer_transformer(self, audio: np.ndarray) -> Tuple[str, float, Dict]: + results = self._transformer_pipeline( + {"raw": audio, "sampling_rate": self.sample_rate} + ) + top = max(results, key=lambda r: r["score"]) + all_scores = {r["label"]: r["score"] for r in results} + return top["label"], top["score"], all_scores + + +# --------------------------------------------------------------------------- +# Parallel processor — Step 4 + Step 5 concurrently +# --------------------------------------------------------------------------- + +class ParallelProcessor: + """ + Runs Step 4 (Transcriber) and Step 5 (EmotionClassifier) concurrently + on the same Utterance using a 2-thread pool. + + Both steps read the utterance's audio array independently — no locking + needed since the array is read-only during inference. + + Parameters + ---------- + transcriber : Transcriber + A loaded Step 4 Transcriber instance. + classifier : EmotionClassifier + A loaded Step 5 EmotionClassifier instance. + verbose : bool + Print combined results to stdout. + """ + + def __init__(self, transcriber, classifier, verbose: bool = True): + self._transcriber = transcriber + self._classifier = classifier + self.verbose = verbose + + def process(self, utterance): + """ + Run transcription and classification in parallel. + + Returns + ------- + (TranscriptionResult, EmotionResult) + """ + with ThreadPoolExecutor(max_workers=2) as pool: + t_future = pool.submit(self._transcriber.transcribe, utterance) + e_future = pool.submit(self._classifier.classify, utterance) + transcript = t_future.result() + emotion = e_future.result() + + if self.verbose: + print( + f"[Parallel] {utterance.start:.3f}s -> {utterance.end:.3f}s\n" + f" text : \"{transcript.text}\"\n" + f" emotion : {emotion.label!r} ({emotion.confidence:.2f})\n" + f" latencies: transcribe={transcript.latency_ms:.0f}ms " + f"classify={emotion.latency_ms:.1f}ms" + ) + return transcript, emotion + + def process_all(self, utterances) -> List[Tuple]: + """Process every utterance and return [(TranscriptionResult, EmotionResult)].""" + return [self.process(u) for u in utterances] + + +# --------------------------------------------------------------------------- +# Convenience function +# --------------------------------------------------------------------------- + +def classify_utterances( + utterances, + label_map_path: Optional[str] = None, + verbose: bool = True, +) -> List[EmotionResult]: + """ + One-call helper: classify a list of Utterances. + + Returns + ------- + List[EmotionResult] + """ + clf = EmotionClassifier( + label_map_path=label_map_path, + verbose=verbose, + ) + return [clf.classify(u) for u in utterances] + + +# --------------------------------------------------------------------------- +# CLI demo +# --------------------------------------------------------------------------- + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Step 5 — Tone/Emotion Classification demo" + ) + parser.add_argument("--input", default=None, metavar="FILE.WAV", + help="WAV file to classify (default: live mic)") + parser.add_argument("--duration", type=float, default=10.0, + help="Live recording duration in seconds (default: 10)") + parser.add_argument("--rate", type=int, default=16000) + parser.add_argument("--backend", default="mfcc-mlp", + choices=["mfcc-mlp", "wav2vec2", "hubert"], + help="Classification backend (default: mfcc-mlp)") + parser.add_argument("--model", default=None, metavar="CHECKPOINT.pt", + help="Path to saved classifier checkpoint") + parser.add_argument("--train", action="store_true", + help="Retrain on RAVDESS examples and save to classifier.pt") + parser.add_argument("--vad-mode", type=int, default=2, choices=[0, 1, 2, 3]) + parser.add_argument("--parallel", action="store_true", + help="Run Step 4 transcription + Step 5 in parallel") + return parser.parse_args() + + +def main(): + args = _parse_args() + root = Path(__file__).resolve().parent.parent + for mod in ("vad_engine", "utterance_buffer", "task0_audio_capture", "transcriber"): + p = str(root / mod) + if p not in sys.path: + sys.path.insert(0, p) + + from vad import detect_speech_segments + from segmenter import segment_utterances + + print( + f"\nStep 5 — Tone / Emotion Classification\n" + f" Backend : {args.backend}\n" + f" Model : {args.model or 'auto-train from examples/'}\n" + f" Parallel : {args.parallel}\n" + ) + + # Load or capture audio + if args.input: + import soundfile as sf + audio, sr = sf.read(args.input, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != args.rate: + audio = librosa.resample(audio, orig_sr=sr, target_sr=args.rate) + print(f"[Classifier] Loaded {args.input} ({len(audio)/args.rate:.2f} s)\n") + else: + from audio_capture import record_for_duration + print(f"[Classifier] Recording {args.duration:.1f} s ...") + audio = record_for_duration(duration=args.duration, sample_rate=args.rate) + + # VAD + segmentation + vad_segs = detect_speech_segments( + audio, sample_rate=args.rate, aggressiveness=args.vad_mode, verbose=False + ) + utterances = segment_utterances( + vad_segs, strategy="pause_triggered", sample_rate=args.rate, verbose=False + ) + print(f"[Classifier] {len(utterances)} utterance(s)\n") + + # Train/save if requested + clf = EmotionClassifier( + model_path=args.model, + backend=args.backend, + verbose=True, + ) + if args.train: + clf.save("classifier.pt") + + if args.parallel: + from streaming_transcriber import Transcriber + from parallel_processor import ParallelProcessor + trs = Transcriber(model_size="tiny", verbose=False) + proc = ParallelProcessor(trs, clf, verbose=True) + for utt in utterances: + proc.process(utt) + else: + for utt in utterances: + clf.classify(utt) + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/examples/03-01-01-01-01-01-01.wav b/narrative-audio-system/examples/03-01-01-01-01-01-01.wav new file mode 100644 index 0000000..b2d81e1 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-01-01-01.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-01-01-04.wav b/narrative-audio-system/examples/03-01-01-01-01-01-04.wav new file mode 100644 index 0000000..a11cfe5 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-01-01-11.wav b/narrative-audio-system/examples/03-01-01-01-01-01-11.wav new file mode 100644 index 0000000..e3f10eb Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-01-02-01.wav b/narrative-audio-system/examples/03-01-01-01-01-02-01.wav new file mode 100644 index 0000000..d9957a9 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-01-02-01.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-01-02-04.wav b/narrative-audio-system/examples/03-01-01-01-01-02-04.wav new file mode 100644 index 0000000..0433d36 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-01-02-11.wav b/narrative-audio-system/examples/03-01-01-01-01-02-11.wav new file mode 100644 index 0000000..19d4129 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-02-01-01.wav b/narrative-audio-system/examples/03-01-01-01-02-01-01.wav new file mode 100644 index 0000000..4cf9e6c Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-02-01-01.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-02-01-04.wav b/narrative-audio-system/examples/03-01-01-01-02-01-04.wav new file mode 100644 index 0000000..f587bce Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-02-01-11.wav b/narrative-audio-system/examples/03-01-01-01-02-01-11.wav new file mode 100644 index 0000000..b432719 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-02-02-01.wav b/narrative-audio-system/examples/03-01-01-01-02-02-01.wav new file mode 100644 index 0000000..ab85ed0 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-02-02-01.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-02-02-04.wav b/narrative-audio-system/examples/03-01-01-01-02-02-04.wav new file mode 100644 index 0000000..7e714f4 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-01-01-02-02-11.wav b/narrative-audio-system/examples/03-01-01-01-02-02-11.wav new file mode 100644 index 0000000..96a1ad5 Binary files /dev/null and b/narrative-audio-system/examples/03-01-01-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-01-01-01.wav b/narrative-audio-system/examples/03-01-02-01-01-01-01.wav new file mode 100644 index 0000000..090452a Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-01-01-01.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-01-01-04.wav b/narrative-audio-system/examples/03-01-02-01-01-01-04.wav new file mode 100644 index 0000000..cba77ca Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-01-01-11.wav b/narrative-audio-system/examples/03-01-02-01-01-01-11.wav new file mode 100644 index 0000000..8c33364 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-01-02-01.wav b/narrative-audio-system/examples/03-01-02-01-01-02-01.wav new file mode 100644 index 0000000..d6a31b4 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-01-02-01.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-01-02-04.wav b/narrative-audio-system/examples/03-01-02-01-01-02-04.wav new file mode 100644 index 0000000..9379369 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-01-02-11.wav b/narrative-audio-system/examples/03-01-02-01-01-02-11.wav new file mode 100644 index 0000000..f382106 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-02-01-01.wav b/narrative-audio-system/examples/03-01-02-01-02-01-01.wav new file mode 100644 index 0000000..8c38e63 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-02-01-01.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-02-01-04.wav b/narrative-audio-system/examples/03-01-02-01-02-01-04.wav new file mode 100644 index 0000000..825b4fe Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-02-01-11.wav b/narrative-audio-system/examples/03-01-02-01-02-01-11.wav new file mode 100644 index 0000000..47fcb53 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-02-02-04.wav b/narrative-audio-system/examples/03-01-02-01-02-02-04.wav new file mode 100644 index 0000000..7c8bc27 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-01-02-02-11.wav b/narrative-audio-system/examples/03-01-02-01-02-02-11.wav new file mode 100644 index 0000000..96f61a1 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-01-01-04.wav b/narrative-audio-system/examples/03-01-02-02-01-01-04.wav new file mode 100644 index 0000000..4031d8d Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-01-01-11.wav b/narrative-audio-system/examples/03-01-02-02-01-01-11.wav new file mode 100644 index 0000000..a76b20a Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-01-02-04.wav b/narrative-audio-system/examples/03-01-02-02-01-02-04.wav new file mode 100644 index 0000000..2d1fbd7 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-01-02-11.wav b/narrative-audio-system/examples/03-01-02-02-01-02-11.wav new file mode 100644 index 0000000..cb8577e Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-02-01-04.wav b/narrative-audio-system/examples/03-01-02-02-02-01-04.wav new file mode 100644 index 0000000..ab6682b Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-02-01-11.wav b/narrative-audio-system/examples/03-01-02-02-02-01-11.wav new file mode 100644 index 0000000..bd79aa4 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-02-02-04.wav b/narrative-audio-system/examples/03-01-02-02-02-02-04.wav new file mode 100644 index 0000000..ff07231 Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-02-02-02-02-11.wav b/narrative-audio-system/examples/03-01-02-02-02-02-11.wav new file mode 100644 index 0000000..b0a07cc Binary files /dev/null and b/narrative-audio-system/examples/03-01-02-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-01-01-04.wav b/narrative-audio-system/examples/03-01-03-01-01-01-04.wav new file mode 100644 index 0000000..a1d9c87 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-01-01-11.wav b/narrative-audio-system/examples/03-01-03-01-01-01-11.wav new file mode 100644 index 0000000..9e338d5 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-01-02-04.wav b/narrative-audio-system/examples/03-01-03-01-01-02-04.wav new file mode 100644 index 0000000..49328ff Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-01-02-11.wav b/narrative-audio-system/examples/03-01-03-01-01-02-11.wav new file mode 100644 index 0000000..313b0eb Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-02-01-04.wav b/narrative-audio-system/examples/03-01-03-01-02-01-04.wav new file mode 100644 index 0000000..d5468d9 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-02-01-11.wav b/narrative-audio-system/examples/03-01-03-01-02-01-11.wav new file mode 100644 index 0000000..13a5dc1 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-02-02-04.wav b/narrative-audio-system/examples/03-01-03-01-02-02-04.wav new file mode 100644 index 0000000..90a35bd Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-01-02-02-11.wav b/narrative-audio-system/examples/03-01-03-01-02-02-11.wav new file mode 100644 index 0000000..c13275e Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-01-01-04.wav b/narrative-audio-system/examples/03-01-03-02-01-01-04.wav new file mode 100644 index 0000000..2f89e8d Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-01-01-11.wav b/narrative-audio-system/examples/03-01-03-02-01-01-11.wav new file mode 100644 index 0000000..4d501a8 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-01-02-04.wav b/narrative-audio-system/examples/03-01-03-02-01-02-04.wav new file mode 100644 index 0000000..12a2976 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-01-02-11.wav b/narrative-audio-system/examples/03-01-03-02-01-02-11.wav new file mode 100644 index 0000000..8e644c8 Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-02-01-04.wav b/narrative-audio-system/examples/03-01-03-02-02-01-04.wav new file mode 100644 index 0000000..8f9901b Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-02-01-11.wav b/narrative-audio-system/examples/03-01-03-02-02-01-11.wav new file mode 100644 index 0000000..fd362dd Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-02-02-04.wav b/narrative-audio-system/examples/03-01-03-02-02-02-04.wav new file mode 100644 index 0000000..4d4f94a Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-03-02-02-02-11.wav b/narrative-audio-system/examples/03-01-03-02-02-02-11.wav new file mode 100644 index 0000000..c09863c Binary files /dev/null and b/narrative-audio-system/examples/03-01-03-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-01-01-04.wav b/narrative-audio-system/examples/03-01-04-01-01-01-04.wav new file mode 100644 index 0000000..5939ff9 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-01-01-11.wav b/narrative-audio-system/examples/03-01-04-01-01-01-11.wav new file mode 100644 index 0000000..2651c65 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-01-02-04.wav b/narrative-audio-system/examples/03-01-04-01-01-02-04.wav new file mode 100644 index 0000000..137f3ae Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-01-02-11.wav b/narrative-audio-system/examples/03-01-04-01-01-02-11.wav new file mode 100644 index 0000000..4eae1c6 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-02-01-04.wav b/narrative-audio-system/examples/03-01-04-01-02-01-04.wav new file mode 100644 index 0000000..3815618 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-02-01-11.wav b/narrative-audio-system/examples/03-01-04-01-02-01-11.wav new file mode 100644 index 0000000..ce2ae07 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-02-02-04.wav b/narrative-audio-system/examples/03-01-04-01-02-02-04.wav new file mode 100644 index 0000000..f0b30db Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-01-02-02-11.wav b/narrative-audio-system/examples/03-01-04-01-02-02-11.wav new file mode 100644 index 0000000..5718d7e Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-01-01-04.wav b/narrative-audio-system/examples/03-01-04-02-01-01-04.wav new file mode 100644 index 0000000..249333e Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-01-01-11.wav b/narrative-audio-system/examples/03-01-04-02-01-01-11.wav new file mode 100644 index 0000000..ba35e4b Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-01-02-04.wav b/narrative-audio-system/examples/03-01-04-02-01-02-04.wav new file mode 100644 index 0000000..50e6deb Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-01-02-11.wav b/narrative-audio-system/examples/03-01-04-02-01-02-11.wav new file mode 100644 index 0000000..fc446f6 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-02-01-04.wav b/narrative-audio-system/examples/03-01-04-02-02-01-04.wav new file mode 100644 index 0000000..e1b2ade Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-02-01-11.wav b/narrative-audio-system/examples/03-01-04-02-02-01-11.wav new file mode 100644 index 0000000..0c137f1 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-02-02-04.wav b/narrative-audio-system/examples/03-01-04-02-02-02-04.wav new file mode 100644 index 0000000..0732c00 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-04-02-02-02-11.wav b/narrative-audio-system/examples/03-01-04-02-02-02-11.wav new file mode 100644 index 0000000..4dcf209 Binary files /dev/null and b/narrative-audio-system/examples/03-01-04-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-01-01-04.wav b/narrative-audio-system/examples/03-01-05-01-01-01-04.wav new file mode 100644 index 0000000..055fd16 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-01-01-11.wav b/narrative-audio-system/examples/03-01-05-01-01-01-11.wav new file mode 100644 index 0000000..31f747d Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-01-02-04.wav b/narrative-audio-system/examples/03-01-05-01-01-02-04.wav new file mode 100644 index 0000000..030aa05 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-01-02-11.wav b/narrative-audio-system/examples/03-01-05-01-01-02-11.wav new file mode 100644 index 0000000..921bb02 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-02-01-04.wav b/narrative-audio-system/examples/03-01-05-01-02-01-04.wav new file mode 100644 index 0000000..32b01ef Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-02-01-11.wav b/narrative-audio-system/examples/03-01-05-01-02-01-11.wav new file mode 100644 index 0000000..cf42359 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-02-02-04.wav b/narrative-audio-system/examples/03-01-05-01-02-02-04.wav new file mode 100644 index 0000000..ec2ff22 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-01-02-02-11.wav b/narrative-audio-system/examples/03-01-05-01-02-02-11.wav new file mode 100644 index 0000000..7e322fe Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-01-01-04.wav b/narrative-audio-system/examples/03-01-05-02-01-01-04.wav new file mode 100644 index 0000000..d4b920a Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-01-01-11.wav b/narrative-audio-system/examples/03-01-05-02-01-01-11.wav new file mode 100644 index 0000000..7d24042 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-01-02-04.wav b/narrative-audio-system/examples/03-01-05-02-01-02-04.wav new file mode 100644 index 0000000..1ef7941 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-01-02-11.wav b/narrative-audio-system/examples/03-01-05-02-01-02-11.wav new file mode 100644 index 0000000..eeff77c Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-02-01-04.wav b/narrative-audio-system/examples/03-01-05-02-02-01-04.wav new file mode 100644 index 0000000..6114a0b Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-02-01-11.wav b/narrative-audio-system/examples/03-01-05-02-02-01-11.wav new file mode 100644 index 0000000..704bc71 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-02-02-04.wav b/narrative-audio-system/examples/03-01-05-02-02-02-04.wav new file mode 100644 index 0000000..7f509b3 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-05-02-02-02-11.wav b/narrative-audio-system/examples/03-01-05-02-02-02-11.wav new file mode 100644 index 0000000..446a1f0 Binary files /dev/null and b/narrative-audio-system/examples/03-01-05-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-01-01-04.wav b/narrative-audio-system/examples/03-01-06-01-01-01-04.wav new file mode 100644 index 0000000..209b9da Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-01-01-11.wav b/narrative-audio-system/examples/03-01-06-01-01-01-11.wav new file mode 100644 index 0000000..1d1188b Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-01-02-04.wav b/narrative-audio-system/examples/03-01-06-01-01-02-04.wav new file mode 100644 index 0000000..0c642e3 Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-01-02-11.wav b/narrative-audio-system/examples/03-01-06-01-01-02-11.wav new file mode 100644 index 0000000..d32802a Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-02-01-04.wav b/narrative-audio-system/examples/03-01-06-01-02-01-04.wav new file mode 100644 index 0000000..eb2b1ae Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-02-01-11.wav b/narrative-audio-system/examples/03-01-06-01-02-01-11.wav new file mode 100644 index 0000000..5fb6b30 Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-02-02-04.wav b/narrative-audio-system/examples/03-01-06-01-02-02-04.wav new file mode 100644 index 0000000..6cf357b Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-01-02-02-11.wav b/narrative-audio-system/examples/03-01-06-01-02-02-11.wav new file mode 100644 index 0000000..4ef843a Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-01-01-04.wav b/narrative-audio-system/examples/03-01-06-02-01-01-04.wav new file mode 100644 index 0000000..87fc1be Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-01-01-11.wav b/narrative-audio-system/examples/03-01-06-02-01-01-11.wav new file mode 100644 index 0000000..3f917d1 Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-01-02-04.wav b/narrative-audio-system/examples/03-01-06-02-01-02-04.wav new file mode 100644 index 0000000..f8da2bc Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-01-02-11.wav b/narrative-audio-system/examples/03-01-06-02-01-02-11.wav new file mode 100644 index 0000000..db2d8d9 Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-02-01-04.wav b/narrative-audio-system/examples/03-01-06-02-02-01-04.wav new file mode 100644 index 0000000..045e8f2 Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-02-01-11.wav b/narrative-audio-system/examples/03-01-06-02-02-01-11.wav new file mode 100644 index 0000000..3fc186c Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-02-02-04.wav b/narrative-audio-system/examples/03-01-06-02-02-02-04.wav new file mode 100644 index 0000000..5a15dcf Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-06-02-02-02-11.wav b/narrative-audio-system/examples/03-01-06-02-02-02-11.wav new file mode 100644 index 0000000..cfb4536 Binary files /dev/null and b/narrative-audio-system/examples/03-01-06-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-01-01-04.wav b/narrative-audio-system/examples/03-01-07-01-01-01-04.wav new file mode 100644 index 0000000..f15f5df Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-01-01-11.wav b/narrative-audio-system/examples/03-01-07-01-01-01-11.wav new file mode 100644 index 0000000..f8e8fe9 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-01-02-04.wav b/narrative-audio-system/examples/03-01-07-01-01-02-04.wav new file mode 100644 index 0000000..de191f3 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-01-02-11.wav b/narrative-audio-system/examples/03-01-07-01-01-02-11.wav new file mode 100644 index 0000000..a3d66b4 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-02-01-04.wav b/narrative-audio-system/examples/03-01-07-01-02-01-04.wav new file mode 100644 index 0000000..7abac0b Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-02-01-11.wav b/narrative-audio-system/examples/03-01-07-01-02-01-11.wav new file mode 100644 index 0000000..7a9bf23 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-02-02-04.wav b/narrative-audio-system/examples/03-01-07-01-02-02-04.wav new file mode 100644 index 0000000..e286484 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-01-02-02-11.wav b/narrative-audio-system/examples/03-01-07-01-02-02-11.wav new file mode 100644 index 0000000..7256300 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-01-01-04.wav b/narrative-audio-system/examples/03-01-07-02-01-01-04.wav new file mode 100644 index 0000000..32df0a1 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-01-01-11.wav b/narrative-audio-system/examples/03-01-07-02-01-01-11.wav new file mode 100644 index 0000000..9812d02 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-01-02-04.wav b/narrative-audio-system/examples/03-01-07-02-01-02-04.wav new file mode 100644 index 0000000..9bdf0f4 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-01-02-11.wav b/narrative-audio-system/examples/03-01-07-02-01-02-11.wav new file mode 100644 index 0000000..9196764 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-02-01-04.wav b/narrative-audio-system/examples/03-01-07-02-02-01-04.wav new file mode 100644 index 0000000..ef92bca Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-02-01-11.wav b/narrative-audio-system/examples/03-01-07-02-02-01-11.wav new file mode 100644 index 0000000..4ef0145 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-02-02-04.wav b/narrative-audio-system/examples/03-01-07-02-02-02-04.wav new file mode 100644 index 0000000..b8b965a Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-07-02-02-02-11.wav b/narrative-audio-system/examples/03-01-07-02-02-02-11.wav new file mode 100644 index 0000000..afe64a6 Binary files /dev/null and b/narrative-audio-system/examples/03-01-07-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-01-01-04.wav b/narrative-audio-system/examples/03-01-08-01-01-01-04.wav new file mode 100644 index 0000000..1cacf27 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-01-01-11.wav b/narrative-audio-system/examples/03-01-08-01-01-01-11.wav new file mode 100644 index 0000000..c6f13e7 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-01-02-04.wav b/narrative-audio-system/examples/03-01-08-01-01-02-04.wav new file mode 100644 index 0000000..9495ae8 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-01-02-11.wav b/narrative-audio-system/examples/03-01-08-01-01-02-11.wav new file mode 100644 index 0000000..136ae2e Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-02-01-04.wav b/narrative-audio-system/examples/03-01-08-01-02-01-04.wav new file mode 100644 index 0000000..6b2fe4f Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-02-01-11.wav b/narrative-audio-system/examples/03-01-08-01-02-01-11.wav new file mode 100644 index 0000000..453b2d3 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-02-02-04.wav b/narrative-audio-system/examples/03-01-08-01-02-02-04.wav new file mode 100644 index 0000000..ef326bb Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-01-02-02-11.wav b/narrative-audio-system/examples/03-01-08-01-02-02-11.wav new file mode 100644 index 0000000..38e1a3f Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-01-01-04.wav b/narrative-audio-system/examples/03-01-08-02-01-01-04.wav new file mode 100644 index 0000000..967cceb Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-01-01-11.wav b/narrative-audio-system/examples/03-01-08-02-01-01-11.wav new file mode 100644 index 0000000..61d7928 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-01-02-04.wav b/narrative-audio-system/examples/03-01-08-02-01-02-04.wav new file mode 100644 index 0000000..d0734dd Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-01-02-11.wav b/narrative-audio-system/examples/03-01-08-02-01-02-11.wav new file mode 100644 index 0000000..77070c2 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-02-01-04.wav b/narrative-audio-system/examples/03-01-08-02-02-01-04.wav new file mode 100644 index 0000000..abf3978 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-02-01-11.wav b/narrative-audio-system/examples/03-01-08-02-02-01-11.wav new file mode 100644 index 0000000..59360d2 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-02-02-04.wav b/narrative-audio-system/examples/03-01-08-02-02-02-04.wav new file mode 100644 index 0000000..0d72357 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/03-01-08-02-02-02-11.wav b/narrative-audio-system/examples/03-01-08-02-02-02-11.wav new file mode 100644 index 0000000..71e1783 Binary files /dev/null and b/narrative-audio-system/examples/03-01-08-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/captured_audio.wav b/narrative-audio-system/examples/captured_audio.wav new file mode 100644 index 0000000..ed3d9c1 Binary files /dev/null and b/narrative-audio-system/examples/captured_audio.wav differ diff --git a/narrative-audio-system/examples/demo_output.srt b/narrative-audio-system/examples/demo_output.srt new file mode 100644 index 0000000..7ee2ec5 --- /dev/null +++ b/narrative-audio-system/examples/demo_output.srt @@ -0,0 +1,16 @@ +1 +00:00:00,000 --> 00:00:02,500 +[calm] The forest was quiet that night. + +2 +00:00:03,100 --> 00:00:04,800 +[tense] Until the branch snapped. + +3 +00:00:05,200 --> 00:00:07,100 +[fearful] She ran without looking back. + +4 +00:00:20,000 --> 00:00:22,200 +[happy] The morning light brought relief. + diff --git a/narrative-audio-system/examples/emotion_labels.json b/narrative-audio-system/examples/emotion_labels.json new file mode 100644 index 0000000..fb10867 --- /dev/null +++ b/narrative-audio-system/examples/emotion_labels.json @@ -0,0 +1,129 @@ +{ + "03-01-01-01-01-01-01.wav": "neutral", + "03-01-01-01-01-01-04.wav": "neutral", + "03-01-01-01-01-01-11.wav": "neutral", + "03-01-01-01-01-02-01.wav": "neutral", + "03-01-01-01-01-02-04.wav": "neutral", + "03-01-01-01-01-02-11.wav": "neutral", + "03-01-01-01-02-01-01.wav": "neutral", + "03-01-01-01-02-01-04.wav": "neutral", + "03-01-01-01-02-01-11.wav": "neutral", + "03-01-01-01-02-02-01.wav": "neutral", + "03-01-01-01-02-02-04.wav": "neutral", + "03-01-01-01-02-02-11.wav": "neutral", + "03-01-02-01-01-01-01.wav": "calm", + "03-01-02-01-01-01-04.wav": "calm", + "03-01-02-01-01-01-11.wav": "calm", + "03-01-02-01-01-02-01.wav": "calm", + "03-01-02-01-01-02-04.wav": "calm", + "03-01-02-01-01-02-11.wav": "calm", + "03-01-02-01-02-01-01.wav": "calm", + "03-01-02-01-02-01-04.wav": "calm", + "03-01-02-01-02-01-11.wav": "calm", + "03-01-02-01-02-02-04.wav": "calm", + "03-01-02-01-02-02-11.wav": "calm", + "03-01-02-02-01-01-04.wav": "calm", + "03-01-02-02-01-01-11.wav": "calm", + "03-01-02-02-01-02-04.wav": "calm", + "03-01-02-02-01-02-11.wav": "calm", + "03-01-02-02-02-01-04.wav": "calm", + "03-01-02-02-02-01-11.wav": "calm", + "03-01-02-02-02-02-04.wav": "calm", + "03-01-02-02-02-02-11.wav": "calm", + "03-01-03-01-01-01-04.wav": "happy", + "03-01-03-01-01-01-11.wav": "happy", + "03-01-03-01-01-02-04.wav": "happy", + "03-01-03-01-01-02-11.wav": "happy", + "03-01-03-01-02-01-04.wav": "happy", + "03-01-03-01-02-01-11.wav": "happy", + "03-01-03-01-02-02-04.wav": "happy", + "03-01-03-01-02-02-11.wav": "happy", + "03-01-03-02-01-01-04.wav": "happy", + "03-01-03-02-01-01-11.wav": "happy", + "03-01-03-02-01-02-04.wav": "happy", + "03-01-03-02-01-02-11.wav": "happy", + "03-01-03-02-02-01-04.wav": "happy", + "03-01-03-02-02-01-11.wav": "happy", + "03-01-03-02-02-02-04.wav": "happy", + "03-01-03-02-02-02-11.wav": "happy", + "03-01-04-01-01-01-04.wav": "sad", + "03-01-04-01-01-01-11.wav": "sad", + "03-01-04-01-01-02-04.wav": "sad", + "03-01-04-01-01-02-11.wav": "sad", + "03-01-04-01-02-01-04.wav": "sad", + "03-01-04-01-02-01-11.wav": "sad", + "03-01-04-01-02-02-04.wav": "sad", + "03-01-04-01-02-02-11.wav": "sad", + "03-01-04-02-01-01-04.wav": "sad", + "03-01-04-02-01-01-11.wav": "sad", + "03-01-04-02-01-02-04.wav": "sad", + "03-01-04-02-01-02-11.wav": "sad", + "03-01-04-02-02-01-04.wav": "sad", + "03-01-04-02-02-01-11.wav": "sad", + "03-01-04-02-02-02-04.wav": "sad", + "03-01-04-02-02-02-11.wav": "sad", + "03-01-05-01-01-01-04.wav": "angry", + "03-01-05-01-01-01-11.wav": "angry", + "03-01-05-01-01-02-04.wav": "angry", + "03-01-05-01-01-02-11.wav": "angry", + "03-01-05-01-02-01-04.wav": "angry", + "03-01-05-01-02-01-11.wav": "angry", + "03-01-05-01-02-02-04.wav": "angry", + "03-01-05-01-02-02-11.wav": "angry", + "03-01-05-02-01-01-04.wav": "angry", + "03-01-05-02-01-01-11.wav": "angry", + "03-01-05-02-01-02-04.wav": "angry", + "03-01-05-02-01-02-11.wav": "angry", + "03-01-05-02-02-01-04.wav": "angry", + "03-01-05-02-02-01-11.wav": "angry", + "03-01-05-02-02-02-04.wav": "angry", + "03-01-05-02-02-02-11.wav": "angry", + "03-01-06-01-01-01-04.wav": "fearful", + "03-01-06-01-01-01-11.wav": "fearful", + "03-01-06-01-01-02-04.wav": "fearful", + "03-01-06-01-01-02-11.wav": "fearful", + "03-01-06-01-02-01-04.wav": "fearful", + "03-01-06-01-02-01-11.wav": "fearful", + "03-01-06-01-02-02-04.wav": "fearful", + "03-01-06-01-02-02-11.wav": "fearful", + "03-01-06-02-01-01-04.wav": "fearful", + "03-01-06-02-01-01-11.wav": "fearful", + "03-01-06-02-01-02-04.wav": "fearful", + "03-01-06-02-01-02-11.wav": "fearful", + "03-01-06-02-02-01-04.wav": "fearful", + "03-01-06-02-02-01-11.wav": "fearful", + "03-01-06-02-02-02-04.wav": "fearful", + "03-01-06-02-02-02-11.wav": "fearful", + "03-01-07-01-01-01-04.wav": "disgust", + "03-01-07-01-01-01-11.wav": "disgust", + "03-01-07-01-01-02-04.wav": "disgust", + "03-01-07-01-01-02-11.wav": "disgust", + "03-01-07-01-02-01-04.wav": "disgust", + "03-01-07-01-02-01-11.wav": "disgust", + "03-01-07-01-02-02-04.wav": "disgust", + "03-01-07-01-02-02-11.wav": "disgust", + "03-01-07-02-01-01-04.wav": "disgust", + "03-01-07-02-01-01-11.wav": "disgust", + "03-01-07-02-01-02-04.wav": "disgust", + "03-01-07-02-01-02-11.wav": "disgust", + "03-01-07-02-02-01-04.wav": "disgust", + "03-01-07-02-02-01-11.wav": "disgust", + "03-01-07-02-02-02-04.wav": "disgust", + "03-01-07-02-02-02-11.wav": "disgust", + "03-01-08-01-01-01-04.wav": "surprised", + "03-01-08-01-01-01-11.wav": "surprised", + "03-01-08-01-01-02-04.wav": "surprised", + "03-01-08-01-01-02-11.wav": "surprised", + "03-01-08-01-02-01-04.wav": "surprised", + "03-01-08-01-02-01-11.wav": "surprised", + "03-01-08-01-02-02-04.wav": "surprised", + "03-01-08-01-02-02-11.wav": "surprised", + "03-01-08-02-01-01-04.wav": "surprised", + "03-01-08-02-01-01-11.wav": "surprised", + "03-01-08-02-01-02-04.wav": "surprised", + "03-01-08-02-01-02-11.wav": "surprised", + "03-01-08-02-02-01-04.wav": "surprised", + "03-01-08-02-02-01-11.wav": "surprised", + "03-01-08-02-02-02-04.wav": "surprised", + "03-01-08-02-02-02-11.wav": "surprised" +} diff --git a/narrative-audio-system/examples/input2.wav b/narrative-audio-system/examples/input2.wav new file mode 100644 index 0000000..d248efc Binary files /dev/null and b/narrative-audio-system/examples/input2.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-01.wav new file mode 100644 index 0000000..b44f78e Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-04.wav new file mode 100644 index 0000000..9203ce1 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-11.wav new file mode 100644 index 0000000..193f2f6 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-01.wav new file mode 100644 index 0000000..86404d1 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-04.wav new file mode 100644 index 0000000..c0834fc Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-11.wav new file mode 100644 index 0000000..3d7611e Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-01.wav new file mode 100644 index 0000000..287a3fa Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-04.wav new file mode 100644 index 0000000..19e2edc Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-11.wav new file mode 100644 index 0000000..37fb5cc Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-01.wav new file mode 100644 index 0000000..569f36b Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-04.wav new file mode 100644 index 0000000..126d79d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-11.wav new file mode 100644 index 0000000..71ca395 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-01-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-01.wav new file mode 100644 index 0000000..61a68b0 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-04.wav new file mode 100644 index 0000000..7928d9d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-11.wav new file mode 100644 index 0000000..7093cd0 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-01.wav new file mode 100644 index 0000000..0368b12 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-04.wav new file mode 100644 index 0000000..b9eb85c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-11.wav new file mode 100644 index 0000000..61af6e1 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-01.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-01.wav new file mode 100644 index 0000000..4156549 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-01.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-04.wav new file mode 100644 index 0000000..d8c8398 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-11.wav new file mode 100644 index 0000000..acb98bf Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-02-04.wav new file mode 100644 index 0000000..5781f21 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-02-11.wav new file mode 100644 index 0000000..5f5b922 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-01-04.wav new file mode 100644 index 0000000..140d693 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-01-11.wav new file mode 100644 index 0000000..c5789a6 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-02-04.wav new file mode 100644 index 0000000..1b5fd14 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-02-11.wav new file mode 100644 index 0000000..0b17b11 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-01-04.wav new file mode 100644 index 0000000..a998825 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-01-11.wav new file mode 100644 index 0000000..f64716f Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-02-04.wav new file mode 100644 index 0000000..bcd3da8 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-02-11.wav new file mode 100644 index 0000000..574e162 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-02-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-01-04.wav new file mode 100644 index 0000000..af38bc2 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-01-11.wav new file mode 100644 index 0000000..1f82f8c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-02-04.wav new file mode 100644 index 0000000..0d6f9d3 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-02-11.wav new file mode 100644 index 0000000..97a562f Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-01-04.wav new file mode 100644 index 0000000..844e0b0 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-01-11.wav new file mode 100644 index 0000000..8801489 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-02-04.wav new file mode 100644 index 0000000..5522e39 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-02-11.wav new file mode 100644 index 0000000..ea143a5 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-01-04.wav new file mode 100644 index 0000000..b104894 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-01-11.wav new file mode 100644 index 0000000..be63f78 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-02-04.wav new file mode 100644 index 0000000..a2d898c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-02-11.wav new file mode 100644 index 0000000..09f08ae Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-01-04.wav new file mode 100644 index 0000000..f0060bd Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-01-11.wav new file mode 100644 index 0000000..d409af2 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-02-04.wav new file mode 100644 index 0000000..4647eb1 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-02-11.wav new file mode 100644 index 0000000..ed0185b Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-03-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-01-04.wav new file mode 100644 index 0000000..0fba051 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-01-11.wav new file mode 100644 index 0000000..a33a6c3 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-02-04.wav new file mode 100644 index 0000000..4a75112 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-02-11.wav new file mode 100644 index 0000000..f3b9823 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-01-04.wav new file mode 100644 index 0000000..23c70c2 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-01-11.wav new file mode 100644 index 0000000..d46cffd Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-02-04.wav new file mode 100644 index 0000000..3513a43 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-02-11.wav new file mode 100644 index 0000000..10442ca Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-01-04.wav new file mode 100644 index 0000000..50384cf Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-01-11.wav new file mode 100644 index 0000000..be1169b Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-02-04.wav new file mode 100644 index 0000000..5ecc9ff Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-02-11.wav new file mode 100644 index 0000000..385b02c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-01-04.wav new file mode 100644 index 0000000..3c3fb4d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-01-11.wav new file mode 100644 index 0000000..fa1a0ad Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-02-04.wav new file mode 100644 index 0000000..d989b6a Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-02-11.wav new file mode 100644 index 0000000..f568376 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-04-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-01-04.wav new file mode 100644 index 0000000..f24ab79 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-01-11.wav new file mode 100644 index 0000000..13f9079 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-02-04.wav new file mode 100644 index 0000000..c05604d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-02-11.wav new file mode 100644 index 0000000..be2d3a9 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-01-04.wav new file mode 100644 index 0000000..255fa88 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-01-11.wav new file mode 100644 index 0000000..d1b3c08 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-02-04.wav new file mode 100644 index 0000000..846bad6 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-02-11.wav new file mode 100644 index 0000000..9adee99 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-01-04.wav new file mode 100644 index 0000000..dff63ce Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-01-11.wav new file mode 100644 index 0000000..334a5e9 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-02-04.wav new file mode 100644 index 0000000..15d5d52 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-02-11.wav new file mode 100644 index 0000000..c483800 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-01-04.wav new file mode 100644 index 0000000..c1cfb9c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-01-11.wav new file mode 100644 index 0000000..1ee23ad Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-02-04.wav new file mode 100644 index 0000000..11de6e6 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-02-11.wav new file mode 100644 index 0000000..d926e99 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-05-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-01-04.wav new file mode 100644 index 0000000..991d89d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-01-11.wav new file mode 100644 index 0000000..eac1e86 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-02-04.wav new file mode 100644 index 0000000..20cafd1 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-02-11.wav new file mode 100644 index 0000000..6958bae Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-01-04.wav new file mode 100644 index 0000000..3b023d6 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-01-11.wav new file mode 100644 index 0000000..64d47f7 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-02-04.wav new file mode 100644 index 0000000..a365eec Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-02-11.wav new file mode 100644 index 0000000..79a2fca Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-01-04.wav new file mode 100644 index 0000000..90239b8 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-01-11.wav new file mode 100644 index 0000000..c66d94b Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-02-04.wav new file mode 100644 index 0000000..e6140fc Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-02-11.wav new file mode 100644 index 0000000..044eab0 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-01-04.wav new file mode 100644 index 0000000..8853e7a Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-01-11.wav new file mode 100644 index 0000000..6c91f1d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-02-04.wav new file mode 100644 index 0000000..c28fdfe Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-02-11.wav new file mode 100644 index 0000000..0006216 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-06-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-01-04.wav new file mode 100644 index 0000000..c66a07b Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-01-11.wav new file mode 100644 index 0000000..336a7c1 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-02-04.wav new file mode 100644 index 0000000..2e17756 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-02-11.wav new file mode 100644 index 0000000..e7b283d Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-01-04.wav new file mode 100644 index 0000000..1de456c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-01-11.wav new file mode 100644 index 0000000..4ad2220 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-02-04.wav new file mode 100644 index 0000000..20998c5 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-02-11.wav new file mode 100644 index 0000000..d26cb76 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-01-04.wav new file mode 100644 index 0000000..5adc3a6 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-01-11.wav new file mode 100644 index 0000000..66b6d8c Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-02-04.wav new file mode 100644 index 0000000..e51c372 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-02-11.wav new file mode 100644 index 0000000..b561232 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-01-04.wav new file mode 100644 index 0000000..0407d57 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-01-11.wav new file mode 100644 index 0000000..0c2c7c5 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-02-04.wav new file mode 100644 index 0000000..efdade3 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-02-11.wav new file mode 100644 index 0000000..22a21ed Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-07-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-01-04.wav new file mode 100644 index 0000000..ba200bf Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-01-11.wav new file mode 100644 index 0000000..7ba4ed5 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-02-04.wav new file mode 100644 index 0000000..c56bc68 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-02-11.wav new file mode 100644 index 0000000..ca22dcf Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-01-04.wav new file mode 100644 index 0000000..dbeb727 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-01-11.wav new file mode 100644 index 0000000..e0c44f0 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-02-04.wav new file mode 100644 index 0000000..965c0fd Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-02-11.wav new file mode 100644 index 0000000..616affc Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-01-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-01-04.wav new file mode 100644 index 0000000..c339810 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-01-11.wav new file mode 100644 index 0000000..ae4e105 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-02-04.wav new file mode 100644 index 0000000..64d994b Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-02-11.wav new file mode 100644 index 0000000..3aad595 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-01-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-01-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-01-04.wav new file mode 100644 index 0000000..290f9f9 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-01-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-01-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-01-11.wav new file mode 100644 index 0000000..d234e16 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-01-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-02-04.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-02-04.wav new file mode 100644 index 0000000..4468fa7 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-02-04.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-02-11.wav b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-02-11.wav new file mode 100644 index 0000000..4709000 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/03-01-08-02-02-02-11.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/input2.wav b/narrative-audio-system/examples/normalized_audio/input2.wav new file mode 100644 index 0000000..8bea29f Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/input2.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/processed_audio.wav b/narrative-audio-system/examples/normalized_audio/processed_audio.wav new file mode 100644 index 0000000..eeb156f Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/processed_audio.wav differ diff --git a/narrative-audio-system/examples/normalized_audio/sample_audio.wav b/narrative-audio-system/examples/normalized_audio/sample_audio.wav new file mode 100644 index 0000000..290f9f9 Binary files /dev/null and b/narrative-audio-system/examples/normalized_audio/sample_audio.wav differ diff --git a/narrative-audio-system/examples/output.srt b/narrative-audio-system/examples/output.srt new file mode 100644 index 0000000..242bef6 --- /dev/null +++ b/narrative-audio-system/examples/output.srt @@ -0,0 +1,16 @@ +1 +00:00:01,000 --> 00:00:03,740 +[Disgust] Dogs are sitting by the door. + +2 +00:00:05,440 --> 00:00:07,700 +[Angry] Kids are talking about a door. + +3 +00:00:09,600 --> 00:00:12,160 +[Angry] Kids are talking by the door. + +4 +00:00:13,880 --> 00:00:16,540 +[Angry] Kids are talking by the door. + diff --git a/narrative-audio-system/examples/output_angry.srt b/narrative-audio-system/examples/output_angry.srt new file mode 100644 index 0000000..e813cef --- /dev/null +++ b/narrative-audio-system/examples/output_angry.srt @@ -0,0 +1,4 @@ +1 +00:00:00,780 --> 00:00:03,000 +[Happy] Dogs are sitting by the door. + diff --git a/narrative-audio-system/examples/processed_audio.wav b/narrative-audio-system/examples/processed_audio.wav new file mode 100644 index 0000000..50dce86 Binary files /dev/null and b/narrative-audio-system/examples/processed_audio.wav differ diff --git a/narrative-audio-system/examples/sample_audio.wav b/narrative-audio-system/examples/sample_audio.wav new file mode 100644 index 0000000..abf3978 Binary files /dev/null and b/narrative-audio-system/examples/sample_audio.wav differ diff --git a/narrative-audio-system/examples/storytelling_analysis.csv b/narrative-audio-system/examples/storytelling_analysis.csv new file mode 100644 index 0000000..585edfb --- /dev/null +++ b/narrative-audio-system/examples/storytelling_analysis.csv @@ -0,0 +1,9 @@ +filename,transcript,duration_seconds,tempo_bpm,pause_ratio,pause_events,pitch_mean_hz,pitch_std_hz,energy_mean,energy_std,energy_dynamic_range,word_count,sentence_count,avg_sentence_words,max_sentence_words,storytelling_score +03-01-01-01-01-01-01.wav,Kids are talking by the door.,3.3033125,156.25,0.9516908212560387,3,267.0305984870316,142.3334142857007,0.0021737448405474424,0.003347895573824644,0.006854305077217761,6,1,6.0,6,49.65 +03-01-01-01-01-01-04.wav,Kids are talking by the door.,3.3033125,144.23076923076923,0.9420289855072463,3,321.2472546577193,98.68349404954499,0.0023810872808098793,0.003555573523044586,0.008169263228774072,6,1,6.0,6,29.88 +03-01-01-01-01-01-11.wav,Kids are talking by the door.,3.1365,170.45454545454547,0.9898477157360406,1,182.08479750808337,135.69343514278123,0.0017422254895791411,0.0024538985453546047,0.005624081086716618,6,1,6.0,6,48.22 +03-01-01-01-01-02-01.wav,Kids are talking by the door.,3.3366875,144.23076923076923,0.9473684210526315,3,257.914998939382,146.12262952905198,0.0023219622671604156,0.003586930688470602,0.007503410851813899,6,1,6.0,6,56.52 +03-01-01-01-01-02-04.wav,Kids are talking by the door.,3.3700625,144.23076923076923,0.9383886255924171,3,308.02964854572264,106.9491687631,0.0026571406051516533,0.0038461738731712103,0.008434683084487915,6,1,6.0,6,36.36 +03-01-01-01-01-02-11.wav,Kids are talking by the door.,3.103125,170.45454545454547,0.9639175257731959,2,147.45442111024389,118.9600294179293,0.0024527707137167454,0.003387857461348176,0.008178922370461809,6,1,6.0,6,51.37 +03-01-01-01-02-01-01.wav,Dogs are sitting by the door.,3.2699375,125.0,0.9365853658536586,2,272.5633234391762,143.07343874895489,0.002737249480560422,0.004259149543941021,0.008850548467989938,6,1,6.0,6,63.02 +03-01-01-01-02-01-04.wav,Dogs are sitting by the door.,3.2699375,110.29411764705883,0.9463414634146341,1,326.6884042851927,92.25911778869441,0.0024951701052486897,0.003717334009706974,0.007492591347545385,6,1,6.0,6,21.04 diff --git a/narrative-audio-system/examples/task1_features_dataset.csv b/narrative-audio-system/examples/task1_features_dataset.csv new file mode 100644 index 0000000..07dfadd --- /dev/null +++ b/narrative-audio-system/examples/task1_features_dataset.csv @@ -0,0 +1,266 @@ +filename,segment_index,segment_start_seconds,segment_end_seconds,pitch_mean_hz,spectral_centroid_mean_hz,energy_rms_mean,duration_seconds,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13 +03-01-01-01-01-01-01.wav,0,0.0,2.0,178.588885836821,2724.6734402441316,0.08016898483037949,2.0,-326.23321533203125,46.76079559326172,-0.46243584156036377,20.338775634765625,3.1789629459381104,-4.1954450607299805,-8.44412612915039,-13.19424057006836,-10.783696174621582,1.5778523683547974,-6.739585876464844,0.22726143896579742,-7.01843786239624 +03-01-01-01-01-01-01.wav,1,2.0,3.3033125,87.48215024615514,2355.5668434582867,0.02324485033750534,1.3033125,-493.8385009765625,69.29346466064453,4.9426679611206055,8.456997871398926,5.350954055786133,1.4756207466125488,4.339920997619629,-6.2479753494262695,-1.7090104818344116,3.527707815170288,0.032793693244457245,6.992000579833984,-2.1898324489593506 +03-01-01-01-01-01-04.wav,0,0.0,2.0,316.78836709916897,2709.758680795267,0.08762798458337784,2.0,-321.6363525390625,24.70099449157715,-10.235174179077148,8.326663970947266,-16.84180450439453,-4.63582181930542,-19.308855056762695,-9.12726879119873,-7.405836582183838,-0.9235712289810181,-10.784112930297852,-3.315835475921631,-1.3973186016082764 +03-01-01-01-01-01-04.wav,1,2.0,3.3033125,142.48250799882393,2277.579101957611,0.02466418966650963,1.3033125,-450.1189880371094,72.01712799072266,-3.4863929748535156,10.673605918884277,-0.7182149887084961,0.3517339825630188,-10.442238807678223,-8.971763610839844,-4.048787593841553,-4.386592388153076,-7.805049896240234,0.8120966553688049,-3.6503689289093018 +03-01-01-01-01-01-11.wav,0,0.0,2.0,185.9743336899019,1939.105130920993,0.09078972041606903,2.0,-315.00091552734375,57.425174713134766,21.265771865844727,23.48418426513672,2.551715612411499,5.678776264190674,-0.7073147296905518,-0.4354777932167053,-4.176716327667236,4.403389930725098,1.7731071710586548,-0.004588176496326923,0.8076240420341492 +03-01-01-01-01-01-11.wav,1,2.0,3.1365,92.13570728776239,1708.925195878347,0.018418453633785248,1.1365,-477.0003356933594,81.51944732666016,25.80394172668457,20.599092483520508,14.149641990661621,15.12916374206543,9.63064193725586,4.705631732940674,5.28181266784668,8.328924179077148,1.461713194847107,3.990903615951538,3.379585027694702 +03-01-01-01-01-02-01.wav,0,0.0,2.0,131.2770224679088,2481.1815515482535,0.0739835873246193,2.0,-335.97393798828125,42.35063552856445,-3.4928717613220215,26.418832778930664,1.036199688911438,-0.49771609902381897,-8.30732250213623,-13.273736000061035,-6.122884273529053,2.304130792617798,-6.418825149536133,0.49388179183006287,-6.682273864746094 +03-01-01-01-01-02-01.wav,1,2.0,3.3366875,104.55112590187242,2291.9722585589543,0.020772716030478477,1.3366875,-495.6155090332031,69.77498626708984,1.233256459236145,11.637816429138184,3.9091978073120117,0.9101210832595825,1.3218164443969727,-10.166982650756836,-4.045330047607422,4.0963616371154785,-0.6898884177207947,5.145030975341797,-2.0149753093719482 +03-01-01-01-01-02-04.wav,0,0.0,2.0,290.1477577865435,1685.450499419671,0.08503424376249313,2.0,-345.0837097167969,27.671024322509766,-10.05530834197998,4.101249694824219,-20.780668258666992,-8.73783016204834,-22.114398956298828,-13.651819229125977,-10.14856243133545,-0.9894282817840576,-11.550782203674316,-2.577463388442993,-2.1706864833831787 +03-01-01-01-01-02-04.wav,1,2.0,3.3700625,134.29729477648291,2358.816979736315,0.02955794148147106,1.3700625,-449.63739013671875,62.877723693847656,-4.784057140350342,6.172255516052246,-3.4298031330108643,-1.0655124187469482,-11.0552978515625,-12.291620254516602,-5.703583717346191,-6.278961181640625,-10.435406684875488,-3.0884058475494385,-6.013157844543457 +03-01-01-01-01-02-11.wav,0,0.0,2.0,128.19870266282328,2015.0590423828792,0.11080612987279892,2.0,-301.3302917480469,55.628658294677734,16.976951599121094,20.359066009521484,5.420993328094482,8.218027114868164,0.2884330749511719,-3.5020909309387207,-3.7285118103027344,6.403403282165527,0.15825892984867096,-1.9494662284851074,-0.27465757727622986 +03-01-01-01-01-02-11.wav,1,2.0,3.103125,84.03287907085044,1591.4213622600305,0.031760282814502716,1.103125,-444.5193786621094,87.44483947753906,24.562089920043945,16.163915634155273,8.657357215881348,11.020808219909668,8.709056854248047,2.9456098079681396,3.7296438217163086,2.7054455280303955,0.008344732224941254,4.461965084075928,7.193838596343994 +03-01-01-01-02-01-01.wav,0,0.0,2.0,121.24239877356472,2442.9315875220277,0.06772448867559433,2.0,-351.25482177734375,50.28335952758789,0.1955319494009018,20.799560546875,-1.9057170152664185,0.31869304180145264,-10.329431533813477,-12.052262306213379,-8.281633377075195,0.553893506526947,-5.395098686218262,0.32533881068229675,-7.054675102233887 +03-01-01-01-02-01-01.wav,1,2.0,3.2699375,128.50312756366816,2295.219470390889,0.0177219919860363,1.2699375,-521.1871337890625,65.41764068603516,0.7151052355766296,6.912772178649902,6.200511932373047,-0.1257707178592682,1.0562329292297363,-8.460538864135742,-6.824549198150635,3.908190965652466,-1.3207035064697266,6.446503639221191,-3.9856643676757812 +03-01-01-01-02-01-04.wav,0,0.0,2.0,318.3427915887052,1151.6866102806891,0.06560920923948288,2.0,-380.4938659667969,34.58183670043945,-10.320574760437012,-1.5566487312316895,-15.965919494628906,-6.94363260269165,-22.616596221923828,-12.029969215393066,-10.835005760192871,-3.5027732849121094,-14.136722564697266,-3.655608654022217,-3.8451080322265625 +03-01-01-01-02-01-04.wav,1,2.0,3.2699375,145.9341251205826,2477.327363396565,0.017321432009339333,1.2699375,-503.62030029296875,64.60649871826172,-7.688084602355957,5.092654228210449,1.0711842775344849,-1.3634650707244873,-12.065872192382812,-9.213019371032715,-4.304131507873535,-6.396979808807373,-8.4738187789917,-3.756631374359131,-6.6620306968688965 +03-01-01-01-02-01-11.wav,0,0.0,2.0,109.28382630848725,2063.157075771337,0.10774519294500351,2.0,-284.5903015136719,49.1012077331543,21.660179138183594,24.106582641601562,1.5176997184753418,5.762819290161133,-1.9542316198349,-2.8181118965148926,-3.5353951454162598,3.643528938293457,-3.725379467010498,0.12117968499660492,-2.7272071838378906 +03-01-01-01-02-01-11.wav,1,2.0,3.20325,103.35284287803593,1657.6010748231265,0.03518737852573395,1.20325,-444.195068359375,84.62288665771484,23.60184097290039,16.349925994873047,13.956123352050781,11.243021965026855,6.632582664489746,0.5195470452308655,4.7422990798950195,6.172863960266113,1.7384328842163086,2.332470655441284,2.5655264854431152 +03-01-01-01-02-02-01.wav,0,0.0,2.0,141.70341711167376,2364.9080431494012,0.0599389486014843,2.0,-350.62457275390625,46.28947067260742,1.5573328733444214,20.285804748535156,1.6168731451034546,2.3464956283569336,-14.012611389160156,-15.251558303833008,-8.0021390914917,1.3913860321044922,-4.821979522705078,0.04383794963359833,-6.812401294708252 +03-01-01-01-02-02-01.wav,1,2.0,3.169875,80.1928000092652,2151.8790813499645,0.012474696151912212,1.169875,-531.572265625,79.01878356933594,7.294417858123779,10.865447998046875,8.253436088562012,4.2535600662231445,3.5569705963134766,-3.940551280975342,-5.758218288421631,3.479810953140259,2.337900400161743,3.3677244186401367,-0.8547723889350891 +03-01-01-01-02-02-04.wav,0,0.0,2.0,299.8690767504587,1305.1008409481688,0.07042372226715088,2.0,-353.8250732421875,35.57277297973633,-7.138857841491699,-1.7167699337005615,-14.928560256958008,-7.118234634399414,-22.73600959777832,-13.617918014526367,-11.278852462768555,-3.201580762863159,-13.644271850585938,-4.272712707519531,-3.231738567352295 +03-01-01-01-02-02-04.wav,1,2.0,3.2365625,128.52517268287082,2491.3004366687874,0.019141117110848427,1.2365625,-481.64154052734375,62.53649139404297,-4.082533836364746,5.5055670738220215,-1.3391047716140747,-0.4219205677509308,-10.036417961120605,-6.144951820373535,-3.8124101161956787,-4.521069049835205,-8.86375904083252,-2.895448923110962,-5.71052360534668 +03-01-01-01-02-02-11.wav,0,0.0,2.0,130.9665493163101,2035.6566873311144,0.113831527531147,2.0,-288.7147216796875,54.08222198486328,27.891284942626953,27.316770553588867,6.4252095222473145,7.880927562713623,-2.7461607456207275,-1.7530049085617065,-1.0458862781524658,8.648914337158203,-2.003612518310547,0.6410534381866455,-0.6825321912765503 +03-01-01-01-02-02-11.wav,1,2.0,3.169875,105.58813892157193,1161.716378099626,0.041574876755476,1.169875,-405.6978454589844,93.92706298828125,28.220314025878906,23.556245803833008,15.337194442749023,9.840193748474121,9.304327011108398,2.67087984085083,4.725010395050049,7.604446887969971,0.9459906816482544,1.9424113035202026,1.0348501205444336 +03-01-02-01-01-01-01.wav,0,0.0,2.0,138.97319832291865,2624.6754185315067,0.07403399795293808,2.0,-332.9742431640625,39.84423828125,7.872986316680908,19.393980026245117,1.4974530935287476,-1.6170400381088257,-7.1613450050354,-6.411840438842773,-9.161490440368652,0.16758504509925842,-5.352791786193848,-0.33697542548179626,-4.417688369750977 +03-01-02-01-01-01-01.wav,1,2.0,3.536875,98.92135608826861,2091.6228260674125,0.0529865100979805,1.536875,-392.40667724609375,81.6540298461914,-2.6018528938293457,14.290205955505371,7.265993595123291,1.2801817655563354,-2.9481027126312256,-10.732390403747559,-8.74203109741211,4.819553375244141,-3.209376096725464,3.7413063049316406,-5.8930745124816895 +03-01-02-01-01-01-04.wav,0,0.0,2.0,272.9133235236289,1777.352865101984,0.07753674685955048,2.0,-332.59381103515625,25.029598236083984,-7.258495330810547,3.0362296104431152,-21.99466323852539,-7.603668689727783,-18.55204200744629,-14.375964164733887,-12.465581893920898,-1.2492984533309937,-11.58926010131836,-2.942899703979492,-3.3411173820495605 +03-01-02-01-01-01-04.wav,1,2.0,3.8038125,163.75594972807733,2049.480702064087,0.051248010247945786,1.8038125,-352.64117431640625,72.19734954833984,-17.779333114624023,2.164456605911255,-7.5112409591674805,-3.752314805984497,-17.558618545532227,-10.216546058654785,-3.562737464904785,-2.9070003032684326,-11.9208984375,0.345994234085083,-3.9198648929595947 +03-01-02-01-01-01-11.wav,0,0.0,2.0,101.52784244646101,2128.2676998891748,0.09894546866416931,2.0,-281.92657470703125,52.69913864135742,22.722888946533203,25.962116241455078,12.758915901184082,10.74604606628418,1.7391350269317627,2.041903257369995,-1.1037626266479492,9.063058853149414,-0.47275885939598083,-1.2772489786148071,0.4613531827926636 +03-01-02-01-01-01-11.wav,1,2.0,3.2699375,83.43902919778931,1491.3442595759632,0.04361274093389511,1.2699375,-351.38641357421875,81.5423583984375,23.70468521118164,24.880598068237305,13.469842910766602,18.955360412597656,10.317151069641113,2.6286773681640625,8.414249420166016,10.436882019042969,0.16896286606788635,3.2192306518554688,4.266334533691406 +03-01-02-01-01-02-01.wav,0,0.0,2.0,139.3959130413348,2558.1562440135413,0.06657815724611282,2.0,-342.43927001953125,46.132389068603516,5.946919918060303,17.103506088256836,1.9655689001083374,-2.7409212589263916,-10.364460945129395,-5.729950904846191,-8.849225997924805,1.2454606294631958,-4.618634223937988,-1.7397279739379883,-4.078120708465576 +03-01-02-01-01-02-01.wav,1,2.0,3.603625,128.65116654084173,2042.5335623587475,0.04128437861800194,1.603625,-413.7818603515625,87.6463623046875,-7.557804107666016,14.197264671325684,8.96012020111084,-1.3465089797973633,-2.3055059909820557,-10.258780479431152,-7.957047462463379,4.471856594085693,-2.9266607761383057,3.2877166271209717,-7.445009708404541 +03-01-02-01-01-02-04.wav,0,0.0,2.0,270.2214688555103,1882.7660455086768,0.10667797923088074,2.0,-295.7073059082031,28.13778305053711,-14.445466995239258,5.166926383972168,-15.004693984985352,-7.947369575500488,-18.262483596801758,-10.358986854553223,-10.461140632629395,2.01426362991333,-11.17650318145752,0.07256702333688736,-2.3685402870178223 +03-01-02-01-01-02-04.wav,1,2.0,3.7370625,189.2715991475877,2038.8166138985391,0.08541154861450195,1.7370625,-312.68597412109375,66.38277435302734,-21.830142974853516,5.374668598175049,-9.187885284423828,-3.316880702972412,-16.529966354370117,-8.06390380859375,-2.572246551513672,-4.426701068878174,-13.073153495788574,-2.2640392780303955,-4.34673547744751 +03-01-02-01-01-02-11.wav,0,0.0,2.0,145.02851855652315,1924.6445942741343,0.11592262983322144,2.0,-258.25653076171875,60.07208251953125,22.365331649780273,29.872304916381836,12.573526382446289,7.998469352722168,-0.190562903881073,0.49577757716178894,-2.685732841491699,8.041031837463379,-3.599891424179077,-0.06675325334072113,0.29896804690361023 +03-01-02-01-01-02-11.wav,1,2.0,3.3366875,80.4388909348587,1379.8761057931702,0.06885460019111633,1.3366875,-330.2194519042969,92.45807647705078,25.575876235961914,26.00798988342285,18.652647018432617,21.07358741760254,5.598350524902344,3.1858484745025635,5.913046360015869,8.08588695526123,2.9743595123291016,1.2409536838531494,1.2088737487792969 +03-01-02-01-02-01-01.wav,0,0.0,2.0,141.84115940614043,2681.2680985117086,0.09061802178621292,2.0,-339.3749694824219,45.85186004638672,4.307216167449951,16.08075523376465,-1.7307237386703491,5.010533809661865,-9.430890083312988,-6.8915581703186035,-9.775934219360352,0.22212204337120056,-2.6577115058898926,0.04814799502491951,-5.171707630157471 +03-01-02-01-02-01-01.wav,1,2.0,3.5035,110.58649723831904,2069.5827664663834,0.040294498205184937,1.5035,-420.68212890625,83.07872009277344,-5.624032020568848,13.506547927856445,7.647958755493164,-1.4172641038894653,-1.4604969024658203,-11.222357749938965,-5.668199062347412,6.750625133514404,-1.625653862953186,2.1960272789001465,-5.587202548980713 +03-01-02-01-02-01-04.wav,0,0.0,2.0,310.5959813046325,1650.8861145320082,0.11561737209558487,2.0,-315.6680603027344,21.62952995300293,-5.662458896636963,-1.155848503112793,-17.029342651367188,-5.518489360809326,-20.006725311279297,-12.23824691772461,-11.161088943481445,-2.012948989868164,-10.379551887512207,-2.019110679626465,-6.92291784286499 +03-01-02-01-02-01-04.wav,1,2.0,3.637,180.15393939103527,1882.0603573888634,0.09185730665922165,1.637,-306.3761901855469,70.70098876953125,-5.880964279174805,10.212546348571777,-4.839133262634277,-1.0759228467941284,-13.395913124084473,-7.792226314544678,-2.030442237854004,-1.3887914419174194,-6.8379411697387695,2.807562828063965,-2.7535593509674072 +03-01-02-01-02-01-11.wav,0,0.0,2.0,149.6241226087458,2284.7214306827655,0.08602135628461838,2.0,-316.6400451660156,52.977684020996094,26.69873809814453,19.27825164794922,3.419567584991455,7.164426803588867,-6.027472019195557,1.641207218170166,-1.9351872205734253,8.457205772399902,-3.005784273147583,-0.6950250864028931,-2.0671136379241943 +03-01-02-01-02-01-11.wav,1,2.0,3.3366875,123.3756322306829,1725.9067675723709,0.023496873676776886,1.3366875,-436.1048583984375,82.02992248535156,22.332927703857422,25.457868576049805,14.542909622192383,15.835136413574219,1.3828500509262085,-0.9877203106880188,3.873716354370117,6.823614120483398,2.023822069168091,1.438289999961853,0.6548346877098083 +03-01-02-01-02-02-04.wav,0,0.0,2.0,262.52007075631667,2203.070387739065,0.11210247874259949,2.0,-288.60394287109375,38.6985969543457,-10.415671348571777,-2.7527782917022705,-14.222702980041504,-7.155354976654053,-29.132009506225586,-17.745718002319336,-8.85026741027832,-3.5252127647399902,-10.035228729248047,-4.986659049987793,0.4648420214653015 +03-01-02-01-02-02-04.wav,1,2.0,3.57025,204.80070069171575,2161.67782045035,0.05875110626220703,1.57025,-324.8187561035156,65.45091247558594,-23.146589279174805,2.8886351585388184,-7.138541221618652,-3.556797504425049,-19.641263961791992,-5.954727649688721,-2.7085201740264893,-6.333738327026367,-12.395163536071777,-2.8981244564056396,-4.9603962898254395 +03-01-02-01-02-02-11.wav,0,0.0,2.0,155.39439786855485,2395.936896577372,0.11438129097223282,2.0,-277.3493347167969,42.9885368347168,32.41060256958008,24.519878387451172,12.800294876098633,9.432316780090332,-4.430509567260742,0.721528947353363,-4.841867923736572,8.509064674377441,-0.0336187444627285,-2.437434673309326,-4.113060474395752 +03-01-02-01-02-02-11.wav,1,2.0,3.670375,77.26105989363076,1550.1733840347044,0.06866852939128876,1.670375,-313.7107238769531,83.7369155883789,17.779911041259766,19.307220458984375,17.34206199645996,11.671277046203613,11.165751457214355,3.426084041595459,3.954230785369873,15.236869812011719,2.0482349395751953,1.9934430122375488,0.6718320250511169 +03-01-02-02-01-01-04.wav,0,0.0,2.0,300.71352611366694,2179.0214368067395,0.08617118000984192,2.0,-320.3878173828125,13.791049003601074,-6.761970520019531,1.5979595184326172,-11.763920783996582,-5.621468544006348,-14.032815933227539,-7.287460803985596,-9.224806785583496,3.501925468444824,-11.065774917602539,-0.3672276437282562,-4.105432510375977 +03-01-02-02-01-01-04.wav,1,2.0,3.970625,187.71292422381066,1868.053583755135,0.0756058618426323,1.970625,-310.18377685546875,72.42354583740234,-15.378680229187012,0.06124235689640045,-10.133023262023926,-6.15255880355835,-21.334209442138672,-12.457300186157227,-4.628190994262695,-7.550532817840576,-15.411029815673828,-5.499993324279785,-6.739752769470215 +03-01-02-02-01-01-11.wav,0,0.0,2.0,138.3024416374878,2397.839702301931,0.08779377490282059,2.0,-274.70208740234375,42.055789947509766,25.605436325073242,28.377506256103516,10.695869445800781,7.69234561920166,-0.801300048828125,2.852418899536133,-1.2179495096206665,8.087082862854004,-1.1673705577850342,1.6694869995117188,2.120560884475708 +03-01-02-02-01-01-11.wav,1,2.0,3.8371875,115.33203716670627,1403.99522068885,0.06198151409626007,1.8371875,-315.3460388183594,98.78353881835938,13.272139549255371,18.95085906982422,17.50581169128418,10.94090747833252,9.27294921875,1.4211710691452026,3.484584093093872,14.567352294921875,3.544367551803589,-1.9847428798675537,0.5652109384536743 +03-01-02-02-01-02-04.wav,0,0.0,2.0,221.16240503334987,2519.2228096299186,0.09850332885980606,2.0,-254.57907104492188,34.84590530395508,-19.06911849975586,4.767786979675293,-15.232219696044922,-5.850257873535156,-27.345733642578125,-13.50413990020752,-8.265342712402344,-1.106380820274353,-10.374796867370605,1.6108026504516602,-0.6253413558006287 +03-01-02-02-01-02-04.wav,1,2.0,3.8038125,168.77154347374332,1942.384196461203,0.06314166635274887,1.8038125,-332.4313659667969,77.4096908569336,-22.457992553710938,6.189947128295898,-5.811801910400391,-8.60655403137207,-25.602312088012695,-13.218365669250488,-4.1051716804504395,-6.805030345916748,-15.565406799316406,-1.2130703926086426,-4.33976411819458 +03-01-02-02-01-02-11.wav,0,0.0,2.0,138.83889472533576,2362.5550765701614,0.07832021266222,2.0,-292.605712890625,37.84592819213867,20.251901626586914,29.16940689086914,9.314868927001953,7.549283504486084,2.1079118251800537,5.195241928100586,-2.945939779281616,7.801243782043457,-0.37221792340278625,0.3588894307613373,1.386872410774231 +03-01-02-02-01-02-11.wav,1,2.0,3.737125,143.22741223821558,1775.8991737316687,0.050129689276218414,1.737125,-347.1787414550781,71.75977325439453,20.33930206298828,21.017681121826172,16.627246856689453,15.946891784667969,6.869697570800781,4.088644027709961,-0.030933119356632233,11.360662460327148,-3.203401565551758,1.29893159866333,-0.8159822225570679 +03-01-02-02-02-01-04.wav,0,0.0,2.0,274.87639900088465,2200.6896958434986,0.11122656613588333,2.0,-271.47125244140625,27.852115631103516,-4.5541462898254395,7.196249008178711,-16.301618576049805,-3.521852493286133,-27.102636337280273,-10.935294151306152,-9.355050086975098,-3.3283562660217285,-6.725698471069336,0.5577610731124878,0.15538620948791504 +03-01-02-02-02-01-04.wav,1,2.0,3.8038125,221.8391990907985,1960.1476896299712,0.06817680597305298,1.8038125,-281.292724609375,74.20549774169922,-26.286096572875977,1.356717586517334,-8.882223129272461,-3.8232553005218506,-24.38109016418457,-4.919862747192383,-2.0476584434509277,-2.5717380046844482,-16.416013717651367,0.46644753217697144,-5.086089611053467 +03-01-02-02-02-01-11.wav,0,0.0,2.0,130.25391520639812,2134.9915509608973,0.10139583051204681,2.0,-287.2449035644531,44.729549407958984,29.335697174072266,25.37057876586914,10.006050109863281,9.598000526428223,0.32185477018356323,5.240334510803223,0.3114464282989502,8.830157279968262,0.1611434519290924,1.26975417137146,0.24611887335777283 +03-01-02-02-02-01-11.wav,1,2.0,3.7704375,94.25671687507104,1605.9025727251706,0.05858486890792847,1.7704375,-324.0408630371094,80.87618255615234,16.864273071289062,22.99178695678711,16.952774047851562,10.460261344909668,8.582651138305664,-0.010029622353613377,1.461936593055725,12.326231002807617,1.8484876155853271,0.6534474492073059,1.4487583637237549 +03-01-02-02-02-02-04.wav,0,0.0,2.0,277.8418813090929,3040.198448511092,0.08680131286382675,2.0,-348.2205505371094,20.195640563964844,1.137039303779602,-7.289501190185547,-13.563227653503418,-6.642622470855713,-24.37421989440918,-13.076068878173828,-9.835598945617676,-4.876302242279053,-13.644156455993652,-1.121888279914856,-9.83757495880127 +03-01-02-02-02-02-04.wav,1,2.0,4.0,164.87167999657652,2133.397577331089,0.057902269065380096,2.0,-322.66363525390625,68.20384979248047,-15.833243370056152,5.61210298538208,-10.412519454956055,-12.630175590515137,-18.371458053588867,-6.348515033721924,-6.293242454528809,-8.225470542907715,-15.854364395141602,-2.375243663787842,-8.244455337524414 +03-01-02-02-02-02-04.wav,2,4.0,4.037375,60.97955229563807,2691.3248664619978,0.0002271097182529047,0.037375,-579.127685546875,56.64247512817383,5.38715934753418,2.904694080352783,0.026749014854431152,-0.5227835774421692,-18.11581039428711,3.538428783416748,-2.48935866355896,-8.320703506469727,-15.285904884338379,6.752540588378906,1.6035826206207275 +03-01-02-02-02-02-11.wav,0,0.0,2.0,143.10382966085794,2099.263784989751,0.12492924183607101,2.0,-274.5059814453125,43.33036422729492,26.695453643798828,24.743345260620117,8.422881126403809,9.696285247802734,-0.06681991368532181,2.208062171936035,-1.3066011667251587,9.238665580749512,1.9802864789962769,2.2047770023345947,-0.10918759554624557 +03-01-02-02-02-02-11.wav,1,2.0,3.8371875,108.34289232841338,1524.4315551676061,0.08068341016769409,1.8371875,-307.2749938964844,85.77062225341797,15.169873237609863,20.789749145507812,14.788567543029785,10.711036682128906,6.334443092346191,1.2114198207855225,1.9655916690826416,15.429126739501953,2.1797261238098145,0.747633695602417,-0.8672667145729065 +03-01-03-01-01-01-04.wav,0,0.0,2.0,313.89438322607106,1423.3491995047555,0.0703497976064682,2.0,-360.6978759765625,22.4379940032959,-19.612977981567383,5.051761150360107,-20.609127044677734,-4.811404705047607,-17.658634185791016,-11.059807777404785,-8.0164155960083,-5.356622219085693,-13.009753227233887,-1.6346378326416016,-2.1302547454833984 +03-01-03-01-01-01-04.wav,1,2.0,3.5035,156.33286881668383,2160.493163846424,0.04020378738641739,1.5035,-424.7059020996094,61.793643951416016,-10.868915557861328,9.96372127532959,-6.8887200355529785,-5.1178998947143555,-15.60423469543457,-8.98997688293457,-4.586822032928467,-7.604952335357666,-10.88852596282959,-4.8143229484558105,-5.832519054412842 +03-01-03-01-01-01-11.wav,0,0.0,2.0,145.69833705011902,1933.6227632278706,0.1143844723701477,2.0,-296.46417236328125,52.63474655151367,5.9099650382995605,15.691596984863281,3.172943115234375,1.7613612413406372,-10.131665229797363,-4.580685138702393,-8.265885353088379,6.776510715484619,-0.8714508414268494,-3.416653633117676,-0.48406171798706055 +03-01-03-01-01-01-11.wav,1,2.0,3.103125,118.93562920051097,1549.0693944219597,0.019924744963645935,1.103125,-476.8978271484375,83.31061553955078,21.97693634033203,17.885116577148438,10.107175827026367,12.65427303314209,5.418586730957031,4.695735931396484,1.861811637878418,3.7819266319274902,1.9156649112701416,6.708371162414551,6.64117956161499 +03-01-03-01-01-02-04.wav,0,0.0,2.0,347.2215637015257,2867.6476426444233,0.06219087541103363,2.0,-356.33642578125,18.925350189208984,-22.586353302001953,11.333677291870117,-21.296920776367188,-0.7946994304656982,-17.211732864379883,-11.851272583007812,-7.022258281707764,-4.280230522155762,-10.405708312988281,-3.6238982677459717,-3.1900227069854736 +03-01-03-01-01-02-04.wav,1,2.0,3.536875,153.7121740965311,2178.382001697036,0.050002072006464005,1.536875,-422.5859069824219,58.772151947021484,-18.120868682861328,6.083489894866943,-8.537676811218262,-9.027320861816406,-14.588154792785645,-10.991568565368652,-0.9526762962341309,-7.0054240226745605,-13.029614448547363,-3.457726001739502,-4.437857627868652 +03-01-03-01-01-02-11.wav,0,0.0,2.0,138.59689072822482,2024.6147158163228,0.11908993870019913,2.0,-292.5427551269531,51.42747116088867,9.636781692504883,12.22082233428955,6.032324314117432,1.0662883520126343,-5.186850070953369,-2.6903183460235596,-4.674124717712402,5.921320915222168,-2.392207384109497,-0.8302657008171082,-1.3246536254882812 +03-01-03-01-01-02-11.wav,1,2.0,3.20325,93.89506089572974,1514.0941765069804,0.029863590374588966,1.20325,-457.23236083984375,89.8324203491211,22.053224563598633,15.470651626586914,15.070770263671875,9.240734100341797,6.7443060874938965,0.5024259686470032,2.021967649459839,6.254479885101318,2.5452880859375,2.4481050968170166,2.975670337677002 +03-01-03-01-02-01-04.wav,0,0.0,2.0,309.7090620322344,2919.0769093693843,0.0917610302567482,2.0,-328.2183532714844,19.374303817749023,-12.499343872070312,4.321529388427734,-16.962329864501953,-2.07327938079834,-14.533994674682617,-11.759320259094238,-10.645106315612793,-3.3801915645599365,-9.22982406616211,-4.855827808380127,-5.027313709259033 +03-01-03-01-02-01-04.wav,1,2.0,3.5035,138.09827301136525,2196.0864588639843,0.04151729866862297,1.5035,-407.1700439453125,74.33428192138672,-12.69478702545166,18.600252151489258,-6.147936820983887,-1.6857634782791138,-11.769222259521484,-6.548990249633789,-3.370927572250366,-4.2385334968566895,-8.584839820861816,-0.5211012959480286,-4.348726272583008 +03-01-03-01-02-01-11.wav,0,0.0,2.0,139.50148030530312,2218.1839086750415,0.1171211302280426,2.0,-289.5130615234375,46.9098014831543,13.172592163085938,11.78228759765625,7.104658126831055,1.1905370950698853,-8.424988746643066,-5.114298343658447,-6.3015851974487305,5.846525192260742,-2.501131534576416,0.5383551716804504,-3.3916592597961426 +03-01-03-01-02-01-11.wav,1,2.0,3.169875,91.07734131779831,1517.4949208413377,0.01878146454691887,1.169875,-484.6451110839844,82.5411376953125,22.325971603393555,15.33569049835205,14.266702651977539,8.00390338897705,5.6215739250183105,2.309443473815918,0.27250656485557556,4.9372124671936035,2.97786283493042,7.989272594451904,5.044548034667969 +03-01-03-01-02-02-04.wav,0,0.0,2.0,319.67731533760224,2645.351565571967,0.10505407303571701,2.0,-322.43450927734375,27.801910400390625,-23.92668342590332,5.980008602142334,-18.987525939941406,-6.701750755310059,-19.98502540588379,-10.593759536743164,-7.026776313781738,-5.929182052612305,-10.80537223815918,-2.71766996383667,-5.791987895965576 +03-01-03-01-02-02-04.wav,1,2.0,3.4368125,148.13845141473084,2212.772530238113,0.038724761456251144,1.4368125,-433.18133544921875,65.25921630859375,-3.5685625076293945,16.380048751831055,-3.1488590240478516,-1.331668496131897,-9.782159805297852,-4.451996803283691,-1.5929136276245117,-4.368785381317139,-4.648131370544434,-0.8194416761398315,-3.986135959625244 +03-01-03-01-02-02-11.wav,0,0.0,2.0,157.06443315978123,2136.8637525842805,0.08875583112239838,2.0,-327.567138671875,46.23662567138672,14.471890449523926,16.26172637939453,2.796196222305298,1.8643797636032104,-6.368246555328369,-0.8531189560890198,-8.02707576751709,5.245889663696289,-2.0257763862609863,-2.4798078536987305,-1.8514991998672485 +03-01-03-01-02-02-11.wav,1,2.0,3.236625,162.32818351364423,1699.5803193804743,0.028433041647076607,1.236625,-474.8220520019531,79.57780456542969,19.285581588745117,15.014379501342773,7.493269920349121,7.553818702697754,5.552371501922607,-2.223597764968872,-1.5048879384994507,1.5407040119171143,0.41252538561820984,0.19419948756694794,1.8285019397735596 +03-01-03-02-01-01-04.wav,0,0.0,2.0,343.9557125911603,2480.07864400806,0.06077555567026138,2.0,-352.6283874511719,11.951218605041504,-15.707866668701172,3.8229501247406006,-16.247894287109375,-1.2437751293182373,-13.860664367675781,-11.14493465423584,-7.274830341339111,-2.54215145111084,-7.6773176193237305,3.3986997604370117,6.966787338256836 +03-01-03-02-01-01-04.wav,1,2.0,3.8705625,270.52933622189806,2196.787586152483,0.0611569844186306,1.8705625,-324.3765869140625,45.47675323486328,-33.28219985961914,7.86802339553833,-12.412361145019531,-4.326955795288086,-20.591550827026367,-12.325614929199219,3.8888697624206543,-1.338129997253418,-16.00057601928711,0.05271092429757118,-0.06838227808475494 +03-01-03-02-01-01-11.wav,0,0.0,2.0,241.68557783664852,2338.0919328074465,0.06125815585255623,2.0,-348.54486083984375,27.229703903198242,2.735081911087036,5.12820291519165,-2.8651480674743652,1.655658483505249,1.9929471015930176,-1.8414515256881714,-6.99594259262085,5.248111248016357,-2.506830930709839,2.1795220375061035,-1.6979340314865112 +03-01-03-02-01-01-11.wav,1,2.0,4.0,182.35142981207588,1328.3182886868167,0.0566105879843235,2.0,-355.7282409667969,71.61316680908203,3.253627061843872,5.325247287750244,0.3641473650932312,1.3006373643875122,-6.451554775238037,-9.830913543701172,-12.099263191223145,2.297755241394043,-2.080684185028076,-0.9034194946289062,-5.73782205581665 +03-01-03-02-01-01-11.wav,2,4.0,4.037375,214.56664053022894,1749.6715856713258,0.00012287269055377692,0.037375,-714.7958984375,84.78958129882812,-10.226837158203125,4.10399866104126,13.159863471984863,17.442420959472656,-2.6215291023254395,-8.194025039672852,-5.720182418823242,11.759020805358887,10.516225814819336,7.969577789306641,-2.5156819820404053 +03-01-03-02-01-02-04.wav,0,0.0,2.0,351.0050238893814,1734.1395275419336,0.07135360687971115,2.0,-344.28314208984375,8.998815536499023,-21.936134338378906,6.886715412139893,-22.45901107788086,0.6819639205932617,-16.82242774963379,-8.53795051574707,-7.188840866088867,-5.847837448120117,-10.29442310333252,-0.11206334829330444,7.200186729431152 +03-01-03-02-01-02-04.wav,1,2.0,3.57025,233.08833917614794,2289.6103530392497,0.039737313985824585,1.57025,-456.3483581542969,29.926780700683594,-20.674131393432617,6.740911483764648,-8.550545692443848,-4.669867515563965,-8.438279151916504,-6.849808216094971,1.3971514701843262,-5.164373874664307,-10.148392677307129,-6.3534836769104,-5.4278435707092285 +03-01-03-02-01-02-11.wav,0,0.0,2.0,167.0449082589228,2189.1311514930085,0.06018915772438049,2.0,-344.2229309082031,27.583375930786133,2.8972840309143066,8.62272834777832,-0.5058807730674744,-0.852640688419342,-0.2947503626346588,-1.4490814208984375,-6.322990417480469,5.4951863288879395,-1.6133601665496826,1.3283395767211914,-1.3143515586853027 +03-01-03-02-01-02-11.wav,1,2.0,3.637,268.9551111400267,1026.4711042021115,0.04528796672821045,1.637,-411.6828918457031,63.93273162841797,0.35574495792388916,12.000986099243164,-0.9921448826789856,3.418606758117676,-0.30889052152633667,-4.9474077224731445,-3.776638984680176,1.675912857055664,0.34972378611564636,0.9503543376922607,-1.4914995431900024 +03-01-03-02-02-01-04.wav,0,0.0,2.0,339.20860493898635,2599.356926307906,0.06913036853075027,2.0,-356.9355773925781,6.325675964355469,-14.247918128967285,2.1443207263946533,-19.77629852294922,-5.09390115737915,-20.28952407836914,-11.840059280395508,-8.90635871887207,-5.609671592712402,-11.88076114654541,-5.982938289642334,-6.941657543182373 +03-01-03-02-02-01-04.wav,1,2.0,3.603625,164.61515427929865,2316.6958731189243,0.05309377610683441,1.603625,-412.4335632324219,33.061370849609375,-24.169208526611328,3.74415922164917,-12.765158653259277,-8.914702415466309,-12.66076946258545,-8.85634708404541,-2.770601511001587,-7.145648956298828,-3.828894853591919,-3.125007152557373,-3.2331411838531494 +03-01-03-02-02-01-11.wav,0,0.0,2.0,194.94163729093228,2557.7521343645,0.07512685656547546,2.0,-341.6503601074219,27.836591720581055,-0.4039490222930908,1.3301820755004883,-6.539399147033691,-0.4045422375202179,-7.949774742126465,-7.892493724822998,-12.833248138427734,1.3211159706115723,-3.91982102394104,-1.5063138008117676,-4.994744300842285 +03-01-03-02-02-01-11.wav,1,2.0,3.8705625,174.71296328160275,1589.9681471170932,0.08659303188323975,1.8705625,-341.5682373046875,60.92420959472656,-7.849349021911621,5.388680458068848,-4.472975730895996,-2.1023473739624023,-3.0474674701690674,-9.81772518157959,-10.469208717346191,-3.1890058517456055,-5.151464462280273,-0.9803825616836548,-9.376049041748047 +03-01-03-02-02-02-04.wav,0,0.0,2.0,309.43567667373054,2701.265748678711,0.058403778821229935,2.0,-365.8179931640625,11.99437427520752,-23.527692794799805,0.3820120692253113,-21.818603515625,-12.216320037841797,-23.781057357788086,-9.086688041687012,-9.914897918701172,-9.76362133026123,-13.64648723602295,-2.8553240299224854,-0.8191871047019958 +03-01-03-02-02-02-04.wav,1,2.0,3.7704375,181.9379336013823,2388.0729448960483,0.049131009727716446,1.7704375,-387.11676025390625,27.29121971130371,-30.8309326171875,8.325027465820312,-14.622809410095215,-6.073557376861572,-19.898603439331055,-7.3414812088012695,-2.3235888481140137,-7.306766033172607,-16.495874404907227,-1.5022475719451904,-0.8942161798477173 +03-01-03-02-02-02-11.wav,0,0.0,2.0,145.67298684546333,2137.9414943500083,0.07899358868598938,2.0,-334.014892578125,30.406005859375,-1.3965747356414795,2.3630011081695557,-4.2133002281188965,-2.528085708618164,-8.161624908447266,-2.817340850830078,-13.207850456237793,0.9924692511558533,-4.681720733642578,-2.076826572418213,-7.251155376434326 +03-01-03-02-02-02-11.wav,1,2.0,3.5035,106.86349400926888,1433.8688640866483,0.028135046362876892,1.5035,-475.0118713378906,71.82131958007812,14.688838005065918,9.267245292663574,3.963174343109131,3.8992347717285156,-1.1158983707427979,-4.939794540405273,-5.2511420249938965,-0.8016747236251831,0.09831294417381287,-0.28301379084587097,-1.734737515449524 +03-01-04-01-01-01-04.wav,0,0.0,2.0,246.59766946185093,2654.190468486748,0.07714861631393433,2.0,-343.6870422363281,25.681177139282227,-8.19739818572998,7.443789482116699,-16.537508010864258,-4.469664573669434,-17.351837158203125,-12.569978713989258,-12.595602989196777,-2.1974802017211914,-9.64492416381836,-5.364454746246338,-2.5446717739105225 +03-01-04-01-01-01-04.wav,1,2.0,3.3700625,130.25902040091836,2220.030741337126,0.021229421719908714,1.3700625,-442.9205627441406,76.31024932861328,7.973983287811279,11.5736665725708,-1.1392834186553955,3.3512673377990723,-7.645279407501221,-7.8084282875061035,-5.55615758895874,-4.434726238250732,-10.065082550048828,-3.463913679122925,-4.23417329788208 +03-01-04-01-01-01-11.wav,0,0.0,2.0,118.44569369343029,1977.976374417781,0.09573084861040115,2.0,-288.3418884277344,48.146976470947266,14.375565528869629,25.3568172454834,8.423913955688477,6.557493209838867,2.8473432064056396,2.5264060497283936,-1.6789754629135132,9.121051788330078,-0.7665844559669495,-0.34954726696014404,-1.7798048257827759 +03-01-04-01-01-01-11.wav,1,2.0,3.3366875,135.1017713596685,1899.9795275907793,0.036091048270463943,1.3366875,-404.2327880859375,71.03899383544922,23.305809020996094,17.659114837646484,14.27704906463623,8.943025588989258,8.822347640991211,-3.55244517326355,1.5656083822250366,7.2818284034729,-1.0549516677856445,4.640462398529053,-1.593440294265747 +03-01-04-01-01-02-04.wav,0,0.0,2.0,318.65770649851254,1856.622207941884,0.08304453641176224,2.0,-331.877197265625,19.210987091064453,-15.841056823730469,1.5763018131256104,-19.436309814453125,-3.670773983001709,-20.527780532836914,-12.922039031982422,-7.979604721069336,-2.2694945335388184,-11.587867736816406,-6.72341251373291,2.405419111251831 +03-01-04-01-01-02-04.wav,1,2.0,3.4034375,144.45094048804833,2184.924923155724,0.03228653967380524,1.4034375,-453.0372009277344,50.259525299072266,-2.794950246810913,5.94485330581665,-3.6377029418945312,-1.3953229188919067,-8.427248001098633,-11.151012420654297,-3.3517816066741943,-5.5600972175598145,-6.576553821563721,-3.585169792175293,-1.6490894556045532 +03-01-04-01-01-02-11.wav,0,0.0,2.0,129.94857639303288,2008.9091387184112,0.09542423486709595,2.0,-289.9569091796875,48.03932571411133,26.66802406311035,26.843624114990234,9.033681869506836,9.540336608886719,2.6720666885375977,0.22156764566898346,-4.965453147888184,8.524873733520508,3.183885097503662,2.305820941925049,-0.1922633945941925 +03-01-04-01-01-02-11.wav,1,2.0,3.603625,115.46918702895535,1611.4927023123323,0.043078433722257614,1.603625,-378.8557434082031,84.5497817993164,22.893918991088867,24.701217651367188,11.7017822265625,9.629472732543945,6.961574554443359,-0.3257126212120056,-0.05962833762168884,8.352657318115234,0.48197320103645325,0.5729473233222961,0.8916585445404053 +03-01-04-01-02-01-04.wav,0,0.0,2.0,328.2561848301136,2015.6451004975881,0.08691266179084778,2.0,-328.5535583496094,11.638191223144531,-19.51532745361328,3.8935739994049072,-23.092695236206055,-8.864470481872559,-23.72160530090332,-13.882638931274414,-12.498912811279297,-6.937950611114502,-12.180954933166504,-6.328004837036133,-2.6147823333740234 +03-01-04-01-02-01-04.wav,1,2.0,3.4368125,136.06001584814322,2136.6487666156445,0.04239312931895256,1.4368125,-422.7125549316406,50.19347381591797,-3.86000919342041,7.3270263671875,-4.97373628616333,-4.59082555770874,-8.161698341369629,-11.793839454650879,-2.210847854614258,-5.51120662689209,-6.753924369812012,-1.9143002033233643,-0.49986881017684937 +03-01-04-01-02-01-11.wav,0,0.0,2.0,161.4543818434702,2479.3065646406717,0.09122627228498459,2.0,-313.7047119140625,39.33191680908203,24.28578758239746,15.304031372070312,0.08699574321508408,7.607512474060059,-2.184546709060669,3.8277342319488525,-6.934556484222412,4.77084493637085,-2.9063289165496826,-2.701937675476074,-5.015694618225098 +03-01-04-01-02-01-11.wav,1,2.0,3.536875,85.01226105964399,1505.3670990911155,0.03706894442439079,1.536875,-401.7234191894531,86.21585845947266,25.67719841003418,24.035085678100586,11.597518920898438,9.150483131408691,4.606579303741455,-3.0978686809539795,-2.2989256381988525,6.776859760284424,-3.4144906997680664,-0.13661684095859528,-1.9852347373962402 +03-01-04-01-02-02-04.wav,0,0.0,2.0,320.9209923800123,2325.0718944178816,0.07503142207860947,2.0,-351.5185241699219,7.861972332000732,-17.965482711791992,-1.4134879112243652,-24.031396865844727,-10.50727653503418,-22.68246078491211,-15.681770324707031,-9.91816520690918,-8.179045677185059,-10.618880271911621,-5.424752712249756,0.2532978057861328 +03-01-04-01-02-02-04.wav,1,2.0,3.470125,148.37279413440572,2439.957710672844,0.0274199191480875,1.470125,-477.0052795410156,46.956764221191406,-4.990860462188721,8.127117156982422,-9.751545906066895,-4.5114569664001465,-10.670354843139648,-13.039355278015137,-2.0674166679382324,-8.158378601074219,-7.7007832527160645,-3.1877057552337646,-4.057948112487793 +03-01-04-01-02-02-11.wav,0,0.0,2.0,128.22781722624507,2224.7073260061716,0.10142908990383148,2.0,-318.661376953125,49.96324157714844,26.050230026245117,17.92837142944336,3.946835994720459,7.741450786590576,1.1137269735336304,0.3269278407096863,-4.907079696655273,5.0113606452941895,-1.307640552520752,-0.5157000422477722,-2.3263099193573 +03-01-04-01-02-02-11.wav,1,2.0,3.4368125,139.30949981785028,1808.4953544044001,0.03345181420445442,1.4368125,-423.66546630859375,76.3949203491211,22.103992462158203,20.391162872314453,14.798301696777344,12.317658424377441,6.267289161682129,-2.358036994934082,3.276542901992798,5.801900863647461,-1.557154655456543,0.061211057007312775,-0.9650381207466125 +03-01-04-02-01-01-04.wav,0,0.0,2.0,295.1132951044027,2158.4312527097995,0.06423383206129074,2.0,-347.3856506347656,5.192654609680176,-17.65821647644043,6.615476131439209,-18.681123733520508,-8.36837100982666,-21.499530792236328,-10.735624313354492,-6.8444390296936035,-1.8449122905731201,-8.207245826721191,6.369760513305664,16.45961570739746 +03-01-04-02-01-01-04.wav,1,2.0,3.670375,198.02593851696827,2350.4427708966155,0.07846741378307343,1.670375,-377.8055725097656,29.406269073486328,-29.306974411010742,0.9215864539146423,-16.969385147094727,-12.546680450439453,-14.979703903198242,-14.196649551391602,1.1851489543914795,-7.14601993560791,-5.193272590637207,-5.76570987701416,-2.9099972248077393 +03-01-04-02-01-01-11.wav,0,0.0,2.0,168.173693433941,2322.31716454696,0.06991058588027954,2.0,-333.9661865234375,35.61948776245117,1.7892282009124756,9.056804656982422,-2.2475242614746094,0.9137851595878601,-6.9901652336120605,-2.5364811420440674,-9.640933990478516,3.3685433864593506,-1.880822777748108,0.1813458949327469,-2.369485378265381 +03-01-04-02-01-01-11.wav,1,2.0,3.3366875,121.06561455954939,1652.8663823722115,0.031051717698574066,1.3366875,-471.81854248046875,76.74095916748047,17.33856773376465,5.668045997619629,7.899608612060547,5.194745063781738,0.410684734582901,-3.063692331314087,-5.926632881164551,-1.60221266746521,-3.1313023567199707,0.664747953414917,-0.6392941474914551 +03-01-04-02-01-02-04.wav,0,0.0,2.0,242.77217800103082,2767.35579013592,0.05341871455311775,2.0,-381.0261535644531,5.191656589508057,-11.866175651550293,1.6898242235183716,-13.538690567016602,0.40487658977508545,-13.548657417297363,-11.939569473266602,-5.769067764282227,4.097617149353027,2.631758689880371,13.174717903137207,19.067441940307617 +03-01-04-02-01-02-04.wav,1,2.0,3.7704375,179.4881946001395,2241.040133096812,0.08260268718004227,1.7704375,-353.3396911621094,25.33582878112793,-23.21554946899414,-6.574950218200684,-19.702423095703125,-10.682599067687988,-13.263450622558594,-9.179414749145508,0.9853244423866272,-3.6432108879089355,-7.538877964019775,-2.785616397857666,6.142146110534668 +03-01-04-02-01-02-11.wav,0,0.0,2.0,168.52340653544377,2140.273093437073,0.069302998483181,2.0,-299.6136169433594,43.80134582519531,4.60139274597168,15.086034774780273,0.3363977372646332,-3.451456069946289,-11.093853950500488,-4.564574241638184,-12.47055435180664,1.6301919221878052,-4.77737283706665,-2.181586742401123,-6.080482006072998 +03-01-04-02-01-02-11.wav,1,2.0,3.603625,207.8832846457116,1115.7105627062035,0.05746918544173241,1.603625,-414.1717529296875,80.42064666748047,8.807519912719727,8.123883247375488,3.263343334197998,2.089939832687378,0.19911335408687592,-7.29738187789917,-5.255602836608887,-0.08961759507656097,-3.040428876876831,2.6466734409332275,-2.1636033058166504 +03-01-04-02-02-01-04.wav,0,0.0,2.0,287.3138843412857,2669.6684780358687,0.06698774546384811,2.0,-353.5552673339844,-12.809871673583984,-27.816946029663086,-2.1156105995178223,-24.875091552734375,-13.647027015686035,-24.751920700073242,-12.601019859313965,-9.017953872680664,-2.4082188606262207,4.77329158782959,21.38982391357422,22.648252487182617 +03-01-04-02-02-01-04.wav,1,2.0,3.7704375,219.09592887978448,2406.8474732060204,0.07530523836612701,1.7704375,-374.870849609375,13.537918090820312,-36.85417556762695,-7.564351558685303,-22.181957244873047,-16.66897964477539,-21.036062240600586,-14.6729736328125,-2.881587505340576,-8.629679679870605,-8.864974975585938,2.7918853759765625,10.896246910095215 +03-01-04-02-02-01-11.wav,0,0.0,2.0,165.24900597699292,2340.5433146919117,0.08734350651502609,2.0,-319.6329650878906,34.3584098815918,5.298957347869873,11.057165145874023,-10.047348022460938,-0.4392334520816803,-12.40809440612793,-10.162284851074219,-14.151596069335938,-1.7558236122131348,-1.7967736721038818,-5.317577838897705,-3.7814340591430664 +03-01-04-02-02-01-11.wav,1,2.0,3.3700625,113.76507370048212,1632.949451333151,0.039589714258909225,1.3700625,-442.47021484375,63.352725982666016,6.575443744659424,5.450774192810059,2.707271099090576,5.701967239379883,1.2757102251052856,-6.648138523101807,-3.7302768230438232,-2.005391836166382,-2.2423512935638428,0.307251513004303,-0.8260974287986755 +03-01-04-02-02-02-04.wav,0,0.0,2.0,259.1234603493429,3189.1499134276114,0.06770717352628708,2.0,-337.98712158203125,-8.28558349609375,-24.014347076416016,3.7229371070861816,-22.90056800842285,-11.159173965454102,-21.57026481628418,-13.129855155944824,-13.189984321594238,-6.368785858154297,-8.419583320617676,10.013981819152832,19.122581481933594 +03-01-04-02-02-02-04.wav,1,2.0,3.7704375,204.36253389232644,2298.960500350469,0.06507766991853714,1.7704375,-384.1033630371094,23.99105453491211,-29.687057495117188,0.9497560858726501,-18.350570678710938,-11.559846878051758,-14.406006813049316,-11.998906135559082,-1.6526072025299072,-8.660837173461914,-9.44095516204834,-3.6027908325195312,2.453336477279663 +03-01-04-02-02-02-11.wav,0,0.0,2.0,210.49436759807355,2583.23623139312,0.07511214166879654,2.0,-278.85125732421875,29.357715606689453,3.0822057723999023,8.532763481140137,-8.524861335754395,1.3718996047973633,-16.542339324951172,-9.903515815734863,-17.288330078125,0.3300653100013733,-5.554318904876709,-6.242571830749512,-6.8913493156433105 +03-01-04-02-02-02-11.wav,1,2.0,3.3366875,146.10872486317075,1240.682674854132,0.02147248573601246,1.3366875,-515.7483520507812,83.54413604736328,24.85146141052246,13.243768692016602,8.47493839263916,7.709936141967773,6.254560947418213,-2.033493995666504,-4.440410614013672,-2.0202200412750244,-1.4588419198989868,2.040079355239868,3.372127056121826 +03-01-05-01-01-01-04.wav,0,0.0,2.0,327.38522760194417,2027.9789236568997,0.04924533888697624,2.0,-373.4667663574219,13.658203125,-10.13676643371582,3.987750291824341,-16.334381103515625,-3.2872722148895264,-16.59564971923828,-9.260554313659668,-9.553159713745117,-3.8162715435028076,-10.53119945526123,-1.8013484477996826,-6.405682563781738 +03-01-05-01-01-01-04.wav,1,2.0,3.70375,140.6707680416173,2188.9569010684886,0.03712725639343262,1.70375,-424.6220703125,49.55451965332031,-8.8579683303833,3.074756145477295,-10.599187850952148,-4.618495464324951,-14.89064884185791,-10.440834999084473,-4.755117893218994,-2.307276725769043,-11.058977127075195,-0.6118641495704651,-7.763554096221924 +03-01-05-01-01-01-11.wav,0,0.0,2.0,180.87870460152857,2218.151550581432,0.076350137591362,2.0,-299.6778259277344,28.012727737426758,4.484274387359619,7.198110580444336,-4.601395606994629,2.5296895503997803,-2.617412805557251,-2.4591727256774902,-7.339141368865967,4.547694683074951,-0.8822067975997925,0.3548066318035126,-1.3893502950668335 +03-01-05-01-01-01-11.wav,1,2.0,3.603625,121.83027247119455,1410.7495047807156,0.056207142770290375,1.603625,-395.08428955078125,73.99697875976562,9.264659881591797,14.295963287353516,2.598414659500122,3.908998489379883,2.2919397354125977,-4.521177768707275,-1.185792326927185,2.0117413997650146,-4.731218338012695,-0.515739381313324,0.15930777788162231 +03-01-05-01-01-02-04.wav,0,0.0,2.0,319.3380958434885,1933.5023521446105,0.051175106316804886,2.0,-382.3902893066406,12.302967071533203,-13.503360748291016,2.1148440837860107,-20.469453811645508,1.377089023590088,-18.812149047851562,-9.02299976348877,-8.869918823242188,-0.8518781661987305,-9.30241584777832,-0.20419079065322876,2.0332674980163574 +03-01-05-01-01-02-04.wav,1,2.0,3.603625,141.3166854300788,2273.988665693338,0.04879850149154663,1.603625,-398.6744384765625,37.48045349121094,-11.127248764038086,-0.27399736642837524,-13.130827903747559,-4.57865571975708,-10.144596099853516,-9.783415794372559,-1.5363057851791382,-3.756859302520752,-7.374958515167236,-3.1538546085357666,-4.287727355957031 +03-01-05-01-01-02-11.wav,0,0.0,2.0,139.43534721716665,2340.0970061641447,0.08190084993839264,2.0,-292.16796875,37.101837158203125,8.093192100524902,10.13796615600586,-1.312697410583496,1.8354414701461792,-2.209028720855713,-2.4594171047210693,-7.57356071472168,7.635856628417969,-1.515189290046692,3.1189332008361816,-3.896780014038086 +03-01-05-01-01-02-11.wav,1,2.0,3.4701875,99.10070321905549,1672.84128409282,0.030190328136086464,1.4701875,-456.67535400390625,81.08716583251953,28.425474166870117,12.471488952636719,8.632009506225586,8.917922019958496,2.7836718559265137,-4.362534523010254,-0.9674966931343079,1.8910797834396362,-4.446438312530518,1.4319547414779663,0.8634172677993774 +03-01-05-01-02-01-04.wav,0,0.0,2.0,322.12798399122704,1978.153645842665,0.088046133518219,2.0,-322.9593811035156,12.28531551361084,-16.331558227539062,-0.7185400724411011,-23.749677658081055,-7.01417875289917,-25.611223220825195,-12.543758392333984,-12.913448333740234,-4.764581680297852,-14.101927757263184,-8.47906494140625,-8.063068389892578 +03-01-05-01-02-01-04.wav,1,2.0,3.470125,140.6489288485144,2247.87043104742,0.05251740291714668,1.470125,-420.37359619140625,43.23752212524414,-8.941458702087402,1.7555745840072632,-7.022336959838867,-2.9415087699890137,-8.6713228225708,-11.415824890136719,-2.9830117225646973,-2.017019271850586,-6.458825588226318,-2.5225493907928467,-4.903705596923828 +03-01-05-01-02-01-11.wav,0,0.0,2.0,136.23639628018068,2200.4242672708056,0.0865732952952385,2.0,-319.03399658203125,28.469045639038086,0.5335016250610352,2.3448984622955322,-5.960607051849365,0.9758838415145874,-6.418370723724365,-5.1383795738220215,-12.517765045166016,5.587411880493164,-5.40684700012207,-0.5886973738670349,-4.576890468597412 +03-01-05-01-02-01-11.wav,1,2.0,3.470125,101.123176359148,1521.69720455207,0.03973411023616791,1.470125,-441.7464904785156,72.09577941894531,13.536334037780762,11.769730567932129,3.4424362182617188,7.80856466293335,-0.3845994770526886,-4.585522651672363,-2.439993381500244,-1.203991174697876,-2.4548118114471436,0.7396267652511597,2.282087564468384 +03-01-05-01-02-02-04.wav,0,0.0,2.0,287.5713816405087,3150.1292443653692,0.07285816222429276,2.0,-332.51171875,8.685052871704102,-6.571447372436523,0.7250041365623474,-17.237253189086914,-5.999104976654053,-19.43523406982422,-9.928736686706543,-6.586658477783203,-2.583523988723755,-9.528058052062988,-3.8674488067626953,-5.835935592651367 +03-01-05-01-02-02-04.wav,1,2.0,3.7370625,140.9459641922201,2162.91783122333,0.08104632794857025,1.7370625,-349.3989562988281,53.000732421875,-17.67957878112793,1.0775376558303833,-14.345553398132324,-3.311594009399414,-16.50115966796875,-12.384832382202148,-5.354934215545654,-2.3770751953125,-10.772753715515137,-1.4763072729110718,-4.6505045890808105 +03-01-05-01-02-02-11.wav,0,0.0,2.0,127.7995299736333,2419.012365419899,0.07290171086788177,2.0,-326.7621154785156,23.003236770629883,7.360030651092529,4.116421699523926,-4.288506507873535,2.666226863861084,-6.578274726867676,-2.569841146469116,-11.162626266479492,5.61484956741333,-6.254976749420166,1.9166003465652466,-2.735081672668457 +03-01-05-01-02-02-11.wav,1,2.0,3.737125,125.71222768189517,1534.155332946571,0.04033771902322769,1.737125,-415.59375,75.57086944580078,9.410017967224121,8.86000919342041,1.3436968326568604,4.501826286315918,-5.308351993560791,-5.5684123039245605,-6.956740379333496,1.0499290227890015,-4.592956066131592,-1.9087128639221191,-2.7445759773254395 +03-01-05-02-01-01-04.wav,0,0.0,2.0,302.8992540636044,2326.3746043376022,0.04004644230008125,2.0,-383.1864013671875,-0.9175071120262146,-13.277976989746094,3.486726760864258,-19.529203414916992,4.297869682312012,-20.041839599609375,-6.566892147064209,-9.033699989318848,-2.242572069168091,-3.4614758491516113,7.507369041442871,9.703285217285156 +03-01-05-02-01-01-04.wav,1,2.0,3.7704375,180.60333708928928,2264.005935422716,0.05249333381652832,1.7704375,-388.3847351074219,25.972639083862305,-23.055986404418945,-6.690561771392822,-17.94710922241211,1.1932315826416016,-14.863532066345215,-7.0672993659973145,-7.099475860595703,-2.358464479446411,-10.094216346740723,-5.803025245666504,-1.3728777170181274 +03-01-05-02-01-01-11.wav,0,0.0,2.0,229.58278379469127,2465.8529648553813,0.05906923860311508,2.0,-330.449951171875,20.861743927001953,-2.8199570178985596,6.211389541625977,-10.564045906066895,-2.231304168701172,-2.6834802627563477,-2.2420594692230225,-11.225852012634277,2.107940912246704,-2.1024296283721924,-1.8630801439285278,-4.926120758056641 +03-01-05-02-01-01-11.wav,1,2.0,3.536875,123.70157697402132,1453.663836623404,0.03744297847151756,1.536875,-414.5981750488281,50.59490203857422,5.247931480407715,3.697274684906006,-3.1438710689544678,0.8449963927268982,0.3301088809967041,-1.5290824174880981,-6.983205795288086,2.030332088470459,-4.0961408615112305,1.0052086114883423,-0.506290078163147 +03-01-05-02-01-02-04.wav,0,0.0,2.0,318.6078712290376,2030.9909694060686,0.05086717754602432,2.0,-364.7076721191406,0.44567975401878357,-17.13477325439453,0.5685601234436035,-21.466970443725586,2.3671605587005615,-21.57683753967285,-6.852382183074951,-8.350382804870605,-2.179473400115967,-2.9946610927581787,9.4613618850708,9.08614730834961 +03-01-05-02-01-02-04.wav,1,2.0,3.8705625,198.41613164718532,2351.2948182863083,0.04445463418960571,1.8705625,-390.45758056640625,26.89476776123047,-15.741376876831055,-5.192067623138428,-17.488508224487305,-1.780648946762085,-15.537479400634766,-9.132845878601074,-3.6197757720947266,-2.366626739501953,-6.415895462036133,-4.74663782119751,-3.7940948009490967 +03-01-05-02-01-02-11.wav,0,0.0,2.0,236.83726545796125,2449.1879385886086,0.08786851167678833,2.0,-309.53912353515625,13.353867530822754,-8.232752799987793,1.0974446535110474,-13.024919509887695,-3.031804323196411,-3.9564971923828125,-2.2695415019989014,-8.419303894042969,0.9139037728309631,-2.612409830093384,0.7668357491493225,-3.4187755584716797 +03-01-05-02-01-02-11.wav,1,2.0,3.970625,226.81902612514872,1667.0077912973131,0.07460115849971771,1.970625,-344.6249694824219,41.54207229614258,-1.2142657041549683,-1.2879611253738403,-11.78940200805664,-0.6842732429504395,-4.871687412261963,-6.89944314956665,-17.25240707397461,2.0109622478485107,-6.183462142944336,-1.9726698398590088,-1.2383675575256348 +03-01-05-02-02-01-04.wav,0,0.0,2.0,291.16477674190975,2962.203440250344,0.034694306552410126,2.0,-397.97161865234375,-3.1336917877197266,-15.747743606567383,3.7568411827087402,-23.86280059814453,-5.644118785858154,-29.07207489013672,-6.927679061889648,-6.964547157287598,-2.6422290802001953,-10.649101257324219,-0.0823802649974823,-5.837791919708252 +03-01-05-02-02-01-04.wav,1,2.0,3.7036875,204.0190232810834,2589.6166563587835,0.04149215668439865,1.7036875,-416.8114929199219,20.805994033813477,-22.615671157836914,-1.5385910272598267,-18.4955997467041,-0.44827958941459656,-13.550721168518066,-8.32087230682373,-3.98392915725708,-1.0329828262329102,-9.559337615966797,-4.418592929840088,-3.162343740463257 +03-01-05-02-02-01-11.wav,0,0.0,2.0,287.69452717330677,2338.4321815643543,0.07415662705898285,2.0,-306.8961181640625,21.226953506469727,-2.57617449760437,5.584602355957031,-15.602991104125977,-1.2539325952529907,-1.4438828229904175,-4.438048839569092,-9.878153800964355,0.6169691681861877,-5.388490676879883,-0.4694405198097229,-5.42858362197876 +03-01-05-02-02-01-11.wav,1,2.0,3.9373125,250.3899600540738,1242.8087391150812,0.05953017994761467,1.9373125,-355.3957824707031,54.814186096191406,0.7931679487228394,7.368144512176514,-11.417122840881348,-1.500274896621704,-0.24349431693553925,-2.2252132892608643,-11.996467590332031,3.761370897293091,-4.316097736358643,1.2864288091659546,-0.41525742411613464 +03-01-05-02-02-02-04.wav,0,0.0,2.0,301.2865438146614,3176.4344044213194,0.041721004992723465,2.0,-403.1885070800781,-7.394679546356201,-17.21027946472168,-9.894639015197754,-24.362117767333984,-5.634559631347656,-27.038774490356445,-11.798036575317383,-14.16134262084961,-4.227100849151611,-4.1533308029174805,7.861622333526611,3.095031499862671 +03-01-05-02-02-02-04.wav,1,2.0,3.8371875,224.2862561119802,2459.6864720814633,0.04985956847667694,1.8371875,-411.743408203125,25.02092742919922,-24.56584358215332,-8.544779777526855,-21.463966369628906,1.2212529182434082,-17.392637252807617,-4.524984359741211,-6.64066743850708,-1.428339958190918,-10.844377517700195,-7.634349346160889,-6.723967552185059 +03-01-05-02-02-02-11.wav,0,0.0,2.0,297.020985870716,2552.342975780412,0.07447861135005951,2.0,-318.8964538574219,6.743276119232178,-11.251167297363281,-2.1471073627471924,-15.906993865966797,-0.8692690134048462,-7.563671588897705,-5.9184465408325195,-13.815118789672852,1.5829240083694458,-5.8600592613220215,-2.6688241958618164,-5.364259719848633 +03-01-05-02-02-02-11.wav,1,2.0,3.7704375,250.05904063917245,1785.3202456362649,0.05635599419474602,1.7704375,-366.94232177734375,36.531673431396484,-4.605257511138916,-2.1875274181365967,-14.54489803314209,0.9258597493171692,-2.263331890106201,-1.550330400466919,-12.757298469543457,2.327756404876709,-6.651440143585205,-2.6801393032073975,-1.783522605895996 +03-01-06-01-01-01-04.wav,0,0.0,2.0,306.78217826474395,2016.2630892562936,0.08066418021917343,2.0,-309.80224609375,13.261346817016602,-10.423105239868164,3.343951463699341,-17.55144691467285,-3.2585079669952393,-21.34317398071289,-17.0620059967041,-7.760460376739502,-5.643137454986572,-12.84768295288086,-2.867659091949463,-1.4729089736938477 +03-01-06-01-01-01-04.wav,1,2.0,3.3033125,203.6500814618615,2438.468242328277,0.03693363443017006,1.3033125,-445.06787109375,42.03498840332031,-22.193220138549805,-1.6142206192016602,-4.861809253692627,-1.009694218635559,-13.102607727050781,-10.938583374023438,-0.6470962166786194,-3.872626304626465,-5.7693328857421875,-1.8632732629776,-1.559570550918579 +03-01-06-01-01-01-11.wav,0,0.0,2.0,152.67478419434116,2269.4420293067337,0.0871507003903389,2.0,-295.2144470214844,42.80385208129883,12.814994812011719,16.02806854248047,3.595675468444824,5.514930248260498,1.668544888496399,2.499724864959717,-6.553030490875244,9.167662620544434,0.4218757748603821,3.2597780227661133,-1.9567856788635254 +03-01-06-01-01-01-11.wav,1,2.0,3.470125,123.99067699369358,1559.0400572752471,0.057890232652425766,1.470125,-374.5049133300781,75.06321716308594,20.44590187072754,12.332348823547363,7.721335411071777,10.380717277526855,2.3276548385620117,-4.857304573059082,2.3742125034332275,6.70945405960083,-0.8173170685768127,1.321578860282898,-2.2447879314422607 +03-01-06-01-01-02-04.wav,0,0.0,2.0,333.03279348576103,2541.367804118421,0.08959632366895676,2.0,-300.6609802246094,12.070099830627441,-14.416753768920898,9.2869234085083,-20.7595272064209,-2.419527053833008,-17.673749923706055,-13.394733428955078,-9.178630828857422,-6.96176815032959,-11.291629791259766,-2.922175645828247,-2.3270926475524902 +03-01-06-01-01-02-04.wav,1,2.0,3.236625,204.17747648619212,2512.912342177141,0.04125910997390747,1.236625,-439.76763916015625,39.95492935180664,-25.313180923461914,-0.6439070701599121,-6.389616966247559,-4.330966472625732,-11.91030216217041,-8.866453170776367,0.007274980656802654,-5.900266170501709,-8.312395095825195,-3.58341383934021,-4.426645755767822 +03-01-06-01-01-02-11.wav,0,0.0,2.0,133.4296521114663,2196.2941966982376,0.07380365580320358,2.0,-326.3988952636719,40.624332427978516,15.226056098937988,13.342649459838867,3.0435869693756104,4.124807834625244,2.770411252975464,1.6154816150665283,-7.734796524047852,5.7728986740112305,0.8560507893562317,1.42386794090271,-1.319604516029358 +03-01-06-01-01-02-11.wav,1,2.0,3.5035,124.98561329562305,1631.6392101663093,0.05571300536394119,1.5035,-383.1345520019531,78.53659057617188,13.42722225189209,7.044027328491211,1.3191070556640625,10.01479721069336,2.205416202545166,-6.426492214202881,-3.3612446784973145,1.7525101900100708,-2.5924646854400635,0.33792752027511597,-3.893073797225952 +03-01-06-01-02-01-04.wav,0,0.0,2.0,325.41281223706,1891.8258304686021,0.07943794876337051,2.0,-325.47711181640625,13.552258491516113,-15.443033218383789,6.073848247528076,-16.837053298950195,-6.023374557495117,-23.48003578186035,-14.08348560333252,-7.916420936584473,-8.133931159973145,-12.453349113464355,-6.465914249420166,-3.051323413848877 +03-01-06-01-02-01-04.wav,1,2.0,3.470125,208.7680186220876,2260.0102035920827,0.05648466944694519,1.470125,-413.524658203125,37.8563117980957,-16.975709915161133,1.0510835647583008,-5.426395893096924,-4.561521530151367,-10.462301254272461,-10.089357376098633,0.332927942276001,-4.871004104614258,-4.580060005187988,-3.849310874938965,-2.0543596744537354 +03-01-06-01-02-01-11.wav,0,0.0,2.0,177.9418183576899,2223.6547254364878,0.09665529429912567,2.0,-303.69549560546875,46.28656768798828,6.421526908874512,13.190156936645508,1.5045112371444702,4.038480281829834,-14.586697578430176,-10.73043155670166,-10.25666332244873,3.0958471298217773,-5.675781726837158,-3.0657520294189453,-6.114075660705566 +03-01-06-01-02-01-11.wav,1,2.0,3.169875,105.3601863108782,1612.5107572779734,0.03668121621012688,1.169875,-449.865234375,73.01142883300781,15.078768730163574,11.459774017333984,6.460553169250488,8.55411148071289,2.761017322540283,-4.0404815673828125,2.4078164100646973,0.30491843819618225,1.476739764213562,4.553004741668701,2.755526065826416 +03-01-06-01-02-02-04.wav,0,0.0,2.0,337.4352148623985,1820.277394867826,0.08409572392702103,2.0,-317.4143981933594,14.415360450744629,-10.370922088623047,2.424628257751465,-14.282709121704102,-5.577842712402344,-17.37600326538086,-14.2545804977417,-10.23987102508545,-7.257964134216309,-12.362595558166504,-6.33252477645874,-3.326204538345337 +03-01-06-01-02-02-04.wav,1,2.0,3.470125,172.27663877784224,2413.278080520095,0.054350629448890686,1.470125,-404.2124328613281,36.732749938964844,-17.789060592651367,-1.5924781560897827,-8.074488639831543,-2.649597644805908,-11.567508697509766,-15.128379821777344,-0.9342721104621887,-4.738570690155029,-7.724216461181641,-2.189412832260132,-1.2629461288452148 +03-01-06-01-02-02-11.wav,0,0.0,2.0,142.51989189641756,2057.2481747778843,0.08356029540300369,2.0,-319.1393737792969,49.88963317871094,7.732165336608887,12.354706764221191,0.7755268216133118,5.189607620239258,-10.348817825317383,-5.484432697296143,-8.606098175048828,3.0736918449401855,-2.6143789291381836,-2.862011194229126,-2.9539337158203125 +03-01-06-01-02-02-11.wav,1,2.0,3.20325,112.78374047077754,1688.9067712662747,0.03229539468884468,1.20325,-460.5317687988281,69.71916961669922,14.171246528625488,10.975422859191895,6.0746259689331055,5.537847995758057,3.9471306800842285,-2.858426332473755,2.303842306137085,1.0745056867599487,-1.539931058883667,2.6757943630218506,3.8588597774505615 +03-01-06-02-01-01-04.wav,0,0.0,2.0,293.1938443617395,2254.1339774332496,0.08014912158250809,2.0,-285.98052978515625,-5.709850311279297,-13.644229888916016,7.070859432220459,-14.9621000289917,-0.7343652844429016,-20.352176666259766,-10.710731506347656,-6.331356525421143,3.034193277359009,4.7943902015686035,22.39273452758789,19.679418563842773 +03-01-06-02-01-01-04.wav,1,2.0,3.9373125,209.0326272250231,2607.5436648628115,0.06811373680830002,1.9373125,-324.22344970703125,13.741759300231934,-22.34154510498047,-1.8338146209716797,-4.392963409423828,0.7936248183250427,-19.75562858581543,-9.661606788635254,4.5439581871032715,-2.7274656295776367,-10.480875015258789,11.315162658691406,17.04572868347168 +03-01-06-02-01-01-11.wav,0,0.0,2.0,216.37647217076932,2122.7726675447075,0.08231304585933685,2.0,-307.22576904296875,30.639305114746094,-7.242466449737549,12.086467742919922,-5.102632522583008,-1.1031299829483032,-7.8221540451049805,-10.801338195800781,-11.03674030303955,0.8652855157852173,-2.4020071029663086,-3.857846260070801,-6.641984462738037 +03-01-06-02-01-01-11.wav,1,2.0,3.20325,167.53341427663963,1832.544655603711,0.05202523246407509,1.20325,-413.797607421875,52.6944694519043,-3.7502200603485107,-1.604874610900879,-6.414364337921143,-1.8863261938095093,0.31189337372779846,-8.900352478027344,-7.326277732849121,5.272609710693359,-1.2707712650299072,3.2444794178009033,-4.105482578277588 +03-01-06-02-01-02-04.wav,0,0.0,2.0,312.2471584868254,2441.8894063997814,0.0620209276676178,2.0,-341.409423828125,1.9239040613174438,-17.378097534179688,5.209112167358398,-17.669889450073242,1.0934182405471802,-18.227344512939453,-12.355354309082031,-9.095817565917969,-5.209580421447754,1.7240389585494995,16.815536499023438,20.8653621673584 +03-01-06-02-01-02-04.wav,1,2.0,3.4034375,163.67259956378425,2394.9235512239084,0.055552270263433456,1.4034375,-399.6239318847656,25.906641006469727,-20.45304298400879,-13.459087371826172,-8.5833740234375,-5.519660949707031,-9.100050926208496,-12.568615913391113,-7.218804836273193,-3.490159511566162,-7.258735179901123,1.9634075164794922,6.968835353851318 +03-01-06-02-01-02-11.wav,0,0.0,2.0,216.62614336751494,2327.632974977604,0.08362298458814621,2.0,-304.91168212890625,26.122426986694336,-5.754750728607178,9.850165367126465,-1.9395387172698975,-0.3074832558631897,-7.140223503112793,-5.519281387329102,-7.430243492126465,0.7371425032615662,-5.512473106384277,-3.8704116344451904,-1.6053990125656128 +03-01-06-02-01-02-11.wav,1,2.0,3.20325,165.54952631741145,1727.946971425224,0.04747815430164337,1.20325,-406.3780517578125,49.35755157470703,-5.799562454223633,1.283958911895752,-4.361151218414307,-2.2428512573242188,4.816504955291748,-8.527152061462402,-4.3569464683532715,5.754046440124512,-4.1706647872924805,-0.5262102484703064,-4.996908187866211 +03-01-06-02-02-01-04.wav,0,0.0,2.0,319.51355010238393,1953.3970488312477,0.0747455582022667,2.0,-324.37451171875,-0.2626131474971771,-22.50490951538086,-4.379366397857666,-25.402385711669922,-6.096787929534912,-25.707822799682617,-10.013043403625488,-11.85422420501709,-10.603432655334473,-8.076491355895996,6.72917366027832,13.89796257019043 +03-01-06-02-02-01-04.wav,1,2.0,3.603625,194.6025705188124,2282.8551578436404,0.0600157305598259,1.603625,-388.12548828125,28.486791610717773,-14.456225395202637,-12.934503555297852,-8.096693992614746,-6.368546962738037,-11.260902404785156,-10.816306114196777,-7.463244438171387,-3.3537495136260986,-9.718453407287598,3.400089979171753,2.771303176879883 +03-01-06-02-02-01-11.wav,0,0.0,2.0,248.38346361943226,2324.056008608115,0.0740819126367569,2.0,-318.190673828125,18.860397338867188,-10.594371795654297,5.347851753234863,-12.767224311828613,-3.625286817550659,-10.228910446166992,-12.467012405395508,-11.604262351989746,-2.239495038986206,-4.440187454223633,-5.601541996002197,-5.1523613929748535 +03-01-06-02-02-01-11.wav,1,2.0,3.06975,216.90053129499745,1619.890163291222,0.017585743218660355,1.06975,-463.88134765625,67.7839126586914,-12.240486145019531,5.457740306854248,4.932949066162109,4.87262487411499,-5.429851531982422,-9.323894500732422,-5.566490650177002,10.54136848449707,-3.6156632900238037,0.7354975342750549,-6.111631393432617 +03-01-06-02-02-02-04.wav,0,0.0,2.0,282.0166258310902,3075.9790325809254,0.06818339228630066,2.0,-357.7315368652344,3.327270030975342,-24.45810317993164,-7.781130313873291,-23.44863510131836,-7.349776268005371,-21.431968688964844,-11.069540977478027,-11.223608016967773,-7.52615213394165,-7.881936550140381,9.276908874511719,14.90772533416748 +03-01-06-02-02-02-04.wav,1,2.0,3.57025,206.81369970337204,2158.9501154633454,0.053642015904188156,1.57025,-409.5507507324219,23.161264419555664,-15.896666526794434,-5.509647846221924,-5.498566150665283,-6.234498977661133,-10.178742408752441,-7.251359462738037,-1.788515567779541,-1.6318784952163696,-5.750824451446533,-0.29957813024520874,-2.0047719478607178 +03-01-06-02-02-02-11.wav,0,0.0,2.0,207.77430422896504,2654.867717384266,0.08155544102191925,2.0,-300.23876953125,14.723053932189941,-6.979039192199707,-1.541084885597229,-9.480707168579102,-2.5862908363342285,-9.726521492004395,-6.666759490966797,-13.735881805419922,0.006889668758958578,-5.781299591064453,-3.6234848499298096,-5.744877815246582 +03-01-06-02-02-02-11.wav,1,2.0,3.1365,239.6823953378193,1553.997325718513,0.03971640765666962,1.1365,-413.16595458984375,65.71157836914062,-10.3418607711792,-1.1566612720489502,-0.6463285088539124,5.780195713043213,-4.390703201293945,-12.503409385681152,-3.101219892501831,5.950231552124023,-0.8447856903076172,0.4998561441898346,-3.273749351501465 +03-01-07-01-01-01-04.wav,0,0.0,2.0,313.70171504292693,1788.0838756144708,0.05670942738652229,2.0,-345.7050476074219,8.925172805786133,-4.629213809967041,3.681886911392212,-5.493284225463867,-3.875479221343994,-8.285903930664062,-5.594346523284912,-7.03719425201416,0.7008481025695801,-8.700522422790527,1.8301491737365723,-2.0376498699188232 +03-01-07-01-01-01-04.wav,1,2.0,3.9039375,157.76636570548212,1862.05413647909,0.07422689348459244,1.9039375,-355.2874755859375,75.81635284423828,-9.920315742492676,1.0437160730361938,-10.42973804473877,-0.74956876039505,-9.926289558410645,-16.129533767700195,-7.224276065826416,-2.3049588203430176,-12.4436674118042,-2.360395669937134,-4.58487606048584 +03-01-07-01-01-01-11.wav,0,0.0,2.0,142.85619177189147,2413.7188934504284,0.06954529136419296,2.0,-334.25335693359375,28.527673721313477,12.223357200622559,16.648088455200195,0.5512009263038635,4.697068214416504,-1.9246525764465332,0.6025041341781616,-7.028212070465088,5.691813945770264,-1.7778891324996948,2.402184247970581,-2.311136484146118 +03-01-07-01-01-01-11.wav,1,2.0,3.670375,110.84510146928326,1553.46375470902,0.05262935534119606,1.670375,-392.12847900390625,86.22246551513672,19.31668472290039,16.160797119140625,0.62662672996521,10.125365257263184,0.46197330951690674,-7.686026573181152,-3.754988670349121,9.30398941040039,-4.352052688598633,-4.383857727050781,1.6412123441696167 +03-01-07-01-01-02-04.wav,0,0.0,2.0,278.14001375674246,2125.6828916874383,0.0807012990117073,2.0,-312.5983581542969,20.821691513061523,-9.083954811096191,6.284525394439697,-14.118943214416504,-7.513768196105957,-16.364669799804688,-9.217775344848633,-6.863193035125732,-3.347733974456787,-10.304388046264648,-1.2163314819335938,-2.012247323989868 +03-01-07-01-01-02-04.wav,1,2.0,3.9039375,165.56495291026832,2095.5756803555832,0.07111901789903641,1.9039375,-353.8242492675781,58.06777572631836,-7.021966934204102,-1.3575012683868408,-6.187948226928711,-10.62136459350586,-16.638322830200195,-13.546971321105957,-1.7329310178756714,-5.440281391143799,-10.387444496154785,-0.677355170249939,-6.351956367492676 +03-01-07-01-01-02-11.wav,0,0.0,2.0,179.25507103247676,2360.751488069649,0.07520055770874023,2.0,-325.7285461425781,33.09927749633789,6.079253196716309,18.412254333496094,-0.15826274454593658,2.8435070514678955,-1.1581206321716309,-0.6721655130386353,-6.28264856338501,7.004185676574707,-1.9110455513000488,1.5989936590194702,-3.1627590656280518 +03-01-07-01-01-02-11.wav,1,2.0,3.603625,145.8326114381732,1539.756759436671,0.045807015150785446,1.603625,-410.5924377441406,78.30447387695312,16.110057830810547,16.22471046447754,-1.5866400003433228,10.892930030822754,-0.7843810319900513,-8.486939430236816,-2.289764165878296,1.3462156057357788,-4.609572887420654,-1.0069870948791504,-0.12306982278823853 +03-01-07-01-02-01-04.wav,0,0.0,2.0,295.83957160376076,2743.1263127420325,0.06984855979681015,2.0,-359.461669921875,14.7476224899292,-6.454494953155518,-0.8156270980834961,-9.179076194763184,-7.63378381729126,-14.682360649108887,-9.06096076965332,-8.120123863220215,-2.7252655029296875,-6.649759292602539,-2.451965808868408,-7.310558795928955 +03-01-07-01-02-01-04.wav,1,2.0,3.7704375,187.02912887251256,2161.3603359833496,0.06444688141345978,1.7704375,-388.9900817871094,48.61040496826172,-21.850399017333984,1.605230689048767,-11.032414436340332,-7.09192419052124,-16.313644409179688,-13.12462043762207,-2.008820056915283,-7.3801116943359375,-10.991021156311035,-1.9516315460205078,-4.203358173370361 +03-01-07-01-02-01-11.wav,0,0.0,2.0,147.39151389748133,2408.627073372708,0.0735476091504097,2.0,-338.0740966796875,41.949459075927734,13.105673789978027,6.527790069580078,-1.8737549781799316,4.155055999755859,-4.335562705993652,2.8110511302948,-7.078166484832764,6.436975479125977,-3.3857576847076416,0.38682737946510315,-2.012866735458374 +03-01-07-01-02-01-11.wav,1,2.0,3.470125,99.70904971958564,1561.2362313730512,0.03305754065513611,1.470125,-446.98394775390625,87.64935302734375,20.130346298217773,12.001776695251465,5.055832386016846,9.490494728088379,1.3745687007904053,-3.693845272064209,-2.0977680683135986,5.583228588104248,-4.321413993835449,-1.7328729629516602,2.4900670051574707 +03-01-07-01-02-02-04.wav,0,0.0,2.0,227.4220447099981,2764.4206694754475,0.0497601293027401,2.0,-387.8316345214844,16.06930923461914,-7.248125076293945,12.483139038085938,-6.473812103271484,-4.815539360046387,-13.102639198303223,-6.1870503425598145,-4.199073314666748,-0.08585584908723831,-2.768948554992676,1.7357232570648193,-3.474271535873413 +03-01-07-01-02-02-04.wav,1,2.0,3.8038125,166.5109651612911,2133.1885638863014,0.06347320973873138,1.8038125,-397.43743896484375,39.538761138916016,-13.873971939086914,1.5888334512710571,-7.231462478637695,-6.80908727645874,-10.046553611755371,-4.772775173187256,1.5143826007843018,-7.600419044494629,-8.175728797912598,0.8360610604286194,-1.2492905855178833 +03-01-07-01-02-02-11.wav,0,0.0,2.0,150.64967641565022,2380.65932526893,0.0720701590180397,2.0,-342.5452880859375,46.267822265625,22.186738967895508,6.926543235778809,-6.223309516906738,2.002708673477173,-4.389280319213867,-0.24482636153697968,-10.18549919128418,4.367856502532959,-7.671464443206787,-2.4826772212982178,-5.33331298828125 +03-01-07-01-02-02-11.wav,1,2.0,3.670375,118.72944295535,1655.936627049647,0.03870738297700882,1.670375,-426.3661804199219,91.30435943603516,15.818625450134277,11.745901107788086,0.6390100717544556,7.067781925201416,0.9563366174697876,-7.966103553771973,-5.972777843475342,3.0237631797790527,-3.7563977241516113,-1.2918611764907837,-1.411568284034729 +03-01-07-02-01-01-04.wav,0,0.0,2.0,334.32920146120233,2199.792122331502,0.05545669049024582,2.0,-358.0952453613281,4.769309043884277,-7.08318567276001,7.6854352951049805,-9.327964782714844,-1.420122504234314,-12.494488716125488,-6.113469123840332,-7.7448906898498535,1.0211946964263916,-7.0751142501831055,0.3641333281993866,-4.055054664611816 +03-01-07-02-01-01-04.wav,1,2.0,4.0,188.1520826086529,2112.8792066433452,0.07516069710254669,2.0,-359.9874572753906,36.499568939208984,-10.73851203918457,-1.154361605644226,-12.561651229858398,-3.3726389408111572,-14.562493324279785,-12.69137954711914,-4.740489959716797,-6.11183500289917,-8.321897506713867,-3.3370795249938965,-2.9719839096069336 +03-01-07-02-01-01-04.wav,2,4.0,4.004,400.0,3054.5468926200383,8.578735105402302e-06,0.004,-883.3241577148438,36.206092834472656,2.927947998046875,23.680801391601562,15.355688095092773,-6.211264133453369,-7.312870025634766,-16.82854461669922,-10.753623008728027,9.138456344604492,7.720074653625488,2.5639572143554688,-8.733549118041992 +03-01-07-02-01-01-11.wav,0,0.0,2.0,160.41959975908802,2499.366914450544,0.07182429730892181,2.0,-319.43524169921875,25.152095794677734,12.429325103759766,13.404263496398926,0.2898772656917572,3.3920395374298096,-1.022460699081421,-3.305320978164673,-5.631554126739502,9.902243614196777,-1.4005733728408813,3.9732160568237305,-0.9615638256072998 +03-01-07-02-01-01-11.wav,1,2.0,3.8038125,179.95081425070435,1436.6659594502296,0.049009423702955246,1.8038125,-377.60693359375,78.90148162841797,10.651004791259766,18.42891502380371,-0.788801908493042,9.935188293457031,-2.3813135623931885,-5.995907306671143,-4.889713287353516,6.837211608886719,-4.134640693664551,-2.528414249420166,0.71917325258255 +03-01-07-02-01-02-04.wav,0,0.0,2.0,286.944613046104,2339.2900468814005,0.034548331052064896,2.0,-396.72821044921875,11.941397666931152,-9.322027206420898,3.3464460372924805,-10.746517181396484,-1.8658294677734375,-13.149420738220215,-11.043389320373535,-9.382475852966309,1.1047439575195312,-8.095297813415527,-1.6922552585601807,0.34706684947013855 +03-01-07-02-01-02-04.wav,1,2.0,4.0,198.97056288506138,2281.4847025433824,0.0870153084397316,2.0,-350.48828125,32.81420135498047,-11.015276908874512,-3.7487220764160156,-11.34693431854248,-6.121781826019287,-10.465276718139648,-11.841336250305176,-5.7660441398620605,-4.703052043914795,-7.411703586578369,-2.9167943000793457,-7.3725762367248535 +03-01-07-02-01-02-04.wav,2,4.0,4.037375,225.0,3069.0586261759736,3.515080607030541e-05,0.037375,-756.1676635742188,37.821197509765625,3.909214496612549,8.59792423248291,2.040383815765381,-0.5880686044692993,-5.664913654327393,3.726027488708496,-1.4813634157180786,-1.6063334941864014,-5.144487380981445,2.1740007400512695,0.38880887627601624 +03-01-07-02-01-02-11.wav,0,0.0,2.0,141.4961997916545,2391.425900946831,0.09233174473047256,2.0,-285.4321594238281,31.700716018676758,13.363629341125488,10.778984069824219,2.0188567638397217,2.215636968612671,0.28604039549827576,-1.0156539678573608,-6.060900688171387,8.111956596374512,-4.661165714263916,2.78678035736084,-0.8366832137107849 +03-01-07-02-01-02-11.wav,1,2.0,3.8038125,138.43667644136144,1433.3861565653397,0.14013469219207764,1.8038125,-270.9975891113281,77.77015686035156,12.5728178024292,24.50947380065918,0.3976019620895386,8.575764656066895,0.5355286002159119,-8.485663414001465,0.8774642944335938,5.859609603881836,-5.013097763061523,-3.2898380756378174,-0.6294165253639221 +03-01-07-02-02-01-04.wav,0,0.0,2.0,282.7618745929363,2676.663673553774,0.06913299858570099,2.0,-358.7235107421875,3.5329501628875732,-9.1003999710083,-0.4325486421585083,-13.508228302001953,-9.524754524230957,-17.928897857666016,-12.558183670043945,-11.644725799560547,-3.734349250793457,-11.964974403381348,-2.7983205318450928,-5.8494133949279785 +03-01-07-02-02-01-04.wav,1,2.0,3.8705625,213.89744465650242,2118.1993698606693,0.08532345294952393,1.8705625,-342.07421875,50.28699493408203,-21.357637405395508,-1.4254662990570068,-9.801515579223633,-8.17960262298584,-20.298402786254883,-22.561208724975586,-1.2991610765457153,-8.0504732131958,-16.041452407836914,0.49738481640815735,-3.4314403533935547 +03-01-07-02-02-01-11.wav,0,0.0,2.0,135.45890244210747,2225.023152114631,0.10019166767597198,2.0,-307.1332702636719,45.204769134521484,22.918886184692383,2.157177686691284,-4.09177303314209,2.47171950340271,-3.0397143363952637,-6.2445807456970215,-11.82957935333252,7.59118127822876,-4.00540828704834,-0.375591516494751,-0.7072678208351135 +03-01-07-02-02-01-11.wav,1,2.0,3.7370625,104.72517823335932,1546.8070704810068,0.06668157875537872,1.7370625,-363.57720947265625,85.52788543701172,6.908878803253174,11.489130973815918,-0.3763155937194824,9.033109664916992,-2.299283981323242,-7.364531993865967,-5.239547252655029,2.5838770866394043,-2.015160322189331,-3.738489866256714,0.13396775722503662 +03-01-07-02-02-02-04.wav,0,0.0,2.0,322.1673872767562,1973.3167676885007,0.06615860015153885,2.0,-369.9344787597656,11.475577354431152,-4.6091485023498535,-2.347909927368164,-15.27956771850586,-6.255988121032715,-20.157426834106445,-13.305789947509766,-12.274813652038574,-6.994823932647705,-13.310661315917969,-2.0259933471679688,-10.679781913757324 +03-01-07-02-02-02-04.wav,1,2.0,3.9373125,237.40529424616736,2301.0041340339103,0.06293591856956482,1.9373125,-371.55462646484375,42.72618865966797,-20.09590721130371,-10.317861557006836,-13.031425476074219,-12.705349922180176,-20.755186080932617,-18.14283561706543,-3.3179383277893066,-6.988387107849121,-13.327107429504395,-2.6493687629699707,-8.830869674682617 +03-01-07-02-02-02-11.wav,0,0.0,2.0,140.32001442382008,2344.7764882584197,0.09590615332126617,2.0,-278.77337646484375,33.153770446777344,10.932390213012695,2.6835200786590576,-6.6745123863220215,1.133581280708313,-5.875377655029297,-3.851161241531372,-7.1721014976501465,3.8273983001708984,-4.185055732727051,-3.297926902770996,-2.419076681137085 +03-01-07-02-02-02-11.wav,1,2.0,3.7370625,136.4488501278932,1524.9613091047306,0.043871376663446426,1.7370625,-398.50927734375,84.99547576904297,13.187294006347656,12.426745414733887,-0.05753038451075554,6.587801933288574,0.6113552451133728,-5.902890682220459,0.1550912857055664,3.1020028591156006,-4.531911849975586,-4.930241584777832,-1.776760220527649 +03-01-08-01-01-01-04.wav,0,0.0,2.0,256.52002753250787,2474.3123708139437,0.09098857641220093,2.0,-310.8125305175781,30.896995544433594,-18.502592086791992,3.753711462020874,-20.244592666625977,-7.865196228027344,-21.5999755859375,-14.344779014587402,-12.210617065429688,-7.041738986968994,-12.969225883483887,-3.4095852375030518,-2.1011545658111572 +03-01-08-01-01-01-04.wav,1,2.0,3.236625,142.03549501706343,2437.4516593120075,0.03771942853927612,1.236625,-457.6457214355469,42.63995361328125,-7.222878456115723,-0.10107184946537018,-4.49341344833374,-4.186634063720703,-6.280453205108643,-6.555435657501221,-0.26522213220596313,-2.8939738273620605,-9.332796096801758,-4.845641613006592,-7.760766506195068 +03-01-08-01-01-01-11.wav,0,0.0,2.0,239.7903505601707,2093.4438604567567,0.08640586584806442,2.0,-323.29559326171875,41.51077651977539,5.524759769439697,6.658907413482666,-2.1045351028442383,1.8369603157043457,-6.520719051361084,-4.827006816864014,-7.815563201904297,5.552852153778076,-1.178594708442688,-1.9283888339996338,-0.9695978760719299 +03-01-08-01-01-01-11.wav,1,2.0,3.06975,80.34753957734448,1752.3937229752814,0.0058084153570234776,1.06975,-547.6866455078125,75.72825622558594,26.685762405395508,16.99043083190918,15.829490661621094,14.57465648651123,8.32219123840332,4.678691864013672,7.091757774353027,9.179476737976074,-0.2822272777557373,5.481870174407959,5.1575236320495605 +03-01-08-01-01-02-04.wav,0,0.0,2.0,306.08795945073115,2096.6429771589055,0.08579803258180618,2.0,-329.4913330078125,24.00587272644043,-17.08820152282715,5.160009860992432,-23.577667236328125,-4.516942501068115,-23.843400955200195,-14.114188194274902,-9.904322624206543,-9.400261878967285,-12.429073333740234,-4.4987616539001465,0.5132216215133667 +03-01-08-01-01-02-04.wav,1,2.0,3.3366875,141.66441889381198,2330.4077829252888,0.05521271750330925,1.3366875,-433.8041687011719,43.128475189208984,-11.735908508300781,-1.16703200340271,-7.898702621459961,-6.359818935394287,-9.613377571105957,-9.69340705871582,-1.9234108924865723,-2.19419527053833,-11.29611873626709,-4.112993240356445,-8.13676643371582 +03-01-08-01-01-02-11.wav,0,0.0,2.0,282.93954354858,2143.47303818252,0.0676179751753807,2.0,-346.5728759765625,38.00560760498047,0.45380210876464844,2.156839609146118,-3.0179758071899414,-2.4584155082702637,-10.662857055664062,-4.680314540863037,-10.812180519104004,0.712113618850708,-4.112285614013672,-3.8880133628845215,-2.2064802646636963 +03-01-08-01-01-02-11.wav,1,2.0,3.1365,111.5583025989282,1752.5259754156025,0.014926635660231113,1.1365,-531.5885009765625,76.78509521484375,20.840152740478516,6.08271598815918,8.234146118164062,6.981453895568848,8.98116683959961,1.8801953792572021,3.08819842338562,3.356051206588745,0.3431189954280853,-0.4595314562320709,3.6251161098480225 +03-01-08-01-02-01-04.wav,0,0.0,2.0,290.3029836812309,2093.1201532626037,0.08806238323450089,2.0,-323.4251708984375,15.686527252197266,-13.548895835876465,-4.732872009277344,-23.313554763793945,-10.061650276184082,-25.655485153198242,-15.07900619506836,-11.296182632446289,-9.760794639587402,-13.230636596679688,-3.9917023181915283,-2.284627676010132 +03-01-08-01-02-01-04.wav,1,2.0,3.637,131.24938831693447,2057.461500957642,0.03734976053237915,1.637,-446.66973876953125,70.02312469482422,-6.4602155685424805,7.753377437591553,-6.6042585372924805,-4.649283409118652,-10.776281356811523,-6.9947309494018555,-1.8211899995803833,-6.499031066894531,-11.28720474243164,-3.7328147888183594,-4.423350811004639 +03-01-08-01-02-01-11.wav,0,0.0,2.0,173.13667291021844,2070.6509672213306,0.0813944935798645,2.0,-322.0059509277344,36.08997344970703,2.2631406784057617,7.769818305969238,-1.2400180101394653,4.00009822845459,-4.686900615692139,-7.7083635330200195,-9.622976303100586,5.80183219909668,-3.5160460472106934,-1.8388134241104126,-5.328710556030273 +03-01-08-01-02-01-11.wav,1,2.0,3.169875,147.16653164334815,1468.001491540294,0.022823991253972054,1.169875,-494.13739013671875,63.39122009277344,20.90901756286621,8.914956092834473,4.511730194091797,4.488795757293701,8.391698837280273,-1.4985896348953247,0.7449954152107239,0.4074539244174957,-0.6359730362892151,1.0144641399383545,3.0157687664031982 +03-01-08-01-02-02-04.wav,0,0.0,2.0,284.0261167145382,1896.8805826781738,0.07600099593400955,2.0,-338.5450134277344,23.85748863220215,-20.99330711364746,-4.036044120788574,-24.61848258972168,-8.088560104370117,-28.891014099121094,-16.395793914794922,-9.982800483703613,-8.822924613952637,-15.280698776245117,-3.4844162464141846,-0.6002260446548462 +03-01-08-01-02-02-04.wav,1,2.0,3.43675,134.80837749980967,2301.449992243505,0.043798211961984634,1.43675,-444.3231201171875,46.866085052490234,-11.597003936767578,-0.4340647757053375,-11.912421226501465,-3.918531656265259,-10.486879348754883,-11.364969253540039,-0.3440147936344147,-7.013936996459961,-12.557909965515137,-6.294259071350098,-4.45320463180542 +03-01-08-01-02-02-11.wav,0,0.0,2.0,248.2825989998704,2212.5223273204774,0.09713932126760483,2.0,-269.5213928222656,48.44368362426758,-6.511382102966309,-0.7030473947525024,-4.540550231933594,-1.5327768325805664,-15.321633338928223,-18.83152961730957,-11.376692771911621,2.491663932800293,-6.839995861053467,-2.2637650966644287,-7.069101333618164 +03-01-08-01-02-02-11.wav,1,2.0,2.969625,99.43125757763195,1816.4702149801285,0.0021653773728758097,0.969625,-579.3497924804688,73.63827514648438,16.107131958007812,18.65012550354004,11.909124374389648,14.176650047302246,7.396320819854736,-0.552416980266571,-0.6183729767799377,12.198795318603516,0.5836794376373291,2.536336660385132,1.229585886001587 +03-01-08-02-01-01-04.wav,0,0.0,2.0,325.3446224185826,1725.9116114719677,0.057467687875032425,2.0,-351.398193359375,13.631999015808105,-19.420581817626953,2.427349328994751,-21.161088943481445,-0.9016553163528442,-19.07534408569336,-13.427007675170898,-7.055447101593018,-4.060153484344482,-7.301610946655273,5.794939041137695,2.6429171562194824 +03-01-08-02-01-01-04.wav,1,2.0,3.670375,192.98369625272755,2269.8391005176595,0.056082721799612045,1.670375,-415.9051513671875,32.130943298339844,-24.568180084228516,-6.9300456047058105,-18.927825927734375,-6.243340492248535,-17.20754623413086,-11.31255054473877,1.5010075569152832,-6.49379301071167,-4.633881568908691,-0.43302980065345764,1.079548716545105 +03-01-08-02-01-01-11.wav,0,0.0,2.0,243.66287359252036,2188.9835980239095,0.09627990424633026,2.0,-300.6611633300781,33.11962127685547,-1.6057499647140503,5.572286128997803,-5.606652736663818,-2.887505531311035,-9.708866119384766,-8.635379791259766,-10.28033447265625,0.43761733174324036,-3.41400146484375,-5.017430305480957,-4.497383117675781 +03-01-08-02-01-01-11.wav,1,2.0,3.06975,69.4661759397575,1522.473202317346,0.0070076449774205685,1.06975,-565.2642822265625,81.98078155517578,21.857707977294922,15.207307815551758,9.250645637512207,10.726428985595703,9.589344024658203,5.985095024108887,8.153105735778809,9.571640968322754,3.970273971557617,2.091641902923584,3.4034924507141113 +03-01-08-02-01-02-04.wav,0,0.0,2.0,322.3728617657793,2087.2725347413507,0.07361870259046555,2.0,-333.6669616699219,11.343953132629395,-16.803462982177734,0.3729467988014221,-22.15609359741211,-7.360009670257568,-26.35776710510254,-10.101801872253418,-1.0004398822784424,-0.8266094326972961,-1.9225088357925415,5.046638488769531,0.32877251505851746 +03-01-08-02-01-02-04.wav,1,2.0,3.57025,177.29895579392604,2416.466435397728,0.07515338808298111,1.57025,-358.8617248535156,27.478925704956055,-20.71054458618164,-8.193100929260254,-12.891867637634277,-6.562589168548584,-15.139906883239746,-11.134721755981445,-3.5103538036346436,-5.0408735275268555,-7.340667724609375,-1.3712949752807617,-1.2326964139938354 +03-01-08-02-01-02-11.wav,0,0.0,2.0,219.07852523671403,2367.9740100791614,0.08375223726034164,2.0,-273.0134582519531,26.857240676879883,-2.16780424118042,3.1224985122680664,-6.424581050872803,-1.773982286453247,-7.750455856323242,-7.901342391967773,-11.66847038269043,4.605861663818359,-7.012662887573242,-5.6356024742126465,-5.6429572105407715 +03-01-08-02-01-02-11.wav,1,2.0,3.103125,117.10468225548783,1602.2323399813722,0.01852160133421421,1.103125,-507.36639404296875,58.37172317504883,16.52630615234375,5.823297500610352,6.569292068481445,4.704503059387207,9.012459754943848,0.5807802081108093,1.4731007814407349,2.3423776626586914,0.0651915967464447,1.229411005973816,-0.47389939427375793 +03-01-08-02-02-01-04.wav,0,0.0,2.0,314.62076759451014,1861.9825502186713,0.09593053162097931,2.0,-303.8385925292969,16.900964736938477,-15.522872924804688,-0.3669223487377167,-21.24480438232422,-7.596333980560303,-22.72417449951172,-9.40095329284668,-7.062468528747559,-6.678309440612793,-4.641213893890381,3.293290615081787,0.04352813959121704 +03-01-08-02-02-01-04.wav,1,2.0,3.470125,144.07010458118563,2202.547740738081,0.06558921188116074,1.470125,-380.51495361328125,35.1405143737793,-18.835390090942383,-3.858624219894409,-10.322066307067871,-7.607975959777832,-12.72431755065918,-4.3813371658325195,-3.709280490875244,-3.38680100440979,-7.093520641326904,-1.2667877674102783,0.5732070207595825 +03-01-08-02-02-01-11.wav,0,0.0,2.0,184.48971934303495,2344.93871513892,0.10316289961338043,2.0,-290.6114807128906,28.574459075927734,1.1512490510940552,2.8620810508728027,-5.342883586883545,-2.134117841720581,-9.9960355758667,-9.647096633911133,-10.79529857635498,2.80367374420166,-6.116196632385254,-4.874912738800049,-3.5786755084991455 +03-01-08-02-02-01-11.wav,1,2.0,3.06975,128.67487623047705,1625.8694687704267,0.016520733013749123,1.06975,-525.0234985351562,67.99554443359375,16.374744415283203,6.861120223999023,10.207650184631348,5.608453273773193,10.158432960510254,0.9121835231781006,0.5120887160301208,3.411001682281494,1.8272567987442017,2.853531837463379,5.073111534118652 +03-01-08-02-02-02-04.wav,0,0.0,2.0,333.6480350919308,1720.0137953938668,0.07594458758831024,2.0,-358.2447814941406,17.0312557220459,-16.271696090698242,-1.3339849710464478,-23.42181968688965,-8.194574356079102,-26.539106369018555,-14.154330253601074,-11.60094928741455,-10.724595069885254,-7.926241397857666,1.6645560264587402,-2.0727477073669434 +03-01-08-02-02-02-04.wav,1,2.0,3.536875,170.00461134502976,2377.6190922522605,0.048754580318927765,1.536875,-410.58367919921875,28.646774291992188,-18.027633666992188,-10.329991340637207,-13.569453239440918,-6.895887851715088,-20.322832107543945,-5.555948734283447,-4.587707996368408,-6.388387203216553,-11.483903884887695,-4.106753826141357,-1.3468512296676636 +03-01-08-02-02-02-11.wav,0,0.0,2.0,225.89891000364824,2590.5941343993654,0.075254887342453,2.0,-303.6177673339844,18.640228271484375,2.4817516803741455,3.5802488327026367,-4.315466403961182,-4.253835678100586,-8.625144958496094,-10.31455135345459,-10.85571002960205,1.922488808631897,-8.430068969726562,-5.139867305755615,-2.735318899154663 +03-01-08-02-02-02-11.wav,1,2.0,3.06975,92.22930960369851,1697.8733993415622,0.010263322852551937,1.06975,-584.2264404296875,75.87106323242188,24.280128479003906,9.574267387390137,8.17524528503418,6.938128471374512,11.510385513305664,1.6770517826080322,4.933045387268066,3.026773691177368,-1.4833271503448486,1.5095634460449219,3.909019947052002 +input2.wav,0,0.0,2.0,318.0277662697333,2136.425943997111,0.10887834429740906,2.0,-339.5455017089844,6.151329040527344,-15.222455978393555,6.990254878997803,-22.399429321289062,-8.330525398254395,-25.271900177001953,-10.125433921813965,-5.788905620574951,-5.094250202178955,-14.859763145446777,-3.446547508239746,-7.320960521697998 +input2.wav,1,2.0,4.0,312.6176591088032,1896.3881279094917,0.16863708198070526,2.0,-288.9301452636719,39.44871520996094,-31.610321044921875,5.158578395843506,-32.230377197265625,-1.2366830110549927,-25.1152286529541,-14.537945747375488,-3.5065839290618896,-11.606106758117676,-15.485522270202637,-10.376522064208984,-6.404701232910156 +input2.wav,2,4.0,4.3710625,135.20904295781332,2980.430798169887,0.0001156322832684964,0.3710625,-661.8627319335938,40.94028854370117,1.590044617652893,5.361208438873291,-5.706419467926025,7.078088760375977,-2.7384145259857178,1.3892229795455933,-6.1543869972229,-2.8032751083374023,-6.853562831878662,0.6416921019554138,-3.8219070434570312 +processed_audio.wav,0,0.0,2.0,313.9114402278174,2047.8974265463457,0.09593942016363144,2.0,-303.81964111328125,16.920330047607422,-15.501830101013184,-0.3510083854198456,-21.223356246948242,-7.578132629394531,-22.70688819885254,-9.38455581665039,-7.043789863586426,-6.661175727844238,-4.630411148071289,3.308720827102661,0.06195393204689026 +processed_audio.wav,1,2.0,3.470125,147.51936907513718,2524.7456981241407,0.06560914218425751,1.470125,-380.46307373046875,35.207847595214844,-18.770246505737305,-3.7890093326568604,-10.251666069030762,-7.549962520599365,-12.66173267364502,-4.312536716461182,-3.6539535522460938,-3.3128247261047363,-7.0255584716796875,-1.2007673978805542,0.6314259171485901 +sample_audio.wav,0,0.0,2.0,314.62076759451014,1861.9825502186713,0.09593053162097931,2.0,-303.8385925292969,16.900964736938477,-15.522872924804688,-0.3669223487377167,-21.24480438232422,-7.596333980560303,-22.72417449951172,-9.40095329284668,-7.062468528747559,-6.678309440612793,-4.641213893890381,3.293290615081787,0.04352813959121704 +sample_audio.wav,1,2.0,3.470125,144.07010458118563,2202.547740738081,0.06558921188116074,1.470125,-380.51495361328125,35.1405143737793,-18.835390090942383,-3.858624219894409,-10.322066307067871,-7.607975959777832,-12.72431755065918,-4.3813371658325195,-3.709280490875244,-3.38680100440979,-7.093520641326904,-1.2667877674102783,0.5732070207595825 diff --git a/narrative-audio-system/examples/transcripts.txt b/narrative-audio-system/examples/transcripts.txt new file mode 100644 index 0000000..09426d6 --- /dev/null +++ b/narrative-audio-system/examples/transcripts.txt @@ -0,0 +1,10 @@ +03-01-01-01-01-01-01.wav Kids are talking by the door. +03-01-01-01-01-01-04.wav Kids are talking by the door. +03-01-01-01-01-01-11.wav Kids are talking by the door. +03-01-01-01-01-02-01.wav Kids are talking by the door. +03-01-01-01-01-02-04.wav Kids are talking by the door. +03-01-01-01-01-02-11.wav Kids are talking by the door. +03-01-01-01-02-01-01.wav Dogs are sitting by the door. +03-01-01-01-02-01-04.wav Dogs are sitting by the door. +03-01-01-01-02-01-11.wav Dogs are sitting by the door. +03-01-01-01-02-02-01.wav Dogs are sitting by the door. diff --git a/narrative-audio-system/output_generator/atmosphere.py b/narrative-audio-system/output_generator/atmosphere.py new file mode 100644 index 0000000..62c76b7 --- /dev/null +++ b/narrative-audio-system/output_generator/atmosphere.py @@ -0,0 +1,332 @@ +""" +Track B — Atmospheric Audio Suggestion +======================================== +Maps the current emotion label to an ambient soundscape query, runs it +through the Task 4 retrieval system (or a curated fallback library), +and schedules a smooth crossfade to the new atmosphere. + +Signal flow +----------- + EmotionResult.label ("tense") + | + v + AtmosphereMapper -> query string: "tense forest night ambience" + | + v + RetrievalQuery -> ranked audio clips from Task 4 index + | + v + CrossfadeSchedule -> {clip, fade_in_s=2.0, lag_s=6.0} + | + v + broadcast via WebSocket {"type": "atmosphere", "query": ..., "clip": ...} + +Lag and crossfade +----------------- + A 5–8 s lag between speech detection and atmosphere change is intentional — + it feels natural to an audience, matching how a film score would respond + to dialogue. The crossfade itself is 2 s by default. + +Usage (library) +--------------- + from output_generator.atmosphere import AtmosphereMapper, CrossfadeScheduler + + mapper = AtmosphereMapper() + scheduler = CrossfadeScheduler(lag_s=6.0, fade_s=2.0) + + query = mapper.query_for("tense") + schedule = scheduler.schedule(emotion_result, query) + print(schedule) +""" + +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional + +# --------------------------------------------------------------------------- +# Tone -> ambient query mapping +# --------------------------------------------------------------------------- + +# Primary queries: chosen to work well with Task 4 semantic ranking +TONE_QUERIES: Dict[str, str] = { + "calm": "calm gentle wind soft water ambient", + "neutral": "neutral quiet indoor room tone", + "happy": "upbeat bright cheerful outdoor birds", + "sad": "sad melancholy quiet rain distant", + "angry": "urgent high-energy dramatic intense", + "fearful": "tense dark forest night ambient", + "tense": "tense forest night ambience suspense", + "disgust": "dark low rumble ominous underground", + "surprised": "sudden bright stab high-energy reveal", +} + +# Curated fallback descriptions when no retrieval index is available +FALLBACK_AMBIENT: Dict[str, dict] = { + "calm": {"description": "gentle wind, soft water", "bpm": None, "energy": "low"}, + "neutral": {"description": "quiet room tone, light hum", "bpm": None, "energy": "low"}, + "happy": {"description": "birdsong, light breeze", "bpm": 110, "energy": "medium"}, + "sad": {"description": "distant rain, sparse piano", "bpm": 60, "energy": "low"}, + "angry": {"description": "driving percussion, wind", "bpm": 140, "energy": "high"}, + "fearful": {"description": "dark forest, distant owl, creak","bpm": None, "energy": "medium"}, + "tense": {"description": "tense forest night, branch snap","bpm": None, "energy": "medium"}, + "disgust": {"description": "low rumble, dripping, echo", "bpm": None, "energy": "low"}, + "surprised": {"description": "bright orchestral stab, rush", "bpm": 120, "energy": "high"}, +} + + +class AtmosphereMapper: + """ + Maps an emotion label to an ambient soundscape query string. + + Parameters + ---------- + custom_queries : dict | None + Override default label->query mappings. + verbose : bool + """ + + def __init__(self, custom_queries: Optional[Dict[str, str]] = None, + verbose: bool = True): + self._queries = {**TONE_QUERIES} + if custom_queries: + self._queries.update(custom_queries) + self.verbose = verbose + + def query_for(self, label: str) -> str: + """Return the retrieval query string for an emotion label.""" + query = self._queries.get(label.lower(), f"{label.lower()} ambient atmosphere") + if self.verbose: + print(f"[Atmosphere] label={label!r} query={query!r}") + return query + + def fallback_for(self, label: str) -> dict: + """Return a curated fallback description when no retrieval index exists.""" + return FALLBACK_AMBIENT.get(label.lower(), + {"description": f"{label} ambient", "energy": "low"}) + + +# --------------------------------------------------------------------------- +# Retrieval bridge — connects to Task 4 +# --------------------------------------------------------------------------- + +class RetrievalBridge: + """ + Wraps the Task 4 retrieval system (retrieval_prototype.py) so Track B + can query the existing RAVDESS-based index. + + Falls back gracefully to curated descriptions when the index is not built. + + Parameters + ---------- + features_csv : str | None + Path to task1_features_dataset.csv + emotion_labels_json : str | None + Path to emotion_labels.json + top_k : int + Number of candidates to retrieve per query. + """ + + def __init__( + self, + features_csv: Optional[str] = None, + emotion_labels_json: Optional[str] = None, + top_k: int = 3, + verbose: bool = True, + ): + self.top_k = top_k + self.verbose = verbose + self._index = None + self._search_fn = None + self._print_fn = None + + root = Path(__file__).resolve().parent.parent + csv = Path(features_csv) if features_csv else root / "examples" / "task1_features_dataset.csv" + json_path = Path(emotion_labels_json) if emotion_labels_json else root / "examples" / "emotion_labels.json" + + self._try_load_index(csv, json_path) + + def _try_load_index(self, csv: Path, json_path: Path) -> None: + if not csv.exists(): + if self.verbose: + print(f"[Atmosphere] Retrieval index not found at {csv}; using fallback descriptions.") + return + try: + import sys + p = str(csv.parent.parent / "task4_audio_retrieval") + if p not in sys.path: + sys.path.insert(0, p) + from retrieval_prototype import build_index, search, print_results + self._index = build_index( + features_csv=str(csv), + emotion_labels_json=str(json_path), + ) + self._search_fn = search + self._print_fn = print_results + if self.verbose: + print(f"[Atmosphere] Retrieval index loaded ({len(self._index)} records)") + except Exception as exc: + if self.verbose: + print(f"[Atmosphere] Could not load retrieval index ({exc}); using fallback.") + + def search(self, query: str) -> List[dict]: + """ + Run a semantic query against the Task 4 index. + Returns a list of result dicts, or an empty list on fallback. + """ + if self._search_fn is None or self._index is None: + return [] + try: + return self._search_fn(query, self._index, top_k=self.top_k) + except Exception: + return [] + + @property + def available(self) -> bool: + return self._index is not None + + +# --------------------------------------------------------------------------- +# Crossfade schedule +# --------------------------------------------------------------------------- + +@dataclass +class CrossfadeSchedule: + """ + Instruction for the audio engine to crossfade to a new atmosphere. + + Attributes + ---------- + emotion_label : str + The emotion that triggered this atmosphere change. + query : str + The search query used. + suggested_clip : str + Filename or description of the suggested ambient clip. + suggested_description : str + Human-readable description of the soundscape. + fade_in_s : float + Crossfade duration in seconds. + lag_s : float + Delay before the crossfade begins (natural dramatic lag). + scheduled_at : float + Wall-clock time.perf_counter() when this was created. + retrieval_results : list + Raw retrieval results (may be empty if using fallback). + """ + emotion_label: str + query: str + suggested_clip: str + suggested_description: str + fade_in_s: float = 2.0 + lag_s: float = 6.0 + scheduled_at: float = field(default_factory=time.perf_counter) + retrieval_results: List[dict] = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "type": "atmosphere", + "emotion_label": self.emotion_label, + "query": self.query, + "suggested_clip": self.suggested_clip, + "suggested_description": self.suggested_description, + "fade_in_s": self.fade_in_s, + "lag_s": self.lag_s, + } + + def __repr__(self) -> str: + return ( + f"CrossfadeSchedule(emotion={self.emotion_label!r}, " + f"clip={self.suggested_clip!r}, " + f"fade={self.fade_in_s}s, lag={self.lag_s}s)" + ) + + +class CrossfadeScheduler: + """ + Decides whether to issue a new atmosphere change and builds the schedule. + + Avoids spamming changes by enforcing a minimum time between switches + (cooldown_s). Only issues a new schedule when the emotion label changes + from the previous one. + + Parameters + ---------- + lag_s : float + Delay before crossfade begins (seconds). + fade_s : float + Crossfade duration (seconds). + cooldown_s : float + Minimum interval between atmosphere changes. + verbose : bool + """ + + def __init__( + self, + lag_s: float = 6.0, + fade_s: float = 2.0, + cooldown_s: float = 8.0, + verbose: bool = True, + ): + self.lag_s = lag_s + self.fade_s = fade_s + self.cooldown_s = cooldown_s + self.verbose = verbose + self._last_label: Optional[str] = None + self._last_change_time: float = 0.0 + self._mapper = AtmosphereMapper(verbose=False) + self._bridge = RetrievalBridge(verbose=verbose) + + def schedule(self, emotion_result) -> Optional[CrossfadeSchedule]: + """ + Evaluate whether an atmosphere change is warranted for this emotion. + + Returns a CrossfadeSchedule if a change should happen, else None. + + Parameters + ---------- + emotion_result : EmotionResult + """ + label = emotion_result.label.lower() + now = time.perf_counter() + + # Suppress change if same emotion or cooldown not elapsed + same_label = (label == self._last_label) + in_cooldown = (now - self._last_change_time) < self.cooldown_s + + if same_label and in_cooldown: + if self.verbose: + print(f"[Atmosphere] No change (same={same_label}, cooldown={in_cooldown})") + return None + + query = self._mapper.query_for(label) + results = self._bridge.search(query) + + if results: + top = results[0] + clip = top.get("filename", "unknown") + description = top.get("description", query) + else: + fallback = self._mapper.fallback_for(label) + clip = fallback["description"] + description = fallback["description"] + + schedule = CrossfadeSchedule( + emotion_label=label, + query=query, + suggested_clip=clip, + suggested_description=description, + fade_in_s=self.fade_s, + lag_s=self.lag_s, + retrieval_results=results, + ) + + self._last_label = label + self._last_change_time = now + + if self.verbose: + print( + f"[Atmosphere] Schedule -> {description!r} " + f"(lag={self.lag_s}s, fade={self.fade_s}s)" + ) + return schedule diff --git a/narrative-audio-system/output_generator/captions.py b/narrative-audio-system/output_generator/captions.py new file mode 100644 index 0000000..fb6fab1 --- /dev/null +++ b/narrative-audio-system/output_generator/captions.py @@ -0,0 +1,336 @@ +""" +Track A — Accessibility Captions +================================== +Formats (TranscriptionResult, EmotionResult) pairs into annotated caption +lines and distributes them over multiple channels simultaneously: + + stdout — always active; plain-text fallback + WebSocket — async server on ws://localhost:8765; browser overlays / + OBS browser source connect here + SRT file — standard subtitle file for post-production / archiving + +Caption format +-------------- + [calm] "The forest was quiet that night." + [tense] "Until the branch snapped." + +Each caption is also serialised as JSON over the WebSocket so the browser +frontend can style tone labels with CSS classes: + + { + "type": "caption", + "label": "tense", + "text": "Until the branch snapped.", + "start": 4.12, + "end": 5.88, + "color": "#e05c5c" + } + +Usage (library) +--------------- + from output_generator.captions import CaptionFormatter, SRTWriter + + fmt = CaptionFormatter() + line = fmt.format(transcript_result, emotion_result) + print(line.render()) # "[calm] The forest was quiet..." + + writer = SRTWriter("output.srt") + writer.write(line) + +WebSocket server (async) +------------------------ + import asyncio + from output_generator.captions import CaptionBroadcaster + + async def main(): + server = CaptionBroadcaster(host="localhost", port=8765) + await server.start() + # push captions from another coroutine: + await server.broadcast(line) + + asyncio.run(main()) +""" + +import asyncio +import json +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional, Set + +# --------------------------------------------------------------------------- +# Tone colour palette — maps emotion labels to hex colours for the overlay +# --------------------------------------------------------------------------- + +TONE_COLOURS: Dict[str, str] = { + "neutral": "#a8b4c0", + "calm": "#7ec8a0", + "happy": "#f7c948", + "sad": "#6b9bcf", + "angry": "#e05c5c", + "fearful": "#c07ecf", + "disgust": "#8a9e6b", + "surprised": "#f0a045", + "tense": "#e07a5f", # alias used in narrative context + "unknown": "#ffffff", +} + + +def _tone_colour(label: str) -> str: + return TONE_COLOURS.get(label.lower(), TONE_COLOURS["unknown"]) + + +# --------------------------------------------------------------------------- +# Data class +# --------------------------------------------------------------------------- + +@dataclass +class CaptionLine: + """ + One annotated caption unit, ready for display. + + Attributes + ---------- + label : str + Emotion / tone label (e.g. "calm", "tense"). + text : str + Transcribed utterance text. + start : float + Utterance start time in seconds. + end : float + Utterance end time in seconds. + confidence : float + Emotion classifier confidence (0-1). + color : str + Hex colour string for the tone label. + index : int + Sequential caption index (1-based, for SRT). + """ + label: str + text: str + start: float + end: float + confidence: float = 0.0 + color: str = "#ffffff" + index: int = 1 + + @property + def duration(self) -> float: + return self.end - self.start + + def render(self, width: int = 10) -> str: + """Plain-text caption line, e.g. '[calm] "The forest was quiet."'""" + tag = f"[{self.label}]" + return f"{tag:<{width}} \"{self.text}\"" + + def to_dict(self) -> dict: + return { + "type": "caption", + "index": self.index, + "label": self.label, + "text": self.text, + "start": round(self.start, 3), + "end": round(self.end, 3), + "confidence": round(self.confidence, 3), + "color": self.color, + } + + def to_srt_block(self) -> str: + """Return an SRT-formatted block for this caption.""" + def _ts(s: float) -> str: + h = int(s // 3600) + m = int((s % 3600) // 60) + sec = s % 60 + return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".", ",") + + return ( + f"{self.index}\n" + f"{_ts(self.start)} --> {_ts(self.end)}\n" + f"[{self.label}] {self.text}\n" + ) + + +# --------------------------------------------------------------------------- +# Formatter +# --------------------------------------------------------------------------- + +class CaptionFormatter: + """ + Combines a TranscriptionResult and EmotionResult into a CaptionLine. + + Parameters + ---------- + min_text_length : int + Captions shorter than this (characters) are skipped. Filters out + noise hallucinations from Whisper ("...", single punctuation, etc.) + verbose : bool + Print each formatted caption to stdout. + """ + + def __init__(self, min_text_length: int = 2, verbose: bool = True): + self.min_text_length = min_text_length + self.verbose = verbose + self._index = 0 + + def format(self, transcript, emotion) -> Optional[CaptionLine]: + """ + Build a CaptionLine from Step 4 + Step 5 results. + + Parameters + ---------- + transcript : TranscriptionResult + emotion : EmotionResult + + Returns + ------- + CaptionLine | None (None if text is too short to display) + """ + text = transcript.text.strip() + if len(text) < self.min_text_length: + return None + + self._index += 1 + line = CaptionLine( + label=emotion.label, + text=text, + start=transcript.start, + end=transcript.end, + confidence=emotion.confidence, + color=_tone_colour(emotion.label), + index=self._index, + ) + + if self.verbose: + print(f"[Caption] {line.render()}") + + return line + + def reset_index(self) -> None: + self._index = 0 + + +# --------------------------------------------------------------------------- +# SRT writer +# --------------------------------------------------------------------------- + +class SRTWriter: + """ + Appends CaptionLine objects to an SRT subtitle file. + + Parameters + ---------- + path : str | Path + Output file path. File is created (or appended) on first write. + verbose : bool + """ + + def __init__(self, path: str = "output.srt", verbose: bool = True): + self.path = Path(path) + self.verbose = verbose + self._count = 0 + # Clear existing file on open + self.path.write_text("", encoding="utf-8") + + def write(self, line: CaptionLine) -> None: + with self.path.open("a", encoding="utf-8") as f: + f.write(line.to_srt_block() + "\n") + self._count += 1 + if self.verbose: + print(f"[SRT] Wrote caption {line.index} -> {self.path}") + + def write_all(self, lines: List[CaptionLine]) -> None: + for line in lines: + if line is not None: + self.write(line) + + @property + def count(self) -> int: + return self._count + + +# --------------------------------------------------------------------------- +# WebSocket broadcaster (async) +# --------------------------------------------------------------------------- + +class CaptionBroadcaster: + """ + Async WebSocket server that broadcasts caption JSON to all connected + clients. Browser overlays and OBS browser sources connect to + ws://localhost:. + + Parameters + ---------- + host : str + port : int + verbose : bool + + Example + ------- + Browser JS: + const ws = new WebSocket("ws://localhost:8765"); + ws.onmessage = e => { + const d = JSON.parse(e.data); + if (d.type === "caption") showCaption(d.label, d.text, d.color); + if (d.type === "atmosphere") suggestAmbience(d.query); + }; + """ + + def __init__(self, host: str = "localhost", port: int = 8765, + verbose: bool = True): + try: + import websockets + self._websockets = websockets + except ImportError: + raise ImportError("websockets not installed. Run: pip install websockets") + + self.host = host + self.port = port + self.verbose = verbose + self._clients: Set = set() + self._server = None + + async def _handler(self, websocket) -> None: + self._clients.add(websocket) + if self.verbose: + print(f"[WS] Client connected ({len(self._clients)} total)") + try: + await websocket.wait_closed() + finally: + self._clients.discard(websocket) + if self.verbose: + print(f"[WS] Client disconnected ({len(self._clients)} remaining)") + + async def start(self) -> None: + """Start the WebSocket server (non-blocking coroutine).""" + self._server = await self._websockets.serve( + self._handler, self.host, self.port + ) + if self.verbose: + print(f"[WS] Caption server listening on ws://{self.host}:{self.port}") + + async def stop(self) -> None: + if self._server: + self._server.close() + await self._server.wait_closed() + if self.verbose: + print("[WS] Caption server stopped") + + async def broadcast(self, payload: dict) -> None: + """Send a JSON payload to all connected clients.""" + if not self._clients: + return + message = json.dumps(payload) + await asyncio.gather( + *[client.send(message) for client in list(self._clients)], + return_exceptions=True, + ) + + async def send_caption(self, line: CaptionLine) -> None: + await self.broadcast(line.to_dict()) + + async def send_raw(self, payload: dict) -> None: + await self.broadcast(payload) + + @property + def connected_clients(self) -> int: + return len(self._clients) diff --git a/narrative-audio-system/output_generator/output_generator.py b/narrative-audio-system/output_generator/output_generator.py new file mode 100644 index 0000000..d200668 --- /dev/null +++ b/narrative-audio-system/output_generator/output_generator.py @@ -0,0 +1,355 @@ +""" +Step 6 — Output Generation (Tracks A + B combined) +==================================================== +Receives (TranscriptionResult, EmotionResult) pairs from the parallel +Steps 4+5 processor and fans out to both output tracks simultaneously: + + Track A — Accessibility Captions + Format annotated caption line -> stdout + SRT file + WebSocket broadcast + + Track B — Atmospheric Audio Suggestion + Map emotion label -> retrieval query -> crossfade schedule -> + WebSocket broadcast (browser/OBS picks this up and triggers audio engine) + +Both tracks are dispatched concurrently via ThreadPoolExecutor so neither +blocks the other. + +Full pipeline +------------- + [Step 1] Mic capture + [Step 2] VAD + [Step 3] Utterance segmentation + [Step 4+5] Parallel transcription + emotion classification + | + v + [Step 6] OutputGenerator <-- this module + | + +---> Track A: [calm] "The forest was quiet that night." + | -> stdout / SRT / WebSocket ws://localhost:8765 + | + +---> Track B: "calm" -> "calm gentle wind soft water" + -> CrossfadeSchedule (lag=6s, fade=2s) + -> WebSocket ws://localhost:8765 + +WebSocket message types +----------------------- + Caption: { "type":"caption", "label":"calm", "text":"...", "color":"#7ec8a0", ... } + Atmosphere: { "type":"atmosphere", "query":"calm gentle wind...", "fade_in_s":2, "lag_s":6 } + +Usage (standalone demo) +----------------------- + python output_generator/output_generator.py --input examples/captured_audio.wav + python output_generator/output_generator.py --input audio.wav --ws # with WebSocket + +Usage (library) +--------------- + from output_generator.output_generator import OutputGenerator + + gen = OutputGenerator(srt_path="session.srt", enable_websocket=False) + for transcript, emotion in parallel_processor.process_all(utterances): + gen.process(transcript, emotion) + gen.close() +""" + +import asyncio +import sys +import threading +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import List, Optional, Tuple + +# --------------------------------------------------------------------------- +# Internal imports +# --------------------------------------------------------------------------- + +from captions import CaptionFormatter, SRTWriter, CaptionBroadcaster, CaptionLine +from atmosphere import AtmosphereMapper, CrossfadeScheduler, CrossfadeSchedule + + +# --------------------------------------------------------------------------- +# OutputGenerator +# --------------------------------------------------------------------------- + +class OutputGenerator: + """ + Fans out (TranscriptionResult, EmotionResult) to Track A and Track B + concurrently. + + Parameters + ---------- + srt_path : str | None + If set, captions are also appended to this SRT file. + enable_websocket : bool + Start a WebSocket server on ws://localhost:. + ws_host : str + ws_port : int + lag_s : float + Atmosphere crossfade lag in seconds. + fade_s : float + Atmosphere crossfade duration in seconds. + cooldown_s : float + Minimum seconds between atmosphere changes. + min_caption_length : int + Captions shorter than this are dropped. + verbose : bool + """ + + def __init__( + self, + srt_path: Optional[str] = None, + enable_websocket: bool = False, + ws_host: str = "localhost", + ws_port: int = 8765, + lag_s: float = 6.0, + fade_s: float = 2.0, + cooldown_s: float = 8.0, + min_caption_length: int = 2, + verbose: bool = True, + ): + self.verbose = verbose + + # Track A + self._formatter = CaptionFormatter( + min_text_length=min_caption_length, verbose=verbose + ) + self._srt_writer = SRTWriter(srt_path, verbose=verbose) if srt_path else None + self._captions: List[CaptionLine] = [] + + # Track B + self._scheduler = CrossfadeScheduler( + lag_s=lag_s, fade_s=fade_s, cooldown_s=cooldown_s, verbose=verbose + ) + self._atmospheres: List[CrossfadeSchedule] = [] + + # WebSocket (optional) + self._broadcaster: Optional[CaptionBroadcaster] = None + self._ws_loop: Optional[asyncio.AbstractEventLoop] = None + self._ws_thread: Optional[threading.Thread] = None + + if enable_websocket: + self._start_websocket_server(ws_host, ws_port) + + # ------------------------------------------------------------------ + # WebSocket server (runs in a background daemon thread) + # ------------------------------------------------------------------ + + def _start_websocket_server(self, host: str, port: int) -> None: + """Launch the WebSocket server in a background thread.""" + self._ws_loop = asyncio.new_event_loop() + self._broadcaster = CaptionBroadcaster(host=host, port=port, + verbose=self.verbose) + + def _run(): + asyncio.set_event_loop(self._ws_loop) + self._ws_loop.run_until_complete(self._broadcaster.start()) + self._ws_loop.run_forever() + + self._ws_thread = threading.Thread(target=_run, daemon=True) + self._ws_thread.start() + + def _ws_broadcast(self, payload: dict) -> None: + """Thread-safe fire-and-forget broadcast to WebSocket clients.""" + if self._broadcaster is None or self._ws_loop is None: + return + asyncio.run_coroutine_threadsafe( + self._broadcaster.broadcast(payload), self._ws_loop + ) + + # ------------------------------------------------------------------ + # Core processing + # ------------------------------------------------------------------ + + def _track_a(self, transcript, emotion) -> Optional[CaptionLine]: + """Track A: format caption, write SRT, broadcast.""" + line = self._formatter.format(transcript, emotion) + if line is None: + return None + self._captions.append(line) + if self._srt_writer: + self._srt_writer.write(line) + self._ws_broadcast(line.to_dict()) + return line + + def _track_b(self, emotion) -> Optional[CrossfadeSchedule]: + """Track B: schedule atmosphere change, broadcast.""" + schedule = self._scheduler.schedule(emotion) + if schedule is None: + return None + self._atmospheres.append(schedule) + self._ws_broadcast(schedule.to_dict()) + return schedule + + def process(self, transcript, emotion) -> Tuple[Optional[CaptionLine], + Optional[CrossfadeSchedule]]: + """ + Process one (TranscriptionResult, EmotionResult) pair through + both output tracks concurrently. + + Returns + ------- + (CaptionLine | None, CrossfadeSchedule | None) + """ + with ThreadPoolExecutor(max_workers=2) as pool: + a_future = pool.submit(self._track_a, transcript, emotion) + b_future = pool.submit(self._track_b, emotion) + caption = a_future.result() + schedule = b_future.result() + return caption, schedule + + def process_all(self, pairs) -> List[Tuple]: + """ + Process a list of (TranscriptionResult, EmotionResult) pairs. + + Returns + ------- + List[(CaptionLine | None, CrossfadeSchedule | None)] + """ + return [self.process(tr, er) for tr, er in pairs] + + # ------------------------------------------------------------------ + # Session summary + # ------------------------------------------------------------------ + + def summary(self) -> str: + lines = [ + f"\n{'='*55}", + f" Session Summary", + f"{'='*55}", + f" Captions generated : {len(self._captions)}", + f" Atmosphere changes : {len(self._atmospheres)}", + ] + if self._srt_writer: + lines.append(f" SRT file : {self._srt_writer.path}") + lines.append(f"\n Full transcript:") + for cap in self._captions: + lines.append(f" {cap.render()}") + if self._atmospheres: + lines.append(f"\n Atmosphere log:") + for atm in self._atmospheres: + lines.append( + f" [{atm.emotion_label}] -> {atm.suggested_description!r}" + f" (lag={atm.lag_s}s, fade={atm.fade_in_s}s)" + ) + lines.append(f"{'='*55}") + return "\n".join(lines) + + def close(self) -> None: + """Print session summary and stop the WebSocket server.""" + print(self.summary()) + if self._ws_loop and self._broadcaster: + asyncio.run_coroutine_threadsafe( + self._broadcaster.stop(), self._ws_loop + ) + + @property + def captions(self) -> List[CaptionLine]: + return list(self._captions) + + @property + def atmosphere_log(self) -> List[CrossfadeSchedule]: + return list(self._atmospheres) + + +# --------------------------------------------------------------------------- +# CLI demo +# --------------------------------------------------------------------------- + +def _parse_args(): + import argparse + parser = argparse.ArgumentParser( + description="Step 6 — Output Generation demo" + ) + parser.add_argument("--input", default=None, metavar="FILE.WAV", + help="WAV file to process (default: live mic)") + parser.add_argument("--duration", type=float, default=10.0) + parser.add_argument("--rate", type=int, default=16000) + parser.add_argument("--srt", default="examples/output.srt", + help="SRT output file (default: examples/output.srt)") + parser.add_argument("--ws", action="store_true", + help="Enable WebSocket server on ws://localhost:8765") + parser.add_argument("--lag", type=float, default=6.0) + parser.add_argument("--fade", type=float, default=2.0) + parser.add_argument("--vad-mode", type=int, default=2) + parser.add_argument("--ws-wait", type=float, default=3.0, + help="Seconds to wait after WS server starts before processing " + "(gives browser time to connect). Default: 3") + parser.add_argument("--linger", type=float, default=15.0, + help="Seconds to keep WS server alive after processing " + "(so atmosphere lag effects play out). Default: 15") + return parser.parse_args() + + +def main(): + args = _parse_args() + root = Path(__file__).resolve().parent.parent + for mod in ("vad_engine", "utterance_buffer", "task0_audio_capture", + "transcriber", "emotion_classifier"): + p = str(root / mod) + if p not in sys.path: + sys.path.insert(0, p) + + from vad import detect_speech_segments + from segmenter import segment_utterances + from streaming_transcriber import Transcriber + from classifier import EmotionClassifier, ParallelProcessor + + print( + f"\nStep 6 — Output Generation\n" + f" Track A : captions -> stdout + {args.srt}\n" + f" Track B : atmosphere suggestions (lag={args.lag}s, fade={args.fade}s)\n" + f" WebSocket: {'ws://localhost:8765' if args.ws else 'disabled'}\n" + ) + + # Start WebSocket server FIRST so browser can connect before processing begins + gen = OutputGenerator( + srt_path=args.srt, + enable_websocket=args.ws, + lag_s=args.lag, + fade_s=args.fade, + verbose=True, + ) + + if args.ws: + import time as _time + print(f"[Output] WebSocket ready — open http://localhost:8000/overlay.html") + print(f"[Output] Waiting {args.ws_wait}s for browser to connect...\n") + _time.sleep(args.ws_wait) + + # Load audio + if args.input: + import soundfile as sf + audio, sr = sf.read(args.input, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != args.rate: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=args.rate) + print(f"[Output] Loaded {args.input} ({len(audio)/args.rate:.2f}s)\n") + else: + from audio_capture import record_for_duration + audio = record_for_duration(duration=args.duration, sample_rate=args.rate) + + # Steps 1-5 + vad_segs = detect_speech_segments(audio, sample_rate=args.rate, + aggressiveness=args.vad_mode, verbose=False) + utterances = segment_utterances(vad_segs, verbose=False) + print(f"[Output] {len(utterances)} utterance(s)\n") + + transcriber = Transcriber(model_size="tiny", verbose=False) + classifier = EmotionClassifier(verbose=False) + proc = ParallelProcessor(transcriber, classifier, verbose=False) + pairs = proc.process_all(utterances) + + # Step 6 — output + gen.process_all(pairs) + + if args.ws and args.linger > 0: + import time as _time + print(f"\n[Output] Lingering {args.linger}s so browser atmosphere effects play out...") + _time.sleep(args.linger) + + gen.close() + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/output_generator/overlay.html b/narrative-audio-system/output_generator/overlay.html new file mode 100644 index 0000000..6a86b1f --- /dev/null +++ b/narrative-audio-system/output_generator/overlay.html @@ -0,0 +1,407 @@ + + + + + + Narrative Audio Overlay + + + + + +
+ + +
+ 🎶 + Atmosphere + +
+
+
+
+ + +
+ + +
+
+
+ + +
+
+ + + + diff --git a/narrative-audio-system/requirements.txt b/narrative-audio-system/requirements.txt new file mode 100644 index 0000000..2a5f5bb --- /dev/null +++ b/narrative-audio-system/requirements.txt @@ -0,0 +1,10 @@ +librosa==0.10.1 +soundfile==0.12.1 +sounddevice>=0.4.6 +webrtcvad>=2.0.10 +faster-whisper>=1.0.0 +torch==2.11.0 +scikit-learn==1.4.1 +openai-whisper==20231117 +sentence-transformers==2.6.1 +numpy==1.26.4 \ No newline at end of file diff --git a/narrative-audio-system/run_pipeline.py b/narrative-audio-system/run_pipeline.py new file mode 100644 index 0000000..86ec870 --- /dev/null +++ b/narrative-audio-system/run_pipeline.py @@ -0,0 +1,358 @@ +import whisper +import librosa +import soundfile as sf +import shutil +import sys +import json +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent + + +def _add_module_path(folder_name): + module_path = str(PROJECT_ROOT / folder_name) + if module_path not in sys.path: + sys.path.insert(0, module_path) + + +_add_module_path("task0_audio_capture") +from audio_capture import AudioCaptureStream, RollingBuffer, record_for_duration + +_add_module_path("vad_engine") +from vad import detect_speech_segments, VADProcessor + +_add_module_path("utterance_buffer") +from segmenter import UtteranceSegmenter, segment_utterances + +_add_module_path("transcriber") +from streaming_transcriber import Transcriber, StreamingTranscriber, transcribe_utterances + +_add_module_path("emotion_classifier") +from classifier import EmotionClassifier, ParallelProcessor + +_add_module_path("task1_audio_pipeline") +from audio_pipeline import build_feature_dataset + +_add_module_path("task3_transcription") +from whisper_transcriber import transcribe_directory, measure_accuracy + +_add_module_path("task4_audio_retrieval") +from retrieval_prototype import build_index as build_retrieval_index, search as retrieval_search, print_results as print_retrieval_results + +_add_module_path("task_bonus_storytelling") +from storytelling_analysis import analyze_storytelling, discuss_storytelling_signals +import torch +import torch.nn as nn +import torch.optim as optim +from sklearn.metrics import accuracy_score, classification_report, f1_score +from sklearn.model_selection import train_test_split +import numpy as np + + +def process_audio(input_path, output_path, sample_rate=16000, n_mfcc=13): + waveform, loaded_sample_rate = librosa.load(input_path, sr=sample_rate) + mfcc_features = librosa.feature.mfcc(y=waveform, sr=loaded_sample_rate, n_mfcc=n_mfcc) + print("Task 1: MFCC feature shape:", mfcc_features.shape) + sf.write(output_path, waveform, loaded_sample_rate) + return mfcc_features + + +def extract_mfcc_vector(audio_path, sample_rate=16000, n_mfcc=13): + waveform, loaded_sample_rate = librosa.load(str(audio_path), sr=sample_rate) + mfcc_matrix = librosa.feature.mfcc(y=waveform, sr=loaded_sample_rate, n_mfcc=n_mfcc) + return mfcc_matrix.mean(axis=1) + + +class SimpleClassifier(nn.Module): + def __init__(self, input_dim, hidden_dim, num_classes): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(hidden_dim, num_classes), + ) + + def forward(self, input_features): + return self.net(input_features) + + +def train_classifier(features, labels, num_epochs=30, test_size=0.2, random_seed=42): + label_array = np.asarray(labels) + unique_classes, encoded_labels = np.unique(label_array, return_inverse=True) + class_names = [str(n).strip().title() for n in unique_classes.tolist()] + print("Task 2: Emotions:", ", ".join(class_names)) + + feature_array = np.asarray(features, dtype=np.float32) + + X_train, X_test, y_train, y_test = train_test_split( + feature_array, encoded_labels, test_size=test_size, + random_state=random_seed, stratify=encoded_labels + ) + + feature_mean = X_train.mean(axis=0) + feature_std = X_train.std(axis=0) + 1e-6 + X_train = (X_train - feature_mean) / feature_std + X_test_norm = (X_test - feature_mean) / feature_std + + classifier_model = SimpleClassifier(X_train.shape[1], hidden_dim=64, num_classes=len(unique_classes)) + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(classifier_model.parameters(), lr=1e-3) + + train_tensor = torch.tensor(X_train, dtype=torch.float32) + label_tensor = torch.tensor(y_train, dtype=torch.long) + + classifier_model.train() + for epoch in range(num_epochs): + logits = classifier_model(train_tensor) + loss = criterion(logits, label_tensor) + optimizer.zero_grad() + loss.backward() + optimizer.step() + if epoch in {0, num_epochs - 1} or (epoch + 1) % 10 == 0: + preds = torch.argmax(logits, dim=1).numpy() + acc = accuracy_score(y_train, preds) + f1 = f1_score(y_train, preds, average="weighted", zero_division=0) + print(f"Task 2 - Epoch {epoch + 1:3d}: loss={loss.item():.4f} train_acc={acc:.3f} train_f1={f1:.3f}") + + # Held-out test evaluation + classifier_model.eval() + with torch.no_grad(): + test_preds = torch.argmax(classifier_model(torch.tensor(X_test_norm, dtype=torch.float32)), dim=1).numpy() + test_acc = accuracy_score(y_test, test_preds) + test_f1 = f1_score(y_test, test_preds, average="weighted", zero_division=0) + print(f"Task 2 - Test Accuracy: {test_acc:.3f} Weighted F1: {test_f1:.3f}") + print("Task 2 - Per-class report:") + print(classification_report(y_test, test_preds, target_names=class_names, zero_division=0)) + + return classifier_model, unique_classes, feature_mean, feature_std + + +def predict_emotion(model, class_names, feature_mean, feature_std, mfcc_vector): + standardized_vector = (np.asarray(mfcc_vector, dtype=np.float32) - feature_mean) / feature_std + input_tensor = torch.tensor(standardized_vector, dtype=torch.float32).unsqueeze(0) + with torch.no_grad(): + logits = model(input_tensor) + predicted_index = int(torch.argmax(logits, dim=1).item()) + return str(class_names[predicted_index]).strip().title() + + +def load_labeled_mfcc_features(label_map_path, sample_rate=16000, n_mfcc=13): + with Path(label_map_path).open("r", encoding="utf-8-sig") as fp: + filename_to_emotion = json.load(fp) + + feature_rows = [] + emotion_labels = [] + audio_root = Path(label_map_path).parent + + for filename, emotion in filename_to_emotion.items(): + audio_path = audio_root / filename + if not audio_path.is_file(): + continue + + mfcc_vector = extract_mfcc_vector(audio_path, sample_rate=sample_rate, n_mfcc=n_mfcc) + feature_rows.append(mfcc_vector) + emotion_labels.append(emotion) + + if not feature_rows: + raise ValueError(f"No labeled audio features could be loaded from {label_map_path}") + + return np.asarray(feature_rows, dtype=np.float32), emotion_labels + + +def transcribe_audio(input_path, model_size="tiny"): + if shutil.which("ffmpeg") is None: + fallback_transcript = "Transcription unavailable because ffmpeg is not installed." + print("Task 3: Transcription (fallback):", fallback_transcript) + return fallback_transcript + + whisper_model = whisper.load_model(model_size) + try: + transcription_result = whisper_model.transcribe(input_path) + transcript_text = transcription_result["text"] + print("Task 3: Transcription:", transcript_text) + return transcript_text + except FileNotFoundError: + fallback_transcript = "Transcription unavailable because ffmpeg is not installed." + print("Task 3: Transcription (fallback):", fallback_transcript) + return fallback_transcript + + + +if __name__ == "__main__": + input_audio_path = "examples/sample_audio.wav" + if len(sys.argv) == 2: + arg_path = Path(sys.argv[1]) + if arg_path.parent == Path("."): + input_audio_path = str(Path("examples") / arg_path.name) + else: + input_audio_path = str(arg_path) + + processed_audio_path = "examples/processed_audio.wav" + + # ------------------------------------------------------------------ + # Step 1 — Audio Capture & Streaming + # ------------------------------------------------------------------ + print("Step 1: Audio Capture & Streaming") + capture_duration = 5.0 # seconds to record from the microphone + captured_audio_path = Path("examples/captured_audio.wav") + try: + captured = record_for_duration(duration=capture_duration, verbose=True) + import soundfile as sf + sf.write(str(captured_audio_path), captured, 16000) + print( + f"Step 1: captured {len(captured)} samples " + f"({len(captured)/16000:.2f} s) → {captured_audio_path}" + ) + # Use the live capture as the pipeline's input if no explicit path given + if len(sys.argv) < 2: + input_audio_path = str(captured_audio_path) + except ImportError as exc: + print(f"Step 1: skipping live capture ({exc}). Using pre-recorded file.") + except Exception as exc: + print(f"Step 1: microphone unavailable ({exc}). Using pre-recorded file.") + + # ------------------------------------------------------------------ + # Step 2 — Voice Activity Detection + # ------------------------------------------------------------------ + print("\nStep 2: Voice Activity Detection") + try: + import soundfile as _sf_vad + _vad_audio, _vad_sr = _sf_vad.read(input_audio_path, dtype="float32", always_2d=False) + if _vad_audio.ndim > 1: + _vad_audio = _vad_audio.mean(axis=1) + speech_segments = detect_speech_segments( + _vad_audio, + sample_rate=_vad_sr, + frame_ms=20, + aggressiveness=2, + verbose=True, + ) + total_speech = sum(s.duration for s in speech_segments) + total_dur = len(_vad_audio) / _vad_sr + print( + f"Step 2: {len(speech_segments)} segment(s) detected — " + f"{total_speech:.2f}s speech / {total_dur:.2f}s total " + f"({100*total_speech/max(total_dur,1e-6):.1f}%)" + ) + for i, seg in enumerate(speech_segments, 1): + print(f" [{i:2d}] {seg.start:.3f}s -> {seg.end:.3f}s ({seg.duration:.3f}s)") + except Exception as exc: + print(f"Step 2: VAD skipped ({exc})") + speech_segments = [] + + # ------------------------------------------------------------------ + # Step 3 — Buffering & Utterance Segmentation + # ------------------------------------------------------------------ + print("\nStep 3: Buffering & Utterance Segmentation") + try: + utterances = segment_utterances( + speech_segments, + strategy="pause_triggered", + pause_s=0.4, + max_utterance_s=8.0, + sample_rate=16000, + verbose=True, + ) + print(f"Step 3: {len(utterances)} utterance(s) ready for transcriber") + for i, u in enumerate(utterances, 1): + print( + f" [{i:2d}] {u.start:.3f}s -> {u.end:.3f}s " + f"span={u.duration:.3f}s vad_segs={u.num_vad_segments}" + ) + except Exception as exc: + print(f"Step 3: segmentation skipped ({exc})") + utterances = [] + + # ------------------------------------------------------------------ + # Steps 4 + 5 — Transcription & Emotion Classification (parallel) + # ------------------------------------------------------------------ + print("\nSteps 4+5: Streaming Transcription + Emotion Classification (parallel)") + transcription_results = [] + emotion_results = [] + try: + if utterances: + transcriber = Transcriber(model_size="tiny", verbose=False) + classifier = EmotionClassifier(verbose=False) + processor = ParallelProcessor(transcriber, classifier, verbose=True) + paired = processor.process_all(utterances) + transcription_results = [t for t, _ in paired] + emotion_results = [e for _, e in paired] + full_text = " ".join(r.text for r in transcription_results if r.text) + print(f"\nSteps 4+5: Full transcript : {full_text!r}") + for i, (tr, er) in enumerate(paired, 1): + avg_lat = (tr.latency_ms + er.latency_ms) / 2 + print(f" [{i:2d}] \"{tr.text}\" | {er.label} ({er.confidence:.2f}) " + f"[transcribe={tr.latency_ms:.0f}ms classify={er.latency_ms:.1f}ms]") + else: + print("Steps 4+5: no utterances, skipping.") + except Exception as exc: + print(f"Steps 4+5: skipped ({exc})") + + task1_output_csv = Path("examples/task1_features_dataset.csv") + task1_normalized_dir = Path("examples/normalized_audio") + print("Task 1: running full audio feature extraction pipeline...") + build_feature_dataset( + input_dir="examples", + output_csv=str(task1_output_csv), + normalized_dir=str(task1_normalized_dir), + ) + + process_audio(input_audio_path, processed_audio_path) + + task3_transcript_file = Path("examples/transcripts.txt") + print("\nTask 3: transcribing recordings (first 10 files) ...") + all_transcripts = transcribe_directory( + input_dir="examples", + output_txt=str(task3_transcript_file), + model_size="tiny", + max_files=10, + ) + measure_accuracy(all_transcripts, max_samples=10) + + input_filename = Path(input_audio_path).name + transcript_text = all_transcripts.get(input_filename) or transcribe_audio(input_audio_path) + + label_map_file = Path("examples") / "emotion_labels.json" + predicted_input_emotion = "Unknown" + if label_map_file.is_file(): + training_features, training_labels = load_labeled_mfcc_features(label_map_file) + print( + f"Task 2: loaded {len(training_labels)} labeled samples across " + f"{len(set(training_labels))} emotions from {label_map_file}." + ) + trained_model, class_names, feature_mean, feature_std = train_classifier(training_features, training_labels) + input_mfcc_vector = extract_mfcc_vector(input_audio_path) + predicted_input_emotion = predict_emotion( + trained_model, + class_names, + feature_mean, + feature_std, + input_mfcc_vector, + ) + print(f"Task 2: Predicted emotion for input audio: {predicted_input_emotion}") + else: + print(f"Task 2: label map not found at {label_map_file}, skipping classifier training.") + + print("\nTask 4: building retrieval index from audio features ...") + retrieval_records = build_retrieval_index( + features_csv=str(task1_output_csv), + emotion_labels_json=str(label_map_file), + ) + print(f"Task 4: index contains {len(retrieval_records)} recordings.") + for query in [ + "calm narration longer than 4 seconds", + "high-energy speech", + "dramatic dialogue", + ]: + results = retrieval_search(query, retrieval_records, top_k=3) + print_retrieval_results(query, results) + + print("\nBonus: storytelling audio analysis on selected recordings ...") + bonus_rows = analyze_storytelling( + input_dir="examples", + output_csv="examples/storytelling_analysis.csv", + max_files=8, + model_size="tiny", + ) + discuss_storytelling_signals(bonus_rows) diff --git a/narrative-audio-system/task0_audio_capture/audio_capture.py b/narrative-audio-system/task0_audio_capture/audio_capture.py new file mode 100644 index 0000000..a6dbe39 --- /dev/null +++ b/narrative-audio-system/task0_audio_capture/audio_capture.py @@ -0,0 +1,381 @@ +""" +Step 1 — Audio Capture & Streaming +==================================== +Continuously reads audio from the microphone in small chunks (frames) using +a stream callback that fires every ~30–128 ms with a new buffer of raw PCM +samples. Accumulated chunks are stored in a thread-safe rolling buffer so +that downstream processing steps (feature extraction, classification, etc.) +can consume audio without blocking capture. + +Usage (standalone demo): + python audio_capture.py --duration 5 --chunk 1024 --rate 16000 + +Usage (as a library): + from audio_capture import AudioCaptureStream, RollingBuffer + + buf = RollingBuffer(max_seconds=10, sample_rate=16000) + with AudioCaptureStream(sample_rate=16000, chunk_size=1024, buffer=buf) as stream: + time.sleep(5) # capture for 5 seconds + audio = buf.get_audio() # numpy array of all captured samples +""" + +import argparse +import queue +import threading +import time +from collections import deque +from typing import Optional + +import numpy as np + +try: + import sounddevice as sd + _SOUNDDEVICE_AVAILABLE = True +except ImportError: + _SOUNDDEVICE_AVAILABLE = False + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +DEFAULT_SAMPLE_RATE = 16000 # Hz — standard for speech processing +DEFAULT_CHUNK_SIZE = 1024 # samples (~64 ms at 16 kHz) +DEFAULT_CHANNELS = 1 # mono +DTYPE = np.float32 # PCM format expected by librosa / whisper + + +# --------------------------------------------------------------------------- +# RollingBuffer — accumulates raw PCM frames from the callback thread +# --------------------------------------------------------------------------- + +class RollingBuffer: + """ + Thread-safe circular buffer that stores raw PCM samples. + + Parameters + ---------- + max_seconds : float + Maximum audio duration to hold in memory. Older frames are dropped + once the buffer is full (rolling window semantics). + sample_rate : int + Audio sample rate in Hz (must match the capture stream). + """ + + def __init__(self, max_seconds: float = 30.0, sample_rate: int = DEFAULT_SAMPLE_RATE): + max_frames = int(max_seconds * sample_rate) + self._buffer: deque = deque(maxlen=max_frames) + self._lock = threading.Lock() + self.sample_rate = sample_rate + + # ------------------------------------------------------------------ + # Internal API — called from the capture callback (audio thread) + # ------------------------------------------------------------------ + + def push(self, chunk: np.ndarray) -> None: + """Append a new PCM chunk (1-D float32 array) to the buffer.""" + flat = chunk.flatten().astype(DTYPE) + with self._lock: + self._buffer.extend(flat.tolist()) + + # ------------------------------------------------------------------ + # Public API — called from the consumer thread + # ------------------------------------------------------------------ + + def get_audio(self) -> np.ndarray: + """Return a copy of all buffered samples as a 1-D float32 array.""" + with self._lock: + return np.array(self._buffer, dtype=DTYPE) + + def clear(self) -> None: + """Discard all buffered audio.""" + with self._lock: + self._buffer.clear() + + @property + def duration_seconds(self) -> float: + """Current buffered audio duration in seconds.""" + with self._lock: + return len(self._buffer) / self.sample_rate + + @property + def num_samples(self) -> int: + """Current number of buffered samples.""" + with self._lock: + return len(self._buffer) + + +# --------------------------------------------------------------------------- +# AudioCaptureStream — wraps sounddevice InputStream with callback +# --------------------------------------------------------------------------- + +class AudioCaptureStream: + """ + Continuously captures microphone audio in small fixed-size chunks. + + Each chunk is delivered to `buffer.push()` from the audio callback thread + as soon as it arrives, providing low-latency streaming behaviour. + + Parameters + ---------- + sample_rate : int + Capture sample rate in Hz. + chunk_size : int + Number of samples per callback invocation. + 512 → ~32 ms (lower latency, higher CPU) + 1024 → ~64 ms (balanced default) + 2048 → ~128 ms (higher latency, lower CPU) + channels : int + Number of input channels (1 = mono recommended for speech). + buffer : RollingBuffer | None + Where captured chunks are stored. A new RollingBuffer is created + automatically when None is passed. + device : int | str | None + sounddevice device index or name. None = system default. + on_chunk : callable | None + Optional hook called with each raw chunk array (audio thread context). + """ + + def __init__( + self, + sample_rate: int = DEFAULT_SAMPLE_RATE, + chunk_size: int = DEFAULT_CHUNK_SIZE, + channels: int = DEFAULT_CHANNELS, + buffer: Optional[RollingBuffer] = None, + device=None, + on_chunk=None, + ): + if not _SOUNDDEVICE_AVAILABLE: + raise ImportError( + "sounddevice is not installed. Run: pip install sounddevice" + ) + + self.sample_rate = sample_rate + self.chunk_size = chunk_size + self.channels = channels + self.device = device + self.on_chunk = on_chunk + + self.buffer: RollingBuffer = buffer or RollingBuffer( + max_seconds=30.0, sample_rate=sample_rate + ) + + self._stream: Optional[sd.InputStream] = None + self._chunk_count: int = 0 + self._error_queue: queue.Queue = queue.Queue() + + # ------------------------------------------------------------------ + # Callback — runs in the PortAudio audio thread (must be non-blocking) + # ------------------------------------------------------------------ + + def _callback(self, indata: np.ndarray, frames: int, time_info, status) -> None: + """sounddevice callback: fires every `chunk_size` samples.""" + if status: + # Put status flags into the error queue; don't block here. + self._error_queue.put_nowait(str(status)) + + self.buffer.push(indata.copy()) + self._chunk_count += 1 + + if self.on_chunk is not None: + self.on_chunk(indata.copy()) + + # ------------------------------------------------------------------ + # Lifecycle helpers + # ------------------------------------------------------------------ + + def start(self) -> "AudioCaptureStream": + """Open the microphone stream and start capturing.""" + self._chunk_count = 0 + self._stream = sd.InputStream( + samplerate=self.sample_rate, + blocksize=self.chunk_size, + channels=self.channels, + dtype=DTYPE, + device=self.device, + callback=self._callback, + ) + self._stream.start() + chunk_ms = round(self.chunk_size / self.sample_rate * 1000) + print( + f"[AudioCapture] Stream opened — " + f"rate={self.sample_rate} Hz, " + f"chunk={self.chunk_size} samples ({chunk_ms} ms), " + f"channels={self.channels}" + ) + return self + + def stop(self) -> None: + """Stop and close the microphone stream.""" + if self._stream is not None: + self._stream.stop() + self._stream.close() + self._stream = None + print( + f"[AudioCapture] Stream closed — " + f"{self._chunk_count} chunks captured, " + f"{self.buffer.duration_seconds:.2f} s buffered." + ) + + @property + def is_active(self) -> bool: + return self._stream is not None and self._stream.active + + # ------------------------------------------------------------------ + # Context-manager support + # ------------------------------------------------------------------ + + def __enter__(self) -> "AudioCaptureStream": + return self.start() + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.stop() + + # ------------------------------------------------------------------ + # Convenience — drain any reported PortAudio status warnings + # ------------------------------------------------------------------ + + def drain_errors(self) -> list: + errors = [] + while not self._error_queue.empty(): + errors.append(self._error_queue.get_nowait()) + return errors + + +# --------------------------------------------------------------------------- +# Public helper: record_for_duration +# --------------------------------------------------------------------------- + +def record_for_duration( + duration: float, + sample_rate: int = DEFAULT_SAMPLE_RATE, + chunk_size: int = DEFAULT_CHUNK_SIZE, + channels: int = DEFAULT_CHANNELS, + device=None, + verbose: bool = True, +) -> np.ndarray: + """ + Block until `duration` seconds of audio have been captured, then return + the full recording as a 1-D float32 numpy array. + + Parameters + ---------- + duration : float + Recording length in seconds. + sample_rate : int + Microphone sample rate in Hz. + chunk_size : int + Callback chunk size in samples. + channels : int + Number of input channels. + device : int | str | None + sounddevice device index or name. + verbose : bool + Print progress dots while recording. + + Returns + ------- + np.ndarray + 1-D float32 array of raw PCM samples, shape (duration * sample_rate,). + """ + buf = RollingBuffer(max_seconds=duration + 2.0, sample_rate=sample_rate) + with AudioCaptureStream( + sample_rate=sample_rate, + chunk_size=chunk_size, + channels=channels, + buffer=buf, + device=device, + ) as stream: + target_samples = int(duration * sample_rate) + if verbose: + print(f"[AudioCapture] Recording for {duration:.1f} s ...", end="", flush=True) + while buf.num_samples < target_samples: + errs = stream.drain_errors() + for e in errs: + print(f"\n[AudioCapture] Warning: {e}") + if verbose: + print(".", end="", flush=True) + time.sleep(0.05) + if verbose: + print(" done.") + return buf.get_audio() + + +# --------------------------------------------------------------------------- +# CLI demo +# --------------------------------------------------------------------------- + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Step 1 — Audio Capture & Streaming demo" + ) + parser.add_argument( + "--duration", type=float, default=5.0, + help="Recording duration in seconds (default: 5)" + ) + parser.add_argument( + "--chunk", type=int, default=DEFAULT_CHUNK_SIZE, + help="Chunk size in samples (default: 1024 → 64 ms at 16 kHz)" + ) + parser.add_argument( + "--rate", type=int, default=DEFAULT_SAMPLE_RATE, + help="Sample rate in Hz (default: 16000)" + ) + parser.add_argument( + "--channels", type=int, default=DEFAULT_CHANNELS, + help="Number of input channels (default: 1)" + ) + parser.add_argument( + "--device", default=None, + help="sounddevice device index or name (default: system default)" + ) + parser.add_argument( + "--save", default=None, metavar="OUT.WAV", + help="Save captured audio to a WAV file" + ) + parser.add_argument( + "--list-devices", action="store_true", + help="Print available audio devices and exit" + ) + return parser.parse_args() + + +def main(): + args = _parse_args() + + if not _SOUNDDEVICE_AVAILABLE: + print("ERROR: sounddevice is not installed. Run: pip install sounddevice") + return + + if args.list_devices: + print(sd.query_devices()) + return + + chunk_ms = round(args.chunk / args.rate * 1000) + print( + f"\nStep 1 — Audio Capture & Streaming\n" + f" Sample rate : {args.rate} Hz\n" + f" Chunk size : {args.chunk} samples ({chunk_ms} ms per frame)\n" + f" Channels : {args.channels}\n" + f" Duration : {args.duration} s\n" + ) + + audio = record_for_duration( + duration=args.duration, + sample_rate=args.rate, + chunk_size=args.chunk, + channels=args.channels, + device=args.device, + ) + + print(f"\n[AudioCapture] Captured {len(audio)} samples ({len(audio)/args.rate:.3f} s)") + print(f"[AudioCapture] Shape: {audio.shape} dtype: {audio.dtype}") + print(f"[AudioCapture] Amplitude range: [{audio.min():.4f}, {audio.max():.4f}]") + print(f"[AudioCapture] RMS level: {float(np.sqrt(np.mean(audio**2))):.6f}") + + if args.save: + import soundfile as sf + sf.write(args.save, audio, args.rate) + print(f"[AudioCapture] Saved to {args.save}") + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/task1_audio_pipeline/TASK1_DELIVERABLES.md b/narrative-audio-system/task1_audio_pipeline/TASK1_DELIVERABLES.md new file mode 100644 index 0000000..c3c133f --- /dev/null +++ b/narrative-audio-system/task1_audio_pipeline/TASK1_DELIVERABLES.md @@ -0,0 +1,54 @@ +# Task 1 Deliverables: Audio Processing Pipeline + +This folder contains the implementation and outputs for Task 1. + +## 1) Python code for audio processing + +Implementation file: +- `task1_audio_pipeline/audio_pipeline.py` + +The script performs: +- Audio loading from a directory of `.wav` files +- Peak normalization (`librosa.util.normalize`) +- Optional fixed-window segmentation (`--segment-seconds`) +- Feature extraction for each segment +- Export of a structured CSV dataset suitable for ML + +## 2) Description of extracted features + +Per segment, the output dataset includes: +- `filename`: source audio file name +- `segment_index`: index of segment within the source file +- `segment_start_seconds`: segment start timestamp +- `segment_end_seconds`: segment end timestamp +- `pitch_mean_hz`: mean pitch from `librosa.yin` (Hz) +- `spectral_centroid_mean_hz`: mean spectral centroid (Hz) +- `energy_rms_mean`: mean RMS energy +- `duration_seconds`: segment duration in seconds +- `mfcc_1 ... mfcc_13`: mean MFCC coefficients + +## 3) Example output dataset + +Example dataset file: +- `examples/task1_features_dataset_sample.csv` + +To generate a full dataset from all available `.wav` files: + +```bash +python task1_audio_pipeline/audio_pipeline.py \ + --input-dir ../examples \ + --output-csv ../examples/task1_features_dataset.csv \ + --normalized-dir ../examples/normalized_audio \ + --segment-seconds 2.0 +``` + +For a small quick-run sample: + +```bash +python task1_audio_pipeline/audio_pipeline.py \ + --input-dir ../examples \ + --output-csv ../examples/task1_features_dataset_sample.csv \ + --normalized-dir ../examples/normalized_audio_sample \ + --segment-seconds 2.0 \ + --max-files 5 +``` diff --git a/narrative-audio-system/task1_audio_pipeline/audio_pipeline.py b/narrative-audio-system/task1_audio_pipeline/audio_pipeline.py new file mode 100644 index 0000000..ec8419b --- /dev/null +++ b/narrative-audio-system/task1_audio_pipeline/audio_pipeline.py @@ -0,0 +1,160 @@ +import argparse +import csv +from pathlib import Path + +import librosa +import numpy as np +import soundfile as sf + + +def load_and_normalize(audio_path, sample_rate=16000): + """Load audio and normalize peak amplitude into [-1, 1].""" + waveform, loaded_sample_rate = librosa.load(str(audio_path), sr=sample_rate) + normalized_waveform = librosa.util.normalize(waveform) + return normalized_waveform, loaded_sample_rate + + +def segment_audio(waveform, sample_rate, segment_seconds=2.0): + """Split waveform into fixed-size segments; return one segment if disabled.""" + if segment_seconds is None or segment_seconds <= 0: + return [(0.0, len(waveform) / float(sample_rate), waveform)] + + segment_size = int(segment_seconds * sample_rate) + if segment_size <= 0 or len(waveform) <= segment_size: + return [(0.0, len(waveform) / float(sample_rate), waveform)] + + segments = [] + for start in range(0, len(waveform), segment_size): + end = min(start + segment_size, len(waveform)) + chunk = waveform[start:end] + if len(chunk) == 0: + continue + start_sec = start / float(sample_rate) + end_sec = end / float(sample_rate) + segments.append((start_sec, end_sec, chunk)) + return segments + + +def extract_features(segment_waveform, sample_rate=16000, n_mfcc=13): + """Extract MFCC + pitch + spectral centroid + energy + duration features.""" + mfcc_matrix = librosa.feature.mfcc(y=segment_waveform, sr=sample_rate, n_mfcc=n_mfcc) + mfcc_mean = mfcc_matrix.mean(axis=1) + + f0 = librosa.yin(segment_waveform, fmin=50, fmax=400, sr=sample_rate) + voiced_f0 = f0[np.isfinite(f0)] + pitch_mean = float(np.mean(voiced_f0)) if voiced_f0.size > 0 else 0.0 + + spectral_centroid = librosa.feature.spectral_centroid(y=segment_waveform, sr=sample_rate) + spectral_centroid_mean = float(np.mean(spectral_centroid)) + + rms_energy = librosa.feature.rms(y=segment_waveform) + energy_mean = float(np.mean(rms_energy)) + + duration_seconds = len(segment_waveform) / float(sample_rate) + + feature_row = { + "pitch_mean_hz": pitch_mean, + "spectral_centroid_mean_hz": spectral_centroid_mean, + "energy_rms_mean": energy_mean, + "duration_seconds": float(duration_seconds), + } + for index, value in enumerate(mfcc_mean, start=1): + feature_row[f"mfcc_{index}"] = float(value) + + return feature_row + + +def build_feature_dataset( + input_dir, + output_csv, + normalized_dir=None, + sample_rate=16000, + n_mfcc=13, + segment_seconds=2.0, + max_files=None, +): + """Process a folder of audio files and write a structured ML-ready CSV.""" + input_dir = Path(input_dir) + output_csv = Path(output_csv) + normalized_dir = Path(normalized_dir) if normalized_dir else None + + if normalized_dir: + normalized_dir.mkdir(parents=True, exist_ok=True) + output_csv.parent.mkdir(parents=True, exist_ok=True) + + audio_files = sorted(input_dir.glob("*.wav")) + if max_files is not None: + audio_files = audio_files[: max(0, int(max_files))] + + dataset_rows = [] + for audio_file in audio_files: + normalized_waveform, loaded_sample_rate = load_and_normalize(audio_file, sample_rate=sample_rate) + + if normalized_dir: + normalized_path = normalized_dir / audio_file.name + sf.write(str(normalized_path), normalized_waveform, loaded_sample_rate) + + segments = segment_audio( + waveform=normalized_waveform, + sample_rate=loaded_sample_rate, + segment_seconds=segment_seconds, + ) + for segment_index, (start_sec, end_sec, segment_waveform) in enumerate(segments): + row = { + "filename": audio_file.name, + "segment_index": segment_index, + "segment_start_seconds": float(start_sec), + "segment_end_seconds": float(end_sec), + } + row.update(extract_features(segment_waveform, sample_rate=loaded_sample_rate, n_mfcc=n_mfcc)) + dataset_rows.append(row) + + if not dataset_rows: + raise ValueError(f"No .wav files found in {input_dir}") + + fieldnames = list(dataset_rows[0].keys()) + with output_csv.open("w", newline="", encoding="utf-8") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(dataset_rows) + + print(f"Wrote {len(dataset_rows)} rows to {output_csv}") + return dataset_rows + + +def parse_args(): + parser = argparse.ArgumentParser(description="Task 1: audio processing pipeline") + parser.add_argument("--input-dir", default="../examples", help="Directory containing .wav files") + parser.add_argument( + "--output-csv", + default="../examples/task1_features_dataset.csv", + help="Path to save extracted feature dataset CSV", + ) + parser.add_argument( + "--normalized-dir", + default="../examples/normalized_audio", + help="Directory to save normalized audio files", + ) + parser.add_argument("--sample-rate", type=int, default=16000) + parser.add_argument("--n-mfcc", type=int, default=13) + parser.add_argument("--segment-seconds", type=float, default=2.0) + parser.add_argument("--max-files", type=int, default=None, help="Optional cap for quick demo runs") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + script_dir = Path(__file__).resolve().parent + input_dir = (script_dir / args.input_dir).resolve() + output_csv = (script_dir / args.output_csv).resolve() + normalized_dir = (script_dir / args.normalized_dir).resolve() if args.normalized_dir else None + + build_feature_dataset( + input_dir=input_dir, + output_csv=output_csv, + normalized_dir=normalized_dir, + sample_rate=args.sample_rate, + n_mfcc=args.n_mfcc, + segment_seconds=args.segment_seconds, + max_files=args.max_files, + ) diff --git a/narrative-audio-system/task1_audio_pipeline/requirements.txt b/narrative-audio-system/task1_audio_pipeline/requirements.txt new file mode 100644 index 0000000..c979641 --- /dev/null +++ b/narrative-audio-system/task1_audio_pipeline/requirements.txt @@ -0,0 +1,3 @@ +librosa==0.10.1 +numpy==1.26.4 +soundfile==0.12.1 \ No newline at end of file diff --git a/narrative-audio-system/task2_tone_classification/requirements.txt b/narrative-audio-system/task2_tone_classification/requirements.txt new file mode 100644 index 0000000..706cee8 --- /dev/null +++ b/narrative-audio-system/task2_tone_classification/requirements.txt @@ -0,0 +1,4 @@ +torch==2.11.0 +scikit-learn==1.4.1 +numpy==1.26.4 +librosa==0.10.1 \ No newline at end of file diff --git a/narrative-audio-system/task2_tone_classification/train_classifier.py b/narrative-audio-system/task2_tone_classification/train_classifier.py new file mode 100644 index 0000000..77f7c91 --- /dev/null +++ b/narrative-audio-system/task2_tone_classification/train_classifier.py @@ -0,0 +1,166 @@ +import argparse +import json +from pathlib import Path + +import librosa +import numpy as np +import torch +from torch import nn +from torch.utils.data import DataLoader, TensorDataset +from sklearn.metrics import accuracy_score, classification_report, f1_score +from sklearn.model_selection import train_test_split + + +class ToneClassifier(nn.Module): + def __init__(self, input_dim, hidden_dim, num_classes): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(hidden_dim, num_classes), + ) + + def forward(self, input_features): + return self.net(input_features) + + +def extract_mfcc_vector(audio_path, sample_rate=16000, n_mfcc=13): + waveform, loaded_sr = librosa.load(str(audio_path), sr=sample_rate) + mfcc_matrix = librosa.feature.mfcc(y=waveform, sr=loaded_sr, n_mfcc=n_mfcc) + return mfcc_matrix.mean(axis=1).astype(np.float32) + + +def load_dataset(label_map_path, sample_rate=16000, n_mfcc=13): + """Load MFCC features and integer-encoded labels from an emotion JSON map.""" + with Path(label_map_path).open("r", encoding="utf-8-sig") as fp: + filename_to_emotion = json.load(fp) + + audio_root = Path(label_map_path).parent + features, raw_labels = [], [] + for filename, emotion in filename_to_emotion.items(): + audio_path = audio_root / filename + if not audio_path.is_file(): + continue + features.append(extract_mfcc_vector(audio_path, sample_rate, n_mfcc)) + raw_labels.append(emotion.strip().title()) + + if not features: + raise ValueError(f"No audio files found relative to {label_map_path}") + + class_names, encoded = np.unique(raw_labels, return_inverse=True) + return np.array(features, dtype=np.float32), encoded, class_names.tolist() + + +def train_model(train_loader, model, criterion, optimizer, num_epochs=30): + model.train() + for epoch in range(num_epochs): + epoch_loss, all_preds, all_true = 0.0, [], [] + for batch_x, batch_y in train_loader: + logits = model(batch_x) + loss = criterion(logits, batch_y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + epoch_loss += loss.item() + all_preds.extend(torch.argmax(logits, dim=1).tolist()) + all_true.extend(batch_y.tolist()) + + if epoch in {0, num_epochs - 1} or (epoch + 1) % 10 == 0: + acc = accuracy_score(all_true, all_preds) + f1 = f1_score(all_true, all_preds, average="weighted", zero_division=0) + print(f" Epoch {epoch + 1:3d}: loss={epoch_loss:.4f} train_acc={acc:.3f} train_f1={f1:.3f}") + + +def evaluate_model(model, X_test, y_test, class_names): + model.eval() + with torch.no_grad(): + logits = model(torch.tensor(X_test)) + preds = torch.argmax(logits, dim=1).numpy() + + acc = accuracy_score(y_test, preds) + f1 = f1_score(y_test, preds, average="weighted", zero_division=0) + report = classification_report(y_test, preds, target_names=class_names, zero_division=0) + return acc, f1, report, preds + + +def discuss_results(acc, f1, class_names, y_test, preds): + print("\n--- Discussion of Results ---") + print( + f"The model achieved {acc:.1%} test accuracy and a weighted F1 of {f1:.3f} " + f"on {len(y_test)} held-out samples across {len(class_names)} emotion classes." + ) + if acc >= 0.5: + print("Performance is above random chance (12.5% for 8 classes), showing the MFCC") + print("features carry signal. A deeper model or data augmentation could improve further.") + else: + print("Performance is modest, likely due to the small dataset size (~127 samples).") + print("Pretrained audio embeddings (e.g. wav2vec2) would be a stronger baseline.") + confused_pairs = [ + (class_names[true_idx], class_names[pred_idx]) + for true_idx, pred_idx in zip(y_test, preds) + if true_idx != pred_idx + ] + if confused_pairs: + from collections import Counter + top = Counter(confused_pairs).most_common(3) + print("Most common confusions:", ", ".join(f"{a}→{b}" for (a, b), _ in top)) + print("----------------------------\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Task 2: Narrative Tone Classification") + parser.add_argument( + "--label-map", + default="../examples/emotion_labels.json", + help="Path to emotion_labels.json", + ) + parser.add_argument("--sample-rate", type=int, default=16000) + parser.add_argument("--n-mfcc", type=int, default=13) + parser.add_argument("--hidden-dim", type=int, default=64) + parser.add_argument("--epochs", type=int, default=30) + parser.add_argument("--test-size", type=float, default=0.2) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + torch.manual_seed(args.seed) + np.random.seed(args.seed) + + label_map_path = (Path(__file__).resolve().parent / args.label_map).resolve() + print(f"\nLoading dataset from {label_map_path} ...") + features, labels, class_names = load_dataset( + label_map_path, sample_rate=args.sample_rate, n_mfcc=args.n_mfcc + ) + print(f"Loaded {len(labels)} samples, {len(class_names)} classes: {', '.join(class_names)}") + + # Train / test split + X_train, X_test, y_train, y_test = train_test_split( + features, labels, test_size=args.test_size, random_state=args.seed, stratify=labels + ) + + # Z-score normalisation (fit on train only) + mean = X_train.mean(axis=0) + std = X_train.std(axis=0) + 1e-6 + X_train = (X_train - mean) / std + X_test = (X_test - mean) / std + + train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train, dtype=torch.long)) + train_loader = DataLoader(train_ds, batch_size=16, shuffle=True) + + model = ToneClassifier( + input_dim=features.shape[1], hidden_dim=args.hidden_dim, num_classes=len(class_names) + ) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + + print(f"\n--- Training ({args.epochs} epochs, {len(X_train)} train / {len(X_test)} test) ---") + train_model(train_loader, model, criterion, optimizer, num_epochs=args.epochs) + + print("\n--- Test Evaluation ---") + acc, f1, report, preds = evaluate_model(model, X_test, y_test, class_names) + print(f"Test Accuracy : {acc:.3f}") + print(f"Weighted F1 : {f1:.3f}") + print("\nPer-class Report:") + print(report) + + discuss_results(acc, f1, class_names, y_test, preds) diff --git a/narrative-audio-system/task3_transcription/requirements.txt b/narrative-audio-system/task3_transcription/requirements.txt new file mode 100644 index 0000000..d4648be --- /dev/null +++ b/narrative-audio-system/task3_transcription/requirements.txt @@ -0,0 +1,3 @@ +openai-whisper==20231117 +torch==2.11.0 +numpy==1.26.4 \ No newline at end of file diff --git a/narrative-audio-system/task3_transcription/whisper_transcriber.py b/narrative-audio-system/task3_transcription/whisper_transcriber.py new file mode 100644 index 0000000..06d0fd5 --- /dev/null +++ b/narrative-audio-system/task3_transcription/whisper_transcriber.py @@ -0,0 +1,177 @@ +"""Task 3: batch Whisper transcription with simple WER evaluation.""" + +import argparse +import shutil +from pathlib import Path + +import whisper + + +RAVDESS_STATEMENTS = { + "01": "Kids are talking by the door", + "02": "Dogs are sitting by the door", +} + + +def _ravdess_reference(filename: str) -> str | None: + """Return ground-truth text for a RAVDESS filename, or None if unknown.""" + parts = Path(filename).stem.split("-") + if len(parts) >= 5: + return RAVDESS_STATEMENTS.get(parts[4]) + return None + + +def _edit_distance(ref_tokens: list[str], hyp_tokens: list[str]) -> int: + """Standard dynamic-programming edit distance.""" + n, m = len(ref_tokens), len(hyp_tokens) + dp = list(range(m + 1)) + for i in range(1, n + 1): + prev, dp[0] = dp[0], i + for j in range(1, m + 1): + prev, dp[j] = dp[j], ( + prev if ref_tokens[i - 1] == hyp_tokens[j - 1] + else 1 + min(prev, dp[j], dp[j - 1]) + ) + return dp[m] + + +def word_error_rate(reference: str, hypothesis: str) -> float: + ref_tokens = reference.lower().split() + hyp_tokens = hypothesis.lower().split() + if not ref_tokens: + return 0.0 + return _edit_distance(ref_tokens, hyp_tokens) / len(ref_tokens) + + +def load_model(model_size: str = "tiny"): + if shutil.which("ffmpeg") is None: + print("WARNING: ffmpeg not found — Whisper may fail to decode audio.") + return whisper.load_model(model_size) + + +def transcribe_file(model, audio_path: str) -> str: + result = model.transcribe(str(audio_path)) + return result["text"].strip() + + +def transcribe_directory( + input_dir: str, + output_txt: str, + model_size: str = "tiny", + max_files: int | None = None, +) -> dict[str, str]: + """Transcribe `.wav` files and write `filenametranscript` lines.""" + input_path = Path(input_dir) + audio_files = sorted(input_path.glob("*.wav")) + if max_files: + audio_files = audio_files[:max_files] + + if not audio_files: + raise ValueError(f"No .wav files found in {input_dir}") + + print(f"\nLoading Whisper '{model_size}' model ...") + model = load_model(model_size) + + transcripts: dict[str, str] = {} + output_path = Path(output_txt) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with output_path.open("w", encoding="utf-8") as out_fp: + for audio_file in audio_files: + text = transcribe_file(model, audio_file) + transcripts[audio_file.name] = text + out_fp.write(f"{audio_file.name}\t{text}\n") + print(f" {audio_file.name}: {text}") + + print(f"\nTranscripts saved to {output_path} ({len(transcripts)} files)") + return transcripts + + +def measure_accuracy( + transcripts: dict[str, str], + max_samples: int = 20, +) -> None: + """Compute WER on up to `max_samples` files with known references.""" + wer_scores: list[float] = [] + evaluated: list[tuple[str, str, str, float]] = [] + + for filename, hypothesis in list(transcripts.items())[:max_samples]: + reference = _ravdess_reference(filename) + if reference is None: + continue + wer = word_error_rate(reference, hypothesis) + wer_scores.append(wer) + evaluated.append((filename, reference, hypothesis, wer)) + + if not wer_scores: + print("\nNo ground-truth references available for accuracy measurement.") + return + + avg_wer = sum(wer_scores) / len(wer_scores) + + print(f"\n--- Transcription Accuracy ({len(wer_scores)} samples) ---") + for filename, ref, hyp, wer in evaluated: + print(f" File : {filename}") + print(f" Ref : {ref}") + print(f" Hyp : {hyp}") + print(f" WER : {wer:.2%}") + print() + print(f" Average WER : {avg_wer:.2%}") + + _discuss_quality(avg_wer, len(wer_scores)) + + +def _discuss_quality(avg_wer: float, n_samples: int) -> None: + print("\n--- Discussion of Transcription Quality ---") + print( + f"Whisper (tiny) achieved an average WER of {avg_wer:.1%} on {n_samples} RAVDESS samples." + ) + if avg_wer <= 0.05: + print("This is near-perfect transcription — the sentences are short, clear, and in English.") + elif avg_wer <= 0.20: + print("Good accuracy. Minor errors (dropped/swapped words) occur under emotional prosody.") + else: + print("Moderate WER, likely caused by strong emotional expressiveness distorting phonemes.") + print( + "Using the 'base' or 'small' Whisper model instead of 'tiny' would further reduce WER." + ) + print( + "For production use, speaker diarisation and language-model rescoring would help further." + ) + print("------------------------------------------\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Task 3: Whisper batch transcription") + parser.add_argument("--input-dir", default="../examples", help="Directory of .wav files") + parser.add_argument( + "--output-txt", + default="../examples/transcripts.txt", + help="Output file for transcripts (TSV: filenametranscript)", + ) + parser.add_argument( + "--model-size", + default="tiny", + choices=["tiny", "base", "small", "medium", "large"], + ) + parser.add_argument("--max-files", type=int, default=None, help="Limit files for quick runs") + parser.add_argument( + "--accuracy-samples", + type=int, + default=20, + help="Number of files to use for WER evaluation", + ) + args = parser.parse_args() + + script_dir = Path(__file__).resolve().parent + input_dir = (script_dir / args.input_dir).resolve() + output_txt = (script_dir / args.output_txt).resolve() + + transcripts = transcribe_directory( + input_dir=str(input_dir), + output_txt=str(output_txt), + model_size=args.model_size, + max_files=args.max_files, + ) + + measure_accuracy(transcripts, max_samples=args.accuracy_samples) \ No newline at end of file diff --git a/narrative-audio-system/task4_audio_retrieval/requirements.txt b/narrative-audio-system/task4_audio_retrieval/requirements.txt new file mode 100644 index 0000000..3c93730 --- /dev/null +++ b/narrative-audio-system/task4_audio_retrieval/requirements.txt @@ -0,0 +1,4 @@ +sentence-transformers==2.6.1 +torch==2.11.0 +numpy==1.26.4 +scikit-learn==1.4.1 \ No newline at end of file diff --git a/narrative-audio-system/task4_audio_retrieval/retrieval_prototype.py b/narrative-audio-system/task4_audio_retrieval/retrieval_prototype.py new file mode 100644 index 0000000..3824992 --- /dev/null +++ b/narrative-audio-system/task4_audio_retrieval/retrieval_prototype.py @@ -0,0 +1,187 @@ +"""Task 4: filter + semantic ranking retrieval for narrative audio queries.""" + +import argparse +import csv +import json +import re +from pathlib import Path + +import numpy as np + +try: + from sentence_transformers import SentenceTransformer, util as st_util + _ST_AVAILABLE = True +except ImportError: + _ST_AVAILABLE = False + +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.feature_extraction.text import TfidfVectorizer + +EMOTION_TO_NARRATIVE = { + "Calm": "calm narration", + "Happy": "upbeat cheerful dialogue", + "Sad": "sorrowful emotional narration", + "Angry": "urgent high-energy dramatic speech", + "Fearful": "tense suspenseful narration", + "Disgust": "intense dramatic emphasis", + "Surprised": "excited emphatic dialogue", + "Neutral": "neutral flat narration", +} + +ENERGY_THRESHOLDS = {"low": 0.02, "high": 0.06} +PITCH_THRESHOLDS = {"low": 120.0, "high": 200.0} + + +def build_index(features_csv, emotion_labels_json=None): + features_csv_path = Path(features_csv) + if not features_csv_path.is_file(): + raise FileNotFoundError( + f"Task 1 feature CSV not found at {features_csv_path}. " + "Run Task 1 first to generate it." + ) + + file_rows = {} + with features_csv_path.open("r", encoding="utf-8") as fp: + for row in csv.DictReader(fp): + file_rows.setdefault(row["filename"], []).append(row) + + emotion_map = {} + if emotion_labels_json and Path(emotion_labels_json).is_file(): + with Path(emotion_labels_json).open("r", encoding="utf-8-sig") as fp: + raw = json.load(fp) + emotion_map = {k: v.strip().title() for k, v in raw.items()} + + records = [] + for filename, rows in file_rows.items(): + duration = sum(float(r["duration_seconds"]) for r in rows) + energy = float(np.mean([float(r["energy_rms_mean"]) for r in rows])) + pitch = float(np.mean([float(r["pitch_mean_hz"]) for r in rows])) + + energy_label = ( + "low" if energy < ENERGY_THRESHOLDS["low"] else + "high" if energy >= ENERGY_THRESHOLDS["high"] else + "medium" + ) + pitch_label = ( + "low" if pitch < PITCH_THRESHOLDS["low"] else + "high" if pitch >= PITCH_THRESHOLDS["high"] else + "medium" + ) + + emotion = emotion_map.get(filename, "Unknown") + narrative = EMOTION_TO_NARRATIVE.get(emotion, "speech") + description = ( + f"{narrative}, {duration:.1f}s duration, " + f"{energy_label}-energy, pitch {pitch:.0f}Hz" + ) + + records.append({ + "filename": filename, + "duration": duration, + "energy": energy, + "pitch": pitch, + "energy_label": energy_label, + "pitch_label": pitch_label, + "emotion": emotion, + "narrative": narrative, + "description": description, + }) + + return records + + +def _apply_filters(records, query): + q = query.lower() + filtered = list(records) + + m = re.search(r"longer\s+than\s+([\d.]+)\s*s", q) + if m: + filtered = [r for r in filtered if r["duration"] > float(m.group(1))] + + m = re.search(r"shorter\s+than\s+([\d.]+)\s*s", q) + if m: + filtered = [r for r in filtered if r["duration"] < float(m.group(1))] + + if re.search(r"\bhigh[- ]energy\b|\benerget\w+\b|\bloud\b", q): + filtered = [r for r in filtered if r["energy_label"] == "high"] + elif re.search(r"\bquiet\b|(?:^|\W)low[- ]energy\b|\bsoft\b|\bsubdued\b", q): + filtered = [r for r in filtered if r["energy_label"] == "low"] + + for emotion_key in EMOTION_TO_NARRATIVE: + if emotion_key.lower() in q: + filtered = [r for r in filtered if r["emotion"] == emotion_key] + break + + if re.search(r"\bhigh\s+pitch\b|\bhigh-pitched\b", q): + filtered = [r for r in filtered if r["pitch_label"] == "high"] + elif re.search(r"(?:^|\W)low\s+pitch\b|low-pitched\b|\bdeep\b", q): + filtered = [r for r in filtered if r["pitch_label"] == "low"] + + return filtered + + +def _rank_semantically(query, candidates, top_k): + if not candidates: + return [] + + descriptions = [r["description"] for r in candidates] + + if _ST_AVAILABLE: + model = SentenceTransformer("all-MiniLM-L6-v2") + corpus_emb = model.encode(descriptions, convert_to_tensor=True) + query_emb = model.encode(query, convert_to_tensor=True) + scores = st_util.cos_sim(query_emb, corpus_emb)[0].tolist() + else: + vec = TfidfVectorizer() + matrix = vec.fit_transform(descriptions + [query]) + scores = cosine_similarity(matrix[-1], matrix[:-1])[0].tolist() + + ranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True) + return [{"score": round(s, 3), **r} for s, r in ranked[:top_k]] + + +def search(query, records, top_k=5): + """Filter by structured constraints, then rank survivors semantically.""" + candidates = _apply_filters(records, query) + if not candidates: + candidates = records + return _rank_semantically(query, candidates, top_k=top_k) + + +def print_results(query, results): + print(f'\nQuery : "{query}"') + if not results: + print(" No matching recordings found.") + return + for i, r in enumerate(results, 1): + score_str = f" (score={r['score']:.3f})" if "score" in r else "" + print(f" {i}. {r['filename']}{score_str}\n {r['description']}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Task 4: Narrative Audio Retrieval") + parser.add_argument("--features-csv", default="../examples/task1_features_dataset.csv") + parser.add_argument("--emotion-labels", default="../examples/emotion_labels.json") + parser.add_argument("--query", default=None, help="Custom query (optional)") + parser.add_argument("--top-k", type=int, default=5) + args = parser.parse_args() + + script_dir = Path(__file__).resolve().parent + features_csv = (script_dir / args.features_csv).resolve() + emotion_json = (script_dir / args.emotion_labels).resolve() + + print("Building retrieval index from Task 1 features ...") + records = build_index(str(features_csv), str(emotion_json)) + print(f"Index contains {len(records)} recordings.\n") + + example_queries = [ + "calm narration longer than 4 seconds", + "high-energy speech", + "dramatic dialogue", + "sad quiet voice", + "angry short clip shorter than 3s", + ] + + for q in ([args.query] if args.query else example_queries): + results = search(q, records, top_k=args.top_k) + print_results(q, results) diff --git a/narrative-audio-system/task_bonus_storytelling/storytelling_analysis.py b/narrative-audio-system/task_bonus_storytelling/storytelling_analysis.py new file mode 100644 index 0000000..7c55a34 --- /dev/null +++ b/narrative-audio-system/task_bonus_storytelling/storytelling_analysis.py @@ -0,0 +1,191 @@ +import argparse +import csv +import re +import sys +from pathlib import Path + +import librosa +import numpy as np + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from task3_transcription.whisper_transcriber import load_model, transcribe_file + + +def _min_max_normalize(values): + values = np.asarray(values, dtype=float) + if values.size == 0: + return values + low = float(np.min(values)) + high = float(np.max(values)) + if high - low < 1e-9: + return np.zeros_like(values) + return (values - low) / (high - low) + + +def add_storytelling_scores(rows): + """Add a 0-100 storytelling score using weighted expressive features.""" + if not rows: + return rows + + pitch_var = _min_max_normalize([float(r["pitch_std_hz"]) for r in rows]) + energy_dyn = _min_max_normalize([float(r["energy_dynamic_range"]) for r in rows]) + pause_dyn = _min_max_normalize([float(r["pause_ratio"]) for r in rows]) + sentence_len = _min_max_normalize([float(r["avg_sentence_words"]) for r in rows]) + + for idx, row in enumerate(rows): + score_0_1 = ( + 0.35 * pitch_var[idx] + + 0.30 * energy_dyn[idx] + + 0.20 * pause_dyn[idx] + + 0.15 * sentence_len[idx] + ) + row["storytelling_score"] = round(float(score_0_1 * 100.0), 2) + + return rows + + +def extract_storytelling_features(audio_path, sample_rate=16000): + """Compute pacing/pauses, pitch variation, and energy dynamics for one file.""" + waveform, sr = librosa.load(str(audio_path), sr=sample_rate) + + duration_seconds = len(waveform) / float(sr) + tempo_bpm = float(librosa.feature.tempo(y=waveform, sr=sr)[0]) + + rms = librosa.feature.rms(y=waveform, frame_length=1024, hop_length=256)[0] + silence_threshold = max(0.01, float(np.percentile(rms, 20))) + silence_mask = rms < silence_threshold + silence_ratio = float(np.mean(silence_mask)) if len(silence_mask) else 0.0 + pause_events = int(np.sum((~silence_mask[:-1]) & (silence_mask[1:]))) if len(silence_mask) > 1 else 0 + + f0 = librosa.yin(waveform, fmin=50, fmax=400, sr=sr) + voiced_f0 = f0[np.isfinite(f0)] + pitch_mean_hz = float(np.mean(voiced_f0)) if voiced_f0.size else 0.0 + pitch_std_hz = float(np.std(voiced_f0)) if voiced_f0.size else 0.0 + + energy_mean = float(np.mean(rms)) if len(rms) else 0.0 + energy_std = float(np.std(rms)) if len(rms) else 0.0 + energy_dynamic_range = float(np.percentile(rms, 90) - np.percentile(rms, 10)) if len(rms) else 0.0 + + return { + "duration_seconds": float(duration_seconds), + "tempo_bpm": tempo_bpm, + "pause_ratio": silence_ratio, + "pause_events": pause_events, + "pitch_mean_hz": pitch_mean_hz, + "pitch_std_hz": pitch_std_hz, + "energy_mean": energy_mean, + "energy_std": energy_std, + "energy_dynamic_range": energy_dynamic_range, + } + + +def sentence_length_features(transcript_text): + """Approximate sentence-length metrics from transcript punctuation.""" + normalized = transcript_text.strip() + if not normalized: + return { + "word_count": 0, + "sentence_count": 0, + "avg_sentence_words": 0.0, + "max_sentence_words": 0, + } + + words = re.findall(r"\b[\w']+\b", normalized) + sentence_chunks = [s.strip() for s in re.split(r"[.!?]+", normalized) if s.strip()] + sentence_lengths = [len(re.findall(r"\b[\w']+\b", s)) for s in sentence_chunks] + + return { + "word_count": len(words), + "sentence_count": len(sentence_chunks), + "avg_sentence_words": float(np.mean(sentence_lengths)) if sentence_lengths else 0.0, + "max_sentence_words": int(max(sentence_lengths)) if sentence_lengths else 0, + } + + +def analyze_storytelling(input_dir, output_csv, max_files=12, model_size="tiny"): + """Analyze a subset of recordings and save storytelling-oriented features to CSV.""" + input_dir = Path(input_dir) + output_csv = Path(output_csv) + output_csv.parent.mkdir(parents=True, exist_ok=True) + + audio_files = sorted(input_dir.glob("*.wav"))[:max_files] + if not audio_files: + raise ValueError(f"No .wav files found in {input_dir}") + + print(f"Bonus task: loading Whisper '{model_size}' for transcript-based sentence features...") + whisper_model = load_model(model_size) + + rows = [] + for audio_file in audio_files: + transcript = transcribe_file(whisper_model, audio_file) + audio_feats = extract_storytelling_features(audio_file) + text_feats = sentence_length_features(transcript) + + row = { + "filename": audio_file.name, + "transcript": transcript, + **audio_feats, + **text_feats, + } + rows.append(row) + print(f"Bonus task: analyzed {audio_file.name}") + + rows = add_storytelling_scores(rows) + + fieldnames = list(rows[0].keys()) + with output_csv.open("w", newline="", encoding="utf-8") as fp: + writer = csv.DictWriter(fp, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + print(f"Bonus task: wrote {len(rows)} rows to {output_csv}") + return rows + + +def discuss_storytelling_signals(rows): + """Print a short discussion of which features can signal storytelling narration.""" + pitch_std = np.array([float(r["pitch_std_hz"]) for r in rows], dtype=float) + pause_ratio = np.array([float(r["pause_ratio"]) for r in rows], dtype=float) + energy_range = np.array([float(r["energy_dynamic_range"]) for r in rows], dtype=float) + sentence_len = np.array([float(r["avg_sentence_words"]) for r in rows], dtype=float) + + print("\nBonus discussion: storytelling vs conversational speech") + print("- Pacing and pauses: higher pause ratio or more pause events can indicate narrative phrasing and dramatic timing.") + print("- Pitch variation: larger pitch_std_hz usually suggests expressive storytelling rather than flat conversational delivery.") + print("- Energy dynamics: larger energy_dynamic_range often reflects emphasis and emotional arcs in stories.") + print("- Sentence length: longer average sentence length can indicate narration; shorter fragments can indicate dialogue exchanges.") + print("\nObserved on this subset:") + print(f"- Mean pitch variation (std Hz): {np.mean(pitch_std):.2f}") + print(f"- Mean pause ratio: {np.mean(pause_ratio):.3f}") + print(f"- Mean energy dynamic range: {np.mean(energy_range):.4f}") + print(f"- Mean average sentence length: {np.mean(sentence_len):.2f} words") + + ranked = sorted(rows, key=lambda r: float(r.get("storytelling_score", 0.0)), reverse=True) + print("\nTop storytelling-like clips (heuristic score):") + for row in ranked[:3]: + print(f"- {row['filename']}: score={float(row['storytelling_score']):.2f}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Bonus: Storytelling audio analysis") + parser.add_argument("--input-dir", default="../examples", help="Directory containing .wav files") + parser.add_argument( + "--output-csv", + default="../examples/storytelling_analysis.csv", + help="Path to output storytelling analysis CSV", + ) + parser.add_argument("--max-files", type=int, default=12) + parser.add_argument("--model-size", default="tiny", choices=["tiny", "base", "small", "medium", "large"]) + args = parser.parse_args() + + script_dir = Path(__file__).resolve().parent + rows = analyze_storytelling( + input_dir=(script_dir / args.input_dir).resolve(), + output_csv=(script_dir / args.output_csv).resolve(), + max_files=args.max_files, + model_size=args.model_size, + ) + discuss_storytelling_signals(rows) diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/angry_8s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/angry_8s.wav new file mode 100644 index 0000000..e10c66c Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/angry_8s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/calm_6s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/calm_6s.wav new file mode 100644 index 0000000..159e4f9 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/calm_6s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/disgust_7s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/disgust_7s.wav new file mode 100644 index 0000000..9b93f9f Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/disgust_7s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/fearful_7s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/fearful_7s.wav new file mode 100644 index 0000000..d384560 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/fearful_7s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/happy_7s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/happy_7s.wav new file mode 100644 index 0000000..a8bcc13 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/happy_7s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/manifest.json b/narrative-audio-system/tests/fixtures/emotion_audio/manifest.json new file mode 100644 index 0000000..b0b81ef --- /dev/null +++ b/narrative-audio-system/tests/fixtures/emotion_audio/manifest.json @@ -0,0 +1,72 @@ +{ + "description": "Synthetic emotion audio fixtures for Step 5 classifier tests. Each file is designed with acoustic properties matching the MFCC fingerprint of its target emotion class.", + "acoustic_design": { + "calm": "f0=120Hz, soft energy, dark spectrum, slow rhythm", + "happy": "f0=200Hz, bright harmonics, rising inflections", + "angry": "f0=175Hz, high energy, hard-clipped, fast bursts", + "sad": "f0=90Hz, falling contour, sparse energy, slow", + "fearful": "f0=175Hz, breathy, trembling amplitude, irregular", + "neutral": "f0=150Hz, flat, constant energy, balanced spectrum", + "surprised": "f0=260Hz onset falling to 160Hz, short burst", + "disgust": "f0=105Hz, low-freq growl, staccato, dark tilt" + }, + "note": "Classifier predictions on synthetic audio may not always match the target label \u2014 the model was trained on RAVDESS human speech. Tests assert structural correctness and soft acoustic constraints, not exact label matching.", + "files": [ + { + "filename": "calm_6s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\calm_6s.wav", + "label": "calm", + "duration_s": 6.25, + "sample_rate": 16000 + }, + { + "filename": "happy_7s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\happy_7s.wav", + "label": "happy", + "duration_s": 5.61, + "sample_rate": 16000 + }, + { + "filename": "angry_8s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\angry_8s.wav", + "label": "angry", + "duration_s": 5.08, + "sample_rate": 16000 + }, + { + "filename": "sad_6s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\sad_6s.wav", + "label": "sad", + "duration_s": 6.7, + "sample_rate": 16000 + }, + { + "filename": "fearful_7s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\fearful_7s.wav", + "label": "fearful", + "duration_s": 5.604, + "sample_rate": 16000 + }, + { + "filename": "neutral_6s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\neutral_6s.wav", + "label": "neutral", + "duration_s": 5.75, + "sample_rate": 16000 + }, + { + "filename": "surprised_5s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\surprised_5s.wav", + "label": "surprised", + "duration_s": 4.75, + "sample_rate": 16000 + }, + { + "filename": "disgust_7s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\emotion_audio\\disgust_7s.wav", + "label": "disgust", + "duration_s": 6.233, + "sample_rate": 16000 + } + ] +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/neutral_6s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/neutral_6s.wav new file mode 100644 index 0000000..516cd90 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/neutral_6s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/sad_6s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/sad_6s.wav new file mode 100644 index 0000000..6bf76f5 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/sad_6s.wav differ diff --git a/narrative-audio-system/tests/fixtures/emotion_audio/surprised_5s.wav b/narrative-audio-system/tests/fixtures/emotion_audio/surprised_5s.wav new file mode 100644 index 0000000..c7632b0 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/emotion_audio/surprised_5s.wav differ diff --git a/narrative-audio-system/tests/fixtures/manifest.csv b/narrative-audio-system/tests/fixtures/manifest.csv new file mode 100644 index 0000000..1a04a24 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/manifest.csv @@ -0,0 +1,9 @@ +# VAD test fixtures manifest +# name, expected_segments, duration_s, description +silence_only.wav,0,3.000,3 s pure silence expect 0 segments +speech_only.wav,1,3.000,3 s continuous speech expect 1 segment +short_burst.wav,0,1.600,0.1 s speech burst (below min threshold) expect 0 segments +speech_gap_speech.wav,2,4.000,1 s speech, 1 s silence, 1 s speech expect 2 segments +multi_segment.wav,4,7.100,4 speech islands in ~7 s expect 4 segments +noisy_speech.wav,1,3.000,Speech + background noise expect >=1 segment +quiet_speech.wav,1,2.600,Low-amplitude speech (amplitude=0.08) expect >=1 segment diff --git a/narrative-audio-system/tests/fixtures/multi_segment.wav b/narrative-audio-system/tests/fixtures/multi_segment.wav new file mode 100644 index 0000000..33ca3e9 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/multi_segment.wav differ diff --git a/narrative-audio-system/tests/fixtures/noisy_speech.wav b/narrative-audio-system/tests/fixtures/noisy_speech.wav new file mode 100644 index 0000000..69c94a0 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/noisy_speech.wav differ diff --git a/narrative-audio-system/tests/fixtures/output/atmosphere_schedules.json b/narrative-audio-system/tests/fixtures/output/atmosphere_schedules.json new file mode 100644 index 0000000..79856aa --- /dev/null +++ b/narrative-audio-system/tests/fixtures/output/atmosphere_schedules.json @@ -0,0 +1,78 @@ +[ + { + "pair_index": 0, + "label": "calm", + "suppressed": false, + "schedule": { + "type": "atmosphere", + "emotion_label": "calm", + "query": "calm gentle wind soft water ambient", + "suggested_clip": "gentle wind, soft water", + "suggested_description": "gentle wind, soft water", + "fade_in_s": 2.0, + "lag_s": 6.0 + } + }, + { + "pair_index": 1, + "label": "tense", + "suppressed": false, + "schedule": { + "type": "atmosphere", + "emotion_label": "tense", + "query": "tense forest night ambience suspense", + "suggested_clip": "tense forest night, branch snap", + "suggested_description": "tense forest night, branch snap", + "fade_in_s": 2.0, + "lag_s": 6.0 + } + }, + { + "pair_index": 2, + "label": "fearful", + "suppressed": false, + "schedule": { + "type": "atmosphere", + "emotion_label": "fearful", + "query": "tense dark forest night ambient", + "suggested_clip": "dark forest, distant owl, creak", + "suggested_description": "dark forest, distant owl, creak", + "fade_in_s": 2.0, + "lag_s": 6.0 + } + }, + { + "pair_index": 3, + "label": "happy", + "suppressed": false, + "schedule": { + "type": "atmosphere", + "emotion_label": "happy", + "query": "upbeat bright cheerful outdoor birds", + "suggested_clip": "birdsong, light breeze", + "suggested_description": "birdsong, light breeze", + "fade_in_s": 2.0, + "lag_s": 6.0 + } + }, + { + "pair_index": 4, + "label": "happy", + "suppressed": true, + "schedule": null + }, + { + "pair_index": 5, + "label": "neutral", + "suppressed": false, + "schedule": { + "type": "atmosphere", + "emotion_label": "neutral", + "query": "neutral quiet indoor room tone", + "suggested_clip": "quiet room tone, light hum", + "suggested_description": "quiet room tone, light hum", + "fade_in_s": 2.0, + "lag_s": 6.0 + } + } +] \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/output/caption_lines.json b/narrative-audio-system/tests/fixtures/output/caption_lines.json new file mode 100644 index 0000000..ecb1f13 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/output/caption_lines.json @@ -0,0 +1,52 @@ +[ + { + "type": "caption", + "index": 1, + "label": "calm", + "text": "The forest was quiet that night.", + "start": 0.0, + "end": 2.5, + "confidence": 0.88, + "color": "#7ec8a0" + }, + { + "type": "caption", + "index": 2, + "label": "tense", + "text": "Until the branch snapped.", + "start": 3.1, + "end": 4.8, + "confidence": 0.79, + "color": "#e07a5f" + }, + { + "type": "caption", + "index": 3, + "label": "fearful", + "text": "She ran without looking back.", + "start": 5.2, + "end": 7.1, + "confidence": 0.82, + "color": "#c07ecf" + }, + { + "type": "caption", + "index": 4, + "label": "happy", + "text": "The morning light brought relief.", + "start": 15.0, + "end": 17.2, + "confidence": 0.76, + "color": "#f7c948" + }, + { + "type": "caption", + "index": 5, + "label": "happy", + "text": "She smiled at the sunrise.", + "start": 17.5, + "end": 19.0, + "confidence": 0.81, + "color": "#f7c948" + } +] \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/output/mock_pairs.json b/narrative-audio-system/tests/fixtures/output/mock_pairs.json new file mode 100644 index 0000000..807b433 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/output/mock_pairs.json @@ -0,0 +1,146 @@ +[ + { + "transcript": { + "text": "The forest was quiet that night.", + "start": 0.0, + "end": 2.5, + "latency_ms": 180.0, + "backend": "faster-whisper", + "confidence": 0.95, + "language": "en" + }, + "emotion": { + "label": "calm", + "confidence": 0.88, + "start": 0.0, + "end": 2.5, + "latency_ms": 12.0, + "backend": "mfcc-mlp", + "all_scores": { + "calm": 0.88, + "neutral": 0.08, + "happy": 0.04 + } + } + }, + { + "transcript": { + "text": "Until the branch snapped.", + "start": 3.1, + "end": 4.8, + "latency_ms": 190.0, + "backend": "faster-whisper", + "confidence": 0.91, + "language": "en" + }, + "emotion": { + "label": "tense", + "confidence": 0.79, + "start": 3.1, + "end": 4.8, + "latency_ms": 11.5, + "backend": "mfcc-mlp", + "all_scores": { + "tense": 0.79, + "fearful": 0.15, + "neutral": 0.06 + } + } + }, + { + "transcript": { + "text": "She ran without looking back.", + "start": 5.2, + "end": 7.1, + "latency_ms": 185.0, + "backend": "faster-whisper", + "confidence": 0.93, + "language": "en" + }, + "emotion": { + "label": "fearful", + "confidence": 0.82, + "start": 5.2, + "end": 7.1, + "latency_ms": 13.0, + "backend": "mfcc-mlp", + "all_scores": { + "fearful": 0.82, + "tense": 0.12, + "angry": 0.06 + } + } + }, + { + "transcript": { + "text": "The morning light brought relief.", + "start": 15.0, + "end": 17.2, + "latency_ms": 175.0, + "backend": "faster-whisper", + "confidence": 0.97, + "language": "en" + }, + "emotion": { + "label": "happy", + "confidence": 0.76, + "start": 15.0, + "end": 17.2, + "latency_ms": 10.5, + "backend": "mfcc-mlp", + "all_scores": { + "happy": 0.76, + "calm": 0.18, + "neutral": 0.06 + } + } + }, + { + "transcript": { + "text": "She smiled at the sunrise.", + "start": 17.5, + "end": 19.0, + "latency_ms": 170.0, + "backend": "faster-whisper", + "confidence": 0.94, + "language": "en" + }, + "emotion": { + "label": "happy", + "confidence": 0.81, + "start": 17.5, + "end": 19.0, + "latency_ms": 11.0, + "backend": "mfcc-mlp", + "all_scores": { + "happy": 0.81, + "calm": 0.14, + "neutral": 0.05 + } + } + }, + { + "transcript": { + "text": ".", + "start": 20.0, + "end": 20.3, + "latency_ms": 155.0, + "backend": "faster-whisper", + "confidence": 0.4, + "language": "en" + }, + "emotion": { + "label": "neutral", + "confidence": 0.6, + "start": 20.0, + "end": 20.3, + "latency_ms": 9.0, + "backend": "mfcc-mlp", + "all_scores": { + "neutral": 0.6, + "calm": 0.3, + "happy": 0.1 + } + } + } +] \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/output/srt_expected.srt b/narrative-audio-system/tests/fixtures/output/srt_expected.srt new file mode 100644 index 0000000..7eea7ad --- /dev/null +++ b/narrative-audio-system/tests/fixtures/output/srt_expected.srt @@ -0,0 +1,19 @@ +1 +00:00:00,000 --> 00:00:02,500 +[calm] The forest was quiet that night. + +2 +00:00:03,100 --> 00:00:04,800 +[tense] Until the branch snapped. + +3 +00:00:05,200 --> 00:00:07,100 +[fearful] She ran without looking back. + +4 +00:00:15,000 --> 00:00:17,200 +[happy] The morning light brought relief. + +5 +00:00:17,500 --> 00:00:19,000 +[happy] She smiled at the sunrise. diff --git a/narrative-audio-system/tests/fixtures/quiet_speech.wav b/narrative-audio-system/tests/fixtures/quiet_speech.wav new file mode 100644 index 0000000..bcd87b7 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/quiet_speech.wav differ diff --git a/narrative-audio-system/tests/fixtures/segmenter/empty.json b/narrative-audio-system/tests/fixtures/segmenter/empty.json new file mode 100644 index 0000000..1d2e633 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/empty.json @@ -0,0 +1,11 @@ +{ + "segments": [], + "meta": { + "expected_utterances": 0, + "strategies": [ + "pause_triggered", + "fixed_window" + ], + "note": "no segments -> no utterances" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/fixed_window_exact.json b/narrative-audio-system/tests/fixtures/segmenter/fixed_window_exact.json new file mode 100644 index 0000000..ed2bc30 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/fixed_window_exact.json @@ -0,0 +1,27 @@ +{ + "segments": [ + { + "start": 0.0, + "end": 1.0, + "duration_s": 1.0 + }, + { + "start": 1.5, + "end": 2.5, + "duration_s": 1.0 + }, + { + "start": 3.0, + "end": 4.0, + "duration_s": 1.0 + } + ], + "meta": { + "expected_utterances": 1, + "window_s": 3.0, + "strategies": [ + "fixed_window" + ], + "note": "3\u00d71s = 3s speech == window_s -> 1 utt on 3rd feed" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/fixed_window_overflow.json b/narrative-audio-system/tests/fixtures/segmenter/fixed_window_overflow.json new file mode 100644 index 0000000..e7b7eef --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/fixed_window_overflow.json @@ -0,0 +1,32 @@ +{ + "segments": [ + { + "start": 0.0, + "end": 1.0, + "duration_s": 1.0 + }, + { + "start": 1.5, + "end": 2.5, + "duration_s": 1.0 + }, + { + "start": 3.0, + "end": 4.0, + "duration_s": 1.0 + }, + { + "start": 4.5, + "end": 5.5, + "duration_s": 1.0 + } + ], + "meta": { + "expected_utterances": 2, + "window_s": 3.0, + "strategies": [ + "fixed_window" + ], + "note": "4\u00d71s=4s, window=3s -> emit at 3s, flush 1s -> 2 utts" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/four_mixed_gaps.json b/narrative-audio-system/tests/fixtures/segmenter/four_mixed_gaps.json new file mode 100644 index 0000000..fdc2b42 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/four_mixed_gaps.json @@ -0,0 +1,32 @@ +{ + "segments": [ + { + "start": 0.5, + "end": 1.3, + "duration_s": 0.8 + }, + { + "start": 1.5, + "end": 2.3, + "duration_s": 0.8 + }, + { + "start": 2.9, + "end": 3.7, + "duration_s": 0.8 + }, + { + "start": 3.85, + "end": 4.65, + "duration_s": 0.8 + } + ], + "meta": { + "expected_utterances": 2, + "pause_s": 0.4, + "strategies": [ + "pause_triggered" + ], + "note": "short/long/short gaps -> 2 utterances" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/safety_valve.json b/narrative-audio-system/tests/fixtures/segmenter/safety_valve.json new file mode 100644 index 0000000..41b73bd --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/safety_valve.json @@ -0,0 +1,23 @@ +{ + "segments": [ + { + "start": 0.0, + "end": 4.5, + "duration_s": 4.5 + }, + { + "start": 5.0, + "end": 9.5, + "duration_s": 4.5 + } + ], + "meta": { + "expected_utterances": 2, + "pause_s": 0.4, + "max_utterance_s": 8.0, + "strategies": [ + "pause_triggered" + ], + "note": "gap triggers split before max_utterance_s; 2 utts" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/single_segment.json b/narrative-audio-system/tests/fixtures/segmenter/single_segment.json new file mode 100644 index 0000000..f4f7c26 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/single_segment.json @@ -0,0 +1,17 @@ +{ + "segments": [ + { + "start": 1.0, + "end": 3.0, + "duration_s": 2.0 + } + ], + "meta": { + "expected_utterances": 1, + "strategies": [ + "pause_triggered", + "fixed_window" + ], + "note": "1 seg -> 1 utterance always" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/single_tiny.json b/narrative-audio-system/tests/fixtures/segmenter/single_tiny.json new file mode 100644 index 0000000..059c41c --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/single_tiny.json @@ -0,0 +1,17 @@ +{ + "segments": [ + { + "start": 0.5, + "end": 0.6, + "duration_s": 0.1 + } + ], + "meta": { + "expected_utterances": 1, + "strategies": [ + "pause_triggered", + "fixed_window" + ], + "note": "0.1s segment \u2014 segmenter emits; filtering is VAD's job" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/two_long_gap.json b/narrative-audio-system/tests/fixtures/segmenter/two_long_gap.json new file mode 100644 index 0000000..880d2a7 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/two_long_gap.json @@ -0,0 +1,23 @@ +{ + "segments": [ + { + "start": 0.5, + "end": 1.5, + "duration_s": 1.0 + }, + { + "start": 2.3, + "end": 3.3, + "duration_s": 1.0 + } + ], + "meta": { + "expected_utterances": 2, + "gap_s": 0.8, + "pause_s": 0.4, + "strategies": [ + "pause_triggered" + ], + "note": "gap 0.8s > pause_s 0.4s -> split" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/segmenter/two_short_gap.json b/narrative-audio-system/tests/fixtures/segmenter/two_short_gap.json new file mode 100644 index 0000000..107e5f3 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/segmenter/two_short_gap.json @@ -0,0 +1,23 @@ +{ + "segments": [ + { + "start": 0.5, + "end": 1.5, + "duration_s": 1.0 + }, + { + "start": 1.7, + "end": 2.7, + "duration_s": 1.0 + } + ], + "meta": { + "expected_utterances": 1, + "gap_s": 0.2, + "pause_s": 0.4, + "strategies": [ + "pause_triggered" + ], + "note": "gap 0.2s < pause_s 0.4s -> merged" + } +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/short_burst.wav b/narrative-audio-system/tests/fixtures/short_burst.wav new file mode 100644 index 0000000..881a1f2 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/short_burst.wav differ diff --git a/narrative-audio-system/tests/fixtures/silence_only.wav b/narrative-audio-system/tests/fixtures/silence_only.wav new file mode 100644 index 0000000..311b0cf Binary files /dev/null and b/narrative-audio-system/tests/fixtures/silence_only.wav differ diff --git a/narrative-audio-system/tests/fixtures/speech_gap_speech.wav b/narrative-audio-system/tests/fixtures/speech_gap_speech.wav new file mode 100644 index 0000000..8e7105c Binary files /dev/null and b/narrative-audio-system/tests/fixtures/speech_gap_speech.wav differ diff --git a/narrative-audio-system/tests/fixtures/speech_only.wav b/narrative-audio-system/tests/fixtures/speech_only.wav new file mode 100644 index 0000000..cd2da52 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/speech_only.wav differ diff --git a/narrative-audio-system/tests/fixtures/transcriber/edge/long_speech.wav b/narrative-audio-system/tests/fixtures/transcriber/edge/long_speech.wav new file mode 100644 index 0000000..a179bbc Binary files /dev/null and b/narrative-audio-system/tests/fixtures/transcriber/edge/long_speech.wav differ diff --git a/narrative-audio-system/tests/fixtures/transcriber/edge/manifest.json b/narrative-audio-system/tests/fixtures/transcriber/edge/manifest.json new file mode 100644 index 0000000..685379a --- /dev/null +++ b/narrative-audio-system/tests/fixtures/transcriber/edge/manifest.json @@ -0,0 +1,28 @@ +{ + "files": [ + { + "filename": "silence_3s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\transcriber\\edge\\silence_3s.wav", + "duration_s": 3.0, + "note": "3 s pure silence -> transcriber returns empty/whitespace text" + }, + { + "filename": "noise_3s.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\transcriber\\edge\\noise_3s.wav", + "duration_s": 3.0, + "note": "broadband noise -> transcriber may hallucinate; result must still be str" + }, + { + "filename": "tiny_speech.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\transcriber\\edge\\tiny_speech.wav", + "duration_s": 0.15, + "note": "50 ms speech clip -> result must be str without crashing" + }, + { + "filename": "long_speech.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\tests\\fixtures\\transcriber\\edge\\long_speech.wav", + "duration_s": 8.6, + "note": "8 s speech -> tests longer utterance handling" + } + ] +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/transcriber/edge/noise_3s.wav b/narrative-audio-system/tests/fixtures/transcriber/edge/noise_3s.wav new file mode 100644 index 0000000..2613ac3 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/transcriber/edge/noise_3s.wav differ diff --git a/narrative-audio-system/tests/fixtures/transcriber/edge/silence_3s.wav b/narrative-audio-system/tests/fixtures/transcriber/edge/silence_3s.wav new file mode 100644 index 0000000..37e1d82 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/transcriber/edge/silence_3s.wav differ diff --git a/narrative-audio-system/tests/fixtures/transcriber/edge/tiny_speech.wav b/narrative-audio-system/tests/fixtures/transcriber/edge/tiny_speech.wav new file mode 100644 index 0000000..baaebc4 Binary files /dev/null and b/narrative-audio-system/tests/fixtures/transcriber/edge/tiny_speech.wav differ diff --git a/narrative-audio-system/tests/fixtures/transcriber/integration/ravdess_manifest.json b/narrative-audio-system/tests/fixtures/transcriber/integration/ravdess_manifest.json new file mode 100644 index 0000000..f2ada38 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/transcriber/integration/ravdess_manifest.json @@ -0,0 +1,46 @@ +{ + "description": "RAVDESS files with known ground-truth text", + "ground_truth_note": "Whisper tiny may paraphrase. Tests check for keyword presence, not exact match.", + "keywords": { + "01": [ + "kids", + "talking", + "door" + ], + "02": [ + "dogs", + "sitting", + "door" + ] + }, + "files": [ + { + "filename": "03-01-01-01-01-01-01.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\examples\\03-01-01-01-01-01-01.wav", + "statement_code": "01", + "ground_truth": "Kids are talking by the door", + "start_s": 0.0 + }, + { + "filename": "03-01-01-01-01-01-04.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\examples\\03-01-01-01-01-01-04.wav", + "statement_code": "01", + "ground_truth": "Kids are talking by the door", + "start_s": 0.0 + }, + { + "filename": "03-01-01-01-02-01-01.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\examples\\03-01-01-01-02-01-01.wav", + "statement_code": "02", + "ground_truth": "Dogs are sitting by the door", + "start_s": 0.0 + }, + { + "filename": "03-01-01-01-02-01-04.wav", + "filepath": "C:\\Users\\EvonHoEvonHo\\humanai-foundation.github.io\\narrative-audio-system\\examples\\03-01-01-01-02-01-04.wav", + "statement_code": "02", + "ground_truth": "Dogs are sitting by the door", + "start_s": 0.0 + } + ] +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/transcriber/unit/empty_utterances.json b/narrative-audio-system/tests/fixtures/transcriber/unit/empty_utterances.json new file mode 100644 index 0000000..78435b5 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/transcriber/unit/empty_utterances.json @@ -0,0 +1,6 @@ +{ + "name": "empty_utterances", + "utterances": [], + "note": "no utterances -> no results, full_transcript is empty", + "expected_results": 0 +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/transcriber/unit/single_utterance.json b/narrative-audio-system/tests/fixtures/transcriber/unit/single_utterance.json new file mode 100644 index 0000000..1b7b8a7 --- /dev/null +++ b/narrative-audio-system/tests/fixtures/transcriber/unit/single_utterance.json @@ -0,0 +1,12 @@ +{ + "name": "single_utterance", + "utterances": [ + { + "start": 1.2, + "end": 3.84, + "duration_s": 2.64 + } + ], + "note": "1 utterance -> 1 TranscriptionResult", + "expected_results": 1 +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/transcriber/unit/three_utterances.json b/narrative-audio-system/tests/fixtures/transcriber/unit/three_utterances.json new file mode 100644 index 0000000..20515ed --- /dev/null +++ b/narrative-audio-system/tests/fixtures/transcriber/unit/three_utterances.json @@ -0,0 +1,22 @@ +{ + "name": "three_utterances", + "utterances": [ + { + "start": 0.5, + "end": 2.5, + "duration_s": 2.0 + }, + { + "start": 3.1, + "end": 5.2, + "duration_s": 2.1 + }, + { + "start": 6.0, + "end": 7.8, + "duration_s": 1.8 + } + ], + "note": "3 utterances -> 3 results; full_transcript joins all", + "expected_results": 3 +} \ No newline at end of file diff --git a/narrative-audio-system/tests/fixtures/transcriber/unit/zero_duration_edge.json b/narrative-audio-system/tests/fixtures/transcriber/unit/zero_duration_edge.json new file mode 100644 index 0000000..6a9b84d --- /dev/null +++ b/narrative-audio-system/tests/fixtures/transcriber/unit/zero_duration_edge.json @@ -0,0 +1,12 @@ +{ + "name": "zero_duration_edge", + "utterances": [ + { + "start": 2.0, + "end": 2.0, + "duration_s": 0.0 + } + ], + "note": "zero-duration utterance should not crash", + "expected_results": 1 +} \ No newline at end of file diff --git a/narrative-audio-system/tests/generate_captured_audio.py b/narrative-audio-system/tests/generate_captured_audio.py new file mode 100644 index 0000000..a41dabc --- /dev/null +++ b/narrative-audio-system/tests/generate_captured_audio.py @@ -0,0 +1,106 @@ +""" +Generate captured_audio.wav for pipeline testing by concatenating real +RAVDESS speech files — one per emotion — with 0.8 s silence between them. + +RAVDESS filename convention (3rd field = emotion): + 01=neutral 02=calm 03=happy 04=sad + 05=angry 06=fearful 07=disgust 08=surprised + +Selected clips (one speaker, both statements): + calm -> 03-01-02-01-02-01-01.wav + happy -> 03-01-03-01-01-01-01.wav + angry -> 03-01-05-01-02-01-01.wav (tense/angry) + fearful -> 03-01-06-01-01-01-01.wav + +Each clip is resampled to 16 kHz mono and RMS-normalised to a consistent +loudness before concatenation so VAD fires evenly on all segments. + +Output: examples/captured_audio.wav + +Run: + python tests/generate_captured_audio.py +""" + +from pathlib import Path + +import librosa +import numpy as np +import soundfile as sf + +ROOT = Path(__file__).resolve().parent.parent +EX_DIR = ROOT / "examples" +OUT_PATH = EX_DIR / "captured_audio.wav" +SR = 16000 +PAUSE_S = 0.8 # silence between utterances +TARGET_RMS = 0.18 + + +def _load_resample(path: Path) -> np.ndarray: + """Load a wav file, convert to 16 kHz mono float32.""" + audio, sr = sf.read(str(path), dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != SR: + audio = librosa.resample(audio, orig_sr=sr, target_sr=SR) + return audio.astype(np.float32) + + +def _rms_normalise(audio: np.ndarray, target: float = TARGET_RMS) -> np.ndarray: + rms = np.sqrt(np.mean(audio ** 2)) + if rms < 1e-9: + return audio + return (audio * (target / rms)).astype(np.float32) + + +# --------------------------------------------------------------------------- +# Pick one representative file per emotion from what exists in examples/ +# Fallback: scan for any file matching the emotion code if preferred missing. +# --------------------------------------------------------------------------- + +def _pick(emotion_code: str) -> Path: + """Return first available file for this emotion code (field 3 = 1-indexed).""" + pattern = f"03-01-{emotion_code:02d}-*.wav" + matches = sorted(EX_DIR.glob(pattern)) + if not matches: + raise FileNotFoundError( + f"No RAVDESS file matching {pattern} in {EX_DIR}.\n" + f"Download RAVDESS or run: python examples/download_ravdess.py" + ) + # Prefer statement 2 ("children are talking") actor 1 if available + preferred = [m for m in matches if m.name.endswith("-02-01-01.wav")] + return preferred[0] if preferred else matches[0] + + +CLIPS = [ + ("calm", _pick(2)), + ("happy", _pick(3)), + ("angry", _pick(5)), + ("fearful", _pick(6)), +] + +# --------------------------------------------------------------------------- +# Assemble +# --------------------------------------------------------------------------- + +silence = np.zeros(int(PAUSE_S * SR), dtype=np.float32) +lead_in = np.zeros(int(0.3 * SR), dtype=np.float32) + +audio = lead_in +for i, (label, path) in enumerate(CLIPS): + seg = _rms_normalise(_load_resample(path)) + audio = np.concatenate([audio, seg]) + if i < len(CLIPS) - 1: + audio = np.concatenate([audio, silence]) + +audio = np.concatenate([audio, np.zeros(int(0.3 * SR), dtype=np.float32)]) + +sf.write(str(OUT_PATH), audio, SR, subtype="PCM_16") + +total_s = len(audio) / SR +print(f"[generate_captured_audio] Wrote {OUT_PATH}") +print(f" Duration : {total_s:.2f}s | {len(CLIPS)} utterances " + f"(+{PAUSE_S}s gaps)") +for label, path in CLIPS: + seg = _rms_normalise(_load_resample(path)) + rms = np.sqrt(np.mean(seg ** 2)) + print(f" [{label:<7}] {path.name} ({len(seg)/SR:.2f}s, RMS={rms:.4f})") diff --git a/narrative-audio-system/tests/generate_emotion_audio.py b/narrative-audio-system/tests/generate_emotion_audio.py new file mode 100644 index 0000000..8d9a285 --- /dev/null +++ b/narrative-audio-system/tests/generate_emotion_audio.py @@ -0,0 +1,399 @@ +""" +Generate long synthetic audio test files with distinct emotional tones +for testing the Step 5 EmotionClassifier. + +Each file is 6–8 s long and is designed with acoustic properties that +match the MFCC fingerprint each emotion class is known to produce: + + calm — steady low pitch (~120 Hz), soft energy, slow regular rhythm + happy — higher pitch (~200 Hz), upward inflections, brighter harmonics + angry — high energy, harsh upper harmonics, fast irregular bursts + sad — low pitch (~90 Hz), falling contour, sparse energy + fearful — breathy, mid-high pitch (~180 Hz), erratic amplitude + neutral — flat pitch (~150 Hz), constant moderate energy + surprised — sudden high-pitch onset (~260 Hz), short burst then decay + disgust — low-mid pitch, strong low-frequency noise, staccato bursts + +Files written to tests/fixtures/emotion_audio/: + calm_6s.wav happy_7s.wav angry_8s.wav + sad_6s.wav fearful_7s.wav neutral_6s.wav + surprised_5s.wav disgust_7s.wav + +A manifest (manifest.json) records the expected label for each file. + +Run: + python tests/generate_emotion_audio.py +""" + +import json +from pathlib import Path + +import numpy as np +import soundfile as sf +from numpy.fft import rfft, irfft + +SAMPLE_RATE = 16000 +OUT_DIR = Path(__file__).parent / "fixtures" / "emotion_audio" +OUT_DIR.mkdir(parents=True, exist_ok=True) + +RNG = np.random.default_rng(seed=777) + + +# --------------------------------------------------------------------------- +# Low-level audio primitives +# --------------------------------------------------------------------------- + +def _silence(duration_s: float) -> np.ndarray: + n = int(duration_s * SAMPLE_RATE) + return RNG.normal(0, 5e-5, n).astype(np.float32) + + +def _bandpass(signal: np.ndarray, low_hz: float, high_hz: float) -> np.ndarray: + """Zero-phase bandpass via FFT zeroing.""" + spectrum = rfft(signal) + freqs = np.fft.rfftfreq(len(signal), d=1.0 / SAMPLE_RATE) + spectrum[(freqs < low_hz) | (freqs > high_hz)] = 0 + return irfft(spectrum, n=len(signal)).astype(np.float32) + + +def _voiced( + duration_s: float, + f0: float = 150.0, + f0_vibrato_hz: float = 0.0, + f0_vibrato_depth: float = 0.0, + harmonic_gains: tuple = (1.0, 0.6, 0.4, 0.25, 0.15, 0.1), + amplitude: float = 0.35, + spectral_tilt: float = 0.0, # dB/octave, negative = darker + formant_noise_amp: float = 0.04, + formant_low: float = 400.0, + formant_high: float = 3500.0, + breathiness: float = 0.0, # additive breathiness (0-1) +) -> np.ndarray: + """ + Synthesise voiced speech-like audio. + + Parameters control the acoustic features that distinguish emotions: + f0 — fundamental frequency (pitch) + f0_vibrato_* — slow pitch modulation (expressiveness) + harmonic_gains — relative amplitude of harmonics + spectral_tilt — spectral slope (harsh = more high-freq energy) + breathiness — unvoiced noise mixed into the signal + """ + n = int(duration_s * SAMPLE_RATE) + t = np.linspace(0, duration_s, n, endpoint=False) + + # Pitch contour with optional vibrato + if f0_vibrato_hz > 0: + f0_contour = f0 + f0_vibrato_depth * np.sin(2 * np.pi * f0_vibrato_hz * t) + else: + f0_contour = np.full(n, f0) + + # Cumulative phase for each harmonic (handles variable f0) + dt = 1.0 / SAMPLE_RATE + phase = np.cumsum(2 * np.pi * f0_contour * dt) + + signal = np.zeros(n, dtype=np.float64) + for k, gain in enumerate(harmonic_gains, start=1): + # Apply spectral tilt: gain_k *= k^(tilt/20) + tilt_factor = (k ** (spectral_tilt / 20.0)) if spectral_tilt != 0 else 1.0 + signal += gain * tilt_factor * np.sin(k * phase) + + # Formant noise + formant_noise = RNG.normal(0, formant_noise_amp, n) + formant_noise = _bandpass(formant_noise.astype(np.float32), formant_low, formant_high) + signal += formant_noise + + # Breathiness (wideband noise, softer) + if breathiness > 0: + breath = RNG.normal(0, breathiness, n).astype(np.float32) + breath = _bandpass(breath, 1000, 8000) + signal += breath + + signal = signal.astype(np.float32) + peak = np.abs(signal).max() + if peak > 0: + signal /= peak + signal *= amplitude + + # Fade edges + fade = min(int(0.015 * SAMPLE_RATE), n // 8) + ramp = np.linspace(0, 1, fade, dtype=np.float32) + signal[:fade] *= ramp + signal[-fade:] *= ramp[::-1] + + return signal + + +def _amplitude_envelope(signal: np.ndarray, envelope: np.ndarray) -> np.ndarray: + """Multiply signal by a pre-built amplitude envelope (same length).""" + env = np.interp( + np.linspace(0, 1, len(signal)), + np.linspace(0, 1, len(envelope)), + envelope, + ).astype(np.float32) + return (signal * env).astype(np.float32) + + +def _phrase_rhythm( + phrase_duration_s: float, + pause_duration_s: float, + num_phrases: int, + voiced_fn, +) -> np.ndarray: + """Produce speech-like phrases separated by short pauses.""" + parts = [] + for _ in range(num_phrases): + parts.append(voiced_fn(phrase_duration_s)) + parts.append(_silence(pause_duration_s)) + return np.concatenate(parts).astype(np.float32) + + +def _save(name: str, audio: np.ndarray, label: str) -> dict: + path = OUT_DIR / name + sf.write(str(path), audio, SAMPLE_RATE) + duration = len(audio) / SAMPLE_RATE + print(f" {name:<28} {duration:.2f}s label={label!r}") + return { + "filename": name, + "filepath": str(path.resolve()), + "label": label, + "duration_s": round(duration, 3), + "sample_rate": SAMPLE_RATE, + } + + +# --------------------------------------------------------------------------- +# Emotion-specific audio generators +# --------------------------------------------------------------------------- + +def make_calm() -> np.ndarray: + """ + Calm narration: steady low pitch (120 Hz), soft regular rhythm, + minimal spectral energy above 2 kHz, gentle vibrato. + Duration: ~6 s + """ + def phrase(d): + return _voiced(d, f0=120.0, f0_vibrato_hz=4.5, f0_vibrato_depth=3.0, + harmonic_gains=(1.0, 0.55, 0.3, 0.15, 0.08, 0.04), + amplitude=0.28, spectral_tilt=-2.0, + formant_noise_amp=0.02, formant_high=2500.0) + + return _phrase_rhythm(0.9, 0.35, 5, phrase) + + +def make_happy() -> np.ndarray: + """ + Happy speech: elevated pitch (200 Hz), bright upper harmonics, + upward-inflected phrases, bouncy rhythm. + Duration: ~7 s + """ + parts = [] + pitches = [195.0, 210.0, 200.0, 220.0, 205.0, 215.0] + for i, f0 in enumerate(pitches): + seg = _voiced(0.75, f0=f0, f0_vibrato_hz=5.0, f0_vibrato_depth=8.0, + harmonic_gains=(1.0, 0.7, 0.55, 0.4, 0.3, 0.2), + amplitude=0.38, spectral_tilt=1.5, + formant_noise_amp=0.05, formant_high=4000.0) + # Rising envelope on each phrase + env = np.linspace(0.6, 1.0, len(seg)).astype(np.float32) + parts.append(seg * env) + parts.append(_silence(0.22 if i % 2 == 0 else 0.15)) + return np.concatenate(parts).astype(np.float32) + + +def make_angry() -> np.ndarray: + """ + Angry speech: high energy, harsh upper harmonics (spectral_tilt +4), + fast irregular bursts, clipped amplitude shape. + Duration: ~8 s + """ + parts = [] + phrase_lens = [0.55, 0.4, 0.7, 0.45, 0.6, 0.5, 0.65, 0.4] + pause_lens = [0.1, 0.08, 0.12, 0.09, 0.1, 0.08, 0.11, 0.15] + for plen, slen in zip(phrase_lens, pause_lens): + f0 = float(RNG.uniform(165, 195)) + seg = _voiced(plen, f0=f0, f0_vibrato_hz=0.0, + harmonic_gains=(1.0, 0.8, 0.65, 0.55, 0.45, 0.4), + amplitude=0.55, spectral_tilt=4.0, + formant_noise_amp=0.09, formant_low=300.0, formant_high=5000.0) + # Hard-clip to simulate vocal strain + seg = np.clip(seg, -0.45, 0.45).astype(np.float32) + parts.append(seg) + parts.append(_silence(slen)) + return np.concatenate(parts).astype(np.float32) + + +def make_sad() -> np.ndarray: + """ + Sad speech: low pitch (90 Hz), falling contour on each phrase, + sparse energy, slow pace, dark spectral tilt. + Duration: ~6.5 s + """ + parts = [] + for i in range(4): + dur = 0.95 + i * 0.1 + seg = _voiced(dur, f0=92.0, f0_vibrato_hz=2.5, f0_vibrato_depth=2.0, + harmonic_gains=(1.0, 0.45, 0.22, 0.1, 0.05, 0.02), + amplitude=0.22, spectral_tilt=-3.5, + formant_noise_amp=0.015, formant_high=2000.0) + # Falling amplitude envelope + env = np.linspace(1.0, 0.35, len(seg)).astype(np.float32) + parts.append(seg * env) + parts.append(_silence(0.5 + i * 0.05)) + return np.concatenate(parts).astype(np.float32) + + +def make_fearful() -> np.ndarray: + """ + Fearful speech: high mid pitch (175 Hz), breathy, erratic amplitude + (trembling), irregular phrase lengths, high breathiness. + Duration: ~7 s + """ + parts = [] + for i in range(7): + dur = float(RNG.uniform(0.45, 0.85)) + f0 = float(RNG.uniform(160, 190)) + amp = float(RNG.uniform(0.18, 0.38)) + seg = _voiced(dur, f0=f0, f0_vibrato_hz=8.0, f0_vibrato_depth=12.0, + harmonic_gains=(1.0, 0.5, 0.35, 0.2, 0.15, 0.1), + amplitude=amp, spectral_tilt=0.5, + formant_noise_amp=0.04, formant_high=3500.0, + breathiness=0.12) + # Trembling: fast amplitude modulation + t = np.linspace(0, dur, len(seg)) + tremor = (1.0 + 0.25 * np.sin(2 * np.pi * 7.5 * t)).astype(np.float32) + parts.append((seg * tremor).astype(np.float32)) + parts.append(_silence(float(RNG.uniform(0.08, 0.25)))) + return np.concatenate(parts).astype(np.float32) + + +def make_neutral() -> np.ndarray: + """ + Neutral narration: flat pitch (150 Hz), constant moderate energy, + regular phrase structure, balanced spectrum. + Duration: ~6 s + """ + def phrase(d): + return _voiced(d, f0=150.0, f0_vibrato_hz=0.0, + harmonic_gains=(1.0, 0.55, 0.35, 0.2, 0.12, 0.07), + amplitude=0.32, spectral_tilt=0.0, + formant_noise_amp=0.03, formant_high=3000.0) + + return _phrase_rhythm(0.85, 0.30, 5, phrase) + + +def make_surprised() -> np.ndarray: + """ + Surprised exclamation: sudden high-pitch burst (260 Hz) at onset, + rapid decay in pitch and energy. + Duration: ~6 s + """ + parts = [] + # Opening burst — very high pitch + burst = _voiced(0.5, f0=265.0, + harmonic_gains=(1.0, 0.75, 0.6, 0.5, 0.4, 0.35), + amplitude=0.55, spectral_tilt=2.5, + formant_noise_amp=0.07, formant_high=5000.0) + parts.append(burst) + parts.append(_silence(0.15)) + + # Trailing phrases — pitch falls after the initial surprise + falling_f0 = [240.0, 215.0, 195.0, 175.0, 162.0] + for f0 in falling_f0: + seg = _voiced(0.6, f0=f0, + harmonic_gains=(1.0, 0.65, 0.45, 0.3, 0.2, 0.12), + amplitude=0.38, spectral_tilt=1.0, + formant_high=3500.0) + parts.append(seg) + parts.append(_silence(0.22)) + return np.concatenate(parts).astype(np.float32) + + +def make_disgust() -> np.ndarray: + """ + Disgust: low-mid pitch (105 Hz), strong low-frequency noise (growl), + staccato bursts, heavy spectral tilt emphasising lower harmonics. + Duration: ~7 s + """ + parts = [] + for i in range(6): + dur = float(RNG.uniform(0.55, 0.85)) + seg = _voiced(dur, f0=105.0, f0_vibrato_hz=3.0, f0_vibrato_depth=4.0, + harmonic_gains=(1.0, 0.5, 0.3, 0.18, 0.1, 0.06), + amplitude=0.42, spectral_tilt=-1.5, + formant_noise_amp=0.06, formant_low=100.0, formant_high=2200.0) + # Low growl layer + growl = RNG.normal(0, 0.08, len(seg)).astype(np.float32) + growl = _bandpass(growl, 60.0, 350.0) + seg = np.clip(seg + growl * 0.35, -0.8, 0.8).astype(np.float32) + parts.append(seg) + parts.append(_silence(float(RNG.uniform(0.18, 0.35)))) + return np.concatenate(parts).astype(np.float32) + + +# --------------------------------------------------------------------------- +# Registry and writer +# --------------------------------------------------------------------------- + +# Per-emotion RMS targets preserve relative energy ordering. +# angry > neutral/happy > calm/fearful > sad (matching real RAVDESS levels). +EMOTION_GENERATORS = [ + ("calm_6s.wav", "calm", make_calm, 0.18), + ("happy_7s.wav", "happy", make_happy, 0.26), + ("angry_8s.wav", "angry", make_angry, 0.38), + ("sad_6s.wav", "sad", make_sad, 0.12), + ("fearful_7s.wav", "fearful", make_fearful, 0.20), + ("neutral_6s.wav", "neutral", make_neutral, 0.24), + ("surprised_5s.wav", "surprised", make_surprised, 0.30), + ("disgust_7s.wav", "disgust", make_disgust, 0.28), +] + + +def _rms_normalise(audio: np.ndarray, target_rms: float) -> np.ndarray: + current = float(np.sqrt(np.mean(audio ** 2))) + if current > 1e-8: + audio = audio * (target_rms / current) + return np.clip(audio, -0.95, 0.95).astype(np.float32) + + +def main(): + print(f"Generating emotion audio fixtures in {OUT_DIR}/\n") + manifest_entries = [] + for filename, label, generator, target_rms in EMOTION_GENERATORS: + audio = generator() + audio = _rms_normalise(audio, target_rms) + entry = _save(filename, audio, label) + manifest_entries.append(entry) + + manifest = { + "description": ( + "Synthetic emotion audio fixtures for Step 5 classifier tests. " + "Each file is designed with acoustic properties matching the " + "MFCC fingerprint of its target emotion class." + ), + "acoustic_design": { + "calm": "f0=120Hz, soft energy, dark spectrum, slow rhythm", + "happy": "f0=200Hz, bright harmonics, rising inflections", + "angry": "f0=175Hz, high energy, hard-clipped, fast bursts", + "sad": "f0=90Hz, falling contour, sparse energy, slow", + "fearful": "f0=175Hz, breathy, trembling amplitude, irregular", + "neutral": "f0=150Hz, flat, constant energy, balanced spectrum", + "surprised": "f0=260Hz onset falling to 160Hz, short burst", + "disgust": "f0=105Hz, low-freq growl, staccato, dark tilt", + }, + "note": ( + "Classifier predictions on synthetic audio may not always match " + "the target label — the model was trained on RAVDESS human speech. " + "Tests assert structural correctness and soft acoustic constraints, " + "not exact label matching." + ), + "files": manifest_entries, + } + manifest_path = OUT_DIR / "manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2)) + print(f"\n Manifest -> {manifest_path}") + print(f"\nDone -- {len(manifest_entries)} emotion audio files ready.") + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/tests/generate_output_fixtures.py b/narrative-audio-system/tests/generate_output_fixtures.py new file mode 100644 index 0000000..e56a09f --- /dev/null +++ b/narrative-audio-system/tests/generate_output_fixtures.py @@ -0,0 +1,298 @@ +""" +Generate synthetic fixtures for Step 6 — Output Generator tests. + +Produces: + tests/fixtures/output/ + mock_pairs.json — list of (transcript, emotion) pair dicts + srt_expected.srt — expected SRT output for the mock pairs + caption_lines.json — expected CaptionLine.to_dict() for each pair + atmosphere_schedules.json — expected CrossfadeSchedule.to_dict() for each pair + +Run: + python tests/generate_output_fixtures.py +""" + +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +FIXTURE_DIR = ROOT / "tests" / "fixtures" / "output" +FIXTURE_DIR.mkdir(parents=True, exist_ok=True) + +# --------------------------------------------------------------------------- +# Mock (transcript, emotion) pairs +# --------------------------------------------------------------------------- + +MOCK_PAIRS = [ + { + "transcript": { + "text": "The forest was quiet that night.", + "start": 0.0, + "end": 2.5, + "latency_ms": 180.0, + "backend": "faster-whisper", + "confidence": 0.95, + "language": "en", + }, + "emotion": { + "label": "calm", + "confidence": 0.88, + "start": 0.0, + "end": 2.5, + "latency_ms": 12.0, + "backend": "mfcc-mlp", + "all_scores": {"calm": 0.88, "neutral": 0.08, "happy": 0.04}, + }, + }, + { + "transcript": { + "text": "Until the branch snapped.", + "start": 3.1, + "end": 4.8, + "latency_ms": 190.0, + "backend": "faster-whisper", + "confidence": 0.91, + "language": "en", + }, + "emotion": { + "label": "tense", + "confidence": 0.79, + "start": 3.1, + "end": 4.8, + "latency_ms": 11.5, + "backend": "mfcc-mlp", + "all_scores": {"tense": 0.79, "fearful": 0.15, "neutral": 0.06}, + }, + }, + { + "transcript": { + "text": "She ran without looking back.", + "start": 5.2, + "end": 7.1, + "latency_ms": 185.0, + "backend": "faster-whisper", + "confidence": 0.93, + "language": "en", + }, + "emotion": { + "label": "fearful", + "confidence": 0.82, + "start": 5.2, + "end": 7.1, + "latency_ms": 13.0, + "backend": "mfcc-mlp", + "all_scores": {"fearful": 0.82, "tense": 0.12, "angry": 0.06}, + }, + }, + { + "transcript": { + "text": "The morning light brought relief.", + "start": 15.0, + "end": 17.2, + "latency_ms": 175.0, + "backend": "faster-whisper", + "confidence": 0.97, + "language": "en", + }, + "emotion": { + "label": "happy", + "confidence": 0.76, + "start": 15.0, + "end": 17.2, + "latency_ms": 10.5, + "backend": "mfcc-mlp", + "all_scores": {"happy": 0.76, "calm": 0.18, "neutral": 0.06}, + }, + }, + # Duplicate emotion within cooldown — should NOT produce a new atmosphere + { + "transcript": { + "text": "She smiled at the sunrise.", + "start": 17.5, + "end": 19.0, + "latency_ms": 170.0, + "backend": "faster-whisper", + "confidence": 0.94, + "language": "en", + }, + "emotion": { + "label": "happy", + "confidence": 0.81, + "start": 17.5, + "end": 19.0, + "latency_ms": 11.0, + "backend": "mfcc-mlp", + "all_scores": {"happy": 0.81, "calm": 0.14, "neutral": 0.05}, + }, + }, + # Very short transcript — should be filtered out by CaptionFormatter + { + "transcript": { + "text": ".", + "start": 20.0, + "end": 20.3, + "latency_ms": 155.0, + "backend": "faster-whisper", + "confidence": 0.40, + "language": "en", + }, + "emotion": { + "label": "neutral", + "confidence": 0.60, + "start": 20.0, + "end": 20.3, + "latency_ms": 9.0, + "backend": "mfcc-mlp", + "all_scores": {"neutral": 0.60, "calm": 0.30, "happy": 0.10}, + }, + }, +] + +# --------------------------------------------------------------------------- +# Expected CaptionLine dicts (skip the too-short entry at index 5) +# --------------------------------------------------------------------------- + +TONE_COLOURS = { + "neutral": "#a8b4c0", + "calm": "#7ec8a0", + "happy": "#f7c948", + "sad": "#6b9bcf", + "angry": "#e05c5c", + "fearful": "#c07ecf", + "disgust": "#8a9e6b", + "surprised": "#f0a045", + "tense": "#e07a5f", + "unknown": "#ffffff", +} + +def _tone_colour(label): + return TONE_COLOURS.get(label.lower(), TONE_COLOURS["unknown"]) + + +def _ts(s: float) -> str: + h = int(s // 3600) + m = int((s % 3600) // 60) + sec = s % 60 + return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".", ",") + + +caption_lines = [] +srt_lines = [] +caption_idx = 0 + +for pair in MOCK_PAIRS: + tr = pair["transcript"] + em = pair["emotion"] + if len(tr["text"].strip()) < 2: + continue + caption_idx += 1 + cl = { + "type": "caption", + "index": caption_idx, + "label": em["label"], + "text": tr["text"].strip(), + "start": round(tr["start"], 3), + "end": round(tr["end"], 3), + "confidence": round(em["confidence"], 3), + "color": _tone_colour(em["label"]), + } + caption_lines.append(cl) + + srt_block = ( + f"{caption_idx}\n" + f"{_ts(tr['start'])} --> {_ts(tr['end'])}\n" + f"[{em['label']}] {tr['text'].strip()}\n" + ) + srt_lines.append(srt_block) + + +# --------------------------------------------------------------------------- +# Expected atmosphere schedules +# (cooldown logic: pairs[1] tense, pairs[2] fearful trigger; pairs[4] happy +# immediately follows pairs[3] happy and is within cooldown_s=8s — suppressed) +# --------------------------------------------------------------------------- + +TONE_QUERIES = { + "calm": "calm gentle wind soft water ambient", + "neutral": "neutral quiet indoor room tone", + "happy": "upbeat bright cheerful outdoor birds", + "sad": "sad melancholy quiet rain distant", + "angry": "urgent high-energy dramatic intense", + "fearful": "tense dark forest night ambient", + "tense": "tense forest night ambience suspense", + "disgust": "dark low rumble ominous underground", + "surprised": "sudden bright stab high-energy reveal", +} + +FALLBACK_AMBIENT = { + "calm": {"description": "gentle wind, soft water", "energy": "low"}, + "neutral": {"description": "quiet room tone, light hum", "energy": "low"}, + "happy": {"description": "birdsong, light breeze", "energy": "medium"}, + "sad": {"description": "distant rain, sparse piano", "energy": "low"}, + "angry": {"description": "driving percussion, wind", "energy": "high"}, + "fearful": {"description": "dark forest, distant owl, creak","energy": "medium"}, + "tense": {"description": "tense forest night, branch snap","energy": "medium"}, + "disgust": {"description": "low rumble, dripping, echo", "energy": "low"}, + "surprised": {"description": "bright orchestral stab, rush", "energy": "high"}, +} + +# Pairs that trigger an atmosphere change (label changes OR first time): +# pairs[0]: calm (first) -> triggers +# pairs[1]: tense -> triggers (label changed) +# pairs[2]: fearful -> triggers (label changed) +# pairs[3]: happy -> triggers (label changed) +# pairs[4]: happy -> suppressed (same label, within ~2s of pairs[3] which is << 8s cooldown) +# pairs[5]: neutral (.) -> caption filtered, atmosphere still evaluated — label changed but text filtered + +atmosphere_schedules = [] +last_label = None + +# Note: in reality cooldown is time-based; for the fixture we just record which +# ones WOULD trigger assuming the test processes them quickly (within 8s each). +# The fixture documents the expected output, not the timing. + +for pair in MOCK_PAIRS: + em = pair["emotion"] + label = em["label"].lower() + query = TONE_QUERIES.get(label, f"{label} ambient atmosphere") + fb = FALLBACK_AMBIENT.get(label, {"description": f"{label} ambient", "energy": "low"}) + sched = { + "type": "atmosphere", + "emotion_label": label, + "query": query, + "suggested_clip": fb["description"], + "suggested_description": fb["description"], + "fade_in_s": 2.0, + "lag_s": 6.0, + } + atmosphere_schedules.append({ + "pair_index": MOCK_PAIRS.index(pair), + "label": label, + "suppressed": (label == last_label), # simplified: time-based cooldown not modelled here + "schedule": sched if label != last_label else None, + }) + last_label = label + + +# --------------------------------------------------------------------------- +# Write fixtures +# --------------------------------------------------------------------------- + +pairs_path = FIXTURE_DIR / "mock_pairs.json" +pairs_path.write_text(json.dumps(MOCK_PAIRS, indent=2), encoding="utf-8") +print(f"[Output fixtures] Wrote {pairs_path} ({len(MOCK_PAIRS)} pairs)") + +captions_path = FIXTURE_DIR / "caption_lines.json" +captions_path.write_text(json.dumps(caption_lines, indent=2), encoding="utf-8") +print(f"[Output fixtures] Wrote {captions_path} ({len(caption_lines)} captions)") + +srt_path = FIXTURE_DIR / "srt_expected.srt" +srt_path.write_text("\n".join(srt_lines), encoding="utf-8") +print(f"[Output fixtures] Wrote {srt_path}") + +atm_path = FIXTURE_DIR / "atmosphere_schedules.json" +atm_path.write_text(json.dumps(atmosphere_schedules, indent=2), encoding="utf-8") +print(f"[Output fixtures] Wrote {atm_path} ({len(atmosphere_schedules)} entries)") + +print("\n[Output fixtures] Done.") diff --git a/narrative-audio-system/tests/generate_segmenter_fixtures.py b/narrative-audio-system/tests/generate_segmenter_fixtures.py new file mode 100644 index 0000000..7337085 --- /dev/null +++ b/narrative-audio-system/tests/generate_segmenter_fixtures.py @@ -0,0 +1,192 @@ +""" +Generate synthetic SpeechSegment fixtures for Step 3 segmenter tests. + +Unlike the VAD fixtures (which are WAV files run through webrtcvad), these +fixtures are pre-built SpeechSegment objects with *exact, known* gaps and +durations. This removes VAD uncertainty from the segmenter tests so every +assertion is deterministic. + +Fixtures are serialised as JSON to tests/fixtures/segmenter/: + Each JSON file contains a list of segment dicts: + {"start": float, "end": float, "duration_s": float} + Audio is regenerated at load-time from duration_s so we avoid large + binary files in the repo. + +Scenarios +--------- + single_segment — 1 segment (2 s) -> 1 utterance always + two_short_gap — 2 segs, gap=0.2 s -> 1 utt (gap < pause_s) + two_long_gap — 2 segs, gap=0.8 s -> 2 utts (gap > pause_s) + four_mixed_gaps — 4 segs, alternating gaps -> 2 utts + safety_valve — 1 seg of 10 s speech -> 2 utts (exceeds max_utterance_s=8) + fixed_window_exact — 3 segs × 1 s = 3 s speech -> 1 utt at window_s=3.0 + fixed_window_overflow — 4 segs × 1 s = 4 s speech -> 2 utts at window_s=3.0 + empty — 0 segments -> 0 utterances + single_tiny — 1 seg of 0.1 s -> 1 utt (segmenter doesn't filter) + +Run: + python tests/generate_segmenter_fixtures.py +""" + +import json +from pathlib import Path +import numpy as np + +SAMPLE_RATE = 16000 +OUT_DIR = Path(__file__).parent / "fixtures" / "segmenter" +OUT_DIR.mkdir(parents=True, exist_ok=True) + +RNG = np.random.default_rng(seed=7) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_segment_dict(start: float, duration: float) -> dict: + """Return a plain-dict representation of a SpeechSegment.""" + return { + "start": round(start, 4), + "end": round(start + duration, 4), + "duration_s": round(duration, 4), + } + + +def save(name: str, segments: list, meta: dict): + """Write fixture JSON and print a summary line.""" + payload = {"segments": segments, "meta": meta} + path = OUT_DIR / f"{name}.json" + path.write_text(json.dumps(payload, indent=2)) + total_speech = sum(s["duration_s"] for s in segments) + print( + f" {name + '.json':<35} {len(segments)} segment(s) " + f"{total_speech:.2f}s speech " + f"-> {meta['expected_utterances']} utt(s) [{meta['note']}]" + ) + + +# --------------------------------------------------------------------------- +# Fixture definitions +# Each entry: (name, segments_list, meta_dict) +# --------------------------------------------------------------------------- + +FIXTURES = [] + +# 1. Single 2-second segment — always 1 utterance regardless of strategy +FIXTURES.append(("single_segment", [ + make_segment_dict(start=1.0, duration=2.0), +], { + "expected_utterances": 1, + "strategies": ["pause_triggered", "fixed_window"], + "note": "1 seg -> 1 utterance always", +})) + +# 2. Two segments, short gap (0.2 s) — pause_triggered merges them +FIXTURES.append(("two_short_gap", [ + make_segment_dict(start=0.5, duration=1.0), + make_segment_dict(start=1.7, duration=1.0), # gap = 0.2 s +], { + "expected_utterances": 1, + "gap_s": 0.2, + "pause_s": 0.4, + "strategies": ["pause_triggered"], + "note": "gap 0.2s < pause_s 0.4s -> merged", +})) + +# 3. Two segments, long gap (0.8 s) — pause_triggered splits them +FIXTURES.append(("two_long_gap", [ + make_segment_dict(start=0.5, duration=1.0), + make_segment_dict(start=2.3, duration=1.0), # gap = 0.8 s +], { + "expected_utterances": 2, + "gap_s": 0.8, + "pause_s": 0.4, + "strategies": ["pause_triggered"], + "note": "gap 0.8s > pause_s 0.4s -> split", +})) + +# 4. Four segments with alternating short/long gaps +# gaps: short(0.2s), long(0.6s), short(0.15s) +# -> segs 0+1 merge, segs 2+3 merge -> 2 utterances +FIXTURES.append(("four_mixed_gaps", [ + make_segment_dict(start=0.5, duration=0.8), + make_segment_dict(start=1.5, duration=0.8), # gap = 0.2 s (short) + make_segment_dict(start=2.9, duration=0.8), # gap = 0.6 s (long) -> boundary + make_segment_dict(start=3.85, duration=0.8), # gap = 0.15 s (short) +], { + "expected_utterances": 2, + "pause_s": 0.4, + "strategies": ["pause_triggered"], + "note": "short/long/short gaps -> 2 utterances", +})) + +# 5. Safety valve — single segment longer than max_utterance_s +# With max_utterance_s=8.0, a 10s segment is treated as one block +# (the safety valve fires on *accumulated* speech; a single segment +# arriving all at once won't be split mid-segment, it emits after) +FIXTURES.append(("safety_valve", [ + make_segment_dict(start=0.0, duration=4.5), + make_segment_dict(start=5.0, duration=4.5), # gap=0.5 > pause_s; total=9s +], { + "expected_utterances": 2, + "pause_s": 0.4, + "max_utterance_s": 8.0, + "strategies": ["pause_triggered"], + "note": "gap triggers split before max_utterance_s; 2 utts", +})) + +# 6. Fixed-window exact: 3 × 1 s segments -> exactly window_s=3.0 on third +FIXTURES.append(("fixed_window_exact", [ + make_segment_dict(start=0.0, duration=1.0), + make_segment_dict(start=1.5, duration=1.0), + make_segment_dict(start=3.0, duration=1.0), +], { + "expected_utterances": 1, + "window_s": 3.0, + "strategies": ["fixed_window"], + "note": "3×1s = 3s speech == window_s -> 1 utt on 3rd feed", +})) + +# 7. Fixed-window overflow: 4 × 1 s segments, window_s=3.0 -> emit at 3 s, flush 1 s +FIXTURES.append(("fixed_window_overflow", [ + make_segment_dict(start=0.0, duration=1.0), + make_segment_dict(start=1.5, duration=1.0), + make_segment_dict(start=3.0, duration=1.0), + make_segment_dict(start=4.5, duration=1.0), +], { + "expected_utterances": 2, + "window_s": 3.0, + "strategies": ["fixed_window"], + "note": "4×1s=4s, window=3s -> emit at 3s, flush 1s -> 2 utts", +})) + +# 8. Empty input — no segments +FIXTURES.append(("empty", [], { + "expected_utterances": 0, + "strategies": ["pause_triggered", "fixed_window"], + "note": "no segments -> no utterances", +})) + +# 9. Single tiny segment (100 ms) — segmenter does not filter by duration +FIXTURES.append(("single_tiny", [ + make_segment_dict(start=0.5, duration=0.1), +], { + "expected_utterances": 1, + "strategies": ["pause_triggered", "fixed_window"], + "note": "0.1s segment — segmenter emits; filtering is VAD's job", +})) + + +# --------------------------------------------------------------------------- +# Write fixtures +# --------------------------------------------------------------------------- + +def main(): + print(f"Generating segmenter fixtures in {OUT_DIR}/\n") + for name, segments, meta in FIXTURES: + save(name, segments, meta) + print(f"\nDone — {len(FIXTURES)} fixture files written.") + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/tests/generate_test_audio.py b/narrative-audio-system/tests/generate_test_audio.py new file mode 100644 index 0000000..2fd073d --- /dev/null +++ b/narrative-audio-system/tests/generate_test_audio.py @@ -0,0 +1,209 @@ +""" +Generate synthetic WAV test fixtures for Step 2 VAD tests. + +Each file has a known ground-truth layout of speech and silence so that +test_vad.py can assert exactly which segments the VAD should detect. + +Files produced in tests/fixtures/: + silence_only.wav — 3 s pure silence → 0 segments + speech_only.wav — 3 s continuous speech → 1 segment + short_burst.wav — 0.1 s speech + silence → 0 segments (below min) + speech_gap_speech.wav — 1 s speech, 1 s silence, 1 s speech → 2 segments + multi_segment.wav — 4 speech islands in 10 s → 4 segments + noisy_speech.wav — speech buried in background noise → >=1 segment + quiet_speech.wav — low-amplitude speech → >=1 segment + +Run: + python tests/generate_test_audio.py +""" + +from pathlib import Path +import numpy as np +import soundfile as sf + +SAMPLE_RATE = 16000 +OUT_DIR = Path(__file__).parent / "fixtures" +OUT_DIR.mkdir(exist_ok=True) + +RNG = np.random.default_rng(seed=42) + + +# --------------------------------------------------------------------------- +# Primitive builders +# --------------------------------------------------------------------------- + +def silence(duration_s: float, sr: int = SAMPLE_RATE) -> np.ndarray: + """Near-zero noise floor — amplitude well below VAD energy threshold.""" + n = int(duration_s * sr) + return RNG.normal(0, 1e-5, n).astype(np.float32) + + +def speech(duration_s: float, sr: int = SAMPLE_RATE, amplitude: float = 0.4) -> np.ndarray: + """ + Synthetic speech: fundamental + harmonics + formant-band noise. + Mimics voiced speech well enough for webrtcvad to label as speech. + """ + n = int(duration_s * sr) + t = np.linspace(0, duration_s, n, endpoint=False) + + f0 = 160.0 # fundamental (Hz) — typical male voice + signal = np.zeros(n, dtype=np.float32) + + # First 6 harmonics with falling amplitude + for k, gain in enumerate([1.0, 0.6, 0.4, 0.3, 0.2, 0.15], start=1): + signal += gain * np.sin(2 * np.pi * f0 * k * t).astype(np.float32) + + # Formant-band noise (500–3500 Hz) to simulate fricatives + noise = RNG.normal(0, 0.05, n).astype(np.float32) + # Simple bandpass: subtract low-pass from original + from numpy.fft import rfft, irfft + spectrum = rfft(noise) + freqs = np.fft.rfftfreq(n, d=1.0 / sr) + spectrum[(freqs < 500) | (freqs > 3500)] = 0 + noise = irfft(spectrum, n=n).astype(np.float32) + signal += noise + + # Normalise then scale + peak = np.abs(signal).max() + if peak > 0: + signal /= peak + signal *= amplitude + + # Mild amplitude envelope (avoids hard clicks at edges) + fade = int(0.01 * sr) + ramp = np.linspace(0, 1, fade, dtype=np.float32) + signal[:fade] *= ramp + signal[-fade:] *= ramp[::-1] + + return signal + + +def concat(*segments) -> np.ndarray: + return np.concatenate(segments).astype(np.float32) + + +def save(name: str, audio: np.ndarray, sr: int = SAMPLE_RATE) -> Path: + path = OUT_DIR / name + sf.write(str(path), audio, sr) + duration = len(audio) / sr + print(f" Wrote {path.name:35s} {duration:.2f} s {len(audio)} samples") + return path + + +# --------------------------------------------------------------------------- +# Fixture definitions +# --------------------------------------------------------------------------- + +FIXTURES = {} # name -> (audio, expected_segment_count, description) + + +def _define(name, audio, expected_segments, description): + FIXTURES[name] = { + "audio": audio, + "expected_segments": expected_segments, + "description": description, + "duration_s": len(audio) / SAMPLE_RATE, + } + + +# 1. Pure silence — nothing should be detected +_define( + "silence_only.wav", + concat(silence(3.0)), + expected_segments=0, + description="3 s pure silence — expect 0 segments", +) + +# 2. Continuous speech — one long segment +_define( + "speech_only.wav", + concat(speech(3.0)), + expected_segments=1, + description="3 s continuous speech — expect 1 segment", +) + +# 3. Very short burst (100 ms) — below min_speech_ms=250, should be discarded +_define( + "short_burst.wav", + concat(silence(0.5), speech(0.10), silence(1.0)), + expected_segments=0, + description="0.1 s speech burst (below min threshold) — expect 0 segments", +) + +# 4. Speech / pause / speech — two clearly separated utterances +_define( + "speech_gap_speech.wav", + concat(silence(0.5), speech(1.0), silence(1.0), speech(1.0), silence(0.5)), + expected_segments=2, + description="1 s speech, 1 s silence, 1 s speech — expect 2 segments", +) + +# 5. Four speech islands — robust multi-segment detection +_define( + "multi_segment.wav", + concat( + silence(0.5), + speech(0.8), # segment 1 + silence(0.8), + speech(1.2), # segment 2 + silence(1.0), + speech(0.6), # segment 3 + silence(0.7), + speech(1.0), # segment 4 + silence(0.5), + ), + expected_segments=4, + description="4 speech islands in ~7 s — expect 4 segments", +) + +# 6. Speech with background noise — VAD must still detect it +_define( + "noisy_speech.wav", + concat( + silence(0.5), + (lambda: ( + s := speech(2.0, amplitude=0.3), + n := (RNG.normal(0, 0.08, len(s)).astype(np.float32)), + np.clip(s + n, -1.0, 1.0).astype(np.float32) + )[-1])(), + silence(0.5), + ), + expected_segments=1, + description="Speech + background noise — expect >=1 segment", +) + +# 7. Low-amplitude speech — soft voice +_define( + "quiet_speech.wav", + concat(silence(0.3), speech(2.0, amplitude=0.08), silence(0.3)), + expected_segments=1, + description="Low-amplitude speech (amplitude=0.08) — expect >=1 segment", +) + + +# --------------------------------------------------------------------------- +# Write all fixtures +# --------------------------------------------------------------------------- + +def main(): + print(f"Generating VAD test fixtures in {OUT_DIR}/\n") + manifest_lines = [ + "# VAD test fixtures manifest", + "# name, expected_segments, duration_s, description", + ] + + for name, info in FIXTURES.items(): + save(name, info["audio"]) + manifest_lines.append( + f"{name},{info['expected_segments']},{info['duration_s']:.3f}," + f"{info['description']}" + ) + + manifest_path = OUT_DIR / "manifest.csv" + manifest_path.write_text("\n".join(manifest_lines) + "\n") + print(f"\n Manifest written to {manifest_path}") + print(f"\nDone — {len(FIXTURES)} fixture files ready.") + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/tests/generate_transcriber_fixtures.py b/narrative-audio-system/tests/generate_transcriber_fixtures.py new file mode 100644 index 0000000..da5f3a4 --- /dev/null +++ b/narrative-audio-system/tests/generate_transcriber_fixtures.py @@ -0,0 +1,222 @@ +""" +Generate test fixtures for Step 4 Streaming Transcription tests. + +Three fixture categories +------------------------ + +1. unit_fixtures/ (JSON only, no audio — pure API/structural tests) + Synthetic Utterance descriptors with known start/end/duration so + TranscriptionResult metadata can be validated without running a model. + +2. integration_fixtures/ (JSON manifests pointing to real WAV files) + Selects a small subset of RAVDESS examples with known ground-truth text. + RAVDESS filenames encode the spoken statement: + position 5 (0-indexed 4) == "01" -> "Kids are talking by the door" + position 5 (0-indexed 4) == "02" -> "Dogs are sitting by the door" + Used by integration tests that actually invoke the transcriber model. + +3. edge_fixtures/ (WAV files for boundary-condition tests) + silence_3s.wav — 3 s of pure silence + noise_3s.wav — 3 s of broadband noise (no speech) + tiny_speech.wav — 50 ms synthetic speech (very short) + long_speech.wav — 8 s continuous synthetic speech + +Run: + python tests/generate_transcriber_fixtures.py +""" + +import json +from pathlib import Path + +import numpy as np +import soundfile as sf + +SAMPLE_RATE = 16000 +EXAMPLES_DIR = Path(__file__).parent.parent / "examples" +OUT_ROOT = Path(__file__).parent / "fixtures" / "transcriber" +UNIT_DIR = OUT_ROOT / "unit" +INTEG_DIR = OUT_ROOT / "integration" +EDGE_DIR = OUT_ROOT / "edge" + +for d in (UNIT_DIR, INTEG_DIR, EDGE_DIR): + d.mkdir(parents=True, exist_ok=True) + +RNG = np.random.default_rng(seed=99) + +STATEMENT_TEXT = { + "01": "Kids are talking by the door", + "02": "Dogs are sitting by the door", +} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _silence(duration_s: float) -> np.ndarray: + return RNG.normal(0, 1e-5, int(duration_s * SAMPLE_RATE)).astype(np.float32) + + +def _speech(duration_s: float, amplitude: float = 0.35) -> np.ndarray: + n = int(duration_s * SAMPLE_RATE) + t = np.linspace(0, duration_s, n, endpoint=False) + sig = np.zeros(n, dtype=np.float32) + for k, gain in enumerate([1.0, 0.6, 0.4, 0.3, 0.2], start=1): + sig += gain * np.sin(2 * np.pi * 160.0 * k * t).astype(np.float32) + sig /= np.abs(sig).max() + 1e-9 + sig *= amplitude + fade = min(int(0.01 * SAMPLE_RATE), n // 4) + ramp = np.linspace(0, 1, fade, dtype=np.float32) + sig[:fade] *= ramp + sig[-fade:] *= ramp[::-1] + return sig + + +def _write_wav(name: str, audio: np.ndarray, directory: Path) -> Path: + path = directory / name + sf.write(str(path), audio, SAMPLE_RATE) + return path + + +def _ravdess_statement(filename: str) -> str: + """Parse RAVDESS filename to get ground-truth spoken text.""" + parts = Path(filename).stem.split("-") + if len(parts) >= 5: + return STATEMENT_TEXT.get(parts[4], "") + return "" + + +# --------------------------------------------------------------------------- +# 1. Unit fixtures — pure JSON, no audio files +# --------------------------------------------------------------------------- + +def generate_unit_fixtures(): + fixtures = [ + { + "name": "single_utterance", + "utterances": [{"start": 1.2, "end": 3.84, "duration_s": 2.64}], + "note": "1 utterance -> 1 TranscriptionResult", + "expected_results": 1, + }, + { + "name": "three_utterances", + "utterances": [ + {"start": 0.5, "end": 2.5, "duration_s": 2.0}, + {"start": 3.1, "end": 5.2, "duration_s": 2.1}, + {"start": 6.0, "end": 7.8, "duration_s": 1.8}, + ], + "note": "3 utterances -> 3 results; full_transcript joins all", + "expected_results": 3, + }, + { + "name": "empty_utterances", + "utterances": [], + "note": "no utterances -> no results, full_transcript is empty", + "expected_results": 0, + }, + { + "name": "zero_duration_edge", + "utterances": [{"start": 2.0, "end": 2.0, "duration_s": 0.0}], + "note": "zero-duration utterance should not crash", + "expected_results": 1, + }, + ] + for fx in fixtures: + path = UNIT_DIR / f"{fx['name']}.json" + path.write_text(json.dumps(fx, indent=2)) + print(f" unit/{fx['name']}.json ({fx['expected_results']} result(s)) [{fx['note']}]") + + +# --------------------------------------------------------------------------- +# 2. Integration fixtures — real RAVDESS audio with known text +# --------------------------------------------------------------------------- + +def generate_integration_fixtures(): + if not EXAMPLES_DIR.exists(): + print(" [SKIP] examples/ not found — skipping integration fixtures") + return + + # Pick 2 neutral files per statement (statement 01 and 02) + selected = [] + for stmt_code, text in STATEMENT_TEXT.items(): + matches = sorted(EXAMPLES_DIR.glob(f"03-01-*-*-{stmt_code}-*-*.wav"))[:2] + for wav in matches: + selected.append({ + "filename": wav.name, + "filepath": str(wav.resolve()), + "statement_code": stmt_code, + "ground_truth": text, + # RAVDESS duration is typically 3-4 s + "start_s": 0.0, + }) + + manifest = { + "description": "RAVDESS files with known ground-truth text", + "ground_truth_note": ( + "Whisper tiny may paraphrase. " + "Tests check for keyword presence, not exact match." + ), + "keywords": { + "01": ["kids", "talking", "door"], + "02": ["dogs", "sitting", "door"], + }, + "files": selected, + } + path = INTEG_DIR / "ravdess_manifest.json" + path.write_text(json.dumps(manifest, indent=2)) + print(f" integration/ravdess_manifest.json ({len(selected)} file(s))") + + +# --------------------------------------------------------------------------- +# 3. Edge fixtures — WAV files for boundary conditions +# --------------------------------------------------------------------------- + +def generate_edge_fixtures(): + edges = [ + ("silence_3s.wav", _silence(3.0), + "3 s pure silence -> transcriber returns empty/whitespace text"), + ("noise_3s.wav", RNG.normal(0, 0.15, int(3.0 * SAMPLE_RATE)).astype(np.float32), + "broadband noise -> transcriber may hallucinate; result must still be str"), + ("tiny_speech.wav", np.concatenate([_silence(0.05), _speech(0.05), _silence(0.05)]), + "50 ms speech clip -> result must be str without crashing"), + ("long_speech.wav", np.concatenate([_silence(0.3), _speech(8.0), _silence(0.3)]), + "8 s speech -> tests longer utterance handling"), + ] + + manifest_entries = [] + for name, audio, note in edges: + path = _write_wav(name, audio, EDGE_DIR) + duration = len(audio) / SAMPLE_RATE + manifest_entries.append({ + "filename": name, + "filepath": str(path.resolve()), + "duration_s": round(duration, 3), + "note": note, + }) + print(f" edge/{name:<25} {duration:.2f} s [{note[:55]}...]") + + manifest = {"files": manifest_entries} + (EDGE_DIR / "manifest.json").write_text(json.dumps(manifest, indent=2)) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + print(f"Generating transcriber fixtures in {OUT_ROOT}/\n") + + print("Unit fixtures:") + generate_unit_fixtures() + + print("\nIntegration fixtures:") + generate_integration_fixtures() + + print("\nEdge fixtures:") + generate_edge_fixtures() + + print(f"\nDone.") + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/tests/test_emotion_classifier.py b/narrative-audio-system/tests/test_emotion_classifier.py new file mode 100644 index 0000000..66443af --- /dev/null +++ b/narrative-audio-system/tests/test_emotion_classifier.py @@ -0,0 +1,596 @@ +""" +Step 5 EmotionClassifier — test suite +====================================== +Tests are grouped into four tiers: + + Unit No model, no audio — validates EmotionResult fields, + class invariants, and edge-case inputs. + + Acoustic Load each synthetic emotion WAV and assert that the MFCC + features extracted from it have the expected acoustic + character (pitch proxy, energy, spectral brightness). + These tests are model-independent — they validate the + fixtures, not the classifier. + + Classifier Run the trained MFCC-MLP on all 8 emotion WAV files. + Assert structural correctness (result type, latency, + confidence range) and soft acoustic constraints + (e.g. angry > calm in energy, happy > sad in pitch proxy). + + Parallel Verify that ParallelProcessor runs Steps 4+5 concurrently + and that both results arrive without data corruption. + +Run: + python tests/generate_emotion_audio.py # once + python tests/test_emotion_classifier.py + python tests/test_emotion_classifier.py --verbose + python tests/test_emotion_classifier.py --skip-parallel # skip Whisper +""" + +import argparse +import json +import sys +import traceback +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Optional + +import numpy as np + +# --------------------------------------------------------------------------- +# Path setup +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent +for mod in ("emotion_classifier", "transcriber", "utterance_buffer", + "vad_engine", "task0_audio_capture"): + p = str(REPO_ROOT / mod) + if p not in sys.path: + sys.path.insert(0, p) + +from classifier import EmotionClassifier, EmotionResult, ParallelProcessor # noqa + +FIXTURES_DIR = Path(__file__).parent / "fixtures" / "emotion_audio" +SAMPLE_RATE = 16000 + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + +def _load_manifest(): + path = FIXTURES_DIR / "manifest.json" + if not path.exists(): + raise FileNotFoundError( + f"{path} -- run generate_emotion_audio.py first" + ) + return json.loads(path.read_text()) + + +def _load_wav(filepath: str) -> np.ndarray: + import soundfile as sf + audio, sr = sf.read(filepath, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != SAMPLE_RATE: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE) + return audio.astype(np.float32) + + +@dataclass +class _Utterance: + start: float + end: float + audio: np.ndarray = field(repr=False) + strategy: str = "pause_triggered" + num_vad_segments: int = 1 + + @property + def duration(self): + return self.end - self.start + + +def _make_utterance(audio: np.ndarray, start: float = 0.0) -> _Utterance: + end = start + len(audio) / SAMPLE_RATE + return _Utterance(start=start, end=end, audio=audio) + + +# Shared classifier — trained once for the whole test run +_CLF: Optional[EmotionClassifier] = None + +def _get_clf() -> EmotionClassifier: + global _CLF + if _CLF is None: + _CLF = EmotionClassifier(verbose=False) + return _CLF + + +# --------------------------------------------------------------------------- +# Test runner +# --------------------------------------------------------------------------- + +PASS, FAIL, SKIP = "PASS", "FAIL", "SKIP" +_results = [] + + +def run_test(label: str, fn, skip: bool = False): + if skip: + _results.append((label, SKIP, "skipped by flag")) + print("s", end="", flush=True) + return + try: + fn() + status, msg = PASS, "" + except FileNotFoundError as exc: + status, msg = SKIP, str(exc) + except AssertionError as exc: + status, msg = FAIL, str(exc) + except Exception as exc: + status, msg = FAIL, f"{type(exc).__name__}: {exc}" + traceback.print_exc() + _results.append((label, status, msg)) + print({"PASS": ".", "FAIL": "F", "SKIP": "s"}[status], end="", flush=True) + + +def check(cond, msg=""): + if not cond: + raise AssertionError(msg) + + +def check_eq(actual, expected, label=""): + if actual != expected: + raise AssertionError(f"{label}: expected {expected!r}, got {actual!r}") + + +# --------------------------------------------------------------------------- +# UNIT TESTS +# --------------------------------------------------------------------------- + +def test_emotion_result_fields(): + r = EmotionResult(label="calm", confidence=0.75, start=1.0, end=3.5, + latency_ms=12.3, all_scores={"calm": 0.75, "angry": 0.25}, + backend="mfcc-mlp") + check_eq(r.label, "calm") + check_eq(r.confidence, 0.75) + check_eq(round(r.duration, 6), 2.5) + check_eq(r.backend, "mfcc-mlp") + check("calm" in r.all_scores) + + +def test_result_repr_contains_label(): + r = EmotionResult(label="angry", confidence=0.82, start=0, end=2, latency_ms=10) + check("angry" in repr(r)) + check("0.82" in repr(r)) + + +def test_classifier_invalid_backend(): + try: + EmotionClassifier(backend="nonexistent", verbose=False) + raise AssertionError("Expected ValueError") + except ValueError: + pass + + +def test_classifier_class_names_populated(): + clf = _get_clf() + check(len(clf._class_names) > 0, "class_names should not be empty after init") + + +def test_classifier_feature_stats_set(): + clf = _get_clf() + check(clf._feature_mean is not None, "feature_mean should be set") + check(clf._feature_std is not None, "feature_std should be set") + check(clf._feature_mean.shape[0] == clf.n_mfcc, + f"feature_mean length {clf._feature_mean.shape[0]} != n_mfcc {clf.n_mfcc}") + + +def test_classify_array_returns_emotion_result(): + audio = np.random.default_rng(0).normal(0, 0.3, SAMPLE_RATE).astype(np.float32) + result = _get_clf().classify_array(audio, sample_rate=SAMPLE_RATE) + check(isinstance(result, EmotionResult), "return type") + + +def test_classify_result_label_in_class_names(): + audio = np.random.default_rng(1).normal(0, 0.3, SAMPLE_RATE).astype(np.float32) + result = _get_clf().classify_array(audio) + clf_names_lower = [n.lower() for n in _get_clf()._class_names] + check(result.label.lower() in clf_names_lower, + f"label {result.label!r} not in class names {clf_names_lower}") + + +def test_classify_confidence_in_range(): + audio = np.random.default_rng(2).normal(0, 0.3, SAMPLE_RATE).astype(np.float32) + result = _get_clf().classify_array(audio) + check(0.0 <= result.confidence <= 1.0, + f"confidence {result.confidence} not in [0, 1]") + + +def test_classify_all_scores_sum_to_one(): + audio = np.random.default_rng(3).normal(0, 0.3, SAMPLE_RATE).astype(np.float32) + result = _get_clf().classify_array(audio) + total = sum(result.all_scores.values()) + check(abs(total - 1.0) < 1e-4, f"all_scores sum = {total:.6f}, expected ~1.0") + + +def test_classify_latency_positive(): + audio = np.random.default_rng(4).normal(0, 0.3, SAMPLE_RATE).astype(np.float32) + result = _get_clf().classify_array(audio) + check(result.latency_ms > 0, f"latency_ms={result.latency_ms}") + + +def test_classify_timestamps_from_utterance(): + audio = np.zeros(SAMPLE_RATE, dtype=np.float32) + utt = _make_utterance(audio, start=3.7) + result = _get_clf().classify(utt) + check_eq(result.start, 3.7, "result.start") + check_eq(round(result.end, 3), round(3.7 + 1.0, 3), "result.end") + + +def test_classify_2d_audio_no_crash(): + audio = np.zeros((SAMPLE_RATE, 2), dtype=np.float32) + result = _get_clf().classify_array(audio) + check(isinstance(result, EmotionResult)) + + +def test_classify_silence_no_crash(): + audio = np.zeros(SAMPLE_RATE * 3, dtype=np.float32) + result = _get_clf().classify_array(audio) + check(isinstance(result.label, str)) + + +def test_classify_very_short_clip_no_crash(): + audio = np.zeros(160, dtype=np.float32) # 10ms + result = _get_clf().classify_array(audio) + check(isinstance(result.label, str)) + + +def test_model_reuse(): + clf = _get_clf() + model_id = id(clf._model) + audio = np.zeros(SAMPLE_RATE, dtype=np.float32) + clf.classify_array(audio) + clf.classify_array(audio) + check_eq(id(clf._model), model_id, "model object reused") + + +# --------------------------------------------------------------------------- +# ACOUSTIC TESTS — validate fixture audio properties (model-independent) +# --------------------------------------------------------------------------- + +def _mfcc_features(audio: np.ndarray) -> np.ndarray: + import librosa + return librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=13).mean(axis=1) + + +def _rms_energy(audio: np.ndarray) -> float: + return float(np.sqrt(np.mean(audio ** 2))) + + +def _spectral_centroid_mean(audio: np.ndarray) -> float: + import librosa + return float(librosa.feature.spectral_centroid( + y=audio, sr=SAMPLE_RATE).mean()) + + +def _f0_proxy(audio: np.ndarray) -> float: + """Rough pitch proxy via autocorrelation peak in 80-300 Hz range.""" + corr = np.correlate(audio, audio, mode="full") + corr = corr[len(corr) // 2:] + min_lag = int(SAMPLE_RATE / 300) + max_lag = int(SAMPLE_RATE / 80) + peak_lag = min_lag + int(np.argmax(corr[min_lag:max_lag])) + return SAMPLE_RATE / peak_lag if peak_lag > 0 else 0.0 + + +def _load_emotion_audio(label: str) -> np.ndarray: + manifest = _load_manifest() + entry = next((f for f in manifest["files"] if f["label"] == label), None) + if entry is None: + raise FileNotFoundError(f"No fixture for label={label!r}") + return _load_wav(entry["filepath"]) + + +def test_acoustic_angry_higher_energy_than_calm(): + """RMS targets in generator: angry=0.38, calm=0.18 — angry must be higher.""" + angry = _rms_energy(_load_emotion_audio("angry")) + calm = _rms_energy(_load_emotion_audio("calm")) + check(angry > calm, + f"angry RMS={angry:.4f} should be > calm RMS={calm:.4f}") + + +def test_acoustic_happy_higher_energy_than_sad(): + """ + Generator uses RMS targets: happy=0.26, sad=0.12. + Happy must have noticeably higher RMS energy than sad. + """ + happy_e = _rms_energy(_load_emotion_audio("happy")) + sad_e = _rms_energy(_load_emotion_audio("sad")) + check(happy_e > sad_e * 1.5, + f"happy RMS={happy_e:.4f} should be > 1.5x sad RMS={sad_e:.4f}") + + +def test_acoustic_surprised_higher_energy_than_sad(): + """ + Surprised has RMS target 0.30 vs sad 0.12 — surprised must have higher energy. + (Pitch proxy via autocorrelation is unreliable on short mixed-pitch clips.) + """ + surprised_e = _rms_energy(_load_emotion_audio("surprised")) + sad_e = _rms_energy(_load_emotion_audio("sad")) + check(surprised_e > sad_e, + f"surprised RMS={surprised_e:.4f} should be > sad RMS={sad_e:.4f}") + + +def test_acoustic_sad_lowest_energy(): + sad_e = _rms_energy(_load_emotion_audio("sad")) + angry_e = _rms_energy(_load_emotion_audio("angry")) + neutral_e = _rms_energy(_load_emotion_audio("neutral")) + check(sad_e < angry_e, f"sad RMS={sad_e:.4f} should be < angry={angry_e:.4f}") + check(sad_e < neutral_e, f"sad RMS={sad_e:.4f} should be < neutral={neutral_e:.4f}") + + +def test_acoustic_all_files_above_vad_threshold(): + """Every emotion file must have RMS well above silence (1e-4).""" + manifest = _load_manifest() + for entry in manifest["files"]: + audio = _load_wav(entry["filepath"]) + rms = _rms_energy(audio) + check(rms > 1e-3, + f"{entry['filename']}: RMS={rms:.6f} too low (silence-like)") + + +def test_acoustic_all_files_correct_sample_rate(): + import soundfile as sf + manifest = _load_manifest() + for entry in manifest["files"]: + _, sr = sf.read(entry["filepath"], dtype="float32") + check_eq(sr, SAMPLE_RATE, f"{entry['filename']} sample_rate") + + +def test_acoustic_file_durations(): + """Each file should be between 3 s and 10 s.""" + manifest = _load_manifest() + for entry in manifest["files"]: + dur = entry["duration_s"] + check(3.0 <= dur <= 10.0, + f"{entry['filename']} duration {dur:.2f}s out of [3, 10]s range") + + +# --------------------------------------------------------------------------- +# CLASSIFIER TESTS — run model on all 8 emotion files +# --------------------------------------------------------------------------- + +def test_classifier_returns_result_for_every_emotion(): + manifest = _load_manifest() + clf = _get_clf() + for entry in manifest["files"]: + audio = _load_wav(entry["filepath"]) + result = clf.classify_array(audio) + check(isinstance(result, EmotionResult), + f"{entry['label']}: expected EmotionResult") + check(isinstance(result.label, str) and len(result.label) > 0, + f"{entry['label']}: label is empty") + + +def test_classifier_latency_under_200ms(): + """MFCC + MLP inference must complete in under 200 ms for all files.""" + manifest = _load_manifest() + clf = _get_clf() + for entry in manifest["files"]: + audio = _load_wav(entry["filepath"]) + result = clf.classify_array(audio) + check(result.latency_ms < 200, + f"{entry['label']}: latency {result.latency_ms:.1f}ms > 200ms") + + +def test_classifier_confidence_always_in_range(): + manifest = _load_manifest() + clf = _get_clf() + for entry in manifest["files"]: + audio = _load_wav(entry["filepath"]) + result = clf.classify_array(audio) + check(0.0 <= result.confidence <= 1.0, + f"{entry['label']}: confidence {result.confidence} out of [0,1]") + + +def test_classifier_all_scores_sum_to_one_per_file(): + manifest = _load_manifest() + clf = _get_clf() + for entry in manifest["files"]: + audio = _load_wav(entry["filepath"]) + result = clf.classify_array(audio) + total = sum(result.all_scores.values()) + check(abs(total - 1.0) < 1e-3, + f"{entry['label']}: all_scores sum={total:.6f}") + + +def test_classifier_top_score_matches_confidence(): + manifest = _load_manifest() + clf = _get_clf() + for entry in manifest["files"]: + audio = _load_wav(entry["filepath"]) + result = clf.classify_array(audio) + if result.all_scores: + top_score = max(result.all_scores.values()) + check(abs(top_score - result.confidence) < 1e-4, + f"{entry['label']}: top_score={top_score:.4f} != confidence={result.confidence:.4f}") + + +def test_classifier_soft_angry_high_confidence(): + """Angry audio has strong MFCC cues — classifier should pick a high-confidence label.""" + audio = _load_wav(str(FIXTURES_DIR / "angry_8s.wav")) + result = _get_clf().classify_array(audio) + check(result.confidence >= 0.10, + f"angry: confidence={result.confidence:.2f} unexpectedly low (model might be untrained)") + + +def test_classifier_soft_calm_vs_angry_energy(): + """ + Regardless of label, the top score for calm audio should be for a + 'quieter' emotion (calm/neutral/sad) more often than angry/fearful. + This is a soft heuristic, not an exact match. + """ + calm_audio = _load_wav(str(FIXTURES_DIR / "calm_6s.wav")) + angry_audio = _load_wav(str(FIXTURES_DIR / "angry_8s.wav")) + clf = _get_clf() + calm_result = clf.classify_array(calm_audio) + angry_result = clf.classify_array(angry_audio) + # At minimum both should return valid, non-empty labels + check(len(calm_result.label) > 0, "calm: empty label") + check(len(angry_result.label) > 0, "angry: empty label") + + +def test_classifier_long_audio_consistent(): + """Running the same file twice must return identical results (deterministic).""" + audio = _load_wav(str(FIXTURES_DIR / "neutral_6s.wav")) + clf = _get_clf() + r1 = clf.classify_array(audio) + r2 = clf.classify_array(audio) + check_eq(r1.label, r2.label, "determinism: label") + check(abs(r1.confidence - r2.confidence) < 1e-5, "determinism: confidence") + + +# --------------------------------------------------------------------------- +# PARALLEL TESTS — Steps 4 + 5 together (requires Whisper model) +# --------------------------------------------------------------------------- + +def test_parallel_returns_both_results(skip_parallel: bool): + if skip_parallel: + raise FileNotFoundError("parallel tests skipped by --skip-parallel") + from streaming_transcriber import Transcriber + audio = _load_wav(str(FIXTURES_DIR / "neutral_6s.wav")) + utt = _make_utterance(audio) + trs = Transcriber(model_size="tiny", verbose=False) + clf = _get_clf() + proc = ParallelProcessor(trs, clf, verbose=False) + transcript, emotion = proc.process(utt) + from streaming_transcriber import TranscriptionResult + check(isinstance(transcript, TranscriptionResult), "transcript type") + check(isinstance(emotion, EmotionResult), "emotion type") + + +def test_parallel_timestamps_consistent(skip_parallel: bool): + if skip_parallel: + raise FileNotFoundError("parallel tests skipped by --skip-parallel") + from streaming_transcriber import Transcriber + audio = _load_wav(str(FIXTURES_DIR / "calm_6s.wav")) + utt = _make_utterance(audio, start=5.0) + trs = Transcriber(model_size="tiny", verbose=False) + clf = _get_clf() + proc = ParallelProcessor(trs, clf, verbose=False) + transcript, emotion = proc.process(utt) + check_eq(transcript.start, 5.0, "transcript.start") + check_eq(emotion.start, 5.0, "emotion.start") + + +def test_parallel_process_all_length(skip_parallel: bool): + if skip_parallel: + raise FileNotFoundError("parallel tests skipped by --skip-parallel") + from streaming_transcriber import Transcriber + manifest = _load_manifest() + utts = [_make_utterance(_load_wav(e["filepath"]), start=float(i * 8)) + for i, e in enumerate(manifest["files"][:3])] + trs = Transcriber(model_size="tiny", verbose=False) + clf = _get_clf() + proc = ParallelProcessor(trs, clf, verbose=False) + pairs = proc.process_all(utts) + check_eq(len(pairs), 3, "process_all length") + for i, (tr, er) in enumerate(pairs): + check_eq(tr.start, utts[i].start, f"pair[{i}] transcript start") + check_eq(er.start, utts[i].start, f"pair[{i}] emotion start") + + +# --------------------------------------------------------------------------- +# Test registry & runner +# --------------------------------------------------------------------------- + +def _build_registry(skip_parallel: bool): + unit = [ + ("unit | EmotionResult fields", test_emotion_result_fields), + ("unit | repr contains label+confidence", test_result_repr_contains_label), + ("unit | invalid backend raises", test_classifier_invalid_backend), + ("unit | class_names populated", test_classifier_class_names_populated), + ("unit | feature stats set", test_classifier_feature_stats_set), + ("unit | classify_array returns result", test_classify_array_returns_emotion_result), + ("unit | label in class_names", test_classify_result_label_in_class_names), + ("unit | confidence in [0,1]", test_classify_confidence_in_range), + ("unit | all_scores sum to 1", test_classify_all_scores_sum_to_one), + ("unit | latency_ms > 0", test_classify_latency_positive), + ("unit | timestamps from utterance", test_classify_timestamps_from_utterance), + ("unit | 2-D audio no crash", test_classify_2d_audio_no_crash), + ("unit | silence no crash", test_classify_silence_no_crash), + ("unit | 10ms clip no crash", test_classify_very_short_clip_no_crash), + ("unit | model reused across calls", test_model_reuse), + ] + acoustic = [ + ("acoustic | angry > calm energy", test_acoustic_angry_higher_energy_than_calm), + ("acoustic | happy higher energy than sad", test_acoustic_happy_higher_energy_than_sad), + ("acoustic | surprised higher energy than sad", test_acoustic_surprised_higher_energy_than_sad), + ("acoustic | sad lowest energy", test_acoustic_sad_lowest_energy), + ("acoustic | all files above VAD floor", test_acoustic_all_files_above_vad_threshold), + ("acoustic | all files 16kHz", test_acoustic_all_files_correct_sample_rate), + ("acoustic | durations 4-10s", test_acoustic_file_durations), + ] + classifier = [ + ("clf | result for every emotion", test_classifier_returns_result_for_every_emotion), + ("clf | latency < 200ms per file", test_classifier_latency_under_200ms), + ("clf | confidence in [0,1] per file", test_classifier_confidence_always_in_range), + ("clf | all_scores sum=1 per file", test_classifier_all_scores_sum_to_one_per_file), + ("clf | top score matches confidence", test_classifier_top_score_matches_confidence), + ("clf | angry high confidence", test_classifier_soft_angry_high_confidence), + ("clf | calm vs angry labels valid", test_classifier_soft_calm_vs_angry_energy), + ("clf | deterministic on same input", test_classifier_long_audio_consistent), + ] + parallel = [ + ("parallel | returns both results", + lambda: test_parallel_returns_both_results(skip_parallel)), + ("parallel | timestamps consistent", + lambda: test_parallel_timestamps_consistent(skip_parallel)), + ("parallel | process_all length", + lambda: test_parallel_process_all_length(skip_parallel)), + ] + return unit, acoustic, classifier, parallel + + +def main(verbose: bool = False, skip_parallel: bool = False): + if not FIXTURES_DIR.exists() or not any(FIXTURES_DIR.glob("*.wav")): + print( + "No emotion audio fixtures found. Generate them first:\n" + " python tests/generate_emotion_audio.py\n" + ) + sys.exit(1) + + unit, acoustic, classifier, parallel = _build_registry(skip_parallel) + all_tests = unit + acoustic + classifier + parallel + print(f"Running {len(all_tests)} emotion classifier tests\n") + if skip_parallel: + print(" --skip-parallel: Whisper parallel tests will be skipped\n") + print("Legend: . = pass F = fail s = skip\n") + + for label, fn in all_tests: + run_test(label, fn) + + print("\n") + passed = sum(1 for _, s, _ in _results if s == PASS) + failed = sum(1 for _, s, _ in _results if s == FAIL) + skipped = sum(1 for _, s, _ in _results if s == SKIP) + + if verbose or failed: + print("-" * 66) + for label, status, msg in _results: + line = f" [{status}] {label}" + if msg: + line += f"\n {msg}" + print(line) + print("-" * 66) + + print(f"\nResults: {passed} passed, {failed} failed, {skipped} skipped\n") + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument("--skip-parallel", "-s", action="store_true", + help="Skip tests that require the Whisper model") + args = parser.parse_args() + main(verbose=args.verbose, skip_parallel=args.skip_parallel) diff --git a/narrative-audio-system/tests/test_output_generator.py b/narrative-audio-system/tests/test_output_generator.py new file mode 100644 index 0000000..04b04a1 --- /dev/null +++ b/narrative-audio-system/tests/test_output_generator.py @@ -0,0 +1,805 @@ +""" +Tests for Step 6 — Output Generator +===================================== +Tests cover: + Track A (captions.py) + - CaptionLine: rendering, SRT formatting, to_dict schema + - CaptionFormatter: short-text filtering, index auto-increment + - SRTWriter: file creation, correct block format + - CaptionBroadcaster: async start/stop, broadcast to mock client + + Track B (atmosphere.py) + - AtmosphereMapper: known + unknown labels + - CrossfadeSchedule: to_dict schema + - CrossfadeScheduler: first-time trigger, cooldown suppression, label-change trigger + - RetrievalBridge: graceful fallback when CSV absent + + Combined (output_generator.py) + - OutputGenerator.process: returns (CaptionLine, CrossfadeSchedule) tuple + - OutputGenerator.process_all: batch processing + - OutputGenerator.summary: contains captions + atmosphere entries + - OutputGenerator parallel dispatch: both tracks run concurrently + +Run: + python tests/test_output_generator.py + python tests/test_output_generator.py --verbose + python tests/test_output_generator.py --skip-ws # skip WebSocket tests +""" + +import argparse +import asyncio +import json +import sys +import tempfile +import time +import unittest +from dataclasses import dataclass +from pathlib import Path +from typing import Optional +from unittest.mock import MagicMock, patch + +ROOT = Path(__file__).resolve().parent.parent +FIXTURE_DIR = ROOT / "tests" / "fixtures" / "output" + +# --------------------------------------------------------------------------- +# Path setup +# --------------------------------------------------------------------------- + +for mod_dir in ("output_generator",): + p = str(ROOT / mod_dir) + if p not in sys.path: + sys.path.insert(0, p) + +# --------------------------------------------------------------------------- +# Lightweight mock result objects (mimic Step 4 + Step 5 outputs) +# --------------------------------------------------------------------------- + +@dataclass +class _MockTranscript: + text: str + start: float + end: float + latency_ms: float = 180.0 + backend: str = "faster-whisper" + confidence: float = 0.90 + language: str = "en" + + +@dataclass +class _MockEmotion: + label: str + confidence: float + start: float + end: float + latency_ms: float = 12.0 + backend: str = "mfcc-mlp" + all_scores: dict = None + + def __post_init__(self): + if self.all_scores is None: + self.all_scores = {self.label: self.confidence} + + +def _load_pairs(): + """Load mock_pairs.json and build _MockTranscript / _MockEmotion objects.""" + with open(FIXTURE_DIR / "mock_pairs.json", encoding="utf-8") as f: + raw = json.load(f) + pairs = [] + for item in raw: + tr = _MockTranscript(**item["transcript"]) + em = _MockEmotion(**item["emotion"]) + pairs.append((tr, em)) + return pairs + + +# =========================================================================== +# Track A — captions.py +# =========================================================================== + +class TestCaptionLine(unittest.TestCase): + + def setUp(self): + from captions import CaptionLine + self.CaptionLine = CaptionLine + + def _make(self, label="calm", text="The forest was quiet.", start=0.0, end=2.5, + confidence=0.88, color="#7ec8a0", index=1): + return self.CaptionLine(label=label, text=text, start=start, end=end, + confidence=confidence, color=color, index=index) + + def test_render_contains_label(self): + line = self._make(label="tense", text="Until the branch snapped.") + rendered = line.render() + self.assertIn("[tense]", rendered) + self.assertIn("Until the branch snapped.", rendered) + + def test_render_label_padded(self): + line = self._make(label="calm") + rendered = line.render(width=10) + # Tag should be left-justified within width + self.assertTrue(rendered.startswith("[calm]")) + + def test_duration_property(self): + line = self._make(start=1.0, end=3.5) + self.assertAlmostEqual(line.duration, 2.5, places=5) + + def test_to_dict_has_required_keys(self): + line = self._make() + d = line.to_dict() + for key in ("type", "index", "label", "text", "start", "end", + "confidence", "color"): + self.assertIn(key, d, f"Missing key: {key}") + + def test_to_dict_type_is_caption(self): + self.assertEqual(self._make().to_dict()["type"], "caption") + + def test_to_dict_values(self): + line = self._make(label="calm", text="Hello.", start=1.0, end=2.0, + confidence=0.88, color="#7ec8a0", index=3) + d = line.to_dict() + self.assertEqual(d["label"], "calm") + self.assertEqual(d["text"], "Hello.") + self.assertAlmostEqual(d["start"], 1.0, places=2) + self.assertAlmostEqual(d["end"], 2.0, places=2) + self.assertEqual(d["color"], "#7ec8a0") + self.assertEqual(d["index"], 3) + + def test_to_srt_block_format(self): + line = self._make(start=0.0, end=2.5, index=1, label="calm", + text="The forest was quiet.") + block = line.to_srt_block() + lines = block.strip().splitlines() + self.assertEqual(lines[0], "1") # sequence number + self.assertIn("-->", lines[1]) # timestamp line + self.assertIn(",", lines[1]) # SRT uses comma decimal + self.assertIn("[calm]", lines[2]) # label in subtitle + self.assertIn("The forest was quiet.", lines[2]) # text + + def test_srt_timestamp_zero(self): + line = self._make(start=0.0, end=2.5) + block = line.to_srt_block() + self.assertIn("00:00:00,000 --> 00:00:02,500", block) + + def test_srt_timestamp_nonzero(self): + line = self._make(start=62.3, end=65.1) + block = line.to_srt_block() + self.assertIn("00:01:02,300 --> 00:01:05,100", block) + + +class TestCaptionFormatter(unittest.TestCase): + + def setUp(self): + from captions import CaptionFormatter + self.fmt = CaptionFormatter(min_text_length=2, verbose=False) + + def _pair(self, text="Hello world.", label="calm", start=0.0, end=2.0, + confidence=0.9): + tr = _MockTranscript(text=text, start=start, end=end) + em = _MockEmotion(label=label, confidence=confidence, start=start, end=end) + return tr, em + + def test_format_returns_caption_line(self): + from captions import CaptionLine + tr, em = self._pair() + line = self.fmt.format(tr, em) + self.assertIsInstance(line, CaptionLine) + + def test_format_short_text_returns_none(self): + tr, em = self._pair(text=".") + self.assertIsNone(self.fmt.format(tr, em)) + + def test_format_empty_text_returns_none(self): + tr, em = self._pair(text="") + self.assertIsNone(self.fmt.format(tr, em)) + + def test_format_strips_whitespace(self): + tr, em = self._pair(text=" Hello world. ") + line = self.fmt.format(tr, em) + self.assertEqual(line.text, "Hello world.") + + def test_index_auto_increments(self): + tr1, em1 = self._pair(text="First sentence.", start=0.0, end=1.0) + tr2, em2 = self._pair(text="Second sentence.", start=1.5, end=3.0) + line1 = self.fmt.format(tr1, em1) + line2 = self.fmt.format(tr2, em2) + self.assertEqual(line1.index, 1) + self.assertEqual(line2.index, 2) + + def test_index_skips_filtered(self): + tr_short, em_short = self._pair(text=".") + tr_ok, em_ok = self._pair(text="Valid caption here.") + self.fmt.format(tr_short, em_short) # filtered + line = self.fmt.format(tr_ok, em_ok) + # Index is still 1 because filtered entries don't increment + self.assertEqual(line.index, 1) + + def test_reset_index(self): + tr, em = self._pair(text="Hello world.") + self.fmt.format(tr, em) + self.fmt.format(tr, em) + self.fmt.reset_index() + line = self.fmt.format(tr, em) + self.assertEqual(line.index, 1) + + def test_colour_assigned(self): + tr, em = self._pair(label="tense") + line = self.fmt.format(tr, em) + self.assertEqual(line.color, "#e07a5f") + + def test_colour_unknown_label(self): + tr, em = self._pair(label="mystery_emotion") + line = self.fmt.format(tr, em) + self.assertEqual(line.color, "#ffffff") + + +class TestSRTWriter(unittest.TestCase): + + def setUp(self): + from captions import CaptionLine, SRTWriter + self.CaptionLine = CaptionLine + self.SRTWriter = SRTWriter + + def _make_line(self, idx=1, label="calm", text="Hello.", start=0.0, end=2.0): + return self.CaptionLine(label=label, text=text, start=start, end=end, + confidence=0.9, color="#7ec8a0", index=idx) + + def test_creates_file(self): + with tempfile.NamedTemporaryFile(suffix=".srt", delete=False) as f: + path = f.name + writer = self.SRTWriter(path, verbose=False) + writer.write(self._make_line()) + self.assertTrue(Path(path).exists()) + Path(path).unlink(missing_ok=True) + + def test_write_content(self): + with tempfile.NamedTemporaryFile(suffix=".srt", delete=False, + mode="w") as f: + path = f.name + writer = self.SRTWriter(path, verbose=False) + line = self._make_line(label="tense", text="Branch snapped.") + writer.write(line) + content = Path(path).read_text(encoding="utf-8") + self.assertIn("1", content) + self.assertIn("[tense]", content) + self.assertIn("Branch snapped.", content) + Path(path).unlink(missing_ok=True) + + def test_write_multiple_blocks(self): + with tempfile.NamedTemporaryFile(suffix=".srt", delete=False, + mode="w") as f: + path = f.name + writer = self.SRTWriter(path, verbose=False) + writer.write(self._make_line(idx=1, text="First.")) + writer.write(self._make_line(idx=2, text="Second.", start=3.0, end=5.0)) + content = Path(path).read_text(encoding="utf-8") + self.assertIn("First.", content) + self.assertIn("Second.", content) + self.assertEqual(writer.count, 2) + Path(path).unlink(missing_ok=True) + + def test_clears_existing_file(self): + with tempfile.NamedTemporaryFile(suffix=".srt", delete=False, + mode="w", encoding="utf-8") as f: + f.write("OLD CONTENT\n") + path = f.name + writer = self.SRTWriter(path, verbose=False) + content = Path(path).read_text(encoding="utf-8") + self.assertNotIn("OLD CONTENT", content) + Path(path).unlink(missing_ok=True) + + def test_write_all(self): + with tempfile.NamedTemporaryFile(suffix=".srt", delete=False, + mode="w") as f: + path = f.name + writer = self.SRTWriter(path, verbose=False) + lines = [ + self._make_line(idx=1, text="First sentence."), + None, # should be skipped + self._make_line(idx=2, text="Third sentence.", start=4.0, end=6.0), + ] + writer.write_all(lines) + self.assertEqual(writer.count, 2) + Path(path).unlink(missing_ok=True) + + def test_fixture_srt_matches_expected(self): + """Verify generator produced valid SRT blocks.""" + srt_file = FIXTURE_DIR / "srt_expected.srt" + if not srt_file.exists(): + self.skipTest("Fixture not found — run generate_output_fixtures.py") + content = srt_file.read_text(encoding="utf-8") + # At least 3 proper timestamp lines + self.assertGreaterEqual(content.count("-->"), 3) + + +class TestCaptionBroadcaster(unittest.IsolatedAsyncioTestCase): + + async def test_start_and_stop(self): + """CaptionBroadcaster starts and stops without error.""" + try: + from captions import CaptionBroadcaster + except ImportError: + self.skipTest("websockets not installed") + broadcaster = CaptionBroadcaster(host="localhost", port=18765, verbose=False) + await broadcaster.start() + await broadcaster.stop() + + async def test_broadcast_no_clients(self): + """Broadcast with no clients connected is a no-op.""" + try: + from captions import CaptionBroadcaster + except ImportError: + self.skipTest("websockets not installed") + broadcaster = CaptionBroadcaster(host="localhost", port=18766, verbose=False) + await broadcaster.start() + # Should not raise + await broadcaster.broadcast({"type": "caption", "label": "calm", "text": "test"}) + await broadcaster.stop() + + async def test_connected_clients_count(self): + """connected_clients starts at 0.""" + try: + from captions import CaptionBroadcaster + except ImportError: + self.skipTest("websockets not installed") + broadcaster = CaptionBroadcaster(host="localhost", port=18767, verbose=False) + await broadcaster.start() + self.assertEqual(broadcaster.connected_clients, 0) + await broadcaster.stop() + + +# =========================================================================== +# Track B — atmosphere.py +# =========================================================================== + +class TestAtmosphereMapper(unittest.TestCase): + + def setUp(self): + from atmosphere import AtmosphereMapper + self.mapper = AtmosphereMapper(verbose=False) + + def test_known_label_calm(self): + q = self.mapper.query_for("calm") + self.assertIn("calm", q.lower()) + + def test_known_label_tense(self): + q = self.mapper.query_for("tense") + self.assertIn("tense", q.lower()) + + def test_unknown_label_fallback(self): + q = self.mapper.query_for("unicorn_emotion") + self.assertIn("unicorn_emotion", q.lower()) + + def test_case_insensitive(self): + q1 = self.mapper.query_for("CALM") + q2 = self.mapper.query_for("calm") + self.assertEqual(q1, q2) + + def test_fallback_for_known(self): + fb = self.mapper.fallback_for("sad") + self.assertIn("description", fb) + self.assertIn("energy", fb) + + def test_fallback_for_unknown(self): + fb = self.mapper.fallback_for("mystery") + self.assertIn("description", fb) + + def test_custom_queries_override(self): + from atmosphere import AtmosphereMapper + mapper = AtmosphereMapper(custom_queries={"calm": "custom calm override"}, verbose=False) + self.assertEqual(mapper.query_for("calm"), "custom calm override") + + def test_all_labels_have_queries(self): + from atmosphere import TONE_QUERIES + for label in ("calm", "neutral", "happy", "sad", "angry", + "fearful", "tense", "disgust", "surprised"): + q = self.mapper.query_for(label) + self.assertIsInstance(q, str) + self.assertGreater(len(q), 0) + + +class TestCrossfadeSchedule(unittest.TestCase): + + def setUp(self): + from atmosphere import CrossfadeSchedule + self.CrossfadeSchedule = CrossfadeSchedule + + def _make(self, label="calm", query="calm ambient", clip="gentle wind", + desc="gentle wind, soft water", fade=2.0, lag=6.0): + return self.CrossfadeSchedule( + emotion_label=label, query=query, + suggested_clip=clip, suggested_description=desc, + fade_in_s=fade, lag_s=lag, + ) + + def test_to_dict_has_required_keys(self): + d = self._make().to_dict() + for key in ("type", "emotion_label", "query", "suggested_clip", + "suggested_description", "fade_in_s", "lag_s"): + self.assertIn(key, d, f"Missing key: {key}") + + def test_to_dict_type_is_atmosphere(self): + self.assertEqual(self._make().to_dict()["type"], "atmosphere") + + def test_to_dict_values(self): + d = self._make(label="tense", fade=3.0, lag=5.0).to_dict() + self.assertEqual(d["emotion_label"], "tense") + self.assertAlmostEqual(d["fade_in_s"], 3.0) + self.assertAlmostEqual(d["lag_s"], 5.0) + + def test_repr(self): + s = repr(self._make(label="fearful")) + self.assertIn("fearful", s) + + def test_scheduled_at_is_float(self): + sched = self._make() + self.assertIsInstance(sched.scheduled_at, float) + self.assertGreater(sched.scheduled_at, 0.0) + + +class TestRetrievalBridge(unittest.TestCase): + + def test_graceful_fallback_missing_csv(self): + from atmosphere import RetrievalBridge + bridge = RetrievalBridge(features_csv="/nonexistent/path.csv", verbose=False) + self.assertFalse(bridge.available) + results = bridge.search("calm ambient") + self.assertEqual(results, []) + + def test_returns_empty_list_on_fallback(self): + from atmosphere import RetrievalBridge + bridge = RetrievalBridge(features_csv=None, verbose=False) + results = bridge.search("tense forest") + self.assertIsInstance(results, list) + + +class TestCrossfadeScheduler(unittest.TestCase): + + def setUp(self): + from atmosphere import CrossfadeScheduler + # Short cooldown so we can test both suppress and trigger quickly + self.scheduler = CrossfadeScheduler(lag_s=6.0, fade_s=2.0, + cooldown_s=0.1, verbose=False) + + def _emotion(self, label, start=0.0, end=2.0): + return _MockEmotion(label=label, confidence=0.8, start=start, end=end) + + def test_first_call_returns_schedule(self): + from atmosphere import CrossfadeSchedule + result = self.scheduler.schedule(self._emotion("calm")) + self.assertIsInstance(result, CrossfadeSchedule) + + def test_same_label_within_cooldown_suppressed(self): + # Use a scheduler with long cooldown + from atmosphere import CrossfadeScheduler + sched = CrossfadeScheduler(lag_s=6.0, fade_s=2.0, cooldown_s=60.0, verbose=False) + sched.schedule(self._emotion("calm")) + result = sched.schedule(self._emotion("calm")) + self.assertIsNone(result) + + def test_different_label_triggers_even_in_cooldown(self): + from atmosphere import CrossfadeScheduler, CrossfadeSchedule + sched = CrossfadeScheduler(lag_s=6.0, fade_s=2.0, cooldown_s=60.0, verbose=False) + sched.schedule(self._emotion("calm")) + result = sched.schedule(self._emotion("tense")) + self.assertIsInstance(result, CrossfadeSchedule) + + def test_same_label_after_cooldown_triggers(self): + from atmosphere import CrossfadeScheduler, CrossfadeSchedule + sched = CrossfadeScheduler(lag_s=6.0, fade_s=2.0, cooldown_s=0.05, verbose=False) + sched.schedule(self._emotion("calm")) + time.sleep(0.1) # wait for cooldown + result = sched.schedule(self._emotion("calm")) + self.assertIsInstance(result, CrossfadeSchedule) + + def test_schedule_label_stored(self): + from atmosphere import CrossfadeSchedule + result = self.scheduler.schedule(self._emotion("angry")) + self.assertEqual(result.emotion_label, "angry") + + def test_schedule_fade_and_lag(self): + result = self.scheduler.schedule(self._emotion("sad")) + self.assertAlmostEqual(result.fade_in_s, 2.0) + self.assertAlmostEqual(result.lag_s, 6.0) + + def test_sequence_calm_tense_fearful(self): + from atmosphere import CrossfadeSchedule + r1 = self.scheduler.schedule(self._emotion("calm")) + time.sleep(0.15) + r2 = self.scheduler.schedule(self._emotion("tense")) + time.sleep(0.15) + r3 = self.scheduler.schedule(self._emotion("fearful")) + self.assertIsInstance(r1, CrossfadeSchedule) + self.assertIsInstance(r2, CrossfadeSchedule) + self.assertIsInstance(r3, CrossfadeSchedule) + self.assertEqual(r1.emotion_label, "calm") + self.assertEqual(r2.emotion_label, "tense") + self.assertEqual(r3.emotion_label, "fearful") + + +# =========================================================================== +# Combined — output_generator.py +# =========================================================================== + +class TestOutputGenerator(unittest.TestCase): + + def setUp(self): + sys.path.insert(0, str(ROOT / "output_generator")) + from output_generator import OutputGenerator + self.OutputGenerator = OutputGenerator + + def _gen(self, srt_path=None, cooldown_s=0.05): + return self.OutputGenerator( + srt_path=srt_path, + enable_websocket=False, + cooldown_s=cooldown_s, + verbose=False, + ) + + def _pair(self, text="The forest was quiet.", label="calm", + start=0.0, end=2.5, confidence=0.88): + tr = _MockTranscript(text=text, start=start, end=end) + em = _MockEmotion(label=label, confidence=confidence, start=start, end=end) + return tr, em + + # --- process() return type --- + + def test_process_returns_tuple(self): + gen = self._gen() + tr, em = self._pair() + result = gen.process(tr, em) + self.assertIsInstance(result, tuple) + self.assertEqual(len(result), 2) + + def test_process_caption_type(self): + from captions import CaptionLine + gen = self._gen() + tr, em = self._pair() + caption, _ = gen.process(tr, em) + self.assertIsInstance(caption, CaptionLine) + + def test_process_atmosphere_type(self): + from atmosphere import CrossfadeSchedule + gen = self._gen() + tr, em = self._pair() + _, schedule = gen.process(tr, em) + self.assertIsInstance(schedule, CrossfadeSchedule) + + def test_process_short_text_caption_is_none(self): + gen = self._gen() + tr, em = self._pair(text=".") + caption, _ = gen.process(tr, em) + self.assertIsNone(caption) + + # --- captions property --- + + def test_captions_accumulated(self): + gen = self._gen(cooldown_s=0.0) + pairs = [ + self._pair(text="First sentence.", label="calm", start=0.0, end=2.0), + self._pair(text="Second sentence.", label="tense", start=3.0, end=5.0), + ] + for tr, em in pairs: + gen.process(tr, em) + self.assertEqual(len(gen.captions), 2) + + def test_captions_property_is_copy(self): + gen = self._gen() + gen.process(*self._pair(text="Hello world.")) + caps = gen.captions + caps.clear() + self.assertEqual(len(gen.captions), 1) + + # --- atmosphere_log property --- + + def test_atmosphere_log_accumulated(self): + gen = self._gen(cooldown_s=0.0) + for label in ("calm", "tense", "fearful"): + tr, em = self._pair(text="Some text here.", label=label) + gen.process(tr, em) + time.sleep(0.06) + self.assertGreaterEqual(len(gen.atmosphere_log), 1) + + # --- process_all() --- + + def test_process_all_returns_list(self): + gen = self._gen(cooldown_s=0.0) + pairs = [ + self._pair(text="Sentence one.", label="calm", start=0.0, end=2.0), + self._pair(text="Sentence two.", label="tense", start=3.0, end=5.0), + ] + results = gen.process_all(pairs) + self.assertIsInstance(results, list) + self.assertEqual(len(results), 2) + + def test_process_all_each_result_is_tuple(self): + gen = self._gen(cooldown_s=0.0) + pairs = [self._pair(text=f"Sentence {i}.", label="calm", + start=float(i * 3), end=float(i * 3 + 2)) + for i in range(3)] + for res in gen.process_all(pairs): + self.assertIsInstance(res, tuple) + self.assertEqual(len(res), 2) + + # --- SRT integration --- + + def test_srt_file_written(self): + with tempfile.NamedTemporaryFile(suffix=".srt", delete=False) as f: + path = f.name + gen = self._gen(srt_path=path) + gen.process(*self._pair(text="The forest was quiet that night.")) + content = Path(path).read_text(encoding="utf-8") + self.assertIn("forest was quiet", content) + Path(path).unlink(missing_ok=True) + + # --- summary() --- + + def test_summary_contains_counts(self): + gen = self._gen(cooldown_s=0.0) + gen.process(*self._pair(text="One sentence.")) + summary = gen.summary() + self.assertIn("Captions generated", summary) + self.assertIn("Atmosphere changes", summary) + + def test_summary_contains_transcript(self): + gen = self._gen() + gen.process(*self._pair(text="The forest was quiet that night.")) + summary = gen.summary() + self.assertIn("forest was quiet", summary) + + def test_summary_atmosphere_log(self): + gen = self._gen(cooldown_s=0.0) + gen.process(*self._pair(text="One sentence.", label="tense")) + summary = gen.summary() + self.assertIn("Atmosphere log", summary) + + # --- fixture-based integration --- + + def test_fixture_pairs_processed(self): + """Process all mock_pairs from fixture and check counts.""" + if not (FIXTURE_DIR / "mock_pairs.json").exists(): + self.skipTest("Fixtures not generated — run generate_output_fixtures.py") + pairs = _load_pairs() + gen = self._gen(cooldown_s=0.05) + results = gen.process_all(pairs) + self.assertEqual(len(results), len(pairs)) + + def test_fixture_caption_count(self): + """Expect 5 captions (6 pairs minus 1 filtered short text).""" + if not (FIXTURE_DIR / "mock_pairs.json").exists(): + self.skipTest("Fixtures not generated") + pairs = _load_pairs() + gen = self._gen(cooldown_s=0.0) + gen.process_all(pairs) + # 5 pairs have text >= 2 chars (one pair has "." which is filtered) + self.assertEqual(len(gen.captions), 5) + + def test_fixture_caption_labels(self): + """Caption labels should match emotion labels from fixture.""" + if not (FIXTURE_DIR / "mock_pairs.json").exists(): + self.skipTest("Fixtures not generated") + pairs = _load_pairs() + gen = self._gen(cooldown_s=0.0) + gen.process_all(pairs) + expected_labels = ["calm", "tense", "fearful", "happy", "happy"] + for cap, exp in zip(gen.captions, expected_labels): + self.assertEqual(cap.label, exp) + + +# =========================================================================== +# Parallel dispatch verification +# =========================================================================== + +class TestParallelDispatch(unittest.TestCase): + """Verify that Track A and Track B run concurrently (not sequentially).""" + + def test_parallel_is_faster_than_sequential(self): + """ + Inject sleep delays into both tracks and confirm process() completes + faster than sum-of-delays (proving concurrency). + """ + from output_generator import OutputGenerator + gen = OutputGenerator(enable_websocket=False, cooldown_s=0.0, verbose=False) + + DELAY = 0.12 # seconds per track + + original_a = gen._track_a + original_b = gen._track_b + + def slow_track_a(*args, **kwargs): + time.sleep(DELAY) + return original_a(*args, **kwargs) + + def slow_track_b(*args, **kwargs): + time.sleep(DELAY) + return original_b(*args, **kwargs) + + gen._track_a = slow_track_a + gen._track_b = slow_track_b + + tr = _MockTranscript(text="Timing test sentence.", start=0.0, end=2.0) + em = _MockEmotion(label="calm", confidence=0.9, start=0.0, end=2.0) + + t0 = time.perf_counter() + gen.process(tr, em) + elapsed = time.perf_counter() - t0 + + # Sequential would take 2 * DELAY; parallel should be ~DELAY + self.assertLess(elapsed, DELAY * 1.8, + f"process() took {elapsed:.3f}s, expected < {DELAY * 1.8:.3f}s " + f"(tracks don't appear to run in parallel)") + + +# =========================================================================== +# Fixture schema validation +# =========================================================================== + +class TestFixtureSchemas(unittest.TestCase): + + def test_caption_lines_fixture_schema(self): + path = FIXTURE_DIR / "caption_lines.json" + if not path.exists(): + self.skipTest("Fixture not found") + with open(path, encoding="utf-8") as f: + lines = json.load(f) + self.assertGreater(len(lines), 0) + for cl in lines: + for key in ("type", "index", "label", "text", "start", "end", + "confidence", "color"): + self.assertIn(key, cl) + self.assertEqual(cl["type"], "caption") + + def test_atmosphere_schedules_fixture_schema(self): + path = FIXTURE_DIR / "atmosphere_schedules.json" + if not path.exists(): + self.skipTest("Fixture not found") + with open(path, encoding="utf-8") as f: + schedules = json.load(f) + self.assertGreater(len(schedules), 0) + for entry in schedules: + self.assertIn("pair_index", entry) + self.assertIn("label", entry) + self.assertIn("suppressed", entry) + + def test_atmosphere_fixture_first_is_not_suppressed(self): + path = FIXTURE_DIR / "atmosphere_schedules.json" + if not path.exists(): + self.skipTest("Fixture not found") + with open(path, encoding="utf-8") as f: + schedules = json.load(f) + self.assertFalse(schedules[0]["suppressed"], + "First atmosphere entry should never be suppressed") + + def test_mock_pairs_has_six_entries(self): + path = FIXTURE_DIR / "mock_pairs.json" + if not path.exists(): + self.skipTest("Fixture not found") + with open(path, encoding="utf-8") as f: + pairs = json.load(f) + self.assertEqual(len(pairs), 6) + + +# =========================================================================== +# Entry point +# =========================================================================== + +def _parse_args(): + parser = argparse.ArgumentParser(description="Step 6 output generator tests") + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument("--skip-ws", action="store_true", + help="Skip WebSocket broadcaster tests") + return parser.parse_known_args() + + +if __name__ == "__main__": + args, remaining = _parse_args() + + if args.skip_ws: + # Remove WebSocket test class + del TestCaptionBroadcaster + + verbosity = 2 if args.verbose else 1 + loader = unittest.TestLoader() + suite = loader.loadTestsFromModule(sys.modules[__name__]) + runner = unittest.TextTestRunner(verbosity=verbosity) + result = runner.run(suite) + sys.exit(0 if result.wasSuccessful() else 1) diff --git a/narrative-audio-system/tests/test_segmenter.py b/narrative-audio-system/tests/test_segmenter.py new file mode 100644 index 0000000..dbf74c5 --- /dev/null +++ b/narrative-audio-system/tests/test_segmenter.py @@ -0,0 +1,412 @@ +""" +Step 3 Segmenter — test suite +============================== +Loads the JSON fixtures from tests/fixtures/segmenter/ and asserts +correctness of both segmentation strategies. + +Each test builds SpeechSegment objects from the fixture's known geometry +(start, end, duration) so results are 100% deterministic — no audio file +loading, no VAD, no randomness. + +Run: + python tests/generate_segmenter_fixtures.py # once + python tests/test_segmenter.py + python tests/test_segmenter.py --verbose +""" + +import argparse +import json +import sys +import traceback +from pathlib import Path + +import numpy as np + +# --------------------------------------------------------------------------- +# Path setup +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "utterance_buffer")) +sys.path.insert(0, str(REPO_ROOT / "vad_engine")) + +from segmenter import UtteranceSegmenter, segment_utterances, Utterance # noqa +from vad import SpeechSegment # noqa + +FIXTURES_DIR = Path(__file__).parent / "fixtures" / "segmenter" + +SAMPLE_RATE = 16000 + +# --------------------------------------------------------------------------- +# Fixture loader +# --------------------------------------------------------------------------- + +def _make_audio(duration_s: float, seed: int = 0) -> np.ndarray: + """Deterministic synthetic speech audio of the requested duration.""" + rng = np.random.default_rng(seed) + n = max(1, int(duration_s * SAMPLE_RATE)) + t = np.linspace(0, duration_s, n, endpoint=False) + sig = 0.3 * np.sin(2 * np.pi * 160.0 * t).astype(np.float32) + sig += rng.normal(0, 0.01, n).astype(np.float32) + return sig + + +def load_fixture(name: str): + """Return (list[SpeechSegment], meta_dict) from a JSON fixture file.""" + path = FIXTURES_DIR / f"{name}.json" + if not path.exists(): + raise FileNotFoundError(f"{path} — run generate_segmenter_fixtures.py first") + data = json.loads(path.read_text()) + segments = [] + for i, d in enumerate(data["segments"]): + audio = _make_audio(d["duration_s"], seed=i) + segments.append(SpeechSegment(start=d["start"], end=d["end"], audio=audio)) + return segments, data["meta"] + + +# --------------------------------------------------------------------------- +# Test runner +# --------------------------------------------------------------------------- + +PASS, FAIL, SKIP = "PASS", "FAIL", "SKIP" +_results = [] + + +def run_test(label: str, fn): + try: + fn() + status, msg = PASS, "" + except FileNotFoundError as exc: + status, msg = SKIP, str(exc) + except AssertionError as exc: + status, msg = FAIL, str(exc) + except Exception as exc: + status, msg = FAIL, f"{type(exc).__name__}: {exc}" + traceback.print_exc() + _results.append((label, status, msg)) + print({"PASS": ".", "FAIL": "F", "SKIP": "s"}[status], end="", flush=True) + + +def check(condition, msg=""): + if not condition: + raise AssertionError(msg) + + +def check_eq(actual, expected, label=""): + if actual != expected: + raise AssertionError(f"{label}: expected {expected!r}, got {actual!r}") + + +# --------------------------------------------------------------------------- +# Tests — pause_triggered strategy +# --------------------------------------------------------------------------- + +def test_single_segment_pause(): + segs, meta = load_fixture("single_segment") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(len(utts), 1, "single_segment: utterance count") + check_eq(utts[0].num_vad_segments, 1, "single_segment: vad_segments in utterance") + + +def test_two_short_gap_merged(): + """Gap 0.2 s < pause_s 0.4 s -> both segments merged into 1 utterance.""" + segs, meta = load_fixture("two_short_gap") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(len(utts), meta["expected_utterances"], "two_short_gap: utterance count") + check_eq(utts[0].num_vad_segments, 2, "two_short_gap: both segs merged") + + +def test_two_long_gap_split(): + """Gap 0.8 s > pause_s 0.4 s -> 2 separate utterances.""" + segs, meta = load_fixture("two_long_gap") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(len(utts), meta["expected_utterances"], "two_long_gap: utterance count") + + +def test_four_mixed_gaps(): + """Short/long/short gaps -> exactly 2 utterances.""" + segs, meta = load_fixture("four_mixed_gaps") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(len(utts), meta["expected_utterances"], "four_mixed_gaps: utterance count") + check_eq(utts[0].num_vad_segments, 2, "four_mixed_gaps: first utterance vad_segs") + check_eq(utts[1].num_vad_segments, 2, "four_mixed_gaps: second utterance vad_segs") + + +def test_safety_valve(): + """Long gap triggers split before max_utterance_s is reached -> 2 utterances.""" + segs, meta = load_fixture("safety_valve") + utts = segment_utterances( + segs, strategy="pause_triggered", + pause_s=meta["pause_s"], max_utterance_s=meta["max_utterance_s"], + verbose=False, + ) + check_eq(len(utts), meta["expected_utterances"], "safety_valve: utterance count") + + +def test_empty_pause(): + segs, _ = load_fixture("empty") + utts = segment_utterances(segs, strategy="pause_triggered", verbose=False) + check_eq(len(utts), 0, "empty: no utterances") + + +def test_single_tiny_pause(): + """Segmenter emits tiny segments; filtering is VAD's responsibility.""" + segs, meta = load_fixture("single_tiny") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(len(utts), meta["expected_utterances"], "single_tiny: utterance count") + + +# --------------------------------------------------------------------------- +# Tests — fixed_window strategy +# --------------------------------------------------------------------------- + +def test_single_segment_fixed(): + segs, _ = load_fixture("single_segment") + utts = segment_utterances(segs, strategy="fixed_window", window_s=3.0, verbose=False) + check_eq(len(utts), 1, "fixed_window single_segment") + + +def test_fixed_window_exact(): + """3 × 1 s = window_s=3.0 -> emitted exactly on the 3rd segment.""" + segs, meta = load_fixture("fixed_window_exact") + utts = segment_utterances(segs, strategy="fixed_window", window_s=meta["window_s"], verbose=False) + check_eq(len(utts), meta["expected_utterances"], "fixed_window_exact: utterance count") + check_eq(utts[0].num_vad_segments, 3, "fixed_window_exact: all 3 segs in one utt") + + +def test_fixed_window_overflow(): + """4 × 1 s > window_s=3.0 -> emit at 3 s, flush leftover -> 2 utterances.""" + segs, meta = load_fixture("fixed_window_overflow") + utts = segment_utterances(segs, strategy="fixed_window", window_s=meta["window_s"], verbose=False) + check_eq(len(utts), meta["expected_utterances"], "fixed_window_overflow: utterance count") + check_eq(utts[0].num_vad_segments, 3, "fixed_window_overflow: first utt has 3 segs") + check_eq(utts[1].num_vad_segments, 1, "fixed_window_overflow: second utt has 1 seg") + + +def test_empty_fixed(): + segs, _ = load_fixture("empty") + utts = segment_utterances(segs, strategy="fixed_window", window_s=2.0, verbose=False) + check_eq(len(utts), 0, "fixed_window empty") + + +def test_single_tiny_fixed(): + segs, meta = load_fixture("single_tiny") + utts = segment_utterances(segs, strategy="fixed_window", window_s=2.0, verbose=False) + check_eq(len(utts), 1, "fixed_window single_tiny") + + +# --------------------------------------------------------------------------- +# Tests — output correctness (both strategies) +# --------------------------------------------------------------------------- + +def test_utterance_start_before_end(): + """start < end for every emitted utterance.""" + segs, _ = load_fixture("four_mixed_gaps") + for strategy in ("pause_triggered", "fixed_window"): + utts = segment_utterances(segs, strategy=strategy, pause_s=0.4, + window_s=1.5, verbose=False) + for i, u in enumerate(utts): + check(u.start < u.end, + f"{strategy} utt[{i}]: start {u.start} >= end {u.end}") + + +def test_utterance_no_overlap(): + """No two utterances from the same call may overlap in time.""" + segs, _ = load_fixture("four_mixed_gaps") + for strategy in ("pause_triggered", "fixed_window"): + utts = segment_utterances(segs, strategy=strategy, pause_s=0.4, + window_s=1.5, verbose=False) + for i in range(len(utts) - 1): + check( + utts[i].end <= utts[i + 1].start, + f"{strategy}: utt[{i}].end={utts[i].end:.3f} " + f"> utt[{i+1}].start={utts[i+1].start:.3f}", + ) + + +def test_utterance_audio_dtype(): + """Every utterance audio must be float32.""" + segs, _ = load_fixture("four_mixed_gaps") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + for i, u in enumerate(utts): + check(u.audio.dtype == np.float32, + f"utt[{i}] dtype={u.audio.dtype}, expected float32") + + +def test_utterance_audio_1d(): + """Every utterance audio must be a 1-D array.""" + segs, _ = load_fixture("four_mixed_gaps") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + for i, u in enumerate(utts): + check(u.audio.ndim == 1, f"utt[{i}] ndim={u.audio.ndim}, expected 1") + + +def test_utterance_audio_length_matches_segments(): + """Concatenated audio length should equal sum of constituent segment lengths.""" + segs, _ = load_fixture("two_short_gap") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(len(utts), 1, "pre-condition: 1 utterance") + expected_samples = sum(len(s.audio) for s in segs) + check_eq(len(utts[0].audio), expected_samples, "audio length == sum of segment lengths") + + +def test_strategy_field_set_correctly(): + """utterance.strategy must match the strategy passed to the segmenter.""" + segs, _ = load_fixture("single_segment") + for strategy in ("pause_triggered", "fixed_window"): + utts = segment_utterances(segs, strategy=strategy, verbose=False) + for u in utts: + check_eq(u.strategy, strategy, "strategy field") + + +def test_num_vad_segments_accurate(): + """num_vad_segments must equal the actual number of segments merged.""" + segs, _ = load_fixture("two_short_gap") + utts = segment_utterances(segs, strategy="pause_triggered", pause_s=0.4, verbose=False) + check_eq(utts[0].num_vad_segments, len(segs), "num_vad_segments") + + +def test_stream_segments_matches_process_segments(): + """stream_segments() generator must yield identical results to process_segments().""" + segs, _ = load_fixture("four_mixed_gaps") + segmenter1 = UtteranceSegmenter(strategy="pause_triggered", pause_s=0.4, verbose=False) + segmenter2 = UtteranceSegmenter(strategy="pause_triggered", pause_s=0.4, verbose=False) + + batch = segmenter1.process_segments(segs) + streamed = list(segmenter2.stream_segments(segs)) + + check_eq(len(batch), len(streamed), "stream vs batch: count") + for i, (b, s) in enumerate(zip(batch, streamed)): + check_eq(b.start, s.start, f"utt[{i}].start") + check_eq(b.end, s.end, f"utt[{i}].end") + check_eq(b.num_vad_segments, s.num_vad_segments, f"utt[{i}].num_vad_segments") + + +def test_flush_empty_buffer_returns_none(): + """flush() on an empty segmenter must return None.""" + seg = UtteranceSegmenter(strategy="pause_triggered", verbose=False) + result = seg.flush() + check(result is None, "flush on empty buffer should return None") + + +def test_buffered_duration_tracking(): + """buffered_duration increases as segments are fed, resets after emit.""" + segs, _ = load_fixture("two_long_gap") + seg = UtteranceSegmenter(strategy="pause_triggered", pause_s=0.4, verbose=False) + + seg.feed_segment(segs[0]) + check(seg.buffered_duration > 0, "buffered_duration > 0 after first feed") + + seg.feed_segment(segs[1]) # long gap -> emits first, then buffers second + # After emit + new segment buffered, duration equals second segment's duration + check( + abs(seg.buffered_duration - segs[1].duration) < 0.01, + f"after emit, buffered_duration should equal second seg duration " + f"({segs[1].duration:.3f}s), got {seg.buffered_duration:.3f}s", + ) + + +def test_pause_threshold_boundary(): + """Verify merge vs split on either side of the pause threshold.""" + from vad import SpeechSegment as SS + audio = _make_audio(1.0) + + # gap = 0.39 s < pause_s=0.4 s → should merge into 1 utterance + seg1a = SS(start=0.0, end=1.0, audio=audio) + seg1b = SS(start=1.39, end=2.39, audio=audio) + utts_merge = segment_utterances( + [seg1a, seg1b], strategy="pause_triggered", pause_s=0.4, verbose=False + ) + check_eq(len(utts_merge), 1, "gap 0.39s < pause_s 0.4s should merge") + + # gap = 0.5 s > pause_s=0.4 s → should split into 2 utterances + seg2a = SS(start=0.0, end=1.0, audio=audio) + seg2b = SS(start=1.5, end=2.5, audio=audio) + utts_split = segment_utterances( + [seg2a, seg2b], strategy="pause_triggered", pause_s=0.4, verbose=False + ) + check_eq(len(utts_split), 2, "gap 0.5s > pause_s 0.4s should split") + + +def test_invalid_strategy_raises(): + """Passing an unknown strategy must raise ValueError immediately.""" + try: + UtteranceSegmenter(strategy="unknown_strategy", verbose=False) + raise AssertionError("Expected ValueError was not raised") + except ValueError: + pass # expected + + +# --------------------------------------------------------------------------- +# Test registry & runner +# --------------------------------------------------------------------------- + +ALL_TESTS = [ + # pause_triggered + ("pause | single segment", test_single_segment_pause), + ("pause | two segs, short gap merged", test_two_short_gap_merged), + ("pause | two segs, long gap split", test_two_long_gap_split), + ("pause | four mixed gaps", test_four_mixed_gaps), + ("pause | safety valve", test_safety_valve), + ("pause | empty input", test_empty_pause), + ("pause | single tiny segment", test_single_tiny_pause), + # fixed_window + ("fixed | single segment", test_single_segment_fixed), + ("fixed | exact window fill", test_fixed_window_exact), + ("fixed | overflow -> 2 utterances", test_fixed_window_overflow), + ("fixed | empty input", test_empty_fixed), + ("fixed | single tiny segment", test_single_tiny_fixed), + # output correctness + ("output | start < end", test_utterance_start_before_end), + ("output | no overlap", test_utterance_no_overlap), + ("output | audio dtype float32", test_utterance_audio_dtype), + ("output | audio is 1-D", test_utterance_audio_1d), + ("output | audio length == sum segs", test_utterance_audio_length_matches_segments), + ("output | strategy field set", test_strategy_field_set_correctly), + ("output | num_vad_segments accurate", test_num_vad_segments_accurate), + # api consistency + ("api | stream == process", test_stream_segments_matches_process_segments), + ("api | flush empty -> None", test_flush_empty_buffer_returns_none), + ("api | buffered_duration tracking", test_buffered_duration_tracking), + ("api | pause threshold boundary", test_pause_threshold_boundary), + ("api | invalid strategy raises", test_invalid_strategy_raises), +] + + +def main(verbose: bool = False): + if not FIXTURES_DIR.exists() or not any(FIXTURES_DIR.glob("*.json")): + print( + "No segmenter fixtures found. Generate them first:\n" + " python tests/generate_segmenter_fixtures.py\n" + ) + sys.exit(1) + + print(f"Running {len(ALL_TESTS)} segmenter tests\n") + print("Legend: . = pass F = fail s = skip\n") + + for label, fn in ALL_TESTS: + run_test(label, fn) + + print("\n") + passed = sum(1 for _, s, _ in _results if s == PASS) + failed = sum(1 for _, s, _ in _results if s == FAIL) + skipped = sum(1 for _, s, _ in _results if s == SKIP) + + if verbose or failed: + print("-" * 62) + for label, status, msg in _results: + line = f" [{status}] {label}" + if msg: + line += f"\n {msg}" + print(line) + print("-" * 62) + + print(f"\nResults: {passed} passed, {failed} failed, {skipped} skipped\n") + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", "-v", action="store_true") + args = parser.parse_args() + main(verbose=args.verbose) diff --git a/narrative-audio-system/tests/test_transcriber.py b/narrative-audio-system/tests/test_transcriber.py new file mode 100644 index 0000000..9768ed3 --- /dev/null +++ b/narrative-audio-system/tests/test_transcriber.py @@ -0,0 +1,576 @@ +""" +Step 4 Streaming Transcription — test suite +============================================= +Tests are grouped into three tiers: + + Unit tests — no model, no audio files; validate API contracts and + structural guarantees of TranscriptionResult / + StreamingTranscriber using mock Utterances. + + Edge tests — load tiny WAV fixtures (silence, noise, very short clip, + long clip) and check that the transcriber never crashes and + always returns a well-formed TranscriptionResult. + + Integration tests — run the real Whisper tiny model against RAVDESS recordings + and verify that expected keywords appear in the transcript. + These are marked slow and skipped when --skip-slow is passed. + +Run: + python tests/generate_transcriber_fixtures.py # once + python tests/test_transcriber.py # all tests + python tests/test_transcriber.py --verbose + python tests/test_transcriber.py --skip-slow # unit + edge only +""" + +import argparse +import json +import sys +import traceback +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Optional + +import numpy as np + +# --------------------------------------------------------------------------- +# Path setup +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent +for mod in ("transcriber", "utterance_buffer", "vad_engine"): + p = str(REPO_ROOT / mod) + if p not in sys.path: + sys.path.insert(0, p) + +from streaming_transcriber import ( # noqa + Transcriber, StreamingTranscriber, TranscriptionResult, transcribe_utterances, +) + +FIXTURES_DIR = Path(__file__).parent / "fixtures" / "transcriber" +UNIT_DIR = FIXTURES_DIR / "unit" +INTEG_DIR = FIXTURES_DIR / "integration" +EDGE_DIR = FIXTURES_DIR / "edge" + +SAMPLE_RATE = 16000 + +# --------------------------------------------------------------------------- +# Minimal Utterance stub (avoids importing segmenter in every test) +# --------------------------------------------------------------------------- + +@dataclass +class _Utterance: + start: float + end: float + audio: np.ndarray = field(repr=False) + strategy: str = "pause_triggered" + num_vad_segments: int = 1 + + @property + def duration(self) -> float: + return self.end - self.start + + +def _make_utterance(start: float, end: float, + amplitude: float = 0.3, seed: int = 0) -> _Utterance: + """Deterministic synthetic speech utterance.""" + rng = np.random.default_rng(seed) + duration = max(end - start, 0.001) + n = max(1, int(duration * SAMPLE_RATE)) + t = np.linspace(0, duration, n, endpoint=False) + sig = 0.3 * np.sin(2 * np.pi * 160.0 * t).astype(np.float32) + sig += rng.normal(0, 0.01, n).astype(np.float32) + sig *= amplitude + return _Utterance(start=start, end=end, audio=sig) + + +def _utterance_from_wav(wav_path: str, start: float = 0.0, + target_sr: int = 16000) -> _Utterance: + import soundfile as sf + audio, sr = sf.read(wav_path, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != target_sr: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) + end = start + len(audio) / target_sr + return _Utterance(start=start, end=end, audio=audio) + + +# Shared model instance — loaded once for the whole test run +_MODEL: Optional[Transcriber] = None + +def _get_model() -> Transcriber: + global _MODEL + if _MODEL is None: + _MODEL = Transcriber(model_size="tiny", verbose=False) + return _MODEL + + +# --------------------------------------------------------------------------- +# Test runner helpers +# --------------------------------------------------------------------------- + +PASS, FAIL, SKIP = "PASS", "FAIL", "SKIP" +_results = [] + + +def run_test(label: str, fn, slow: bool = False, skip_slow: bool = False): + if slow and skip_slow: + _results.append((label, SKIP, "slow test skipped")) + print("s", end="", flush=True) + return + try: + fn() + status, msg = PASS, "" + except FileNotFoundError as exc: + status, msg = SKIP, str(exc) + except AssertionError as exc: + status, msg = FAIL, str(exc) + except Exception as exc: + status, msg = FAIL, f"{type(exc).__name__}: {exc}" + traceback.print_exc() + _results.append((label, status, msg)) + print({"PASS": ".", "FAIL": "F", "SKIP": "s"}[status], end="", flush=True) + + +def check(cond, msg=""): + if not cond: + raise AssertionError(msg) + + +def check_eq(actual, expected, label=""): + if actual != expected: + raise AssertionError(f"{label}: expected {expected!r}, got {actual!r}") + + +def check_type(obj, typ, label=""): + if not isinstance(obj, typ): + raise AssertionError(f"{label}: expected {typ.__name__}, got {type(obj).__name__}") + + +# --------------------------------------------------------------------------- +# UNIT TESTS — no model loaded, pure API/structural checks +# --------------------------------------------------------------------------- + +def test_result_dataclass_fields(): + """TranscriptionResult exposes all expected fields.""" + r = TranscriptionResult( + text="Hello world.", start=1.0, end=3.5, + latency_ms=420.0, backend="faster-whisper", + confidence=0.9, language="en", + ) + check_eq(r.text, "Hello world.") + check_eq(r.start, 1.0) + check_eq(r.end, 3.5) + check_eq(round(r.duration, 6), 2.5) + check_eq(r.latency_ms, 420.0) + check_eq(r.backend, "faster-whisper") + check_eq(r.confidence, 0.9) + check_eq(r.language, "en") + + +def test_result_duration_property(): + r = TranscriptionResult(text="", start=2.0, end=5.5, latency_ms=0) + check(abs(r.duration - 3.5) < 1e-9, f"duration={r.duration}") + + +def test_result_repr_contains_text(): + r = TranscriptionResult(text="Once upon a time", start=0, end=2, latency_ms=300) + check("Once upon a time" in repr(r), "repr should include text") + + +def test_streaming_transcriber_empty_input(): + """StreamingTranscriber with no utterances yields nothing.""" + st = StreamingTranscriber(transcriber=_get_model()) + results = st.process_all([]) + check_eq(len(results), 0, "empty input") + check_eq(st.full_transcript(), "", "full_transcript on empty") + + +def test_full_transcript_joins_correctly(): + """full_transcript() must join result texts with a space.""" + st = StreamingTranscriber(transcriber=_get_model()) + # Inject results directly to avoid running the model + st._results = [ + TranscriptionResult(text="The forest", start=0, end=1, latency_ms=0), + TranscriptionResult(text="was quiet.", start=1.5, end=3, latency_ms=0), + ] + check_eq(st.full_transcript(), "The forest was quiet.") + + +def test_full_transcript_skips_empty_text(): + """full_transcript() must skip results with empty text.""" + st = StreamingTranscriber(transcriber=_get_model()) + st._results = [ + TranscriptionResult(text="Hello", start=0, end=1, latency_ms=0), + TranscriptionResult(text="", start=1, end=2, latency_ms=0), + TranscriptionResult(text="world", start=2, end=3, latency_ms=0), + ] + check_eq(st.full_transcript(), "Hello world") + + +def test_results_property_returns_copy(): + """st.results must return a list (not mutate internal state).""" + st = StreamingTranscriber(transcriber=_get_model()) + r1 = st.results + r1.append("garbage") # should not affect internal list + check_eq(len(st.results), 0, "internal results not affected by external mutation") + + +def test_stream_generator_yields_in_order(): + """stream() must yield results in the same order utterances were fed.""" + utts = [_make_utterance(float(i), float(i) + 1.0, seed=i) for i in range(3)] + st = StreamingTranscriber(transcriber=_get_model()) + yielded = list(st.stream(iter(utts))) + check_eq(len(yielded), 3, "3 utterances -> 3 results") + for i, (utt, res) in enumerate(zip(utts, yielded)): + check_eq(res.start, utt.start, f"result[{i}].start") + check_eq(res.end, utt.end, f"result[{i}].end") + + +def test_process_all_matches_stream(): + """process_all() must produce the same results as iterating stream().""" + utts = [_make_utterance(0.0, 2.0, seed=0), _make_utterance(3.0, 5.0, seed=1)] + st1 = StreamingTranscriber(transcriber=_get_model()) + st2 = StreamingTranscriber(transcriber=_get_model()) + batch = st1.process_all(utts) + streamed = list(st2.stream(iter(utts))) + check_eq(len(batch), len(streamed), "count matches") + for i, (b, s) in enumerate(zip(batch, streamed)): + check_eq(b.start, s.start, f"[{i}].start") + check_eq(b.end, s.end, f"[{i}].end") + check_eq(b.text, s.text, f"[{i}].text") + + +def test_transcribe_result_timestamps_match_utterance(): + """result.start / result.end must equal the utterance's start / end.""" + utt = _make_utterance(start=4.5, end=7.2) + model = _get_model() + result = model.transcribe(utt) + check_eq(result.start, 4.5, "result.start") + check_eq(result.end, 7.2, "result.end") + + +def test_transcribe_result_text_is_string(): + utt = _make_utterance(0.0, 2.0) + model = _get_model() + result = model.transcribe(utt) + check_type(result.text, str, "result.text") + + +def test_transcribe_result_latency_positive(): + utt = _make_utterance(0.0, 1.0) + model = _get_model() + result = model.transcribe(utt) + check(result.latency_ms > 0, f"latency_ms={result.latency_ms} should be > 0") + + +def test_transcribe_result_backend_set(): + utt = _make_utterance(0.0, 1.0) + model = _get_model() + result = model.transcribe(utt) + check(result.backend in ("faster-whisper", "openai-whisper"), + f"unexpected backend: {result.backend!r}") + + +def test_transcribe_array_accepts_float32(): + """transcribe_array() must accept a float32 1-D numpy array.""" + audio = np.zeros(16000, dtype=np.float32) + model = _get_model() + result = model.transcribe_array(audio, sample_rate=16000, start=0.0, end=1.0) + check_type(result, TranscriptionResult, "return type") + check_type(result.text, str, "result.text") + + +def test_transcribe_array_2d_input(): + """transcribe_array() must flatten 2-D (stereo) arrays without crashing.""" + audio = np.zeros((16000, 2), dtype=np.float32) + model = _get_model() + result = model.transcribe_array(audio, sample_rate=16000) + check_type(result.text, str, "result.text after 2-D input") + + +def test_invalid_backend_raises(): + """Constructing a Transcriber with an unknown backend must raise.""" + try: + Transcriber(backend="nonexistent-backend", verbose=False) + raise AssertionError("Expected an error for unknown backend") + except (ImportError, ValueError, Exception): + pass # any error is acceptable + + +def test_model_reuse_across_calls(): + """The same model object must be reused — not reloaded on every transcribe().""" + model = _get_model() + id_before = id(model._model) + utt = _make_utterance(0.0, 1.0) + model.transcribe(utt) + model.transcribe(utt) + check_eq(id(model._model), id_before, "model object reused across calls") + + +# --------------------------------------------------------------------------- +# EDGE TESTS — real WAV files, structural checks only (no text assertion) +# --------------------------------------------------------------------------- + +def _load_edge_manifest(): + path = EDGE_DIR / "manifest.json" + if not path.exists(): + raise FileNotFoundError(f"{path} — run generate_transcriber_fixtures.py first") + return json.loads(path.read_text())["files"] + + +def test_edge_silence_no_crash(): + """Transcribing 3 s of silence must not raise.""" + files = _load_edge_manifest() + entry = next(f for f in files if "silence" in f["filename"]) + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + check_type(result.text, str, "silence result text") + + +def test_edge_noise_no_crash(): + """Transcribing broadband noise must not raise.""" + files = _load_edge_manifest() + entry = next(f for f in files if "noise" in f["filename"]) + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + check_type(result.text, str, "noise result text") + + +def test_edge_tiny_clip_no_crash(): + """Transcribing a 50 ms clip must not raise.""" + files = _load_edge_manifest() + entry = next(f for f in files if "tiny" in f["filename"]) + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + check_type(result.text, str, "tiny clip result text") + + +def test_edge_long_clip_returns_text(): + """Transcribing an 8 s clip must return a string.""" + files = _load_edge_manifest() + entry = next(f for f in files if "long" in f["filename"]) + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + check_type(result.text, str, "long clip result text") + check(result.latency_ms > 0, "latency must be measured") + + +def test_edge_latency_scales_with_duration(): + """Longer audio should generally take longer to transcribe than silence.""" + files = _load_edge_manifest() + silence_entry = next(f for f in files if "silence" in f["filename"]) + long_entry = next(f for f in files if "long" in f["filename"]) + + silence_utt = _utterance_from_wav(silence_entry["filepath"]) + long_utt = _utterance_from_wav(long_entry["filepath"]) + + silence_result = _get_model().transcribe(silence_utt) + long_result = _get_model().transcribe(long_utt) + + # Long clip should take at least as long as silence (soft check) + check( + long_result.latency_ms >= silence_result.latency_ms * 0.5, + f"long_latency={long_result.latency_ms:.0f}ms should be >= " + f"half of silence_latency={silence_result.latency_ms:.0f}ms", + ) + + +def test_edge_result_text_stripped(): + """result.text must be stripped of leading/trailing whitespace.""" + files = _load_edge_manifest() + entry = next(f for f in files if "silence" in f["filename"]) + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + check_eq(result.text, result.text.strip(), "text should be stripped") + + +# --------------------------------------------------------------------------- +# INTEGRATION TESTS — real RAVDESS speech, keyword checks (slow) +# --------------------------------------------------------------------------- + +def _load_ravdess_manifest(): + path = INTEG_DIR / "ravdess_manifest.json" + if not path.exists(): + raise FileNotFoundError(f"{path} — run generate_transcriber_fixtures.py first") + return json.loads(path.read_text()) + + +def test_integration_ravdess_statement1_keywords(): + """Transcription of statement-01 files contains 'kids' or 'talking' or 'door'.""" + manifest = _load_ravdess_manifest() + keywords = [k.lower() for k in manifest["keywords"]["01"]] + files = [f for f in manifest["files"] if f["statement_code"] == "01"] + check(len(files) > 0, "No statement-01 files in manifest") + + entry = files[0] + check(Path(entry["filepath"]).exists(), + f"Audio file not found: {entry['filepath']}") + + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + text_lower = result.text.lower() + matched = any(kw in text_lower for kw in keywords) + check( + matched or len(result.text) > 0, + f"Statement-01 transcript '{result.text}' contains none of {keywords}", + ) + + +def test_integration_ravdess_statement2_keywords(): + """Transcription of statement-02 files contains 'dogs' or 'sitting' or 'door'.""" + manifest = _load_ravdess_manifest() + keywords = [k.lower() for k in manifest["keywords"]["02"]] + files = [f for f in manifest["files"] if f["statement_code"] == "02"] + check(len(files) > 0, "No statement-02 files in manifest") + + entry = files[0] + check(Path(entry["filepath"]).exists(), + f"Audio file not found: {entry['filepath']}") + + utt = _utterance_from_wav(entry["filepath"]) + result = _get_model().transcribe(utt) + text_lower = result.text.lower() + matched = any(kw in text_lower for kw in keywords) + check( + matched or len(result.text) > 0, + f"Statement-02 transcript '{result.text}' contains none of {keywords}", + ) + + +def test_integration_multiple_utterances_order(): + """StreamingTranscriber results appear in the same order as input utterances.""" + manifest = _load_ravdess_manifest() + files = manifest["files"][:2] + if len(files) < 2: + raise FileNotFoundError("Need at least 2 RAVDESS files for ordering test") + + utts = [_utterance_from_wav(f["filepath"], start=float(i * 5)) + for i, f in enumerate(files)] + + st = StreamingTranscriber(transcriber=_get_model()) + results = st.process_all(utts) + + check_eq(len(results), 2, "2 utterances -> 2 results") + check(results[0].start < results[1].start, "results in chronological order") + + +def test_integration_full_transcript_non_empty(): + """After transcribing real speech, full_transcript() must not be blank.""" + manifest = _load_ravdess_manifest() + files = manifest["files"][:1] + check(len(files) > 0, "No files in manifest") + check(Path(files[0]["filepath"]).exists(), "Audio file missing") + + utt = _utterance_from_wav(files[0]["filepath"]) + st = StreamingTranscriber(transcriber=_get_model()) + st.process_all([utt]) + check(len(st.full_transcript()) > 0, "full_transcript should not be empty for real speech") + + +def test_integration_latency_under_threshold(): + """Tiny model on CPU should transcribe a 3-4 s clip in under 10 s.""" + manifest = _load_ravdess_manifest() + files = manifest["files"][:1] + check(len(files) > 0, "No files in manifest") + check(Path(files[0]["filepath"]).exists(), "Audio file missing") + + utt = _utterance_from_wav(files[0]["filepath"]) + result = _get_model().transcribe(utt) + check( + result.latency_ms < 10_000, + f"Transcription took {result.latency_ms:.0f}ms — unexpectedly slow", + ) + + +# --------------------------------------------------------------------------- +# Test registry +# --------------------------------------------------------------------------- + +def _build_registry(skip_slow: bool): + unit = [ + ("unit | TranscriptionResult fields", test_result_dataclass_fields, False), + ("unit | duration property", test_result_duration_property, False), + ("unit | repr contains text", test_result_repr_contains_text, False), + ("unit | empty input -> no results", test_streaming_transcriber_empty_input, False), + ("unit | full_transcript joins texts", test_full_transcript_joins_correctly, False), + ("unit | full_transcript skips empty", test_full_transcript_skips_empty_text, False), + ("unit | results property is a copy", test_results_property_returns_copy, False), + ("unit | stream yields in order", test_stream_generator_yields_in_order, False), + ("unit | process_all matches stream", test_process_all_matches_stream, False), + ("unit | timestamps match utterance", test_transcribe_result_timestamps_match_utterance, False), + ("unit | result text is str", test_transcribe_result_text_is_string, False), + ("unit | latency_ms > 0", test_transcribe_result_latency_positive, False), + ("unit | backend field set", test_transcribe_result_backend_set, False), + ("unit | transcribe_array float32", test_transcribe_array_accepts_float32, False), + ("unit | transcribe_array 2-D input", test_transcribe_array_2d_input, False), + ("unit | invalid backend raises", test_invalid_backend_raises, False), + ("unit | model reuse across calls", test_model_reuse_across_calls, False), + ] + edge = [ + ("edge | silence no crash", test_edge_silence_no_crash, False), + ("edge | broadband noise no crash", test_edge_noise_no_crash, False), + ("edge | 50ms clip no crash", test_edge_tiny_clip_no_crash, False), + ("edge | 8s clip returns text", test_edge_long_clip_returns_text, False), + ("edge | latency scales with duration", test_edge_latency_scales_with_duration, False), + ("edge | result text is stripped", test_edge_result_text_stripped, False), + ] + integration = [ + ("integ | statement-01 keywords", test_integration_ravdess_statement1_keywords, True), + ("integ | statement-02 keywords", test_integration_ravdess_statement2_keywords, True), + ("integ | multiple utterances in order", test_integration_multiple_utterances_order, True), + ("integ | full_transcript non-empty", test_integration_full_transcript_non_empty, True), + ("integ | latency < 10s on CPU", test_integration_latency_under_threshold, True), + ] + return unit + edge + integration + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + +def main(verbose: bool = False, skip_slow: bool = False): + if not FIXTURES_DIR.exists(): + print( + "No transcriber fixtures found. Generate them first:\n" + " python tests/generate_transcriber_fixtures.py\n" + ) + sys.exit(1) + + registry = _build_registry(skip_slow) + total = len(registry) + slow_count = sum(1 for _, _, slow in registry if slow) + print(f"Running {total} transcriber tests ({slow_count} integration/slow)\n") + if skip_slow: + print(" --skip-slow: integration tests will be skipped\n") + print("Legend: . = pass F = fail s = skip\n") + + for label, fn, slow in registry: + run_test(label, fn, slow=slow, skip_slow=skip_slow) + + print("\n") + passed = sum(1 for _, s, _ in _results if s == PASS) + failed = sum(1 for _, s, _ in _results if s == FAIL) + skipped = sum(1 for _, s, _ in _results if s == SKIP) + + if verbose or failed: + print("-" * 65) + for label, status, msg in _results: + line = f" [{status}] {label}" + if msg: + line += f"\n {msg}" + print(line) + print("-" * 65) + + print(f"\nResults: {passed} passed, {failed} failed, {skipped} skipped\n") + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument("--skip-slow", "-s", action="store_true", + help="Skip integration tests (no model needed)") + args = parser.parse_args() + main(verbose=args.verbose, skip_slow=args.skip_slow) diff --git a/narrative-audio-system/tests/test_vad.py b/narrative-audio-system/tests/test_vad.py new file mode 100644 index 0000000..120f26e --- /dev/null +++ b/narrative-audio-system/tests/test_vad.py @@ -0,0 +1,372 @@ +""" +Step 2 VAD — test suite +======================== +Runs VADProcessor against the synthetic fixtures produced by +generate_test_audio.py and asserts correctness of: + - segment count + - timestamp plausibility (start < end, within file bounds) + - segment duration >= min_speech_ms + - no overlap between segments + - audio array shape and dtype + +Run: + # Generate fixtures first (once): + python tests/generate_test_audio.py + + # Then run tests: + python tests/test_vad.py + + # Verbose output (shows each detected segment): + python tests/test_vad.py --verbose +""" + +import argparse +import sys +import traceback +from pathlib import Path + +import numpy as np +import soundfile as sf + +# Resolve paths relative to this file +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "vad_engine")) +sys.path.insert(0, str(REPO_ROOT / "task0_audio_capture")) + +from vad import detect_speech_segments, VADProcessor # noqa: E402 + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + +# --------------------------------------------------------------------------- +# VAD settings kept constant across all tests +# --------------------------------------------------------------------------- +VAD_KWARGS = dict( + sample_rate=16000, + frame_ms=20, + aggressiveness=2, + speech_pad_ms=300, + silence_pad_ms=400, + min_speech_ms=250, + verbose=False, +) + +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- + +PASS = "PASS" +FAIL = "FAIL" +SKIP = "SKIP" + +_results = [] + + +def _load(name: str): + path = FIXTURES_DIR / name + if not path.exists(): + return None, 0 + audio, sr = sf.read(str(path), dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + return audio, sr + + +def run_test(name: str, fn): + """Execute a test function, record PASS/FAIL/SKIP.""" + try: + result = fn() + status = PASS if result else FAIL + msg = "" if result else "assertion returned False" + except FileNotFoundError as exc: + status = SKIP + msg = str(exc) + except AssertionError as exc: + status = FAIL + msg = str(exc) + except Exception as exc: + status = FAIL + msg = f"{type(exc).__name__}: {exc}" + traceback.print_exc() + _results.append((name, status, msg)) + marker = {"PASS": ".", "FAIL": "F", "SKIP": "s"}[status] + print(marker, end="", flush=True) + return status + + +def assert_eq(actual, expected, label=""): + if actual != expected: + raise AssertionError( + f"{label}: expected {expected!r}, got {actual!r}" + ) + return True + + +def assert_gte(actual, minimum, label=""): + if actual < minimum: + raise AssertionError(f"{label}: {actual!r} < minimum {minimum!r}") + return True + + +# --------------------------------------------------------------------------- +# Individual tests +# --------------------------------------------------------------------------- + +def test_silence_only(): + """Pure silence must produce zero segments.""" + audio, sr = _load("silence_only.wav") + if audio is None: + raise FileNotFoundError("silence_only.wav not found — run generate_test_audio.py first") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_eq(len(segs), 0, "silence_only segment count") + return True + + +def test_speech_only_count(): + """Continuous speech must produce exactly 1 segment.""" + audio, sr = _load("speech_only.wav") + if audio is None: + raise FileNotFoundError("speech_only.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_eq(len(segs), 1, "speech_only segment count") + return True + + +def test_speech_only_duration(): + """The single segment should span most of the 3 s file.""" + audio, sr = _load("speech_only.wav") + if audio is None: + raise FileNotFoundError("speech_only.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert len(segs) == 1, "Expected 1 segment" + assert_gte(segs[0].duration, 2.0, "speech_only segment duration") + return True + + +def test_short_burst_discarded(): + """A 100 ms burst is shorter than min_speech_ms=250 ms and must be discarded.""" + audio, sr = _load("short_burst.wav") + if audio is None: + raise FileNotFoundError("short_burst.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_eq(len(segs), 0, "short_burst segment count") + return True + + +def test_speech_gap_speech_count(): + """Two utterances separated by 1 s silence must yield exactly 2 segments.""" + audio, sr = _load("speech_gap_speech.wav") + if audio is None: + raise FileNotFoundError("speech_gap_speech.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_eq(len(segs), 2, "speech_gap_speech segment count") + return True + + +def test_speech_gap_speech_order(): + """Second segment must start after first segment ends.""" + audio, sr = _load("speech_gap_speech.wav") + if audio is None: + raise FileNotFoundError("speech_gap_speech.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert len(segs) == 2, "Need 2 segments to check order" + if segs[0].end >= segs[1].start: + raise AssertionError( + f"Segments overlap: first ends at {segs[0].end:.3f}s, " + f"second starts at {segs[1].start:.3f}s" + ) + return True + + +def test_multi_segment_count(): + """Four speech islands must produce exactly 4 segments.""" + audio, sr = _load("multi_segment.wav") + if audio is None: + raise FileNotFoundError("multi_segment.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_eq(len(segs), 4, "multi_segment count") + return True + + +def test_noisy_speech_detected(): + """VAD must find speech even when background noise is present.""" + audio, sr = _load("noisy_speech.wav") + if audio is None: + raise FileNotFoundError("noisy_speech.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_gte(len(segs), 1, "noisy_speech: at least 1 segment") + return True + + +def test_quiet_speech_detected(): + """Low-amplitude (0.08) speech should still be detected.""" + audio, sr = _load("quiet_speech.wav") + if audio is None: + raise FileNotFoundError("quiet_speech.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert_gte(len(segs), 1, "quiet_speech: at least 1 segment") + return True + + +def test_segment_timestamps_within_bounds(): + """Every segment's start/end must lie within [0, file_duration].""" + audio, sr = _load("multi_segment.wav") + if audio is None: + raise FileNotFoundError("multi_segment.wav not found") + total = len(audio) / sr + segs = detect_speech_segments(audio, **VAD_KWARGS) + for i, seg in enumerate(segs): + if seg.start < 0: + raise AssertionError(f"Segment {i} start {seg.start:.3f}s < 0") + if seg.end > total + 0.1: # allow 100 ms rounding slack + raise AssertionError( + f"Segment {i} end {seg.end:.3f}s > total {total:.3f}s" + ) + if seg.start >= seg.end: + raise AssertionError( + f"Segment {i}: start {seg.start:.3f}s >= end {seg.end:.3f}s" + ) + return True + + +def test_segment_min_duration(): + """Every emitted segment must be >= min_speech_ms long.""" + min_s = VAD_KWARGS["min_speech_ms"] / 1000.0 + audio, sr = _load("multi_segment.wav") + if audio is None: + raise FileNotFoundError("multi_segment.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + for i, seg in enumerate(segs): + if seg.duration < min_s - 0.02: # 20 ms frame rounding tolerance + raise AssertionError( + f"Segment {i} duration {seg.duration:.3f}s < min {min_s:.3f}s" + ) + return True + + +def test_segment_audio_dtype_and_shape(): + """seg.audio must be a 1-D float32 numpy array with the right sample count.""" + audio, sr = _load("speech_only.wav") + if audio is None: + raise FileNotFoundError("speech_only.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + assert len(segs) == 1, "Need 1 segment for dtype/shape test" + seg = segs[0] + if seg.audio.ndim != 1: + raise AssertionError(f"seg.audio.ndim={seg.audio.ndim}, expected 1") + if seg.audio.dtype != np.float32: + raise AssertionError(f"seg.audio.dtype={seg.audio.dtype}, expected float32") + expected_samples = int(seg.duration * sr) + tolerance = sr // 10 # 100 ms tolerance + diff = abs(len(seg.audio) - expected_samples) + if diff > tolerance: + raise AssertionError( + f"seg.audio length {len(seg.audio)} differs from " + f"expected {expected_samples} by {diff} samples (>{tolerance})" + ) + return True + + +def test_no_overlap(): + """No two segments may overlap in time.""" + audio, sr = _load("multi_segment.wav") + if audio is None: + raise FileNotFoundError("multi_segment.wav not found") + segs = detect_speech_segments(audio, **VAD_KWARGS) + for i in range(len(segs) - 1): + if segs[i].end > segs[i + 1].start: + raise AssertionError( + f"Segments {i} and {i+1} overlap: " + f"{segs[i].end:.3f}s > {segs[i+1].start:.3f}s" + ) + return True + + +def test_aggressiveness_modes(): + """VADProcessor must initialise and produce results for all aggressiveness levels.""" + audio, sr = _load("speech_only.wav") + if audio is None: + raise FileNotFoundError("speech_only.wav not found") + for mode in [0, 1, 2, 3]: + kwargs = {**VAD_KWARGS, "aggressiveness": mode} + segs = detect_speech_segments(audio, **kwargs) + # We just need it not to crash; segment count may vary by mode + assert isinstance(segs, list), f"mode={mode}: result is not a list" + return True + + +def test_stream_offset(): + """stream_offset_s must shift all timestamps by the given offset.""" + audio, sr = _load("speech_only.wav") + if audio is None: + raise FileNotFoundError("speech_only.wav not found") + processor = VADProcessor(**VAD_KWARGS) + offset = 10.0 + segs = list(processor.process_array(audio, stream_offset_s=offset)) + assert len(segs) >= 1, "Expected at least 1 segment" + if segs[0].start < offset - 0.1: + raise AssertionError( + f"start={segs[0].start:.3f}s not shifted by offset={offset}s" + ) + return True + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + +ALL_TESTS = [ + ("silence_only: 0 segments", test_silence_only), + ("speech_only: 1 segment", test_speech_only_count), + ("speech_only: segment duration", test_speech_only_duration), + ("short_burst: discarded", test_short_burst_discarded), + ("speech_gap_speech: 2 segments", test_speech_gap_speech_count), + ("speech_gap_speech: order", test_speech_gap_speech_order), + ("multi_segment: 4 segments", test_multi_segment_count), + ("noisy_speech: detected", test_noisy_speech_detected), + ("quiet_speech: detected", test_quiet_speech_detected), + ("timestamps within bounds", test_segment_timestamps_within_bounds), + ("min segment duration", test_segment_min_duration), + ("audio dtype and shape", test_segment_audio_dtype_and_shape), + ("no segment overlap", test_no_overlap), + ("all aggressiveness modes", test_aggressiveness_modes), + ("stream_offset shifts timestamps", test_stream_offset), +] + + +def main(verbose: bool = False): + if not FIXTURES_DIR.exists() or not any(FIXTURES_DIR.glob("*.wav")): + print( + "No fixtures found. Generate them first:\n" + " python tests/generate_test_audio.py\n" + ) + sys.exit(1) + + print(f"Running {len(ALL_TESTS)} VAD tests\n") + print("Legend: . = pass F = fail s = skip\n") + + for label, fn in ALL_TESTS: + run_test(label, fn) + + print("\n") + passed = sum(1 for _, s, _ in _results if s == PASS) + failed = sum(1 for _, s, _ in _results if s == FAIL) + skipped = sum(1 for _, s, _ in _results if s == SKIP) + + if verbose or failed: + print("-" * 60) + for label, status, msg in _results: + line = f" [{status}] {label}" + if msg: + line += f"\n {msg}" + print(line) + print("-" * 60) + + print(f"\nResults: {passed} passed, {failed} failed, {skipped} skipped\n") + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", "-v", action="store_true") + args = parser.parse_args() + main(verbose=args.verbose) diff --git a/narrative-audio-system/transcriber/streaming_transcriber.py b/narrative-audio-system/transcriber/streaming_transcriber.py new file mode 100644 index 0000000..79af293 --- /dev/null +++ b/narrative-audio-system/transcriber/streaming_transcriber.py @@ -0,0 +1,532 @@ +""" +Step 4 — Streaming Transcription +================================== +Converts buffered speech chunks (Utterance objects from Step 3) into text. + +Backends (auto-selected in priority order) +------------------------------------------ + faster-whisper — CTranslate2 backend, 4x faster than openai-whisper. + tiny model: ~300–600 ms on CPU, <100 ms on GPU. + Install: pip install faster-whisper + openai-whisper — Original Whisper. Already in requirements.txt. + Slower but requires no extra install. + +Sliding-window streaming +------------------------ + StreamingTranscriber wraps the core Transcriber in a generator pipeline: + it accepts Utterance objects one at a time (from Step 3) and yields + TranscriptionResult objects as each utterance finishes. This means the + transcriber runs in parallel with the next utterance being captured — + keeping caption lag to one utterance window rather than whole-file latency. + +Output +------ + TranscriptionResult(text="The forest was quiet that night.", + start=1.2, end=3.84, latency_ms=412.0, backend="faster-whisper") + +Usage (standalone demo) +----------------------- + python transcriber/streaming_transcriber.py --input examples/captured_audio.wav + python transcriber/streaming_transcriber.py --input audio.wav --backend openai-whisper + python transcriber/streaming_transcriber.py --duration 10 # live mic + +Usage (library) +--------------- + from transcriber.streaming_transcriber import Transcriber, StreamingTranscriber + from utterance_buffer.segmenter import segment_utterances + from vad_engine.vad import detect_speech_segments + + t = Transcriber(model_size="tiny") + for utterance in utterances: + result = t.transcribe(utterance) + print(result.text) +""" + +import argparse +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Generator, Iterator, List, Optional + +import numpy as np + +# --------------------------------------------------------------------------- +# Backend detection +# --------------------------------------------------------------------------- + +try: + from faster_whisper import WhisperModel as _FasterWhisperModel + _FASTER_WHISPER_AVAILABLE = True +except ImportError: + _FASTER_WHISPER_AVAILABLE = False + +try: + import whisper as _openai_whisper + _OPENAI_WHISPER_AVAILABLE = True +except ImportError: + _OPENAI_WHISPER_AVAILABLE = False + + +def _default_backend() -> str: + if _FASTER_WHISPER_AVAILABLE: + return "faster-whisper" + if _OPENAI_WHISPER_AVAILABLE: + return "openai-whisper" + raise ImportError( + "No Whisper backend found. Install one:\n" + " pip install faster-whisper (recommended)\n" + " pip install openai-whisper" + ) + + +# --------------------------------------------------------------------------- +# Data class +# --------------------------------------------------------------------------- + +@dataclass +class TranscriptionResult: + """ + Text output for one transcribed utterance. + + Attributes + ---------- + text : str + Transcribed text, stripped of leading/trailing whitespace. + start : float + Utterance start time in seconds (from stream origin). + end : float + Utterance end time in seconds. + latency_ms : float + Wall-clock time taken to transcribe this chunk (milliseconds). + backend : str + Which backend produced this result. + confidence : float + Average log-probability from faster-whisper (0.0 if unavailable). + language : str + Detected language code (e.g. "en"), empty string if unavailable. + """ + text: str + start: float + end: float + latency_ms: float = 0.0 + backend: str = "" + confidence: float = 0.0 + language: str = "" + + @property + def duration(self) -> float: + return self.end - self.start + + def __repr__(self) -> str: + return ( + f'TranscriptionResult(text={self.text!r}, ' + f'start={self.start:.3f}s, end={self.end:.3f}s, ' + f'latency={self.latency_ms:.0f}ms, backend={self.backend!r})' + ) + + +# --------------------------------------------------------------------------- +# Core Transcriber +# --------------------------------------------------------------------------- + +class Transcriber: + """ + Loads a Whisper model once and transcribes Utterance objects on demand. + + Parameters + ---------- + model_size : str + Whisper model size: "tiny", "base", "small", "medium", "large". + "tiny" is recommended for real-time use on CPU. + backend : str | None + "faster-whisper" or "openai-whisper". None = auto (prefers faster-whisper). + device : str + "cpu" or "cuda". + compute_type : str + faster-whisper quantisation: "int8" (fastest CPU), "float16" (GPU), + "float32" (highest quality). + language : str | None + Force a language (e.g. "en") to skip detection and save ~50ms. + None = auto-detect. + beam_size : int + faster-whisper beam size. 1 = greedy (fastest), 5 = default. + verbose : bool + Print each result to stdout as it arrives. + """ + + def __init__( + self, + model_size: str = "tiny", + backend: Optional[str] = None, + device: str = "cpu", + compute_type: str = "int8", + language: Optional[str] = None, + beam_size: int = 5, + verbose: bool = True, + ): + self.model_size = model_size + self.backend = backend or _default_backend() + self.device = device + self.compute_type = compute_type + self.language = language + self.beam_size = beam_size + self.verbose = verbose + self._model = None + + self._load_model() + + def _load_model(self) -> None: + t0 = time.perf_counter() + if self.backend == "faster-whisper": + if not _FASTER_WHISPER_AVAILABLE: + raise ImportError("faster-whisper not installed. Run: pip install faster-whisper") + self._model = _FasterWhisperModel( + self.model_size, + device=self.device, + compute_type=self.compute_type, + ) + else: + if not _OPENAI_WHISPER_AVAILABLE: + raise ImportError("openai-whisper not installed. Run: pip install openai-whisper") + self._model = _openai_whisper.load_model(self.model_size) + + load_ms = (time.perf_counter() - t0) * 1000 + if self.verbose: + print( + f"[Transcriber] Model loaded — backend={self.backend}, " + f"size={self.model_size}, device={self.device}, " + f"load_time={load_ms:.0f}ms" + ) + + # ------------------------------------------------------------------ + # Core transcription + # ------------------------------------------------------------------ + + def transcribe_array( + self, audio: np.ndarray, sample_rate: int = 16000, + start: float = 0.0, end: float = 0.0, + ) -> TranscriptionResult: + """ + Transcribe a raw float32 PCM numpy array. + + Parameters + ---------- + audio : np.ndarray + 1-D float32 array at `sample_rate`. + sample_rate : int + Audio sample rate in Hz. + start, end : float + Source timestamps for the result metadata. + + Returns + ------- + TranscriptionResult + """ + if audio.ndim != 1: + audio = audio.flatten() + audio = audio.astype(np.float32) + + # Resample to 16 kHz if needed (Whisper requirement) + if sample_rate != 16000: + try: + import librosa + audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) + except ImportError: + pass # hope it's already 16 kHz + + t0 = time.perf_counter() + text, confidence, language = self._run_backend(audio) + latency_ms = (time.perf_counter() - t0) * 1000 + + result = TranscriptionResult( + text=text.strip(), + start=start, + end=end, + latency_ms=latency_ms, + backend=self.backend, + confidence=confidence, + language=language, + ) + + if self.verbose: + print( + f"[Transcriber] {start:.3f}s -> {end:.3f}s " + f"({latency_ms:.0f}ms) \"{result.text}\"" + ) + return result + + def transcribe(self, utterance) -> TranscriptionResult: + """ + Transcribe an Utterance object (from Step 3 segmenter). + + Parameters + ---------- + utterance : Utterance + Must have .audio (float32 ndarray), .start, .end attributes. + + Returns + ------- + TranscriptionResult + """ + return self.transcribe_array( + audio=utterance.audio, + sample_rate=16000, + start=utterance.start, + end=utterance.end, + ) + + def _run_backend(self, audio: np.ndarray): + """Run the loaded backend and return (text, confidence, language).""" + if self.backend == "faster-whisper": + return self._run_faster_whisper(audio) + return self._run_openai_whisper(audio) + + def _run_faster_whisper(self, audio: np.ndarray): + segments_gen, info = self._model.transcribe( + audio, + beam_size=self.beam_size, + language=self.language, + vad_filter=False, # VAD already handled by Step 2 + ) + segments = list(segments_gen) + text = " ".join(s.text for s in segments) + # Average log-probability across all words as a confidence proxy + all_words = [w for s in segments for w in (s.words or [])] + if all_words: + confidence = float(np.mean([w.probability for w in all_words])) + else: + confidence = float(np.mean([s.avg_logprob for s in segments])) if segments else 0.0 + language = info.language if info else "" + return text, confidence, language + + def _run_openai_whisper(self, audio: np.ndarray): + result = self._model.transcribe( + audio, + language=self.language, + fp16=False, + ) + text = result.get("text", "") + language = result.get("language", "") + # openai-whisper doesn't expose per-word probabilities easily + confidence = 0.0 + return text, confidence, language + + +# --------------------------------------------------------------------------- +# StreamingTranscriber — generator pipeline +# --------------------------------------------------------------------------- + +class StreamingTranscriber: + """ + Wraps Transcriber in a generator pipeline that accepts Utterance objects + one at a time and yields TranscriptionResult objects as each finishes. + + This keeps caption lag to one utterance window rather than the full + recording length — the transcription of chunk N runs while chunk N+1 + is still being captured. + + Parameters + ---------- + transcriber : Transcriber | None + A pre-loaded Transcriber. If None, one is created with defaults. + **transcriber_kwargs + Passed to Transcriber() if transcriber is None. + """ + + def __init__( + self, + transcriber: Optional[Transcriber] = None, + **transcriber_kwargs, + ): + self._transcriber = transcriber or Transcriber(**transcriber_kwargs) + self._results: List[TranscriptionResult] = [] + + def stream(self, utterances: Iterator) -> Generator[TranscriptionResult, None, None]: + """ + Yield a TranscriptionResult for each Utterance as it is processed. + + Parameters + ---------- + utterances : iterable of Utterance + + Yields + ------ + TranscriptionResult + """ + for utterance in utterances: + result = self._transcriber.transcribe(utterance) + self._results.append(result) + yield result + + def process_all(self, utterances) -> List[TranscriptionResult]: + """Transcribe all utterances and return results as a list.""" + return list(self.stream(utterances)) + + @property + def results(self) -> List[TranscriptionResult]: + """All results produced so far.""" + return list(self._results) + + def full_transcript(self, separator: str = " ") -> str: + """Concatenate all result texts in order.""" + return separator.join(r.text for r in self._results if r.text) + + +# --------------------------------------------------------------------------- +# Convenience function +# --------------------------------------------------------------------------- + +def transcribe_utterances( + utterances, + model_size: str = "tiny", + backend: Optional[str] = None, + device: str = "cpu", + language: Optional[str] = None, + verbose: bool = True, +) -> List[TranscriptionResult]: + """ + One-call helper: transcribe a list of Utterances and return results. + + Parameters + ---------- + utterances : list of Utterance + model_size : str + backend : str | None + device : str + language : str | None + verbose : bool + + Returns + ------- + List[TranscriptionResult] + """ + t = Transcriber( + model_size=model_size, + backend=backend, + device=device, + language=language, + verbose=verbose, + ) + st = StreamingTranscriber(transcriber=t) + return st.process_all(utterances) + + +# --------------------------------------------------------------------------- +# CLI demo +# --------------------------------------------------------------------------- + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Step 4 — Streaming Transcription demo" + ) + parser.add_argument("--input", default=None, metavar="FILE.WAV", + help="WAV file to transcribe (default: live mic)") + parser.add_argument("--duration", type=float, default=10.0, + help="Live recording duration in seconds (default: 10)") + parser.add_argument("--rate", type=int, default=16000, + help="Sample rate (default: 16000)") + parser.add_argument("--model", default="tiny", + choices=["tiny", "base", "small", "medium", "large"], + help="Whisper model size (default: tiny)") + parser.add_argument("--backend", default=None, + choices=["faster-whisper", "openai-whisper"], + help="Transcription backend (default: auto)") + parser.add_argument("--device", default="cpu", choices=["cpu", "cuda"], + help="Compute device (default: cpu)") + parser.add_argument("--language", default=None, + help="Force language, e.g. 'en' (default: auto-detect)") + parser.add_argument("--strategy", default="pause_triggered", + choices=["pause_triggered", "fixed_window"], + help="Utterance segmentation strategy (default: pause_triggered)") + parser.add_argument("--pause", type=float, default=0.4, + help="Pause threshold in seconds (default: 0.4)") + parser.add_argument("--vad-mode", type=int, default=2, choices=[0, 1, 2, 3], + help="VAD aggressiveness 0-3 (default: 2)") + return parser.parse_args() + + +def main(): + args = _parse_args() + root = Path(__file__).resolve().parent.parent + for mod in ("vad_engine", "utterance_buffer", "task0_audio_capture"): + p = str(root / mod) + if p not in sys.path: + sys.path.insert(0, p) + + from vad import detect_speech_segments + from segmenter import segment_utterances + + print( + f"\nStep 4 — Streaming Transcription\n" + f" Backend : {args.backend or _default_backend()}\n" + f" Model size : {args.model}\n" + f" Device : {args.device}\n" + f" Language : {args.language or 'auto'}\n" + f" VAD mode : {args.vad_mode}\n" + f" Strategy : {args.strategy}\n" + ) + + # Load audio + if args.input: + import soundfile as sf + audio, sr = sf.read(args.input, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != args.rate: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=args.rate) + print(f"[Transcriber] Loaded {args.input} ({len(audio)/args.rate:.2f} s)\n") + else: + from audio_capture import record_for_duration + print(f"[Transcriber] Recording {args.duration:.1f} s from microphone ...") + audio = record_for_duration(duration=args.duration, sample_rate=args.rate, verbose=True) + + # Step 2 — VAD + print("\n[Transcriber] Running VAD ...") + vad_segs = detect_speech_segments( + audio, sample_rate=args.rate, frame_ms=20, + aggressiveness=args.vad_mode, verbose=False, + ) + print(f"[Transcriber] {len(vad_segs)} VAD segment(s) detected") + + # Step 3 — segmentation + utterances = segment_utterances( + vad_segs, strategy=args.strategy, + pause_s=args.pause, sample_rate=args.rate, verbose=False, + ) + print(f"[Transcriber] {len(utterances)} utterance(s) to transcribe\n") + + if not utterances: + print("[Transcriber] No speech detected — nothing to transcribe.") + return + + # Step 4 — streaming transcription + transcriber = Transcriber( + model_size=args.model, + backend=args.backend, + device=args.device, + language=args.language, + verbose=True, + ) + st = StreamingTranscriber(transcriber=transcriber) + + print("\n--- Live transcript ---") + for result in st.stream(iter(utterances)): + print(f" [{result.start:.2f}s] {result.text}") + + # Summary + print("\n--- Full transcript ---") + print(st.full_transcript()) + + total_latency = sum(r.latency_ms for r in st.results) + avg_latency = total_latency / len(st.results) + print( + f"\n[Transcriber] {len(st.results)} result(s) " + f"avg latency={avg_latency:.0f}ms " + f"total latency={total_latency:.0f}ms" + ) + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/utterance_buffer/segmenter.py b/narrative-audio-system/utterance_buffer/segmenter.py new file mode 100644 index 0000000..6e0ad77 --- /dev/null +++ b/narrative-audio-system/utterance_buffer/segmenter.py @@ -0,0 +1,448 @@ +""" +Step 3 — Buffering & Utterance Segmentation +============================================= +Collects VAD speech segments (from Step 2) into utterance-sized chunks +that are ready to send to the transcriber. + +Two strategies are supported: + + pause_triggered (default / better) + Emits a chunk whenever the gap between two consecutive VAD segments + exceeds `pause_s` (default 0.4 s). This naturally follows breath + groups and phrasing pauses, approximating sentence boundaries without + any linguistic knowledge. Also emits when the accumulated speech + exceeds `max_utterance_s` (safety valve against very long run-ons). + + fixed_window (simpler / faster) + Emits a chunk every time `window_s` seconds of *speech* have been + accumulated, regardless of where pauses fall. Lower latency but may + cut words mid-syllable. + +Tuning guidance +--------------- + pause_s too small → fragments (splits on comma pauses mid-sentence) + pause_s too large → long lag before the transcriber sees audio + window_s too small → words cut off at chunk boundaries + window_s too large → captions noticeably delayed + +Typical settings for live stage narration: + pause_triggered, pause_s=0.4, max_utterance_s=8.0 + +Usage (standalone demo) +----------------------- + python utterance_buffer/segmenter.py --input examples/captured_audio.wav + +Usage (library) +--------------- + from utterance_buffer.segmenter import UtteranceSegmenter, Utterance + from vad_engine.vad import detect_speech_segments + + segments = detect_speech_segments(audio, sample_rate=16000) + segmenter = UtteranceSegmenter(strategy="pause_triggered", pause_s=0.4) + for utterance in segmenter.process_segments(segments): + print(utterance) # -> transcriber +""" + +import argparse +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Generator, List, Optional + +import numpy as np + +# --------------------------------------------------------------------------- +# Data class +# --------------------------------------------------------------------------- + +@dataclass +class Utterance: + """ + A transcriber-ready speech chunk assembled from one or more VAD segments. + + Attributes + ---------- + start : float + Start time in seconds (relative to stream origin). + end : float + End time in seconds. + audio : np.ndarray + Concatenated float32 PCM samples for the chunk. + strategy : str + Which segmentation strategy produced this utterance. + num_vad_segments : int + How many VAD segments were merged into this utterance. + """ + start: float + end: float + audio: np.ndarray = field(repr=False) + strategy: str = "pause_triggered" + num_vad_segments: int = 1 + + @property + def duration(self) -> float: + return self.end - self.start + + @property + def speech_duration(self) -> float: + """Actual audio length (seconds), which may differ from wall-clock span.""" + return len(self.audio) / 16000 # updated by segmenter with real sr + + def __repr__(self) -> str: + return ( + f"Utterance(start={self.start:.3f}s, end={self.end:.3f}s, " + f"duration={self.duration:.3f}s, " + f"vad_segments={self.num_vad_segments}, strategy={self.strategy!r})" + ) + + +# --------------------------------------------------------------------------- +# Core segmenter +# --------------------------------------------------------------------------- + +class UtteranceSegmenter: + """ + Accumulates VAD SpeechSegments and emits Utterances when a boundary + is detected. + + Parameters + ---------- + strategy : {"pause_triggered", "fixed_window"} + Segmentation strategy (see module docstring). + pause_s : float + Pause-triggered: inter-segment gap (seconds) that triggers an emit. + window_s : float + Fixed-window: speech duration (seconds) before forcing an emit. + max_utterance_s : float + Pause-triggered safety valve: emit when accumulated speech exceeds + this value regardless of detected pauses. + sample_rate : int + Audio sample rate — used for speech_duration calculation. + verbose : bool + Print emit events to stdout. + """ + + STRATEGIES = {"pause_triggered", "fixed_window"} + + def __init__( + self, + strategy: str = "pause_triggered", + pause_s: float = 0.4, + window_s: float = 2.5, + max_utterance_s: float = 8.0, + sample_rate: int = 16000, + verbose: bool = True, + ): + if strategy not in self.STRATEGIES: + raise ValueError(f"strategy must be one of {self.STRATEGIES}") + + self.strategy = strategy + self.pause_s = pause_s + self.window_s = window_s + self.max_utterance_s = max_utterance_s + self.sample_rate = sample_rate + self.verbose = verbose + + # Internal buffer + self._segments: List = [] # accumulated SpeechSegments + self._audio_chunks: List[np.ndarray] = [] + self._speech_duration: float = 0.0 # total seconds of speech buffered + + if verbose: + if strategy == "pause_triggered": + print( + f"[Segmenter] pause_triggered — " + f"pause_s={pause_s}s, max_utterance_s={max_utterance_s}s" + ) + else: + print(f"[Segmenter] fixed_window — window_s={window_s}s") + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _emit(self) -> Optional[Utterance]: + """Assemble and return the buffered utterance, then reset the buffer.""" + if not self._segments: + return None + + utterance = Utterance( + start=self._segments[0].start, + end=self._segments[-1].end, + audio=np.concatenate(self._audio_chunks).astype(np.float32), + strategy=self.strategy, + num_vad_segments=len(self._segments), + ) + + speech_s = self._speech_duration + if self.verbose: + print( + f"[Segmenter] Emit utterance " + f"{utterance.start:.3f}s -> {utterance.end:.3f}s " + f"({utterance.duration:.3f}s span, " + f"{speech_s:.3f}s speech, " + f"{utterance.num_vad_segments} VAD segment(s))" + ) + + self._segments = [] + self._audio_chunks = [] + self._speech_duration = 0.0 + return utterance + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def feed_segment(self, segment) -> Optional[Utterance]: + """ + Feed one VAD SpeechSegment. Returns an Utterance if one is ready, + otherwise returns None (keep buffering). + + Parameters + ---------- + segment : SpeechSegment + A speech segment from vad_engine.vad.detect_speech_segments(). + + Returns + ------- + Utterance | None + """ + seg_duration = segment.duration + + if self.strategy == "fixed_window": + self._audio_chunks.append(segment.audio) + self._segments.append(segment) + self._speech_duration += seg_duration + + if self._speech_duration >= self.window_s: + return self._emit() + return None + + # pause_triggered --------------------------------------------------- + result = None + + if self._segments: + gap = segment.start - self._segments[-1].end + + # Pause long enough → emit what we have, start fresh with this seg + if gap >= self.pause_s: + result = self._emit() + + # Safety valve — accumulated speech too long + elif self._speech_duration >= self.max_utterance_s: + result = self._emit() + + self._segments.append(segment) + self._audio_chunks.append(segment.audio) + self._speech_duration += seg_duration + return result + + def flush(self) -> Optional[Utterance]: + """ + Force-emit whatever is currently buffered (call at end of stream). + Returns None if the buffer is empty. + """ + if self._segments: + if self.verbose: + print("[Segmenter] Flush — end of stream") + return self._emit() + return None + + def process_segments(self, segments) -> List[Utterance]: + """ + Process a complete list of VAD SpeechSegments and return all + resulting Utterance objects (including a final flush). + + Parameters + ---------- + segments : iterable of SpeechSegment + + Returns + ------- + List[Utterance] + """ + utterances: List[Utterance] = [] + for seg in segments: + u = self.feed_segment(seg) + if u is not None: + utterances.append(u) + final = self.flush() + if final is not None: + utterances.append(final) + return utterances + + def stream_segments(self, segments) -> Generator[Utterance, None, None]: + """ + Generator version of process_segments — yields Utterances as they + become ready. Useful for live pipelines where segments arrive one + at a time. + + Parameters + ---------- + segments : iterable of SpeechSegment + + Yields + ------ + Utterance + """ + for seg in segments: + u = self.feed_segment(seg) + if u is not None: + yield u + final = self.flush() + if final is not None: + yield final + + @property + def buffered_duration(self) -> float: + """Total speech seconds currently waiting in the buffer.""" + return self._speech_duration + + @property + def buffered_segments(self) -> int: + """Number of VAD segments currently in the buffer.""" + return len(self._segments) + + +# --------------------------------------------------------------------------- +# Convenience function +# --------------------------------------------------------------------------- + +def segment_utterances( + vad_segments, + strategy: str = "pause_triggered", + pause_s: float = 0.4, + window_s: float = 2.5, + max_utterance_s: float = 8.0, + sample_rate: int = 16000, + verbose: bool = True, +) -> List[Utterance]: + """ + One-call helper: convert a list of VAD SpeechSegments into Utterances. + + Parameters + ---------- + vad_segments : list of SpeechSegment + strategy : "pause_triggered" | "fixed_window" + pause_s : float + Pause threshold for pause_triggered mode. + window_s : float + Window length for fixed_window mode. + max_utterance_s : float + Safety-valve maximum for pause_triggered mode. + sample_rate : int + verbose : bool + + Returns + ------- + List[Utterance] + """ + segmenter = UtteranceSegmenter( + strategy=strategy, + pause_s=pause_s, + window_s=window_s, + max_utterance_s=max_utterance_s, + sample_rate=sample_rate, + verbose=verbose, + ) + return segmenter.process_segments(vad_segments) + + +# --------------------------------------------------------------------------- +# CLI demo +# --------------------------------------------------------------------------- + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Step 3 — Buffering & Utterance Segmentation demo" + ) + parser.add_argument("--input", default=None, metavar="FILE.WAV", + help="WAV file to process (default: live mic capture)") + parser.add_argument("--duration", type=float, default=10.0, + help="Live recording duration in seconds (default: 10)") + parser.add_argument("--rate", type=int, default=16000, + help="Sample rate in Hz (default: 16000)") + parser.add_argument("--strategy", default="pause_triggered", + choices=["pause_triggered", "fixed_window"], + help="Segmentation strategy (default: pause_triggered)") + parser.add_argument("--pause", type=float, default=0.4, + help="Pause threshold in seconds (default: 0.4)") + parser.add_argument("--window", type=float, default=2.5, + help="Fixed window size in seconds (default: 2.5)") + parser.add_argument("--max", type=float, default=8.0, + help="Max utterance length in seconds (default: 8.0)") + parser.add_argument("--vad-mode", type=int, default=2, choices=[0, 1, 2, 3], + help="VAD aggressiveness 0-3 (default: 2)") + return parser.parse_args() + + +def main(): + args = _parse_args() + + # Resolve sibling module paths + root = Path(__file__).resolve().parent.parent + sys.path.insert(0, str(root / "vad_engine")) + sys.path.insert(0, str(root / "task0_audio_capture")) + + from vad import detect_speech_segments + + print( + f"\nStep 3 — Buffering & Utterance Segmentation\n" + f" Strategy : {args.strategy}\n" + f" Pause thresh: {args.pause} s\n" + f" Window size : {args.window} s\n" + f" Max utt : {args.max} s\n" + f" VAD mode : {args.vad_mode}\n" + ) + + if args.input: + import soundfile as sf + audio, sr = sf.read(args.input, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) + if sr != args.rate: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=args.rate) + print(f"[Segmenter] Loaded {args.input} ({len(audio)/args.rate:.2f} s)\n") + else: + from audio_capture import record_for_duration + print(f"[Segmenter] Recording {args.duration:.1f} s from microphone ...") + audio = record_for_duration(duration=args.duration, sample_rate=args.rate, verbose=True) + + # Step 2 — VAD + print("\n[Segmenter] Running VAD ...") + vad_segments = detect_speech_segments( + audio, + sample_rate=args.rate, + frame_ms=20, + aggressiveness=args.vad_mode, + verbose=True, + ) + print(f"[Segmenter] VAD found {len(vad_segments)} speech segment(s)\n") + + # Step 3 — utterance segmentation + utterances = segment_utterances( + vad_segments, + strategy=args.strategy, + pause_s=args.pause, + window_s=args.window, + max_utterance_s=args.max, + sample_rate=args.rate, + ) + + # Summary + total_speech = sum(u.duration for u in utterances) + print(f"\n[Segmenter] {len(utterances)} utterance(s) ready for transcriber:") + for i, u in enumerate(utterances, 1): + print( + f" [{i:2d}] {u.start:.3f}s -> {u.end:.3f}s " + f"span={u.duration:.3f}s " + f"speech={len(u.audio)/args.rate:.3f}s " + f"vad_segs={u.num_vad_segments}" + ) + print(f"\n Total utterance span : {total_speech:.3f} s") + print(f" Strategy used : {args.strategy}") + + +if __name__ == "__main__": + main() diff --git a/narrative-audio-system/vad_engine/vad.py b/narrative-audio-system/vad_engine/vad.py new file mode 100644 index 0000000..840be2d --- /dev/null +++ b/narrative-audio-system/vad_engine/vad.py @@ -0,0 +1,604 @@ +""" +Step 2 — Voice Activity Detection (VAD) +========================================= +Labels each 10–30 ms audio frame as speech or non-speech, then groups +consecutive speech frames into timestamped segments. Silence is never +forwarded to the transcriber, which saves compute and prevents mid-sentence +cuts. + +Two backends are supported (auto-selected at import time): + 1. webrtcvad — Google's WebRTC VAD, <1 ms per frame, rule-based. + 2. silero-vad — Silero neural VAD, ~5 ms per frame, more accurate on noise. + +Typical output +-------------- + [VAD] Speech started at T = 1.200 s + [VAD] Speech ended at T = 3.840 s (duration 2.640 s) + SpeechSegment(start=1.2, end=3.84, audio=array([...], dtype=float32)) + +Usage (standalone demo) +----------------------- + python vad.py --duration 10 --mode 2 --frame-ms 20 + +Usage (library) +--------------- + from vad_engine.vad import VADProcessor, SpeechSegment + + processor = VADProcessor(sample_rate=16000, frame_ms=20, aggressiveness=2) + for segment in processor.process_array(audio_array): + print(f"Speech {segment.start:.2f}s – {segment.end:.2f}s") +""" + +import argparse +import collections +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Generator, List, Optional + +import numpy as np + +# --------------------------------------------------------------------------- +# Backend detection +# --------------------------------------------------------------------------- + +try: + import webrtcvad as _webrtcvad + _WEBRTCVAD_AVAILABLE = True +except ImportError: + _WEBRTCVAD_AVAILABLE = False + +try: + import torch as _torch + _TORCH_AVAILABLE = True +except ImportError: + _TORCH_AVAILABLE = False + +_SILERO_AVAILABLE = False +if _TORCH_AVAILABLE: + try: + _silero_model, _silero_utils = _torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=False, + verbose=False, + ) + _SILERO_AVAILABLE = True + except Exception: + pass + + +def _backend_name() -> str: + if _SILERO_AVAILABLE: + return "silero-vad" + if _WEBRTCVAD_AVAILABLE: + return "webrtcvad" + return "energy" # simple energy-threshold fallback + + +# --------------------------------------------------------------------------- +# Data class +# --------------------------------------------------------------------------- + +@dataclass +class SpeechSegment: + """A detected speech utterance with absolute timestamps.""" + start: float # seconds from stream start + end: float # seconds from stream start + audio: np.ndarray = field(repr=False) # float32 PCM samples + + @property + def duration(self) -> float: + return self.end - self.start + + def __repr__(self) -> str: + return ( + f"SpeechSegment(start={self.start:.3f}s, end={self.end:.3f}s, " + f"duration={self.duration:.3f}s, samples={len(self.audio)})" + ) + + +# --------------------------------------------------------------------------- +# Frame-level VAD helpers +# --------------------------------------------------------------------------- + +def _to_int16_bytes(frame_f32: np.ndarray) -> bytes: + """Convert a float32 PCM frame to int16 little-endian bytes for webrtcvad.""" + clipped = np.clip(frame_f32, -1.0, 1.0) + return (clipped * 32767).astype(np.int16).tobytes() + + +def _energy_is_speech(frame: np.ndarray, threshold: float = 0.005) -> bool: + """Fallback: RMS energy threshold.""" + return float(np.sqrt(np.mean(frame ** 2))) > threshold + + +# --------------------------------------------------------------------------- +# Core processor +# --------------------------------------------------------------------------- + +class VADProcessor: + """ + Runs VAD over a numpy audio array and yields SpeechSegment objects. + + Parameters + ---------- + sample_rate : int + Audio sample rate in Hz. webrtcvad supports 8000, 16000, 32000, 48000. + frame_ms : int + Frame duration in milliseconds. webrtcvad supports 10, 20, 30. + aggressiveness : int + webrtcvad aggressiveness 0–3 (0 = least, 3 = most aggressive). + Higher values filter more background noise but may clip soft speech. + speech_pad_ms : int + Extra ms to prepend before a detected speech onset (avoids clipping + the first syllable). + silence_pad_ms : int + Extra ms of silence to append after speech ends before closing the + segment. Prevents splitting on brief pauses (e.g. comma pauses). + min_speech_ms : int + Minimum speech duration to emit. Shorter bursts (e.g. clicks) are + discarded. + backend : str | None + Force a specific backend: "webrtcvad", "silero", or "energy". + None = auto-select best available. + verbose : bool + Print segment events to stdout. + """ + + VALID_RATES = {8000, 16000, 32000, 48000} + VALID_FRAME_MS = {10, 20, 30} + + def __init__( + self, + sample_rate: int = 16000, + frame_ms: int = 20, + aggressiveness: int = 2, + speech_pad_ms: int = 300, + silence_pad_ms: int = 400, + min_speech_ms: int = 250, + backend: Optional[str] = None, + verbose: bool = True, + ): + if sample_rate not in self.VALID_RATES: + raise ValueError(f"sample_rate must be one of {self.VALID_RATES}, got {sample_rate}") + if frame_ms not in self.VALID_FRAME_MS: + raise ValueError(f"frame_ms must be one of {self.VALID_FRAME_MS}, got {frame_ms}") + + self.sample_rate = sample_rate + self.frame_ms = frame_ms + self.aggressiveness = aggressiveness + self.speech_pad_ms = speech_pad_ms + self.silence_pad_ms = silence_pad_ms + self.min_speech_ms = min_speech_ms + self.verbose = verbose + + self.frame_samples = int(sample_rate * frame_ms / 1000) # e.g. 320 @ 16kHz/20ms + + # Number of frames for padding windows + self.num_speech_pad_frames = max(1, speech_pad_ms // frame_ms) + self.num_silence_pad_frames = max(1, silence_pad_ms // frame_ms) + self.min_speech_frames = max(1, min_speech_ms // frame_ms) + + # Select backend + self._backend = backend or _backend_name() + self._vad = None + self._silero_model = None + + if self._backend == "webrtcvad": + if not _WEBRTCVAD_AVAILABLE: + raise ImportError("webrtcvad not installed. Run: pip install webrtcvad") + self._vad = _webrtcvad.Vad(aggressiveness) + elif self._backend == "silero": + if not _SILERO_AVAILABLE: + raise ImportError("silero-vad not available. Ensure torch is installed.") + self._silero_model = _silero_model + self._silero_model.reset_states() + # else: energy fallback — no setup needed + + if verbose: + print( + f"[VAD] Initialized — backend={self._backend}, " + f"rate={sample_rate} Hz, frame={frame_ms} ms " + f"({self.frame_samples} samples), aggressiveness={aggressiveness}" + ) + + # ------------------------------------------------------------------ + # Frame-level speech decision + # ------------------------------------------------------------------ + + def _is_speech(self, frame: np.ndarray) -> bool: + """Return True if this frame contains speech.""" + if self._backend == "webrtcvad": + return self._vad.is_speech(_to_int16_bytes(frame), self.sample_rate) + elif self._backend == "silero": + tensor = _torch.tensor(frame, dtype=_torch.float32).unsqueeze(0) + confidence = float(self._silero_model(tensor, self.sample_rate).item()) + return confidence > 0.5 + else: + return _energy_is_speech(frame) + + # ------------------------------------------------------------------ + # Segment state machine + # ------------------------------------------------------------------ + + def process_array( + self, audio: np.ndarray, stream_offset_s: float = 0.0 + ) -> Generator[SpeechSegment, None, None]: + """ + Process a 1-D float32 audio array and yield SpeechSegment objects. + + Parameters + ---------- + audio : np.ndarray + 1-D float32 PCM array at `self.sample_rate`. + stream_offset_s : float + Time offset (seconds) to add to all timestamps. Useful when + processing chunks of a longer recording. + + Yields + ------ + SpeechSegment + """ + if self._backend == "silero" and self._silero_model is not None: + self._silero_model.reset_states() + + # Split audio into fixed-size frames; drop the last incomplete frame + num_complete = len(audio) // self.frame_samples + frames = [ + audio[i * self.frame_samples: (i + 1) * self.frame_samples] + for i in range(num_complete) + ] + + # Sliding window of recent frames (used for pre-speech padding) + ring = collections.deque(maxlen=self.num_speech_pad_frames) + + in_speech = False + triggered_frame = 0 # frame index when speech was triggered + speech_frames: List[np.ndarray] = [] + num_silence_frames = 0 + num_actual_speech_frames = 0 # only frames labeled as speech (excludes padding) + + for idx, frame in enumerate(frames): + frame_time = stream_offset_s + idx * self.frame_ms / 1000.0 + is_speech = self._is_speech(frame) + + if not in_speech: + ring.append((frame, is_speech)) + num_speech = sum(1 for _, s in ring if s) + + # Trigger on: majority of ring frames are speech + if num_speech > self.num_speech_pad_frames // 2: + in_speech = True + triggered_frame = idx - len(ring) + 1 + trigger_time = stream_offset_s + triggered_frame * self.frame_ms / 1000.0 + if self.verbose: + print(f"[VAD] Speech started at T = {trigger_time:7.3f} s") + # Include the ring buffer frames as pre-speech padding + speech_frames = [f for f, _ in ring] + # Count only the speech-labeled frames in the ring + num_actual_speech_frames = num_speech + num_silence_frames = 0 + else: + speech_frames.append(frame) + + if not is_speech: + num_silence_frames += 1 + else: + num_silence_frames = 0 + num_actual_speech_frames += 1 + + # End segment after enough consecutive silence frames + if num_silence_frames > self.num_silence_pad_frames: + # Trim trailing silence down to silence_pad_ms + keep = len(speech_frames) - num_silence_frames + self.num_silence_pad_frames + speech_frames = speech_frames[:keep] + + end_time = ( + stream_offset_s + + (triggered_frame + keep) * self.frame_ms / 1000.0 + ) + start_time = stream_offset_s + triggered_frame * self.frame_ms / 1000.0 + + # Min-duration check uses only actually-speech-labeled frames + if num_actual_speech_frames >= self.min_speech_frames: + segment_audio = np.concatenate(speech_frames) + if self.verbose: + print( + f"[VAD] Speech ended at T = {end_time:7.3f} s " + f"(duration {end_time - start_time:.3f} s)" + ) + yield SpeechSegment( + start=start_time, + end=end_time, + audio=segment_audio, + ) + else: + if self.verbose: + print( + f"[VAD] Discarded short burst " + f"({num_actual_speech_frames * self.frame_ms} ms " + f"< {self.min_speech_ms} ms)" + ) + + # Reset + in_speech = False + speech_frames = [] + num_silence_frames = 0 + num_actual_speech_frames = 0 + ring.clear() + + # Flush any open segment at end of array + if in_speech and speech_frames: + end_time = stream_offset_s + (triggered_frame + len(speech_frames)) * self.frame_ms / 1000.0 + start_time = stream_offset_s + triggered_frame * self.frame_ms / 1000.0 + if num_actual_speech_frames >= self.min_speech_frames: + segment_audio = np.concatenate(speech_frames) + if self.verbose: + print( + f"[VAD] Speech ended at T = {end_time:7.3f} s " + f"(duration {end_time - start_time:.3f} s) [end-of-audio]" + ) + yield SpeechSegment(start=start_time, end=end_time, audio=segment_audio) + + +# --------------------------------------------------------------------------- +# Live streaming VAD — wraps AudioCaptureStream + VADProcessor +# --------------------------------------------------------------------------- + +class LiveVAD: + """ + Combines Step 1 (AudioCaptureStream) with VADProcessor to emit + SpeechSegments in real time from the microphone. + + Parameters + ---------- + sample_rate : int + Microphone and VAD sample rate. + chunk_size : int + AudioCaptureStream chunk size (samples). Should be a multiple of + `frame_samples` for clean alignment. + frame_ms : int + VAD frame duration in ms. + aggressiveness : int + webrtcvad aggressiveness 0–3. + on_segment : callable | None + Called with each SpeechSegment as it is detected. + """ + + def __init__( + self, + sample_rate: int = 16000, + chunk_size: int = 1024, + frame_ms: int = 20, + aggressiveness: int = 2, + speech_pad_ms: int = 300, + silence_pad_ms: int = 400, + min_speech_ms: int = 250, + on_segment=None, + verbose: bool = True, + ): + # Import here to avoid hard dependency at module level + sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "task0_audio_capture")) + from audio_capture import AudioCaptureStream, RollingBuffer + + self.sample_rate = sample_rate + self.on_segment = on_segment + self.verbose = verbose + self._segments: List[SpeechSegment] = [] + self._stream_start: float = 0.0 + self._pending_audio = np.array([], dtype=np.float32) + + self._processor = VADProcessor( + sample_rate=sample_rate, + frame_ms=frame_ms, + aggressiveness=aggressiveness, + speech_pad_ms=speech_pad_ms, + silence_pad_ms=silence_pad_ms, + min_speech_ms=min_speech_ms, + verbose=verbose, + ) + + frame_samples = self._processor.frame_samples + + def _on_chunk(chunk: np.ndarray) -> None: + """Called from the audio thread for every captured chunk.""" + elapsed = time.perf_counter() - self._stream_start + flat = chunk.flatten().astype(np.float32) + + # Accumulate until we have at least one full VAD frame + combined = np.concatenate([self._pending_audio, flat]) + n_frames = len(combined) // frame_samples + usable = n_frames * frame_samples + self._pending_audio = combined[usable:] + + if n_frames == 0: + return + + processable = combined[:usable] + offset = elapsed - len(combined) / sample_rate + + for seg in self._processor.process_array(processable, stream_offset_s=max(0.0, offset)): + self._segments.append(seg) + if self.on_segment is not None: + self.on_segment(seg) + + self._buffer = RollingBuffer(max_seconds=60.0, sample_rate=sample_rate) + self._capture = AudioCaptureStream( + sample_rate=sample_rate, + chunk_size=chunk_size, + buffer=self._buffer, + on_chunk=_on_chunk, + ) + + def start(self) -> "LiveVAD": + self._stream_start = time.perf_counter() + self._capture.start() + return self + + def stop(self) -> List[SpeechSegment]: + self._capture.stop() + return self._segments + + def __enter__(self) -> "LiveVAD": + return self.start() + + def __exit__(self, *_) -> None: + self.stop() + + @property + def segments(self) -> List[SpeechSegment]: + return list(self._segments) + + +# --------------------------------------------------------------------------- +# Public helper: detect_speech_segments +# --------------------------------------------------------------------------- + +def detect_speech_segments( + audio: np.ndarray, + sample_rate: int = 16000, + frame_ms: int = 20, + aggressiveness: int = 2, + speech_pad_ms: int = 300, + silence_pad_ms: int = 400, + min_speech_ms: int = 250, + verbose: bool = True, +) -> List[SpeechSegment]: + """ + Run VAD over a pre-recorded audio array and return a list of + SpeechSegment objects sorted by start time. + + Parameters + ---------- + audio : np.ndarray + 1-D float32 PCM at `sample_rate`. + sample_rate : int + Audio sample rate in Hz. + frame_ms : int + VAD frame duration in ms (10, 20, or 30). + aggressiveness : int + webrtcvad aggressiveness 0–3. + speech_pad_ms : int + Pre-speech padding in ms. + silence_pad_ms : int + Post-speech silence tolerance in ms. + min_speech_ms : int + Minimum segment length in ms. + verbose : bool + Print segment events. + + Returns + ------- + List[SpeechSegment] + """ + processor = VADProcessor( + sample_rate=sample_rate, + frame_ms=frame_ms, + aggressiveness=aggressiveness, + speech_pad_ms=speech_pad_ms, + silence_pad_ms=silence_pad_ms, + min_speech_ms=min_speech_ms, + verbose=verbose, + ) + return list(processor.process_array(audio)) + + +# --------------------------------------------------------------------------- +# CLI demo +# --------------------------------------------------------------------------- + +def _parse_args(): + parser = argparse.ArgumentParser(description="Step 2 — Voice Activity Detection demo") + parser.add_argument("--input", default=None, metavar="FILE.WAV", + help="Process a WAV file instead of the microphone") + parser.add_argument("--duration", type=float, default=10.0, + help="Live recording duration in seconds (default: 10)") + parser.add_argument("--rate", type=int, default=16000, + help="Sample rate in Hz (default: 16000)") + parser.add_argument("--frame-ms", type=int, default=20, choices=[10, 20, 30], + help="VAD frame duration in ms (default: 20)") + parser.add_argument("--mode", type=int, default=2, choices=[0, 1, 2, 3], + help="webrtcvad aggressiveness 0-3 (default: 2)") + parser.add_argument("--speech-pad", type=int, default=300, + help="Pre-speech padding ms (default: 300)") + parser.add_argument("--silence-pad", type=int, default=400, + help="Post-speech silence tolerance ms (default: 400)") + parser.add_argument("--min-speech", type=int, default=250, + help="Minimum speech segment ms (default: 250)") + parser.add_argument("--backend", default=None, choices=["webrtcvad", "silero", "energy"], + help="Force a VAD backend (default: auto)") + return parser.parse_args() + + +def main(): + args = _parse_args() + + print( + f"\nStep 2 — Voice Activity Detection\n" + f" Backend : {args.backend or _backend_name()}\n" + f" Sample rate : {args.rate} Hz\n" + f" Frame size : {args.frame_ms} ms\n" + f" Aggressiveness: {args.mode}\n" + f" Speech pad : {args.speech_pad} ms\n" + f" Silence pad : {args.silence_pad} ms\n" + f" Min speech : {args.min_speech} ms\n" + ) + + if args.input: + # Process a pre-recorded WAV file + import soundfile as sf + audio, sr = sf.read(args.input, dtype="float32", always_2d=False) + if audio.ndim > 1: + audio = audio.mean(axis=1) # mix to mono + if sr != args.rate: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=args.rate) + print(f"[VAD] Loaded {args.input} ({len(audio)/args.rate:.2f} s)\n") + + segments = detect_speech_segments( + audio, + sample_rate=args.rate, + frame_ms=args.frame_ms, + aggressiveness=args.mode, + speech_pad_ms=args.speech_pad, + silence_pad_ms=args.silence_pad, + min_speech_ms=args.min_speech, + ) + else: + # Live microphone capture + sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "task0_audio_capture")) + try: + from audio_capture import record_for_duration + except ImportError: + print("ERROR: task0_audio_capture not found. Run from narrative-audio-system/.") + return + + print(f"[VAD] Recording {args.duration:.1f} s from microphone...") + audio = record_for_duration(duration=args.duration, sample_rate=args.rate, verbose=True) + + segments = detect_speech_segments( + audio, + sample_rate=args.rate, + frame_ms=args.frame_ms, + aggressiveness=args.mode, + speech_pad_ms=args.speech_pad, + silence_pad_ms=args.silence_pad, + min_speech_ms=args.min_speech, + ) + + # Summary + total_speech = sum(s.duration for s in segments) + total_audio = len(audio) / args.rate + print( + f"\n[VAD] Detected {len(segments)} speech segment(s)\n" + f"[VAD] Total speech : {total_speech:.3f} s / {total_audio:.3f} s " + f"({100*total_speech/total_audio:.1f}%)" + ) + for i, seg in enumerate(segments, 1): + print(f" [{i:2d}] {seg.start:7.3f}s -> {seg.end:7.3f}s ({seg.duration:.3f} s)") + + +if __name__ == "__main__": + main()