diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..b716781 --- /dev/null +++ b/demo.py @@ -0,0 +1,143 @@ +import gradio as gr +import numpy as np +import soundfile as sf +import tempfile +import os +from zipvoice.luxvoice import LuxTTS + +# Initialize LuxTTS model (only once at startup) +print("Loading LuxTTS model...") +lux_tts = LuxTTS('YatharthS/LuxTTS', device='cuda', threads=2) # Use device='cpu' for CPU +print("Model loading complete") + +def generate(audio, text, rms, num_steps, t_shift): + if audio is None: + return None, "Error: Please upload an audio file" + + if not text or text.strip() == "": + return None, "Error: Please enter text" + + try: + sample_rate, audio_data = audio + + print(f"Received text: {text}") + print(f"Sample rate: {sample_rate}") + print(f"Audio data shape: {audio_data.shape}") + print(f"Parameters - RMS: {rms}, Num steps: {num_steps}, T-shift: {t_shift}") + + # Save audio to temporary file (LuxTTS requires file path) + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: + tmp_path = tmp_file.name + sf.write(tmp_path, audio_data, sample_rate) + + # Encode audio (first run may take ~10 seconds) + print("Encoding audio...") + encoded_prompt = lux_tts.encode_prompt(tmp_path, rms=rms) + + # Generate speech + print("Generating speech...") + final_wav = lux_tts.generate_speech(text, encoded_prompt, num_steps=num_steps, t_shift=t_shift) + + # Convert to numpy array + final_wav = final_wav.numpy().squeeze() + + # Delete temporary file + os.unlink(tmp_path) + + print("Speech generation complete") + + # Return with 48000Hz sample rate + return (48000, final_wav), "✓ Speech generation complete" + + except Exception as e: + print(f"Error occurred: {str(e)}") + import traceback + traceback.print_exc() + return None, f"Error: {str(e)}" + + +# Create Gradio interface +with gr.Blocks() as demo: + gr.Markdown("# LuxTTS Voice Cloning") + gr.Markdown("Upload a reference audio and enter text to generate speech") + + with gr.Row(): + with gr.Column(): + audio_input = gr.Audio( + label="Reference Audio (WAV/MP3)", + type="numpy", + sources=["upload", "microphone"] + ) + text_input = gr.Textbox( + label="Text to Generate", + placeholder="Enter the text you want to generate as speech", + lines=3 + ) + + # Parameter controls + gr.Markdown("### Generation Parameters") + rms_slider = gr.Slider( + minimum=0.001, + maximum=0.1, + value=0.01, + step=0.001, + label="RMS (Volume normalization)", + info="Root Mean Square for audio normalization (default: 0.01)" + ) + num_steps_slider = gr.Slider( + minimum=1, + maximum=20, + value=4, + step=1, + label="Number of Steps", + info="Inference steps - higher values may improve quality but take longer (default: 4)" + ) + t_shift_slider = gr.Slider( + minimum=0.0, + maximum=1.0, + value=0.9, + step=0.05, + label="T-Shift", + info="Temperature shift parameter (default: 0.9)" + ) + + submit_btn = gr.Button("Generate Speech", variant="primary") + + with gr.Column(): + audio_output = gr.Audio( + label="Generated Speech (WAV)", + type="numpy" + ) + status_output = gr.Textbox( + label="Status", + interactive=False + ) + + # Handle button click + submit_btn.click( + fn=generate, + inputs=[audio_input, text_input, rms_slider, num_steps_slider, t_shift_slider], + outputs=[audio_output, status_output] + ) + + # Usage instructions + gr.Markdown(""" + ## How to Use + 1. **Reference Audio**: Upload a WAV or MP3 file of the voice you want to clone, or record from microphone + 2. **Text Input**: Enter the text you want to generate as speech (English recommended) + 3. **Adjust Parameters** (optional): + - **RMS**: Controls volume normalization (0.001-0.1, default: 0.01) + - **Number of Steps**: Inference quality/speed tradeoff (1-20, default: 4) + - **T-Shift**: Temperature parameter affecting generation (0.0-1.0, default: 0.9) + 4. Click the **Generate Speech** button + 5. Once processing is complete, the generated audio will appear on the right + + **Notes**: + - First run may take ~10 seconds for audio encoding initialization + - Clear reference audio with minimal background noise is recommended + - Use `device='cuda'` for GPU or `device='cpu'` for CPU in the code + - Higher num_steps values may improve quality but increase processing time + """) + +if __name__ == "__main__": + demo.launch()