engine.py

import sys
import os
sys.path.insert(0, os.getcwd()+'/glados_tts')
import pathlib

import torch
from utils.tools import prepare_text
from scipy.io.wavfile import write
import time
import base64

print("\033[1;94mINFO:\033[;97m Initializing TTS Engine...")

# Select the device
if torch.is_vulkan_available():
    device = 'vulkan'
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

# Load models
if __name__ == "__main__":
    glados = torch.jit.load('models/glados.pt')
    vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device)
else:
    glados = torch.jit.load('glados_tts/models/glados.pt')
    vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device)

# Prepare models in RAM
for i in range(4):
    init = glados.generate_jit(prepare_text(str(i)))
    init_mel = init['mel_post'].to(device)
    init_vo = vocoder(init_mel)


def glados_tts(text, key=False):

    # Tokenize, clean and phonemize input text
    x = prepare_text(text).to('cpu')

    with torch.no_grad():

        # Generate generic TTS-output
        old_time = time.time()
        tts_output = glados.generate_jit(x)

        # Use HiFiGAN as vocoder to make output sound like GLaDOS
        mel = tts_output['mel_post'].to(device)
        audio = vocoder(mel)
        print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.")

        # Normalize audio to fit in wav-file
        # make audo directory if it doesnt exist
        audio = audio.squeeze()
        audio = audio * 32768.0
        audio = audio.cpu().numpy().astype('int16')
        if(key):
            output_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav')
        else:
            output_file = ('audio/GLaDOS-tts-temp-output.wav')
        pathlib.Path('audio/').mkdir(parents=True, exist_ok=True)
        # Write audio file to disk
        # 22,05 kHz sample rate 
        write(output_file, 22050, audio)

    return True


# If the script is run directly, assume remote engine
if __name__ == "__main__":
    
    # Remote Engine Veritables
    PORT = 8124
    CACHE = True

    from flask import Flask, request, send_file
    import urllib.parse
    import shutil
    
    print("\033[1;94mINFO:\033[;97m Initializing TTS Server...")
    
    app = Flask(__name__)

    @app.route('/synthesize/', defaults={'text': ''})
    @app.route('/synthesize/<path:text>')
    def synthesize(text):
        if(text == ''): return 'No input'
        
        line = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:])
        line = base64.b64decode(line).decode('utf8')
        filename = "GLaDOS-tts-"+line.replace(" ", "-")
        filename = filename.replace("!", "")
        filename = filename.replace("°c", "degrees celcius")
        filename = filename.replace(",", "")+".wav"
        file = os.getcwd()+'/audio/'+filename
        
        # Check for Local Cache
        if(os.path.isfile(file)):
        
            # Update access time. This will allow for routine cleanups
            os.utime(file, None)
            print("\033[1;94mINFO:\033[;97m The audio sample sent from cache.")
            return send_file(file)
            
        # Generate New Sample
        key = str(time.time())[7:]
        if(glados_tts(line, key)):
            tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav'
                        
            # If the line isn't too long, store in cache
            if(len(line) < 200 and CACHE):
                shutil.move(tempfile, file)
            else:
                return send_file(tempfile)
                os.remove(tempfile)
                
            return send_file(file)
                
        else:
            return 'TTS Engine Failed'
            
    cli = sys.modules['flask.cli']
    cli.show_server_banner = lambda *x: None
    app.run(host="0.0.0.0", port=PORT)