From 4de9b29c00c0cdd8b8c1f38ad69ca71c939be501 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Tue, 25 Oct 2022 20:44:25 +0000
Subject: [PATCH] :sparkles: add some example scripts

---
 .gitignore                      |   1 -
 examples/make_music_video.py    |  60 ++++++
 examples/run_app.py             |  21 ++
 examples/run_music_video_app.py | 365 ++++++++++++++++++++++++++++++++
 4 files changed, 446 insertions(+), 1 deletion(-)
 create mode 100644 examples/make_music_video.py
 create mode 100644 examples/run_app.py
 create mode 100644 examples/run_music_video_app.py

diff --git a/.gitignore b/.gitignore
index 38a8e6f..ae91fbf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,5 +132,4 @@ dmypy.json
 dreams
 images
 run.py
-examples
 test_outputs
\ No newline at end of file
diff --git a/examples/make_music_video.py b/examples/make_music_video.py
new file mode 100644
index 0000000..5019530
--- /dev/null
+++ b/examples/make_music_video.py
@@ -0,0 +1,60 @@
+from stable_diffusion_videos import StableDiffusionWalkPipeline
+
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import LMSDiscreteScheduler
+import torch
+
+
+pipe = StableDiffusionWalkPipeline.from_pretrained(
+    'runwayml/stable-diffusion-v1-5',
+    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
+    torch_dtype=torch.float16,
+    revision="fp16",
+    safety_checker=None,
+    scheduler=LMSDiscreteScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+    )
+).to("cuda")
+
+
+# I give you permission to scrape this song :)
+# youtube-dl -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 -o "music/thoughts.%(ext)s" https://soundcloud.com/nateraw/thoughts
+audio_filepath = 'music/thoughts.mp3'
+
+# Seconds in the song. Here we slice the audio from 0:11-0:14
+# Should be same length as prompts/seeds.
+audio_offsets = [7, 10, 13, 16]
+
+# Output video frames per second.
+# Use lower values for testing (5 or 10), higher values for better quality (30 or 60)
+fps = 25
+
+# Convert seconds to frames
+# This array should be `len(prompts) - 1` as its steps between prompts.
+num_interpolation_steps = [(b-a) * fps for a, b in zip(audio_offsets, audio_offsets[1:])]
+
+prompts = [
+    'Baroque oil painting anime key visual concept art of wanderer above the sea of fog 1 8 1 8 with anime maid, brutalist, dark fantasy, rule of thirds golden ratio, fake detail, trending pixiv fanbox, acrylic palette knife, style of makoto shinkai studio ghibli genshin impact jamie wyeth james gilleard greg rutkowski chiho aoshima',
+    'the conscious mind entering the dark wood window into the surreal subconscious dream mind, majestic, dreamlike, surrealist, trending on artstation, by gustavo dore ',
+    'Chinese :: by martine johanna and simon stålenhag and chie yoshii and casey weldon and wlop :: ornate, dynamic, particulate, rich colors, intricate, elegant, highly detailed, centered, artstation, smooth, sharp focus, octane render, 3d',
+    'Chinese :: by martine johanna and simon stålenhag and chie yoshii and casey weldon and wlop :: ornate, dynamic, particulate, rich colors, intricate, elegant, highly detailed, centered, artstation, smooth, sharp focus, octane render, 3d',
+]
+seeds = [
+    6954010,
+    8092009,
+    1326004,
+    5019608,
+]
+pipe.walk(
+    prompts=prompts,
+    seeds=seeds,
+    num_interpolation_steps=num_interpolation_steps,
+    fps=fps,
+    audio_filepath=audio_filepath,
+    audio_start_sec=audio_offsets[0],
+    batch_size=16,
+    num_inference_steps=50,
+    guidance_scale=15,
+    margin=1.0,
+    smooth=0.2,
+)
\ No newline at end of file
diff --git a/examples/run_app.py b/examples/run_app.py
new file mode 100644
index 0000000..a16c428
--- /dev/null
+++ b/examples/run_app.py
@@ -0,0 +1,21 @@
+from stable_diffusion_videos import StableDiffusionWalkPipeline, Interface
+
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import LMSDiscreteScheduler
+import torch
+
+pipe = StableDiffusionWalkPipeline.from_pretrained(
+    'runwayml/stable-diffusion-v1-5',
+    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
+    torch_dtype=torch.float16,
+    revision="fp16",
+    safety_checker=None,
+    scheduler=LMSDiscreteScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+    )
+).to("cuda")
+
+interface = Interface(pipe)
+
+if __name__ == '__main__':
+    interface.launch(debug=True)
diff --git a/examples/run_music_video_app.py b/examples/run_music_video_app.py
new file mode 100644
index 0000000..80d0cbe
--- /dev/null
+++ b/examples/run_music_video_app.py
@@ -0,0 +1,365 @@
+# Experimental app to help with the process of generating music videos
+# Requires youtube-dl to be installed
+# pip install youtube-dl
+
+import gradio as gr
+import librosa
+from pathlib import Path
+import numpy as np
+import random
+from io import BytesIO
+import soundfile as sf
+from matplotlib import pyplot as plt
+
+from stable_diffusion_videos import StableDiffusionWalkPipeline, generate_images, get_timesteps_arr
+
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import LMSDiscreteScheduler
+import torch
+import youtube_dl
+import os
+
+pipe = StableDiffusionWalkPipeline.from_pretrained(
+    'runwayml/stable-diffusion-v1-5',
+    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
+    torch_dtype=torch.float16,
+    revision="fp16",
+    safety_checker=None,
+    scheduler=LMSDiscreteScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+    )
+).to("cuda")
+
+
+def download_example_clip(url, output_dir='./', output_filename='%(title)s.%(ext)s'):
+    if (Path(output_dir) / output_filename).exists():
+        return str(Path(output_dir) / output_filename)
+
+    files_before = os.listdir(output_dir) if os.path.exists(output_dir) else []
+    ydl_opts = {
+        'outtmpl': str(Path(output_dir) / output_filename),
+        'format': 'bestaudio',
+        'extract-audio': True,
+        'audio-format': 'mp3',
+        'audio-quality': 0,
+    }
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    
+    files_after = os.listdir(output_dir)
+    return str(Path(output_dir) / list(set(files_after) - set(files_before))[0])
+    
+def audio_data_to_buffer(y, sr):
+    audio_filepath = BytesIO()
+    audio_filepath.name = 'audio.wav'
+    sf.write(audio_filepath, y, samplerate=sr, format='WAV')
+    audio_filepath.seek(0)
+    return audio_filepath
+
+
+def plot_array(y):
+    fig = plt.figure()
+    x = np.arange(y.shape[0]) 
+    plt.title("Line graph") 
+    plt.xlabel("X axis") 
+    plt.ylabel("Y axis") 
+    plt.plot(x, y, color ="red") 
+    plt.savefig('timesteps_chart.png')
+    return fig
+
+def on_slice_btn_click(audio, audio_start_sec, duration, fps, smooth, margin):
+    if audio is None:
+        return [
+            gr.update(visible=False),
+            gr.update(visible=False),
+        ]
+
+    y, sr = librosa.load(audio, offset=audio_start_sec, duration=duration)
+    T = get_timesteps_arr(
+        audio_data_to_buffer(y, sr),
+        0,
+        duration,
+        fps=fps,
+        margin=margin,
+        smooth=smooth,
+    )
+    return [gr.update(value=(sr, y), visible=True), gr.update(value=plot_array(T), visible=True)]
+
+def on_audio_change_or_clear(audio):
+    if audio is None:
+        return [
+            gr.update(visible=False),
+            gr.update(visible=False)
+        ]
+    
+    duration = librosa.get_duration(filename=audio)
+    return [
+        gr.update(maximum=int(duration), visible=True),
+        gr.update(maximum=int(min(10, duration)), visible=True)
+    ]
+
+def on_update_weight_settings_btn_click(sliced_audio, duration, fps, smooth, margin):
+    if sliced_audio is None:
+        return gr.update(visible=False)
+
+    T = get_timesteps_arr(
+        sliced_audio,
+        0,
+        duration,
+        fps=fps,
+        margin=margin,
+        smooth=smooth,
+    )
+    return gr.update(value=plot_array(T), visible=True)
+
+
+def on_generate_images_btn_click(
+    prompt_a,
+    prompt_b,
+    seed_a,
+    seed_b,
+    output_dir,
+    num_inference_steps,
+    guidance_scale,
+    height,
+    width,
+    upsample,
+):
+    output_dir = Path(output_dir) / 'images'
+
+    if seed_a == -1:
+        seed_a = random.randint(0, 9999999)
+    if seed_b == -1:
+        seed_b = random.randint(0, 9999999)
+
+    image_a_fpath = generate_images(
+        pipe,
+        prompt_a,
+        seeds=[seed_a],
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        upsample=upsample,
+        output_dir=output_dir
+    )[0]
+    image_b_fpath = generate_images(
+        pipe,
+        prompt_b,
+        seeds=[seed_b],
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        upsample=upsample,
+        output_dir=output_dir
+    )[0]
+
+    return [
+        gr.update(value=image_a_fpath, visible=True),
+        gr.update(value=image_b_fpath, visible=True),
+        gr.update(value=seed_a),
+        gr.update(value=seed_b),
+    ]
+
+def on_generate_music_video_btn_click(
+    audio_filepath,
+    audio_start_sec,
+    duration,
+    fps,
+    smooth,
+    margin,
+    prompt_a,
+    prompt_b,
+    seed_a,
+    seed_b,
+    batch_size,
+    output_dir,
+    num_inference_steps,
+    guidance_scale,
+    height,
+    width,
+    upsample,
+):
+
+    if audio_filepath is None:
+        return gr.update(visible=False)
+
+    video_filepath = pipe.walk(
+        prompts=[prompt_a, prompt_b],
+        seeds=[seed_a, seed_b],
+        num_interpolation_steps=int(duration * fps),
+        output_dir=output_dir,
+        fps=fps,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        upsample=upsample,
+        batch_size=batch_size,
+        audio_filepath=audio_filepath,
+        audio_start_sec=audio_start_sec,
+        margin=margin,
+        smooth=smooth,
+    )
+    return gr.update(value=video_filepath, visible=True)
+
+
+audio_start_sec = gr.Slider(0, 10, 0, step=1, label="Start (sec)", interactive=True)
+duration = gr.Slider(0, 10, 1, step=1, label="Duration (sec)", interactive=True)
+slice_btn = gr.Button("Slice Audio")
+
+sliced_audio = gr.Audio(type='filepath')
+wav_plot = gr.Plot(label="Interpolation Weights Per Frame")
+
+fps = gr.Slider(1, 60, 12, step=1, label="FPS", interactive=True)
+smooth = gr.Slider(0, 1, 0.0, label="Smoothing", interactive=True)
+margin = gr.Slider(1.0, 20.0, 1.0, step=0.5, label="Margin Max", interactive=True)
+update_weight_settings_btn = gr.Button("Update Interpolation Weights")
+
+prompt_a = gr.Textbox(value='blueberry spaghetti', label="Prompt A")
+prompt_b = gr.Textbox(value='strawberry spaghetti', label="Prompt B")
+seed_a = gr.Number(-1, label="Seed A", precision=0, interactive=True)
+seed_b = gr.Number(-1, label="Seed B", precision=0, interactive=True)
+generate_images_btn = gr.Button("Generate Images")
+image_a = gr.Image(visible=False, label="Image A")
+image_b = gr.Image(visible=False, label="Image B")
+
+batch_size = gr.Slider(1, 32, 1, step=1, label="Batch Size", interactive=True)
+generate_music_video_btn = gr.Button("Generate Music Video")
+video = gr.Video(visible=False, label="Video")
+
+STEP_1_MARKDOWN = """
+## 1. Upload Some Audio
+
+Upload an audio file to use as the source for the music video.
+"""
+
+STEP_2_MARKDOWN = """
+## 2. Slice Portion of Audio for Generated Clip
+
+Here you can slice a portion of the audio to use for the generated music video. The longer the audio, the more frames will be generated (which will take longer).
+
+I suggest you use this app to make music videos in segments of 5-10 seconds at a time. Then, you can stitch the videos together using a video editor or ffmpeg later.
+
+**Warning**: If your audio file is short, I do no check that the duration you chose is not longer than the audio. It may cause some issues, so just be mindful of that.
+"""
+
+STEP_3_MARKDOWN = """
+## 3. Set Interpolation Weight Settings
+
+This section lets you play with the settings used to configure how we move through the latent space given the audio you sliced.
+
+If you look at the graph on the right, you'll see in the X-axis how many frames. The Y-axis is the weight of Image A as we move through the latent space.
+
+If you listen to the audio slice and look at the graph, you should see bumps at points where the audio energy is high (in our case, percussive energy).
+"""
+
+STEP_4_MARKDOWN = """
+## 4. Select Prompts, Seeds, Settings, and Generate Images
+
+Here you can select the settings for image generation.
+
+Then, you can select prompts and seeds for generating images.
+
+  - Image A will be first frame of the generated video.
+  - Image B will be last frame of the generated video.
+  - The video will be generated by interpolating between the two images using the audio you provided.
+
+If you set the seeds to -1, a random seed will be used and saved for you, so you can explore different images given the same prompt.
+"""
+
+
+with gr.Blocks() as demo:
+    gr.Markdown(STEP_1_MARKDOWN)
+    audio = gr.Audio(type='filepath', interactive=True)
+    gr.Examples(
+        [
+            download_example_clip(
+                url='https://soundcloud.com/nateraw/thoughts',
+                output_dir='./music',
+                output_filename='thoughts.mp3'
+            )
+        ],
+        inputs=audio,
+        outputs=[audio_start_sec, duration],
+        fn=on_audio_change_or_clear,
+        cache_examples=True
+    )
+    audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
+    audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])
+
+    gr.Markdown(STEP_2_MARKDOWN)
+    audio_start_sec.render()
+    duration.render()
+    slice_btn.render()
+
+    slice_btn.click(on_slice_btn_click, [audio, audio_start_sec, duration, fps, smooth, margin], [sliced_audio, wav_plot])
+    sliced_audio.render()
+
+    gr.Markdown(STEP_3_MARKDOWN)
+
+    with gr.Row():
+        with gr.Column(scale=4):
+            fps.render()
+            smooth.render()
+            margin.render()
+            update_weight_settings_btn.render()
+            update_weight_settings_btn.click(
+                on_update_weight_settings_btn_click,
+                [sliced_audio, duration, fps, smooth, margin],
+                wav_plot
+            )
+        with gr.Column(scale=3):
+            wav_plot.render()
+
+    gr.Markdown(STEP_4_MARKDOWN)
+    
+    with gr.Accordion("Additional Settings", open=False):
+        output_dir = gr.Textbox(value='./dreams', label="Output Directory")
+        num_inference_steps = gr.Slider(1, 200, 50, step=10, label="Diffusion Inference Steps", interactive=True)
+        guidance_scale = gr.Slider(1.0, 25.0, 7.5, step=0.5, label="Guidance Scale", interactive=True)
+        height = gr.Slider(512, 1024, 512, step=64, label="Height", interactive=True)
+        width = gr.Slider(512, 1024, 512, step=64, label="Width", interactive=True)
+        upsample = gr.Checkbox(value=False, label="Upsample with Real-ESRGAN")
+
+    with gr.Row():
+        with gr.Column(scale=4):
+            prompt_a.render()
+        with gr.Column(scale=1):
+            seed_a.render()
+
+    with gr.Row():
+        with gr.Column(scale=4):
+            prompt_b.render()
+        with gr.Column(scale=1):
+            seed_b.render()
+
+    generate_images_btn.render()
+
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_a.render()
+        with gr.Column(scale=1):
+            image_b.render()
+
+    generate_images_btn.click(
+        on_generate_images_btn_click,
+        [prompt_a, prompt_b, seed_a, seed_b, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
+        [image_a, image_b, seed_a, seed_b]
+    )
+
+    gr.Markdown("## 5. Generate Music Video")
+    # TODO - add equivalent code snippet to generate music video
+    batch_size.render()
+    generate_music_video_btn.render()
+    generate_music_video_btn.click(
+        on_generate_music_video_btn_click,
+        [audio, audio_start_sec, duration, fps, smooth, margin, prompt_a, prompt_b, seed_a, seed_b, batch_size, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
+        video
+    )
+    video.render()
+
+
+if __name__ == '__main__':
+    demo.launch(debug=True)