Merge pull request #65 from nateraw/pipeline-refactor

🎨 big restructure
nateraw · Oct 7, 2022 · 4e98524 · 4e98524
2 parents d44b066 + ed95d7c
commit 4e98524
Show file tree

Hide file tree

Showing 8 changed files with 773 additions and 498 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,4 +131,5 @@ dmypy.json
 # Extra stuff to ignore
 dreams
 images
-run.py
+run.py
+examples
diff --git a/README.md b/README.md
@@ -45,21 +45,69 @@ huggingface-cli login
 #### Programatic Usage
 
 ```python
-from stable_diffusion_videos import walk
-
-walk(
+from stable_diffusion_videos import StableDiffusionWalkPipeline
+from diffusers.schedulers import LMSDiscreteScheduler
+import torch
+
+pipeline = StableDiffusionWalkPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    torch_dtype=torch.float16,
+    revision="fp16",
+    scheduler=LMSDiscreteScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+    )
+).to("cuda")
+
+video_path = pipeline.walk(
     prompts=['a cat', 'a dog'],
     seeds=[42, 1337],
+    num_interpolation_steps=3,
+    height=512,  # use multiples of 64 if > 512. Multiples of 8 if < 512.
+    width=512,   # use multiples of 64 if > 512. Multiples of 8 if < 512.
     output_dir='dreams',        # Where images/videos will be saved
     name='animals_test',        # Subdirectory of output_dir where images/videos will be saved
     guidance_scale=8.5,         # Higher adheres to prompt more, lower lets model take the wheel
-    num_interpolation_steps=5,  # Change to 60-200 for better results...3-5 for testing
-    num_inference_steps=50,     # Number of diffusion steps per image generated. 50 is good default.
-    scheduler='klms',           # One of: "klms", "default", "ddim"
-    disable_tqdm=False,         # Set to True to disable tqdm progress bar
-    make_video=True,            # If false, just save images
-    use_lerp_for_text=True,     # Use lerp for text embeddings instead of slerp
-    do_loop=False,              # Change to True if you want last prompt to loop back to first prompt
+    num_inference_steps=50,     # Number of diffusion steps per image generated. 50 is good default
+)
+```
+
+*New!* Music can be added to the video by providing a path to an audio file. The audio will inform the rate of interpolation so the videos move to the beat 🎶
+
+```python
+from stable_diffusion_videos import StableDiffusionWalkPipeline
+from diffusers.schedulers import LMSDiscreteScheduler
+import torch
+
+pipeline = StableDiffusionWalkPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    torch_dtype=torch.float16,
+    revision="fp16",
+    scheduler=LMSDiscreteScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+    )
+).to("cuda")
+
+
+# Seconds in the song.
+audio_offsets = [146, 148]
+fps = 30  # Use lower values for testing (5 or 10), higher values for better quality (30 or 60)
+
+# Convert seconds to frames
+num_interpolation_steps = [(b-a) * fps for a, b in zip(audio_offsets, audio_offsets[1:])]
+
+video_path = pipeline.walk(
+    prompts=['a cat', 'a dog'],
+    seeds=[42, 1337],
+    num_interpolation_steps=num_interpolation_steps,
+    audio_filepath='audio.mp3',
+    audio_start_sec=audio_offsets[0],
+    height=512,  # use multiples of 64 if > 512. Multiples of 8 if < 512.
+    width=512,   # use multiples of 64 if > 512. Multiples of 8 if < 512.
+    output_dir='dreams',        # Where images/videos will be saved
+    guidance_scale=7.5,         # Higher adheres to prompt more, lower lets model take the wheel
+    num_inference_steps=50,     # Number of diffusion steps per image generated. 50 is good default
 )
 ```
 
@@ -97,9 +145,7 @@ pip install realesrgan
 Then, you'll be able to use `upsample=True` in the `walk` function, like this:
 
 ```python
-from stable_diffusion_videos import walk
-
-walk(['a cat', 'a dog'], [234, 345], upsample=True)
+pipeline.walk(['a cat', 'a dog'], [234, 345], upsample=True)
 ```
 
 The above may cause you to run out of VRAM. No problem, you can do upsampling separately.

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,7 @@
 transformers
-diffusers==0.3.0
+diffusers==0.4.0
 scipy
 fire
-gradio
+gradio
+librosa
+av
diff --git a/stable_diffusion_videos/__init__.py b/stable_diffusion_videos/__init__.py
@@ -61,9 +61,7 @@ def _attach(package_name, submodules=None, submod_attrs=None):
     else:
         submodules = set(submodules)
 
-    attr_to_modules = {
-        attr: mod for mod, attrs in submod_attrs.items() for attr in attrs
-    }
+    attr_to_modules = {attr: mod for mod, attrs in submod_attrs.items() for attr in attrs}
 
     __all__ = list(submodules | attr_to_modules.keys())
 
@@ -96,28 +94,22 @@ def __dir__():
     return __getattr__, __dir__, list(__all__)
 
 
-
 __getattr__, __dir__, __all__ = _attach(
     __name__,
     submodules=[],
     submod_attrs={
-        "commands.user": ["notebook_login"],
         "app": [
             "interface",
+            "pipeline",
         ],
         "stable_diffusion_pipeline": [
-            "StableDiffusionPipeline",
+            "StableDiffusionWalkPipeline",
             "NoCheck",
+            "make_video_pyav",
+            "get_timesteps_arr",
         ],
-        "stable_diffusion_walk": [
-            "walk",
-            "SCHEDULERS",
-            "pipeline",
-        ],
-        "upsampling": [
-            "PipelineRealESRGAN"
-        ]
+        "upsampling": ["PipelineRealESRGAN"],
     },
 )
 
-__version__ = "0.4.0"
+__version__ = "0.5.0"
diff --git a/stable_diffusion_videos/app.py b/stable_diffusion_videos/app.py
@@ -3,48 +3,48 @@
 import gradio as gr
 import torch
 
-from .stable_diffusion_walk import SCHEDULERS, pipeline, walk
+from .stable_diffusion_pipeline import StableDiffusionWalkPipeline
+from .upsampling import RealESRGANModel
+
+pipeline = StableDiffusionWalkPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    torch_dtype=torch.float16,
+    revision="fp16",
+).to("cuda")
 
 
 def fn_images(
     prompt,
     seed,
-    scheduler,
     guidance_scale,
     num_inference_steps,
-    disable_tqdm,
     upsample,
 ):
     if upsample:
-        from .upsampling import PipelineRealESRGAN
-
-        upsampling_pipeline = PipelineRealESRGAN.from_pretrained('nateraw/real-esrgan')
+        if getattr(pipeline, "upsampler", None) is None:
+            pipeline.upsampler = RealESRGANModel.from_pretrained("nateraw/real-esrgan")
+        pipeline.upsampler.to(pipeline.device)
 
-    pipeline.set_progress_bar_config(disable=disable_tqdm)
-    pipeline.scheduler = SCHEDULERS[scheduler]  # klms, default, ddim
     with torch.autocast("cuda"):
         img = pipeline(
             prompt,
             guidance_scale=guidance_scale,
             num_inference_steps=num_inference_steps,
             generator=torch.Generator(device=pipeline.device).manual_seed(seed),
-            output_type='pil' if not upsample else 'numpy',
+            output_type="pil" if not upsample else "numpy",
         )["sample"][0]
-        return img if not upsample else upsampling_pipeline(img)
+        return img if not upsample else pipeline.upsampler(img)
 
 
 def fn_videos(
     prompt_1,
     seed_1,
     prompt_2,
     seed_2,
-    scheduler,
     guidance_scale,
     num_inference_steps,
     num_interpolation_steps,
-    do_loop,
-    disable_tqdm,
-    use_lerp_for_text,
     output_dir,
     upsample,
 ):
@@ -54,20 +54,15 @@ def fn_videos(
     prompts = [x for x in prompts if x.strip()]
     seeds = seeds[: len(prompts)]
 
-    video_path = walk(
-        do_loop=do_loop,
-        make_video=True,
+    video_path = pipeline.walk(
         guidance_scale=guidance_scale,
         prompts=prompts,
         seeds=seeds,
         num_interpolation_steps=num_interpolation_steps,
         num_inference_steps=num_inference_steps,
-        use_lerp_for_text=use_lerp_for_text,
         output_dir=output_dir,
         name=time.strftime("%Y%m%d-%H%M%S"),
-        scheduler=scheduler,
-        disable_tqdm=disable_tqdm,
-        upsample=upsample
+        upsample=upsample,
     )
     return video_path
 
@@ -76,21 +71,15 @@ def fn_videos(
     fn_videos,
     inputs=[
         gr.Textbox("blueberry spaghetti"),
-        gr.Number(42, label='Seed 1', precision=0),
+        gr.Number(42, label="Seed 1", precision=0),
         gr.Textbox("strawberry spaghetti"),
-        gr.Number(42, label='Seed 2', precision=0),
-        gr.Dropdown(["klms", "ddim", "default"], value="klms"),
+        gr.Number(42, label="Seed 2", precision=0),
         gr.Slider(0.0, 20.0, 8.5),
         gr.Slider(1, 200, 50),
         gr.Slider(3, 240, 10),
-        gr.Checkbox(False),
-        gr.Checkbox(False),
-        gr.Checkbox(True),
         gr.Textbox(
             "dreams",
-            placeholder=(
-                "Folder where outputs will be saved. Each output will be saved in a new folder."
-            ),
+            placeholder=("Folder where outputs will be saved. Each output will be saved in a new folder."),
         ),
         gr.Checkbox(False),
     ],
@@ -101,19 +90,15 @@ def fn_videos(
     fn_images,
     inputs=[
         gr.Textbox("blueberry spaghetti"),
-        gr.Number(42, label='Seed', precision=0),
-        gr.Dropdown(["klms", "ddim", "default"], value="klms"),
+        gr.Number(42, label="Seed", precision=0),
         gr.Slider(0.0, 20.0, 8.5),
         gr.Slider(1, 200, 50),
         gr.Checkbox(False),
-        gr.Checkbox(False),
     ],
     outputs=gr.Image(type="pil"),
 )
 
-interface = gr.TabbedInterface(
-    [interface_images, interface_videos], ["Images!", "Videos!"]
-)
+interface = gr.TabbedInterface([interface_images, interface_videos], ["Images!", "Videos!"])
 
 if __name__ == "__main__":
     interface.launch(debug=True)
-Original file line number
+Diff line change
@@ Expand Up / @@ -131,4 +131,5 @@ dmypy.json @@
     # Extra stuff to ignore
     dreams
     images
-    run.py
+    run.py
+    examples