From b25eb57a2dbd1b6c6c855e9aa2b3e6e70fde504f Mon Sep 17 00:00:00 2001
From: philgzl <services@philgzl.com>
Date: Tue, 24 Jan 2023 15:50:05 +0100
Subject: [PATCH 1/6] Add make video script

---
 scripts/make_video.py | 152 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 scripts/make_video.py

diff --git a/scripts/make_video.py b/scripts/make_video.py
new file mode 100644
index 0000000..605378a
--- /dev/null
+++ b/scripts/make_video.py
@@ -0,0 +1,152 @@
+import argparse
+import random
+
+import torch
+import yaml
+
+from diffusers import DPMSolverMultistepScheduler
+from stable_diffusion_videos import StableDiffusionWalkPipeline
+
+
+def init_arg_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument('--checkpoint_id',
+                        default="stabilityai/stable-diffusion-2-1",
+                        help="checkpoint id on huggingface")
+    parser.add_argument('--prompts', nargs='+',
+                        help='sequence of prompts')
+    parser.add_argument('--seeds', type=int, nargs='+',
+                        help='seed for each prompt')
+    parser.add_argument('--num_interpolation_steps', type=int, nargs='+',
+                        help='number of steps between each image')
+    parser.add_argument('--output_dir', default="dreams",
+                        help='output directory')
+    parser.add_argument('--name',
+                        help='output sub-directory')
+    parser.add_argument('--fps', type=int, default=10,
+                        help='frames per second')
+    parser.add_argument('--guidance_scale', type=float, default=7.5,
+                        help='diffusion guidance scale')
+    parser.add_argument('--num_inference_steps', type=int, default=50,
+                        help='number of diffusion inference steps')
+    parser.add_argument('--height', type=int, default=512,
+                        help='output image height')
+    parser.add_argument('--width', type=int, default=512,
+                        help='output image width')
+    parser.add_argument('--upsample', action='store_true',
+                        help='upscale x4 using Real-ESRGAN')
+    parser.add_argument('--batch_size', type=int, default=1,
+                        help='batch size')
+    parser.add_argument('--audio_filepath',
+                        help='path to audio file')
+    parser.add_argument('--audio_offsets', type=int, nargs='+',
+                        help='audio offset for each prompt')
+    parser.add_argument('--negative_prompt',
+                        help='negative prompt (one for all images)')
+
+    parser.add_argument('--cfg',
+                        help='yaml config file (overwrites other options)')
+
+    return parser
+
+
+def parse_args(parser):
+    args = parser.parse_args()
+
+    # read config file
+    if args.cfg is not None:
+        with open(args.cfg) as f:
+            cfg = yaml.safe_load(f)
+        for key, val in cfg.items():
+            if hasattr(args, key):
+                setattr(args, key, val)
+            else:
+                raise ValueError(f'bad field in config file: {key}')
+
+    # check for prompts
+    if args.prompts is None:
+        raise ValueError('no prompt provided')
+    if args.seeds is None:
+        args.seeds = [random.getrandbits(16) for _ in args.prompts]
+
+    # check audio arguments
+    if args.audio_filepath is not None and args.audio_offsets is None:
+        raise ValueError('must provide audio_offsets when providing '
+                         'audio_filepath')
+    if args.audio_offsets is not None and args.audio_filepath is None:
+        raise ValueError('must provide audio_filepath when providing '
+                         'audio_offsets')
+
+    # check lengths
+    if args.audio_offsets is not None:
+        if not len(args.prompts) == len(args.seeds) == len(args.audio_offsets):
+            raise ValueError('prompts, seeds and audio_offsets must have same '
+                             f'length, got lengths {len(args.prompts)}, '
+                             f'{len(args.seeds)} and '
+                             f'{len(args.audio_offsets)} respectively')
+    else:
+        if not len(args.prompts) == len(args.seeds):
+            raise ValueError('prompts and seeds must have same length, got '
+                             f'lengths {len(args.prompts)} and '
+                             f'{len(args.seeds)} respectively')
+
+    # set num_interpolation_steps if audio_offsets is provided
+    if args.audio_offsets is not None \
+            and args.num_interpolation_steps is not None:
+        raise ValueError('cannot provide both audio_offsets and '
+                         'num_interpolation_steps')
+    elif args.audio_offsets is not None:
+        args.num_interpolation_steps = [
+            (b-a)*args.fps for a, b in zip(
+                args.audio_offsets, args.audio_offsets[1:]
+            )
+        ]
+    elif args.num_interpolation_steps is not None \
+            and not len(args.num_interpolation_steps) == len(args.prompts)-1:
+        raise ValueError('num_interpolation_steps must have length '
+                         f'len(prompts)-1, got '
+                         f'{len(args.num_interpolation_steps)} != '
+                         f'{len(args.prompts)-1}')
+
+    return args
+
+
+def main():
+    parser = init_arg_parser()
+    args = parse_args(parser)
+
+    pipe = StableDiffusionWalkPipeline.from_pretrained(
+        args.checkpoint_id,
+        torch_dtype=torch.float16,
+        revision="fp16",
+        feature_extractor=None,
+        safety_checker=None,
+    ).to("cuda")
+    pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+        pipe.scheduler.config
+    )
+
+    pipe.walk(
+        prompts=args.prompts,
+        seeds=args.seeds,
+        num_interpolation_steps=args.num_interpolation_steps,
+        output_dir=args.output_dir,
+        name=args.name,
+        fps=args.fps,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        height=args.height,
+        width=args.width,
+        upsample=args.upsample,
+        batch_size=args.batch_size,
+        audio_filepath=args.audio_filepath,
+        audio_start_sec=args.audio_offsets,
+        negative_prompt=args.negative_prompt,
+    )
+
+
+if __name__ == '__main__':
+    main()

From 3342302ec0d71d909885abcbafe47ca5cf2581a9 Mon Sep 17 00:00:00 2001
From: philgzl <services@philgzl.com>
Date: Tue, 24 Jan 2023 15:50:32 +0100
Subject: [PATCH 2/6] Remove deprecated music video script

---
 examples/make_music_video.py | 60 ------------------------------------
 1 file changed, 60 deletions(-)
 delete mode 100644 examples/make_music_video.py

diff --git a/examples/make_music_video.py b/examples/make_music_video.py
deleted file mode 100644
index 912b369..0000000
--- a/examples/make_music_video.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from stable_diffusion_videos import StableDiffusionWalkPipeline
-
-from diffusers.models import AutoencoderKL
-from diffusers.schedulers import LMSDiscreteScheduler
-import torch
-
-
-pipe = StableDiffusionWalkPipeline.from_pretrained(
-    'runwayml/stable-diffusion-v1-5',
-    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
-    torch_dtype=torch.float16,
-    revision="fp16",
-    safety_checker=None,
-    scheduler=LMSDiscreteScheduler(
-        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-    )
-).to("cuda")
-
-
-# I give you permission to scrape this song :)
-# youtube-dl -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 -o "music/thoughts.%(ext)s" https://soundcloud.com/nateraw/thoughts
-audio_filepath = 'music/thoughts.mp3'
-
-# Seconds in the song. Here we slice the audio from 0:07-0:16
-# Should be same length as prompts/seeds.
-audio_offsets = [7, 10, 13, 16]
-
-# Output video frames per second.
-# Use lower values for testing (5 or 10), higher values for better quality (30 or 60)
-fps = 25
-
-# Convert seconds to frames
-# This array should be `len(prompts) - 1` as its steps between prompts.
-num_interpolation_steps = [(b-a) * fps for a, b in zip(audio_offsets, audio_offsets[1:])]
-
-prompts = [
-    'Baroque oil painting anime key visual concept art of wanderer above the sea of fog 1 8 1 8 with anime maid, brutalist, dark fantasy, rule of thirds golden ratio, fake detail, trending pixiv fanbox, acrylic palette knife, style of makoto shinkai studio ghibli genshin impact jamie wyeth james gilleard greg rutkowski chiho aoshima',
-    'the conscious mind entering the dark wood window into the surreal subconscious dream mind, majestic, dreamlike, surrealist, trending on artstation, by gustavo dore ',
-    'Chinese :: by martine johanna and simon stålenhag and chie yoshii and casey weldon and wlop :: ornate, dynamic, particulate, rich colors, intricate, elegant, highly detailed, centered, artstation, smooth, sharp focus, octane render, 3d',
-    'Chinese :: by martine johanna and simon stålenhag and chie yoshii and casey weldon and wlop :: ornate, dynamic, particulate, rich colors, intricate, elegant, highly detailed, centered, artstation, smooth, sharp focus, octane render, 3d',
-]
-seeds = [
-    6954010,
-    8092009,
-    1326004,
-    5019608,
-]
-pipe.walk(
-    prompts=prompts,
-    seeds=seeds,
-    num_interpolation_steps=num_interpolation_steps,
-    fps=fps,
-    audio_filepath=audio_filepath,
-    audio_start_sec=audio_offsets[0],
-    batch_size=16,
-    num_inference_steps=50,
-    guidance_scale=15,
-    margin=1.0,
-    smooth=0.2,
-)

From 961a8dbaa27b876b79e904e37b6a46f2c597cd65 Mon Sep 17 00:00:00 2001
From: philgzl <services@philgzl.com>
Date: Tue, 24 Jan 2023 16:15:59 +0100
Subject: [PATCH 3/6] Fix default num_interpolation_steps

---
 scripts/make_video.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/make_video.py b/scripts/make_video.py
index 605378a..f500f17 100644
--- a/scripts/make_video.py
+++ b/scripts/make_video.py
@@ -93,7 +93,7 @@ def parse_args(parser):
                              f'lengths {len(args.prompts)} and '
                              f'{len(args.seeds)} respectively')
 
-    # set num_interpolation_steps if audio_offsets is provided
+    # set num_interpolation_steps
     if args.audio_offsets is not None \
             and args.num_interpolation_steps is not None:
         raise ValueError('cannot provide both audio_offsets and '
@@ -110,6 +110,8 @@ def parse_args(parser):
                          f'len(prompts)-1, got '
                          f'{len(args.num_interpolation_steps)} != '
                          f'{len(args.prompts)-1}')
+    else:
+        args.num_interpolation_steps = 5
 
     return args
 

From 21542ae013c446a135932d50778e141ce8fc2756 Mon Sep 17 00:00:00 2001
From: philgzl <services@philgzl.com>
Date: Tue, 24 Jan 2023 16:26:42 +0100
Subject: [PATCH 4/6] Increase default num_interpolation_steps

---
 scripts/make_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/make_video.py b/scripts/make_video.py
index f500f17..518db67 100644
--- a/scripts/make_video.py
+++ b/scripts/make_video.py
@@ -111,7 +111,7 @@ def parse_args(parser):
                          f'{len(args.num_interpolation_steps)} != '
                          f'{len(args.prompts)-1}')
     else:
-        args.num_interpolation_steps = 5
+        args.num_interpolation_steps = args.fps*10  # 10 second video
 
     return args
 

From 6c261f74a373c465be70b4db2d43457341fe2a2c Mon Sep 17 00:00:00 2001
From: philgzl <services@philgzl.com>
Date: Tue, 24 Jan 2023 19:08:45 +0100
Subject: [PATCH 5/6] Fix audio_start_sec set to args.audio_offsets

---
 scripts/make_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/make_video.py b/scripts/make_video.py
index 518db67..38d62fc 100644
--- a/scripts/make_video.py
+++ b/scripts/make_video.py
@@ -145,7 +145,7 @@ def main():
         upsample=args.upsample,
         batch_size=args.batch_size,
         audio_filepath=args.audio_filepath,
-        audio_start_sec=args.audio_offsets,
+        audio_start_sec=None if args.audio_offsets is None else args.audio_offsets[0],
         negative_prompt=args.negative_prompt,
     )
 

From ce34e15ac8d98ddc3af5daaa58e0323d5be868d9 Mon Sep 17 00:00:00 2001
From: philgzl <services@philgzl.com>
Date: Tue, 24 Jan 2023 19:23:07 +0100
Subject: [PATCH 6/6] Update README.md

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6a7c990..ce1c450 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ video_path = pipeline.walk(
 )
 ```
 
-#### Run the App Locally
+### Run the App Locally
 
 ```python
 from stable_diffusion_videos import StableDiffusionWalkPipeline, Interface
@@ -123,6 +123,13 @@ interface = Interface(pipeline)
 interface.launch()
 ```
 
+### CLI
+
+The script `scripts/make_video.py` also provides a CLI. Example:
+```bash
+python scripts/make_video.py --prompts "a cat" "a dog" --fps 10
+```
+
 ## Credits
 
 This work built off of [a script](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355