nateraw · philgzl · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023
diff --git a/README.md b/README.md
@@ -107,7 +107,7 @@ video_path = pipeline.walk(
 )
 ```
 
-#### Run the App Locally
+### Run the App Locally
 
 ```python
 from stable_diffusion_videos import StableDiffusionWalkPipeline, Interface
@@ -123,6 +123,13 @@ interface = Interface(pipeline)
 interface.launch()
 ```
 
+### CLI
+
+The script `scripts/make_video.py` also provides a CLI. Example:
+```bash
+python scripts/make_video.py --prompts "a cat" "a dog" --fps 10
+```
+
 ## Credits
 
 This work built off of [a script](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355

diff --git a/examples/make_music_video.py b/examples/make_music_video.py
diff --git a/scripts/make_video.py b/scripts/make_video.py
@@ -0,0 +1,154 @@
+import argparse
+import random
+
+import torch
+import yaml
+
+from diffusers import DPMSolverMultistepScheduler
+from stable_diffusion_videos import StableDiffusionWalkPipeline
+
+
+def init_arg_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument('--checkpoint_id',
+                        default="stabilityai/stable-diffusion-2-1",
+                        help="checkpoint id on huggingface")
+    parser.add_argument('--prompts', nargs='+',
+                        help='sequence of prompts')
+    parser.add_argument('--seeds', type=int, nargs='+',
+                        help='seed for each prompt')
+    parser.add_argument('--num_interpolation_steps', type=int, nargs='+',
+                        help='number of steps between each image')
+    parser.add_argument('--output_dir', default="dreams",
+                        help='output directory')
+    parser.add_argument('--name',
+                        help='output sub-directory')
+    parser.add_argument('--fps', type=int, default=10,
+                        help='frames per second')
+    parser.add_argument('--guidance_scale', type=float, default=7.5,
+                        help='diffusion guidance scale')
+    parser.add_argument('--num_inference_steps', type=int, default=50,
+                        help='number of diffusion inference steps')
+    parser.add_argument('--height', type=int, default=512,
+                        help='output image height')
+    parser.add_argument('--width', type=int, default=512,
+                        help='output image width')
+    parser.add_argument('--upsample', action='store_true',
+                        help='upscale x4 using Real-ESRGAN')
+    parser.add_argument('--batch_size', type=int, default=1,
+                        help='batch size')
+    parser.add_argument('--audio_filepath',
+                        help='path to audio file')
+    parser.add_argument('--audio_offsets', type=int, nargs='+',
+                        help='audio offset for each prompt')
+    parser.add_argument('--negative_prompt',
+                        help='negative prompt (one for all images)')
+
+    parser.add_argument('--cfg',
+                        help='yaml config file (overwrites other options)')
+
+    return parser
+
+
+def parse_args(parser):
+    args = parser.parse_args()
+
+    # read config file
+    if args.cfg is not None:
+        with open(args.cfg) as f:
+            cfg = yaml.safe_load(f)
+        for key, val in cfg.items():
+            if hasattr(args, key):
+                setattr(args, key, val)
+            else:
+                raise ValueError(f'bad field in config file: {key}')
+
+    # check for prompts
+    if args.prompts is None:
+        raise ValueError('no prompt provided')
+    if args.seeds is None:
+        args.seeds = [random.getrandbits(16) for _ in args.prompts]
+
+    # check audio arguments
+    if args.audio_filepath is not None and args.audio_offsets is None:
+        raise ValueError('must provide audio_offsets when providing '
+                         'audio_filepath')
+    if args.audio_offsets is not None and args.audio_filepath is None:
+        raise ValueError('must provide audio_filepath when providing '
+                         'audio_offsets')
+
+    # check lengths
+    if args.audio_offsets is not None:
+        if not len(args.prompts) == len(args.seeds) == len(args.audio_offsets):
+            raise ValueError('prompts, seeds and audio_offsets must have same '
+                             f'length, got lengths {len(args.prompts)}, '
+                             f'{len(args.seeds)} and '
+                             f'{len(args.audio_offsets)} respectively')
+    else:
+        if not len(args.prompts) == len(args.seeds):
+            raise ValueError('prompts and seeds must have same length, got '
+                             f'lengths {len(args.prompts)} and '
+                             f'{len(args.seeds)} respectively')
+
+    # set num_interpolation_steps
+    if args.audio_offsets is not None \
+            and args.num_interpolation_steps is not None:
+        raise ValueError('cannot provide both audio_offsets and '
+                         'num_interpolation_steps')
+    elif args.audio_offsets is not None:
+        args.num_interpolation_steps = [
+            (b-a)*args.fps for a, b in zip(
+                args.audio_offsets, args.audio_offsets[1:]
+            )
+        ]
+    elif args.num_interpolation_steps is not None \
+            and not len(args.num_interpolation_steps) == len(args.prompts)-1:
+        raise ValueError('num_interpolation_steps must have length '
+                         f'len(prompts)-1, got '
+                         f'{len(args.num_interpolation_steps)} != '
+                         f'{len(args.prompts)-1}')
+    else:
+        args.num_interpolation_steps = args.fps*10  # 10 second video
+
+    return args
+
+
+def main():
+    parser = init_arg_parser()
+    args = parse_args(parser)
+
+    pipe = StableDiffusionWalkPipeline.from_pretrained(
+        args.checkpoint_id,
+        torch_dtype=torch.float16,
+        revision="fp16",
-        revision="fp16",
-        revision="fp16",
+        feature_extractor=None,
+        safety_checker=None,
+    ).to("cuda")
+    pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+        pipe.scheduler.config
+    )
+
+    pipe.walk(
+        prompts=args.prompts,
+        seeds=args.seeds,
+        num_interpolation_steps=args.num_interpolation_steps,
+        output_dir=args.output_dir,
+        name=args.name,
+        fps=args.fps,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        height=args.height,
+        width=args.width,
+        upsample=args.upsample,
+        batch_size=args.batch_size,
+        audio_filepath=args.audio_filepath,
+        audio_start_sec=None if args.audio_offsets is None else args.audio_offsets[0],
+        negative_prompt=args.negative_prompt,
+    )
+
+
+if __name__ == '__main__':
+    main()