diff --git a/.gitignore b/.gitignore index 506d5d35..73c66b60 100644 --- a/.gitignore +++ b/.gitignore @@ -168,5 +168,7 @@ Dockerfile start_docker.sh start.sh +checkpoints + # Mac .DS_Store diff --git a/LICENSE b/LICENSE index b2a615ac..15319625 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,209 @@ -MIT License - -Copyright (c) 2023 Tencent AI Lab - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Tencent is pleased to support the open source community by making SadTalker available. + +Copyright (C), a Tencent company. All rights reserved. + +SadTalker is licensed under the Apache 2.0 License, except for the third-party components listed below. + +Terms of the Apache License Version 2.0: +--------------------------------------------- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index c330bdce..332615dc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ -     [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)   [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker)   [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb)   [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker) +     [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)   [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker)   [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb)  
[![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker) [![Discord](https://dcbadge.vercel.app/api/server/rrayYqZ4tf?style=flat)](https://discord.gg/rrayYqZ4tf)
Wenxuan Zhang *,1,2   @@ -37,57 +37,62 @@ -## 🔥 Highlight +## Highlights -- 🔥 The extension of the [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) is online. Checkout more details [here](docs/webui_extension.md). +- The license has been updated to Apache 2.0, and we've removed the non-commercial restriction +- **SadTalker has now officially been integrated into Discord, where you can use it for free by sending files. You can also generate high-quailty videos from text prompts. Join: [![Discord](https://dcbadge.vercel.app/api/server/rrayYqZ4tf?style=flat)](https://discord.gg/rrayYqZ4tf)** -https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4 +- We've published a [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) extension. Check out more details [here](docs/webui_extension.md). [Demo Video](https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4) -- 🔥 `full image mode` is online! checkout [here](https://github.com/Winfredy/SadTalker#full-bodyimage-generation) for more details. +- Full image mode is now available! [More details...](https://github.com/OpenTalker/SadTalker#full-bodyimage-generation) | still+enhancer in v0.0.1 | still + enhancer in v0.0.2 | [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) | |:--------------------: |:--------------------: | :----: | | | | -- 🔥 Several new mode, eg, `still mode`, `reference mode`, `resize mode` are online for better and custom applications. +- Several new modes (Still, reference, and resize modes) are now available! -- 🔥 Happy to see more community demos at [bilibili](https://search.bilibili.com/all?keyword=sadtalker&from_source=webtop_search&spm_id_from=333.1007&search_source=3 -), [Youtube](https://www.youtube.com/results?search_query=sadtalker&sp=CAM%253D) and [twitter #sadtalker](https://twitter.com/search?q=%23sadtalker&src=typed_query). +- We're happy to see more community demos on [bilibili](https://search.bilibili.com/all?keyword=sadtalker), [YouTube](https://www.youtube.com/results?search_query=sadtalker) and [X (#sadtalker)](https://twitter.com/search?q=%23sadtalker&src). -## 📋 Changelog (Previous changelog can be founded [here](docs/changlelog.md)) +## Changelog -- __[2023.06.12]__: add more new features in WEBUI extension, see the discussion [here](https://github.com/OpenTalker/SadTalker/discussions/386). +The previous changelog can be found [here](docs/changlelog.md). -- __[2023.06.05]__: release a new 512 beta face model. Fixed some bugs and improve the performance. +- __[2023.06.12]__: Added more new features in WebUI extension, see the discussion [here](https://github.com/OpenTalker/SadTalker/discussions/386). -- __[2023.04.15]__: Adding automatic1111 colab by @camenduru, thanks for this awesome colab: [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb). +- __[2023.06.05]__: Released a new 512x512px (beta) face model. Fixed some bugs and improve the performance. -- __[2023.04.12]__: adding a more detailed sd-webui installation document, fixed reinstallation problem. +- __[2023.04.15]__: Added a WebUI Colab notebook by [@camenduru](https://github.com/camenduru/): [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) -- __[2023.04.12]__: Fixed the sd-webui safe issues becasue of the 3rd packages, optimize the output path in `sd-webui-extension`. +- __[2023.04.12]__: Added a more detailed WebUI installation document and fixed a problem when reinstalling. -- __[2023.04.08]__: ❗️❗️❗️ In v0.0.2, we add a logo watermark to the generated video to prevent abusing since it is very realistic. +- __[2023.04.12]__: Fixed the WebUI safe issues becasue of 3rd-party packages, and optimized the output path in `sd-webui-extension`. -- __[2023.04.08]__: v0.0.2, full image animation, adding baidu driver for download checkpoints. Optimizing the logic about enhancer. +- __[2023.04.08]__: In v0.0.2, we added a logo watermark to the generated video to prevent abuse. _This watermark has since been removed in a later release._ +- __[2023.04.08]__: In v0.0.2, we added features for full image animation and a link to download checkpoints from Baidu. We also optimized the enhancer logic. -## 🚧 TODO: See the Discussion https://github.com/OpenTalker/SadTalker/issues/280 +## To-Do -## If you have any problem, please view our [FAQ](docs/FAQ.md) before opening an issue. +We're tracking new updates in [issue #280](https://github.com/OpenTalker/SadTalker/issues/280). +## Troubleshooting +If you have any problems, please read our [FAQs](docs/FAQ.md) before opening an issue. -## ⚙️ 1. Installation. -Tutorials from communities: [中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース](https://br-d.fanbox.cc/posts/5685086?utm_campaign=manage_post_page&utm_medium=share&utm_source=twitter) -### Linux: +## 1. Installation. -1. Installing [anaconda](https://www.anaconda.com/), python and git. +Community tutorials: [中文Windows教程 (Chinese Windows tutorial)](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース (Japanese tutorial)](https://br-d.fanbox.cc/posts/5685086). + +### Linux/Unix + +1. Install [Anaconda](https://www.anaconda.com/), Python and `git`. 2. Creating the env and install the requirements. ```bash - git clone https://github.com/Winfredy/SadTalker.git + git clone https://github.com/OpenTalker/SadTalker.git cd SadTalker @@ -101,40 +106,54 @@ Tutorials from communities: [中文windows教程](https://www.bilibili.com/video pip install -r requirements.txt - ### tts is optional for gradio demo. + ### Coqui TTS is optional for gradio demo. ### pip install TTS ``` -### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)): +### Windows + +A video tutorial in chinese is available [here](https://www.bilibili.com/video/BV1Dc411W7V6/). You can also follow the following instructions: + +1. Install [Python 3.8](https://www.python.org/downloads/windows/) and check "Add Python to PATH". +2. Install [git](https://git-scm.com/download/win) manually or using [Scoop](https://scoop.sh/): `scoop install git`. +3. Install `ffmpeg`, following [this tutorial](https://www.wikihow.com/Install-FFmpeg-on-Windows) or using [scoop](https://scoop.sh/): `scoop install ffmpeg`. +4. Download the SadTalker repository by running `git clone https://github.com/Winfredy/SadTalker.git`. +5. Download the checkpoints and gfpgan models in the [downloads section](#2-download-models). +6. Run `start.bat` from Windows Explorer as normal, non-administrator, user, and a Gradio-powered WebUI demo will be started. -1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH". -2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)). -3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)). -4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`. -5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models). -5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started. +### macOS -### Macbook: +A tutorial on installing SadTalker on macOS can be found [here](docs/install.md). -More tips about installnation on Macbook and the Docker file can be founded [here](docs/install.md) +### Docker, WSL, etc -## 📥 2. Download Trained Models. +Please check out additional tutorials [here](docs/install.md). -You can run the following script to put all the models in the right place. +## 2. Download Models + +You can run the following script on Linux/macOS to automatically download all the models: ```bash bash scripts/download_models.sh ``` -Other alternatives: -> we also provide an offline patch (`gfpgan/`), thus, no model will be downloaded when generating. +We also provide an offline patch (`gfpgan/`), so no model will be downloaded when generating. + +### Pre-Trained Models + +* [Google Drive](https://drive.google.com/file/d/1gwWh45pF7aelNP_P78uDJL8Sycep-K7j/view?usp=sharing) +* [GitHub Releases](https://github.com/OpenTalker/SadTalker/releases) +* [Baidu (百度云盘)](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt) (Password: `sadt`) -**Google Driver**: download our pre-trained model from [ this link (main checkpoints)](https://drive.google.com/file/d/1gwWh45pF7aelNP_P78uDJL8Sycep-K7j/view?usp=sharing) and [ gfpgan (offline patch)](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing) + -**Github Release Page**: download all the files from the [lastest github release page](https://github.com/Winfredy/SadTalker/releases), and then, put it in ./checkpoints. +### GFPGAN Offline Patch -**百度云盘**: we provided the downloaded model in [checkpoints, 提取码: sadt.](https://pan.baidu.com/s/1P4fRgk9gaSutZnn8YW034Q?pwd=sadt) And [gfpgan, 提取码: sadt.](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt) +* [Google Drive](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing) +* [GitHub Releases](https://github.com/OpenTalker/SadTalker/releases) +* [Baidu (百度云盘)](https://pan.baidu.com/s/1P4fRgk9gaSutZnn8YW034Q?pwd=sadt) (Password: `sadt`) +
Model Details @@ -174,28 +193,30 @@ The final folder will be shown as:
-## 🔮 3. Quick Start ([Best Practice](docs/best_practice.md)). +## 3. Quick Start -### WebUI Demos: +Please read our document on [best practices and configuration tips](docs/best_practice.md) -**Online**: [Huggingface](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) +### WebUI Demos -**Local Autiomatic1111 stable-diffusion webui extension**: please refer to [Autiomatic1111 stable-diffusion webui docs](docs/webui_extension.md). +**Online Demo**: [HuggingFace](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) -**Local gradio demo(highly recommanded!)**: Similar to our [hugging-face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run by: +**Local WebUI extension**: Please refer to [WebUI docs](docs/webui_extension.md). + +**Local gradio demo (recommanded)**: A Gradio instance similar to our [Hugging Face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run locally: ```bash ## you need manually install TTS(https://github.com/coqui-ai/TTS) via `pip install tts` in advanced. -python app.py +python app_sadtalker.py ``` -**Local gradio demo(highly recommanded!)**: +You can also start it more easily: - windows: just double click `webui.bat`, the requirements will be installed automatically. - Linux/Mac OS: run `bash webui.sh` to start the webui. -### Manually usages: +### CLI usage ##### Animating a portrait image from default config: ```bash @@ -220,7 +241,7 @@ python inference.py --driven_audio \ More examples and configuration and tips can be founded in the [ >>> best practice documents <<<](docs/best_practice.md). -## 🛎 Citation +## Citation If you find our work useful in your research, please consider citing: @@ -233,23 +254,21 @@ If you find our work useful in your research, please consider citing: } ``` +## Acknowledgements +Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, we also used the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work. -## 💗 Acknowledgements - -Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, We also use the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work. - -See also these wonderful 3rd libraries we use: +We also use the following 3rd-party libraries: - **Face Utils**: https://github.com/xinntao/facexlib - **Face Enhancement**: https://github.com/TencentARC/GFPGAN - **Image/Video Enhancement**:https://github.com/xinntao/Real-ESRGAN -## 🥂 Extensions: +## Extensions: - [SadTalker-Video-Lip-Sync](https://github.com/Zz-ww/SadTalker-Video-Lip-Sync) from [@Zz-ww](https://github.com/Zz-ww): SadTalker for Video Lip Editing -## 🥂 Related Works +## Related Works - [StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN (ECCV 2022)](https://github.com/FeiiYin/StyleHEAT) - [CodeTalker: Speech-Driven 3D Facial Animation with Discrete Motion Prior (CVPR 2023)](https://github.com/Doubiiu/CodeTalker) - [VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild (SIGGRAPH Asia 2022)](https://github.com/vinthony/video-retalking) @@ -257,12 +276,23 @@ See also these wonderful 3rd libraries we use: - [3D GAN Inversion with Facial Symmetry Prior (CVPR 2023)](https://github.com/FeiiYin/SPI/) - [T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations (CVPR 2023)](https://github.com/Mael-zys/T2M-GPT) -## 📢 Disclaimer +## Disclaimer -This is not an official product of Tencent. This repository can only be used for personal/research/non-commercial purposes. +This is not an official product of Tencent. -LOGO: color and font suggestion: [ChatGPT](ai.com), logo font:[Montserrat Alternates +``` +1. Please carefully read and comply with the open-source license applicable to this code before using it. +2. Please carefully read and comply with the intellectual property declaration applicable to this code before using it. +3. This open-source code runs completely offline and does not collect any personal information or other data. If you use this code to provide services to end-users and collect related data, please take necessary compliance measures according to applicable laws and regulations (such as publishing privacy policies, adopting necessary data security strategies, etc.). If the collected data involves personal information, user consent must be obtained (if applicable). Any legal liabilities arising from this are unrelated to Tencent. +4. Without Tencent's written permission, you are not authorized to use the names or logos legally owned by Tencent, such as "Tencent." Otherwise, you may be liable for legal responsibilities. +5. This open-source code does not have the ability to directly provide services to end-users. If you need to use this code for further model training or demos, as part of your product to provide services to end-users, or for similar use, please comply with applicable laws and regulations for your product or service. Any legal liabilities arising from this are unrelated to Tencent. +6. It is prohibited to use this open-source code for activities that harm the legitimate rights and interests of others (including but not limited to fraud, deception, infringement of others' portrait rights, reputation rights, etc.), or other behaviors that violate applicable laws and regulations or go against social ethics and good customs (including providing incorrect or false information, spreading pornographic, terrorist, and violent information, etc.). Otherwise, you may be liable for legal responsibilities. +``` + +LOGO: color and font suggestion: [ChatGPT](https://chat.openai.com), logo font: [Montserrat Alternates ](https://fonts.google.com/specimen/Montserrat+Alternates?preview.text=SadTalker&preview.text_type=custom&query=mont). -All the copyright of the demo images and audio are from communities users or the geneartion from stable diffusion. Free free to contact us if you feel uncomfortable. +All the copyrights of the demo images and audio are from community users or the generation from stable diffusion. Feel free to contact us if you would like use to remove them. + + diff --git a/app.py b/app.py deleted file mode 100644 index 11be0b48..00000000 --- a/app.py +++ /dev/null @@ -1,133 +0,0 @@ -import os, sys -import gradio as gr -from src.gradio_demo import SadTalker - - -try: - import webui # in webui - in_webui = True -except: - in_webui = False - - -def toggle_audio_file(choice): - if choice == False: - return gr.update(visible=True), gr.update(visible=False) - else: - return gr.update(visible=False), gr.update(visible=True) - -def ref_video_fn(path_of_ref_video): - if path_of_ref_video is not None: - return gr.update(value=True) - else: - return gr.update(value=False) - - -def sadtalker_demo(checkpoint_path='checkpoints', config_path='src/config', warpfn=None): - - sad_talker = SadTalker(checkpoint_path, config_path, lazy_load=True) - - with gr.Blocks(analytics_enabled=False) as sadtalker_interface: - - gr.Markdown("

😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023)

\ - Arxiv       \ - Homepage       \ - Github
") - - with gr.Row().style(equal_height=False): - with gr.Column(variant='panel'): - with gr.Tabs(elem_id="sadtalker_source_image"): - with gr.TabItem('Source image'): - with gr.Row(): - source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512) - - - with gr.Tabs(elem_id="sadtalker_driven_audio"): - with gr.TabItem('Driving Methods'): - gr.Markdown("Possible driving combinations:
1. Audio only 2. Audio/IDLE Mode + Ref Video(pose, blink, pose+blink) 3. IDLE Mode only 4. Ref Video only (all) ") - - with gr.Row(): - driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath") - driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False) - - with gr.Column(): - use_idle_mode = gr.Checkbox(label="Use Idle Animation") - length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.") - use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo - - if sys.platform != 'win32' and not in_webui: - with gr.Accordion('Generate Audio From TTS', open=False): - from src.utils.text2speech import TTSTalker - tts_talker = TTSTalker() - with gr.Column(variant='panel'): - input_text = gr.Textbox(label="Generating audio from text", lines=5, placeholder="please enter some text here, we genreate the audio from text using @Coqui.ai TTS.") - tts = gr.Button('Generate audio',elem_id="sadtalker_audio_generate", variant='primary') - tts.click(fn=tts_talker.test, inputs=[input_text], outputs=[driven_audio]) - - with gr.Row(): - ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref").style(width=512) - - with gr.Column(): - use_ref_video = gr.Checkbox(label="Use Reference Video") - ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))") - - ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo - - - with gr.Column(variant='panel'): - with gr.Tabs(elem_id="sadtalker_checkbox"): - with gr.TabItem('Settings'): - gr.Markdown("need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials") - with gr.Column(variant='panel'): - # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width - # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width - with gr.Row(): - pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) # - exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # - blink_every = gr.Checkbox(label="use eye blink", value=True) - - with gr.Row(): - size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") # - preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?") - - with gr.Row(): - is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)") - batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1) - enhancer = gr.Checkbox(label="GFPGAN as Face enhancer") - - submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary') - - with gr.Tabs(elem_id="sadtalker_genearted"): - gen_video = gr.Video(label="Generated video", format="mp4").style(width=256) - - submit.click( - fn=sad_talker.test, - inputs=[source_image, - driven_audio, - preprocess_type, - is_still_mode, - enhancer, - batch_size, - size_of_image, - pose_style, - exp_weight, - use_ref_video, - ref_video, - ref_info, - use_idle_mode, - length_of_audio, - blink_every - ], - outputs=[gen_video] - ) - - return sadtalker_interface - - -if __name__ == "__main__": - - demo = sadtalker_demo() - demo.queue() - demo.launch() - - diff --git a/app_sadtalker.py b/app_sadtalker.py new file mode 100644 index 00000000..1401a600 --- /dev/null +++ b/app_sadtalker.py @@ -0,0 +1,111 @@ +import os, sys +import gradio as gr +from src.gradio_demo import SadTalker + + +try: + import webui # in webui + in_webui = True +except: + in_webui = False + + +def toggle_audio_file(choice): + if choice == False: + return gr.update(visible=True), gr.update(visible=False) + else: + return gr.update(visible=False), gr.update(visible=True) + +def ref_video_fn(path_of_ref_video): + if path_of_ref_video is not None: + return gr.update(value=True) + else: + return gr.update(value=False) + +def sadtalker_demo(checkpoint_path='checkpoints', config_path='src/config', warpfn=None): + + sad_talker = SadTalker(checkpoint_path, config_path, lazy_load=True) + + with gr.Blocks(analytics_enabled=False) as sadtalker_interface: + gr.Markdown("

😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023)

\ +
Arxiv       \ + Homepage       \ + Github
") + + with gr.Row().style(equal_height=False): + with gr.Column(variant='panel'): + with gr.Tabs(elem_id="sadtalker_source_image"): + with gr.TabItem('Upload image'): + with gr.Row(): + source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512) + + with gr.Tabs(elem_id="sadtalker_driven_audio"): + with gr.TabItem('Upload OR TTS'): + with gr.Column(variant='panel'): + driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath") + + if sys.platform != 'win32' and not in_webui: + from src.utils.text2speech import TTSTalker + tts_talker = TTSTalker() + with gr.Column(variant='panel'): + input_text = gr.Textbox(label="Generating audio from text", lines=5, placeholder="please enter some text here, we genreate the audio from text using @Coqui.ai TTS.") + tts = gr.Button('Generate audio',elem_id="sadtalker_audio_generate", variant='primary') + tts.click(fn=tts_talker.test, inputs=[input_text], outputs=[driven_audio]) + + with gr.Column(variant='panel'): + with gr.Tabs(elem_id="sadtalker_checkbox"): + with gr.TabItem('Settings'): + gr.Markdown("need help? please visit our [best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md) for more detials") + with gr.Column(variant='panel'): + # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width + # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width + pose_style = gr.Slider(minimum=0, maximum=46, step=1, label="Pose style", value=0) # + size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") # + preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?") + is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)") + batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=2) + enhancer = gr.Checkbox(label="GFPGAN as Face enhancer") + submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary') + + with gr.Tabs(elem_id="sadtalker_genearted"): + gen_video = gr.Video(label="Generated video", format="mp4").style(width=256) + + if warpfn: + submit.click( + fn=warpfn(sad_talker.test), + inputs=[source_image, + driven_audio, + preprocess_type, + is_still_mode, + enhancer, + batch_size, + size_of_image, + pose_style + ], + outputs=[gen_video] + ) + else: + submit.click( + fn=sad_talker.test, + inputs=[source_image, + driven_audio, + preprocess_type, + is_still_mode, + enhancer, + batch_size, + size_of_image, + pose_style + ], + outputs=[gen_video] + ) + + return sadtalker_interface + + +if __name__ == "__main__": + + demo = sadtalker_demo() + demo.queue() + demo.launch() + + diff --git a/docs/best_practice.md b/docs/best_practice.md index 3a738b21..76700506 100644 --- a/docs/best_practice.md +++ b/docs/best_practice.md @@ -1,8 +1,8 @@ -# Best Practice and Tips for configuration +# Best Practices and Tips for configuration -> Our model only works on REAL person's photo or the portrait image similar to REAL person. The anime talking head genreation method will be released in future. +> Our model only works on REAL people or the portrait image similar to REAL person. The anime talking head genreation method will be released in future. -Advanced confiurations for `inference.py`: +Advanced confiuration options for `inference.py`: | Name | Configuration | default | Explaination | |:------------- |:------------- |:----- | :------------- | @@ -20,18 +20,18 @@ Advanced confiurations for `inference.py`: ### About `--preprocess` -Our method automatically handle the input images via `crop`, `resize` and `full`. +Our system automatically handles the input images via `crop`, `resize` and `full`. - In `crop` mode, we only generate the croped image via the facial keypoints and generated the facial anime avator. The animation of both expression and head pose are realistic. +In `crop` mode, we only generate the croped image via the facial keypoints and generated the facial anime avator. The animation of both expression and head pose are realistic. -> still mode will stop the eyeblink and head pose movement. +> Still mode will stop the eyeblink and head pose movement. | [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) | crop | crop w/still | |:--------------------: |:--------------------: | :----: | | | ![full_body_2](example_crop.gif) | ![full_body_2](example_crop_still.gif) | - In `resize` mode, we resize the whole images to generate the fully talking head video. Thus, an image similar to the ID photo can be produced. ⚠️ It will produce bad results for full person images. +In `resize` mode, we resize the whole images to generate the fully talking head video. Thus, an image similar to the ID photo can be produced. ⚠️ It will produce bad results for full person images. @@ -50,7 +50,7 @@ In `full` mode, our model will automatically process the croped region and paste ### About `--enhancer` -For better facial quality, we intergate [gfpgan](https://github.com/TencentARC/GFPGAN) and [real-esrgan](https://github.com/xinntao/Real-ESRGAN) for different purpose. Just adding `--enhancer ` or `--background_enhancer ` for the enhancement of the face and the full image. +For higher resolution, we intergate [gfpgan](https://github.com/TencentARC/GFPGAN) and [real-esrgan](https://github.com/xinntao/Real-ESRGAN) for different purpose. Just adding `--enhancer ` or `--background_enhancer ` for the enhancement of the face and the full image. ```bash # make sure above packages are available: @@ -70,7 +70,7 @@ This flag indicate that we can generated the 3d-rendered face and it's 3d facial -#### reference eye-link mode. +#### Reference eye-link mode. | Input, w/ reference video , reference video | |:-------------: | diff --git a/docs/face3d.md b/docs/face3d.md index b5a3f00b..394f4fef 100644 --- a/docs/face3d.md +++ b/docs/face3d.md @@ -1,11 +1,11 @@ -## 3D Face visualization +## 3D Face Visualization -We use pytorch3d to visualize the produced 3d face from a single image. +We use `pytorch3d` to visualize the 3D faces from a single image. -Since it is not easy to install, we produce a new install guidence here: +The requirements for 3D visualization are difficult to install, so here's a tutorial: ```bash -git clone https://github.com/Winfredy/SadTalker.git +git clone https://github.com/OpenTalker/SadTalker.git cd SadTalker conda create -n sadtalker3d python=3.8 source activate sadtalker3d @@ -28,10 +28,9 @@ pip install git+https://github.com/TencentARC/GFPGAN ### when occurs gcc version problem `from pytorch import _C` from pytorch3d, add the anaconda path to LD_LIBRARY_PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/$YOUR_ANACONDA_PATH/lib/ -``` +``` - -Then, generating the result via: +Then, generate the result via: ```bash @@ -43,6 +42,6 @@ python inference.py --driven_audio \ ``` -Then, the results will be given in the folders with the file name of `face3d.mp4`. +The results will appear, named `face3d.mp4`. -More applications about 3d face will be released. \ No newline at end of file +More applications about 3D face rendering will be released soon. diff --git a/docs/install.md b/docs/install.md index 31d1f918..807c8eb9 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,41 +1,33 @@ +### macOS -### Mac (Tested on M1 Mac OS 13.3) - -``` -git clone https://github.com/Winfredy/SadTalker.git +This method has been tested on a M1 Mac (13.3) +```bash +git clone https://github.com/OpenTalker/SadTalker.git cd SadTalker - conda create -n sadtalker python=3.8 - conda activate sadtalker - # install pytorch 2.0 pip install torch torchvision torchaudio - conda install ffmpeg - pip install -r requirements.txt - -pip install dlib # mac need to install the original dlib. - +pip install dlib # macOS needs to install the original dlib. ``` - - ### Windows Native -- Make sure you have `ffmpeg` in the `%PATH%` as suggested in [#54](https://github.com/Winfredy/SadTalker/issues/54), following [this](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) installation to install `ffmpeg`. +- Make sure you have `ffmpeg` in the `%PATH%` as suggested in [#54](https://github.com/Winfredy/SadTalker/issues/54), following [this](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) tutorial to install `ffmpeg` or using scoop. ### Windows WSL -- Make sure the environment: `export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH` -### Docker installnation +- Make sure the environment: `export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH` + -A dockerfile are also provided by [@thegenerativegeneration](https://github.com/thegenerativegeneration) in [docker hub](https://hub.docker.com/repository/docker/wawa9000/sadtalker), which can be used directly as: +### Docker Installation +A community Docker image by [@thegenerativegeneration](https://github.com/thegenerativegeneration) is available on the [Docker hub](https://hub.docker.com/repository/docker/wawa9000/sadtalker), which can be used directly: ```bash docker run --gpus "all" --rm -v $(pwd):/host_dir wawa9000/sadtalker \ --driven_audio /host_dir/deyu.wav \ diff --git a/docs/webui_extension.md b/docs/webui_extension.md index 6e272e2d..5e64469f 100644 --- a/docs/webui_extension.md +++ b/docs/webui_extension.md @@ -1,7 +1,6 @@ - ## Run SadTalker as a Stable Diffusion WebUI Extension. -1. Installing the lastest version of [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) and install the sadtalker via `extension`. +1. Install the lastest version of [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) and install SadTalker via `extension`. image 2. Download the checkpoints manually, for Linux and Mac: @@ -10,30 +9,30 @@ cd SOMEWHERE_YOU_LIKE - bash <(wget -qO- https://raw.githubusercontent.com/Winfredy/SadTalker/main/scripts/download_models.sh) + bash <(wget -qO- https://raw.githubusercontent.com/Winfredy/OpenTalker/main/scripts/download_models.sh) ``` - For windows, you can download all the checkpoints from [google drive](https://drive.google.com/drive/folders/1Wd88VDoLhVzYsQ30_qDVluQr_Xm46yHT?usp=sharing) or [百度云盘](https://pan.baidu.com/s/1nXuVNd0exUl37ISwWqbFGA?pwd=sadt) 提取码: sadt. + For Windows, you can download all the checkpoints [here](https://github.com/OpenTalker/SadTalker/tree/main#2-download-models). -3.1. options 1: put the checkpoint in `stable-diffusion-webui/models/SadTalker` or `stable-diffusion-webui/extensions/SadTalker/checkpoints/`, the checkpoints will be detected automatically. +3.1. Option 1: put the checkpoint in `stable-diffusion-webui/models/SadTalker` or `stable-diffusion-webui/extensions/SadTalker/checkpoints/`, the checkpoints will be detected automatically. -3.2. Options 2: Set the path of `SADTALKTER_CHECKPOINTS` in `webui_user.sh`(linux) or `webui_user.bat`(windows) by: +3.2. Option 2: Set the path of `SADTALKTER_CHECKPOINTS` in `webui_user.sh`(linux) or `webui_user.bat`(windows) by: > only works if you are directly starting webui from `webui_user.sh` or `webui_user.bat`. ```bash - # windows (webui_user.bat) + # Windows (webui_user.bat) set SADTALKER_CHECKPOINTS=D:\SadTalker\checkpoints - # linux (webui_user.sh) + # Linux/macOS (webui_user.sh) export SADTALKER_CHECKPOINTS=/path/to/SadTalker/checkpoints ``` -4. Then, starting the webui via `webui.sh or webui_user.sh(linux)` or `webui_user.bat(windows)` or any other methods, the SadTalker can be used in stable-diffusion-webui directly. +4. Start the WebUI via `webui.sh or webui_user.sh(linux)` or `webui_user.bat(windows)` or any other method. SadTalker can also be used in stable-diffusion-webui directly. image -## Questsions +## Questions 1. if you are running on CPU, you need to specific `--disable-safe-unpickle` in `webui_user.sh` or `webui_user.bat`. @@ -47,4 +46,4 @@ -(Some [important discussion](https://github.com/Winfredy/SadTalker/issues/78) if you are unable to use `full` mode). +(If you're unable to use the `full` mode, please read this [discussion](https://github.com/Winfredy/SadTalker/issues/78).) diff --git a/inference.py b/inference.py index a0b00790..345ed5a5 100644 --- a/inference.py +++ b/inference.py @@ -1,20 +1,21 @@ from glob import glob import shutil import torch -from time import strftime +from time import strftime import os, sys, time from argparse import ArgumentParser from src.utils.preprocess import CropAndExtract -from src.test_audio2coeff import Audio2Coeff +from src.test_audio2coeff import Audio2Coeff from src.facerender.animate import AnimateFromCoeff from src.generate_batch import get_data from src.generate_facerender_batch import get_facerender_data from src.utils.init_path import init_path +from src.utils.process_log import record_process_log, get_file_size -def main(args): - #torch.backends.cudnn.enabled = False +def main(args): + # torch.backends.cudnn.enabled = False pic_path = args.source_image audio_path = args.driven_audio save_dir = os.path.join(args.result_dir, strftime("%Y_%m_%d_%H.%M.%S")) @@ -29,22 +30,40 @@ def main(args): ref_pose = args.ref_pose current_root_path = os.path.split(sys.argv[0])[0] + t = time.time() + sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, + args.old_version, args.preprocess) + record_process_log("main", "init_path", time.time() - t) - sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess) - - #init model + # init model + t = time.time() preprocess_model = CropAndExtract(sadtalker_paths, device) + # 初始化 CropAndExtract, V100 消耗时间:3.572038889 + # TODO(qingyuan): this is loading check points, try to just load once + record_process_log("main", "CropAndExtract", time.time() - t) + + t = time.time() + audio_to_coeff = Audio2Coeff(sadtalker_paths, device) + # 初始化 Audio2Coeff, V100 消耗时间:0.297198772 + record_process_log("main", "Audio2Coeff", time.time() - t) - audio_to_coeff = Audio2Coeff(sadtalker_paths, device) - + t = time.time() animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device) + # 初始化 AnimateFromCoeff, V100 消耗时间:2.004800558 + # TODO(qingyuan): this is loading check points and models, try to just load once + record_process_log("main", "AnimateFromCoeff", time.time() - t) - #crop image and extract 3dmm from image + # crop image and extract 3dmm from image first_frame_dir = os.path.join(save_dir, 'first_frame_dir') os.makedirs(first_frame_dir, exist_ok=True) print('3DMM Extraction for source image') - first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\ - source_image_flag=True, pic_size=args.size) + t = time.time() + first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate( + pic_path, first_frame_dir, args.preprocess, + source_image_flag=True, pic_size=args.size) + # call preprocess_model.generate,消耗时间:1.483717203 + # TODO(qingyuan): add logging and check where the time is spent + record_process_log("main", "preprocess_model.generate", time.time() - t, "first_coeff_path") if first_coeff_path is None: print("Can't get the coeffs of the input") return @@ -54,76 +73,102 @@ def main(args): ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname) os.makedirs(ref_eyeblink_frame_dir, exist_ok=True) print('3DMM Extraction for the reference video providing eye blinking') - ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False) + t = time.time() + ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, + source_image_flag=False) + record_process_log("main", "preprocess_model.generate", time.time() - t, "ref_eyeblink") else: - ref_eyeblink_coeff_path=None + ref_eyeblink_coeff_path = None if ref_pose is not None: - if ref_pose == ref_eyeblink: + if ref_pose == ref_eyeblink: ref_pose_coeff_path = ref_eyeblink_coeff_path else: ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0] ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname) os.makedirs(ref_pose_frame_dir, exist_ok=True) print('3DMM Extraction for the reference video providing pose') - ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False) + t = time.time() + ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, + source_image_flag=False) + record_process_log("main", "preprocess_model.generate", time.time() - t, "ref_pose_coeff_path") else: - ref_pose_coeff_path=None + ref_pose_coeff_path = None - #audio2ceoff + t = time.time() + # audio2ceoff batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still) + # call get_data, 消耗时间:0.853573322 + record_process_log("main", "get_data", time.time() - t) coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path) + t = time.time() + record_process_log("main", "audio_to_coeff.generate", time.time() - t) # 3dface render if args.face3dvis: from src.face3d.visualize import gen_composed_video + t = time.time() gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4')) - - #coeff2video - data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, - batch_size, input_yaw_list, input_pitch_list, input_roll_list, - expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size) - + record_process_log("main", "gen_composed_video", time.time() - t) + + t = time.time() + # coeff2video + data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, + batch_size, input_yaw_list, input_pitch_list, input_roll_list, + expression_scale=args.expression_scale, still_mode=args.still, + preprocess=args.preprocess, size=args.size) + record_process_log("main", "get_facerender_data", time.time() - t) + + t = time.time() result = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \ - enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size) - - shutil.move(result, save_dir+'.mp4') - print('The generated video is named:', save_dir+'.mp4') + enhancer=args.enhancer, background_enhancer=args.background_enhancer, + preprocess=args.preprocess, img_size=args.size) + # AnimateFromCoeff generate 消耗的总时间:79.29866314 + record_process_log("main", "animate_from_coeff.generate", time.time() - t) + + shutil.move(result, save_dir + '.mp4') + print('The generated video is named:', save_dir + '.mp4') + + record_process_log("main", "result mp4 file", get_file_size(f"{save_dir}.mp4")) if not args.verbose: shutil.rmtree(save_dir) - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', help="path to driven audio") - parser.add_argument("--source_image", default='./examples/source_image/full_body_1.png', help="path to source image") +if __name__ == '__main__': + t = time.time() + parser = ArgumentParser() + parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', + help="path to driven audio") + parser.add_argument("--source_image", default='./examples/source_image/full_body_1.png', + help="path to source image") parser.add_argument("--ref_eyeblink", default=None, help="path to reference video providing eye blinking") parser.add_argument("--ref_pose", default=None, help="path to reference video providing pose") parser.add_argument("--checkpoint_dir", default='./checkpoints', help="path to output") parser.add_argument("--result_dir", default='./results', help="path to output") - parser.add_argument("--pose_style", type=int, default=0, help="input pose style from [0, 46)") - parser.add_argument("--batch_size", type=int, default=2, help="the batch size of facerender") - parser.add_argument("--size", type=int, default=256, help="the image size of the facerender") - parser.add_argument("--expression_scale", type=float, default=1., help="the batch size of facerender") + parser.add_argument("--pose_style", type=int, default=0, help="input pose style from [0, 46)") + parser.add_argument("--batch_size", type=int, default=2, help="the batch size of facerender") + parser.add_argument("--size", type=int, default=256, help="the image size of the facerender") + parser.add_argument("--expression_scale", type=float, default=1., help="the batch size of facerender") parser.add_argument('--input_yaw', nargs='+', type=int, default=None, help="the input yaw degree of the user ") parser.add_argument('--input_pitch', nargs='+', type=int, default=None, help="the input pitch degree of the user") parser.add_argument('--input_roll', nargs='+', type=int, default=None, help="the input roll degree of the user") - parser.add_argument('--enhancer', type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]") - parser.add_argument('--background_enhancer', type=str, default=None, help="background enhancer, [realesrgan]") - parser.add_argument("--cpu", dest="cpu", action="store_true") - parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks") - parser.add_argument("--still", action="store_true", help="can crop back to the original videos for the full body aniamtion") - parser.add_argument("--preprocess", default='crop', choices=['crop', 'extcrop', 'resize', 'full', 'extfull'], help="how to preprocess the images" ) - parser.add_argument("--verbose",action="store_true", help="saving the intermedia output or not" ) - parser.add_argument("--old_version",action="store_true", help="use the pth other than safetensor version" ) - + parser.add_argument('--enhancer', type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]") + parser.add_argument('--background_enhancer', type=str, default=None, help="background enhancer, [realesrgan]") + parser.add_argument("--cpu", dest="cpu", action="store_true") + parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks") + parser.add_argument("--still", action="store_true", + help="can crop back to the original videos for the full body aniamtion") + parser.add_argument("--preprocess", default='crop', choices=['crop', 'extcrop', 'resize', 'full', 'extfull'], + help="how to preprocess the images") + parser.add_argument("--verbose", action="store_true", help="saving the intermedia output or not") + parser.add_argument("--old_version", action="store_true", help="use the pth other than safetensor version") # net structure and parameters - parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='useless') + parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], + help='useless') parser.add_argument('--init_path', type=str, default=None, help='Useless') - parser.add_argument('--use_last_fc',default=False, help='zero initialize the last fc') + parser.add_argument('--use_last_fc', default=False, help='zero initialize the last fc') parser.add_argument('--bfm_folder', type=str, default='./checkpoints/BFM_Fitting/') parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model') @@ -135,11 +180,11 @@ def main(args): parser.add_argument('--z_far', type=float, default=15.) args = parser.parse_args() - + record_process_log(__name__, "parse_args", time.time() - t) if torch.cuda.is_available() and not args.cpu: args.device = "cuda" else: args.device = "cpu" main(args) - + record_process_log(__name__, "Total Time", time.time() - t) diff --git a/launcher.py b/launcher.py index ae1be9d4..17ce9f1a 100644 --- a/launcher.py +++ b/launcher.py @@ -194,7 +194,7 @@ def prepare_environment(): def start(): print(f"Launching SadTalker Web UI") - from app import sadtalker_demo + from app_sadtalker import sadtalker_demo demo = sadtalker_demo() demo.queue() demo.launch() diff --git a/quick_demo.ipynb b/quick_demo.ipynb index 8b9767d0..8606cfc8 100644 --- a/quick_demo.ipynb +++ b/quick_demo.ipynb @@ -60,21 +60,29 @@ }, "outputs": [], "source": [ - "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.8 2 \n", - "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.9 1 \n", - "!python --version \n", + "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.8 2\n", + "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.9 1\n", + "!sudo apt install python3.8\n", + "\n", + "!sudo apt-get install python3.8-distutils\n", + "\n", + "!python --version\n", + "\n", "!apt-get update\n", + "\n", "!apt install software-properties-common\n", + "\n", "!sudo dpkg --remove --force-remove-reinstreq python3-pip python3-setuptools python3-wheel\n", + "\n", "!apt-get install python3-pip\n", "\n", "print('Git clone project and install requirements...')\n", "!git clone https://github.com/Winfredy/SadTalker &> /dev/null\n", - "%cd SadTalker \n", - "!export PYTHONPATH=/content/SadTalker:$PYTHONPATH \n", + "%cd SadTalker\n", + "!export PYTHONPATH=/content/SadTalker:$PYTHONPATH\n", "!python3.8 -m pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113\n", "!apt update\n", - "!apt install ffmpeg &> /dev/null \n", + "!apt install ffmpeg &> /dev/null\n", "!python3.8 -m pip install -r requirements.txt" ] }, diff --git a/scripts/extension.py b/scripts/extension.py index cd3d1a21..e99fefee 100644 --- a/scripts/extension.py +++ b/scripts/extension.py @@ -169,7 +169,7 @@ def on_ui_tabs(): result_dir = opts.sadtalker_result_dir os.makedirs(result_dir, exist_ok=True) - from app import sadtalker_demo + from app_sadtalker import sadtalker_demo if os.getenv('SADTALKER_CHECKPOINTS'): checkpoint_path = os.getenv('SADTALKER_CHECKPOINTS') diff --git a/src/audio2pose_models/audio2pose.py b/src/audio2pose_models/audio2pose.py index 1a8410d6..2b8cd142 100644 --- a/src/audio2pose_models/audio2pose.py +++ b/src/audio2pose_models/audio2pose.py @@ -25,8 +25,8 @@ def forward(self, x): batch = {} coeff_gt = x['gt'].cuda().squeeze(0) #bs frame_len+1 73 - batch['pose_motion_gt'] = coeff_gt[:, 1:, -9:-3] - coeff_gt[:, :1, -9:-3] #bs frame_len 6 - batch['ref'] = coeff_gt[:, 0, -9:-3] #bs 6 + batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6 + batch['ref'] = coeff_gt[:, 0, 64:70] #bs 6 batch['class'] = x['class'].squeeze(0).cuda() # bs indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16 @@ -37,8 +37,8 @@ def forward(self, x): batch = self.netG(batch) pose_motion_pred = batch['pose_motion_pred'] # bs frame_len 6 - pose_gt = coeff_gt[:, 1:, -9:-3].clone() # bs frame_len 6 - pose_pred = coeff_gt[:, :1, -9:-3] + pose_motion_pred # bs frame_len 6 + pose_gt = coeff_gt[:, 1:, 64:70].clone() # bs frame_len 6 + pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred # bs frame_len 6 batch['pose_pred'] = pose_pred batch['pose_gt'] = pose_gt diff --git a/src/face3d/extract_kp_videos_safe.py b/src/face3d/extract_kp_videos_safe.py index 262439b9..5141ba3a 100644 --- a/src/face3d/extract_kp_videos_safe.py +++ b/src/face3d/extract_kp_videos_safe.py @@ -8,10 +8,28 @@ import torch from tqdm import tqdm from itertools import cycle -from facexlib.alignment import init_alignment_model, landmark_98_to_68 -from facexlib.detection import init_detection_model from torch.multiprocessing import Pool, Process, set_start_method +from facexlib.alignment import landmark_98_to_68 +from facexlib.detection import init_detection_model + +from facexlib.utils import load_file_from_url +from src.face3d.util.my_awing_arch import FAN + +def init_alignment_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'awing_fan': + model = FAN(num_modules=4, num_landmarks=98, device=device) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/alignment_WFLW_4HG.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + model.load_state_dict(torch.load(model_path, map_location=device)['state_dict'], strict=True) + model.eval() + model = model.to(device) + return model + class KeypointExtractor(): def __init__(self, device='cuda'): diff --git a/src/face3d/util/my_awing_arch.py b/src/face3d/util/my_awing_arch.py new file mode 100644 index 00000000..cd565617 --- /dev/null +++ b/src/face3d/util/my_awing_arch.py @@ -0,0 +1,378 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def calculate_points(heatmaps): + # change heatmaps to landmarks + B, N, H, W = heatmaps.shape + HW = H * W + BN_range = np.arange(B * N) + + heatline = heatmaps.reshape(B, N, HW) + indexes = np.argmax(heatline, axis=2) + + preds = np.stack((indexes % W, indexes // W), axis=2) + preds = preds.astype(np.float, copy=False) + + inr = indexes.ravel() + + heatline = heatline.reshape(B * N, HW) + x_up = heatline[BN_range, inr + 1] + x_down = heatline[BN_range, inr - 1] + # y_up = heatline[BN_range, inr + W] + + if any((inr + W) >= 4096): + y_up = heatline[BN_range, 4095] + else: + y_up = heatline[BN_range, inr + W] + if any((inr - W) <= 0): + y_down = heatline[BN_range, 0] + else: + y_down = heatline[BN_range, inr - W] + + think_diff = np.sign(np.stack((x_up - x_down, y_up - y_down), axis=1)) + think_diff *= .25 + + preds += think_diff.reshape(B, N, 2) + preds += .5 + return preds + + +class AddCoordsTh(nn.Module): + + def __init__(self, x_dim=64, y_dim=64, with_r=False, with_boundary=False): + super(AddCoordsTh, self).__init__() + self.x_dim = x_dim + self.y_dim = y_dim + self.with_r = with_r + self.with_boundary = with_boundary + + def forward(self, input_tensor, heatmap=None): + """ + input_tensor: (batch, c, x_dim, y_dim) + """ + batch_size_tensor = input_tensor.shape[0] + + xx_ones = torch.ones([1, self.y_dim], dtype=torch.int32, device=input_tensor.device) + xx_ones = xx_ones.unsqueeze(-1) + + xx_range = torch.arange(self.x_dim, dtype=torch.int32, device=input_tensor.device).unsqueeze(0) + xx_range = xx_range.unsqueeze(1) + + xx_channel = torch.matmul(xx_ones.float(), xx_range.float()) + xx_channel = xx_channel.unsqueeze(-1) + + yy_ones = torch.ones([1, self.x_dim], dtype=torch.int32, device=input_tensor.device) + yy_ones = yy_ones.unsqueeze(1) + + yy_range = torch.arange(self.y_dim, dtype=torch.int32, device=input_tensor.device).unsqueeze(0) + yy_range = yy_range.unsqueeze(-1) + + yy_channel = torch.matmul(yy_range.float(), yy_ones.float()) + yy_channel = yy_channel.unsqueeze(-1) + + xx_channel = xx_channel.permute(0, 3, 2, 1) + yy_channel = yy_channel.permute(0, 3, 2, 1) + + xx_channel = xx_channel / (self.x_dim - 1) + yy_channel = yy_channel / (self.y_dim - 1) + + xx_channel = xx_channel * 2 - 1 + yy_channel = yy_channel * 2 - 1 + + xx_channel = xx_channel.repeat(batch_size_tensor, 1, 1, 1) + yy_channel = yy_channel.repeat(batch_size_tensor, 1, 1, 1) + + if self.with_boundary and heatmap is not None: + boundary_channel = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0) + + zero_tensor = torch.zeros_like(xx_channel) + xx_boundary_channel = torch.where(boundary_channel > 0.05, xx_channel, zero_tensor) + yy_boundary_channel = torch.where(boundary_channel > 0.05, yy_channel, zero_tensor) + if self.with_boundary and heatmap is not None: + xx_boundary_channel = xx_boundary_channel.to(input_tensor.device) + yy_boundary_channel = yy_boundary_channel.to(input_tensor.device) + ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1) + + if self.with_r: + rr = torch.sqrt(torch.pow(xx_channel, 2) + torch.pow(yy_channel, 2)) + rr = rr / torch.max(rr) + ret = torch.cat([ret, rr], dim=1) + + if self.with_boundary and heatmap is not None: + ret = torch.cat([ret, xx_boundary_channel, yy_boundary_channel], dim=1) + return ret + + +class CoordConvTh(nn.Module): + """CoordConv layer as in the paper.""" + + def __init__(self, x_dim, y_dim, with_r, with_boundary, in_channels, first_one=False, *args, **kwargs): + super(CoordConvTh, self).__init__() + self.addcoords = AddCoordsTh(x_dim=x_dim, y_dim=y_dim, with_r=with_r, with_boundary=with_boundary) + in_channels += 2 + if with_r: + in_channels += 1 + if with_boundary and not first_one: + in_channels += 2 + self.conv = nn.Conv2d(in_channels=in_channels, *args, **kwargs) + + def forward(self, input_tensor, heatmap=None): + ret = self.addcoords(input_tensor, heatmap) + last_channel = ret[:, -2:, :, :] + ret = self.conv(ret) + return ret, last_channel + + +def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False, dilation=1): + '3x3 convolution with padding' + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=strd, padding=padding, bias=bias, dilation=dilation) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + # self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + # self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + + out = self.conv2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ConvBlock(nn.Module): + + def __init__(self, in_planes, out_planes): + super(ConvBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = conv3x3(in_planes, int(out_planes / 2)) + self.bn2 = nn.BatchNorm2d(int(out_planes / 2)) + self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4), padding=1, dilation=1) + self.bn3 = nn.BatchNorm2d(int(out_planes / 4)) + self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4), padding=1, dilation=1) + + if in_planes != out_planes: + self.downsample = nn.Sequential( + nn.BatchNorm2d(in_planes), + nn.ReLU(True), + nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False), + ) + else: + self.downsample = None + + def forward(self, x): + residual = x + + out1 = self.bn1(x) + out1 = F.relu(out1, True) + out1 = self.conv1(out1) + + out2 = self.bn2(out1) + out2 = F.relu(out2, True) + out2 = self.conv2(out2) + + out3 = self.bn3(out2) + out3 = F.relu(out3, True) + out3 = self.conv3(out3) + + out3 = torch.cat((out1, out2, out3), 1) + + if self.downsample is not None: + residual = self.downsample(residual) + + out3 += residual + + return out3 + + +class HourGlass(nn.Module): + + def __init__(self, num_modules, depth, num_features, first_one=False): + super(HourGlass, self).__init__() + self.num_modules = num_modules + self.depth = depth + self.features = num_features + self.coordconv = CoordConvTh( + x_dim=64, + y_dim=64, + with_r=True, + with_boundary=True, + in_channels=256, + first_one=first_one, + out_channels=256, + kernel_size=1, + stride=1, + padding=0) + self._generate_network(self.depth) + + def _generate_network(self, level): + self.add_module('b1_' + str(level), ConvBlock(256, 256)) + + self.add_module('b2_' + str(level), ConvBlock(256, 256)) + + if level > 1: + self._generate_network(level - 1) + else: + self.add_module('b2_plus_' + str(level), ConvBlock(256, 256)) + + self.add_module('b3_' + str(level), ConvBlock(256, 256)) + + def _forward(self, level, inp): + # Upper branch + up1 = inp + up1 = self._modules['b1_' + str(level)](up1) + + # Lower branch + low1 = F.avg_pool2d(inp, 2, stride=2) + low1 = self._modules['b2_' + str(level)](low1) + + if level > 1: + low2 = self._forward(level - 1, low1) + else: + low2 = low1 + low2 = self._modules['b2_plus_' + str(level)](low2) + + low3 = low2 + low3 = self._modules['b3_' + str(level)](low3) + + up2 = F.interpolate(low3, scale_factor=2, mode='nearest') + + return up1 + up2 + + def forward(self, x, heatmap): + x, last_channel = self.coordconv(x, heatmap) + return self._forward(self.depth, x), last_channel + + +class FAN(nn.Module): + + def __init__(self, num_modules=1, end_relu=False, gray_scale=False, num_landmarks=68, device='cuda'): + super(FAN, self).__init__() + self.device = device + self.num_modules = num_modules + self.gray_scale = gray_scale + self.end_relu = end_relu + self.num_landmarks = num_landmarks + + # Base part + if self.gray_scale: + self.conv1 = CoordConvTh( + x_dim=256, + y_dim=256, + with_r=True, + with_boundary=False, + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3) + else: + self.conv1 = CoordConvTh( + x_dim=256, + y_dim=256, + with_r=True, + with_boundary=False, + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.conv2 = ConvBlock(64, 128) + self.conv3 = ConvBlock(128, 128) + self.conv4 = ConvBlock(128, 256) + + # Stacking part + for hg_module in range(self.num_modules): + if hg_module == 0: + first_one = True + else: + first_one = False + self.add_module('m' + str(hg_module), HourGlass(1, 4, 256, first_one)) + self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256)) + self.add_module('conv_last' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) + self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256)) + self.add_module('l' + str(hg_module), nn.Conv2d(256, num_landmarks + 1, kernel_size=1, stride=1, padding=0)) + + if hg_module < self.num_modules - 1: + self.add_module('bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) + self.add_module('al' + str(hg_module), + nn.Conv2d(num_landmarks + 1, 256, kernel_size=1, stride=1, padding=0)) + + def forward(self, x): + x, _ = self.conv1(x) + x = F.relu(self.bn1(x), True) + # x = F.relu(self.bn1(self.conv1(x)), True) + x = F.avg_pool2d(self.conv2(x), 2, stride=2) + x = self.conv3(x) + x = self.conv4(x) + + previous = x + + outputs = [] + boundary_channels = [] + tmp_out = None + for i in range(self.num_modules): + hg, boundary_channel = self._modules['m' + str(i)](previous, tmp_out) + + ll = hg + ll = self._modules['top_m_' + str(i)](ll) + + ll = F.relu(self._modules['bn_end' + str(i)](self._modules['conv_last' + str(i)](ll)), True) + + # Predict heatmaps + tmp_out = self._modules['l' + str(i)](ll) + if self.end_relu: + tmp_out = F.relu(tmp_out) # HACK: Added relu + outputs.append(tmp_out) + boundary_channels.append(boundary_channel) + + if i < self.num_modules - 1: + ll = self._modules['bl' + str(i)](ll) + tmp_out_ = self._modules['al' + str(i)](tmp_out) + previous = previous + ll + tmp_out_ + + return outputs, boundary_channels + + def get_landmarks(self, img): + H, W, _ = img.shape + offset = W / 64, H / 64, 0, 0 + + img = cv2.resize(img, (256, 256)) + inp = img[..., ::-1] + inp = torch.from_numpy(np.ascontiguousarray(inp.transpose((2, 0, 1)))).float() + inp = inp.to(self.device) + inp.div_(255.0).unsqueeze_(0) + + outputs, _ = self.forward(inp) + out = outputs[-1][:, :-1, :, :] + heatmaps = out.detach().cpu().numpy() + + pred = calculate_points(heatmaps).reshape(-1, 2) + + pred *= offset[:2] + pred += offset[-2:] + + return pred diff --git a/src/face3d/visualize.py b/src/face3d/visualize.py index 23a11108..d4652946 100644 --- a/src/face3d/visualize.py +++ b/src/face3d/visualize.py @@ -6,43 +6,48 @@ import torch import subprocess, platform import scipy.io as scio -from tqdm import tqdm +from tqdm import tqdm +import time +from src.utils.process_log import record_process_log + # draft def gen_composed_video(args, device, first_frame_coeff, coeff_path, audio_path, save_path, exp_dim=64): - coeff_first = scio.loadmat(first_frame_coeff)['full_3dmm'] coeff_pred = scio.loadmat(coeff_path)['coeff_3dmm'] - coeff_full = np.repeat(coeff_first, coeff_pred.shape[0], axis=0) # 257 + coeff_full = np.repeat(coeff_first, coeff_pred.shape[0], axis=0) # 257 coeff_full[:, 80:144] = coeff_pred[:, 0:64] - coeff_full[:, 224:227] = coeff_pred[:, 64:67] # 3 dim translation - coeff_full[:, 254:] = coeff_pred[:, 67:] # 3 dim translation + coeff_full[:, 224:227] = coeff_pred[:, 64:67] # 3 dim translation + coeff_full[:, 254:] = coeff_pred[:, 67:] # 3 dim translation tmp_video_path = '/tmp/face3dtmp.mp4' facemodel = FaceReconModel(args) - + t = time.time() video = cv2.VideoWriter(tmp_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 25, (224, 224)) - + record_process_log("gen_composed_video", "cv2.VideoWriter", time.time()-t) + t = time.time() for k in tqdm(range(coeff_pred.shape[0]), 'face3d rendering:'): - cur_coeff_full = torch.tensor(coeff_full[k:k+1], device=device) + cur_coeff_full = torch.tensor(coeff_full[k:k + 1], device=device) facemodel.forward(cur_coeff_full, device) - predicted_landmark = facemodel.pred_lm # TODO. + predicted_landmark = facemodel.pred_lm # TODO. predicted_landmark = predicted_landmark.cpu().numpy().squeeze() rendered_img = facemodel.pred_face - rendered_img = 255. * rendered_img.cpu().numpy().squeeze().transpose(1,2,0) + rendered_img = 255. * rendered_img.cpu().numpy().squeeze().transpose(1, 2, 0) out_img = rendered_img[:, :, :3].astype(np.uint8) - video.write(np.uint8(out_img[:,:,::-1])) + video.write(np.uint8(out_img[:, :, ::-1])) video.release() + record_process_log("gen_composed_video", "face3d rendering", time.time() - t) + t = time.time() command = 'ffmpeg -v quiet -y -i {} -i {} -strict -2 -q:v 1 {}'.format(audio_path, tmp_video_path, save_path) subprocess.call(command, shell=platform.system() != 'Windows') - + record_process_log("gen_composed_video", "ffmpeg", time.time() - t) diff --git a/src/facerender/animate.py b/src/facerender/animate.py index 781f5a33..2ada81cc 100644 --- a/src/facerender/animate.py +++ b/src/facerender/animate.py @@ -5,45 +5,56 @@ import warnings from skimage import img_as_ubyte import safetensors -import safetensors.torch -warnings.filterwarnings('ignore') +import safetensors.torch +warnings.filterwarnings('ignore') import imageio import torch import torchvision - from src.facerender.modules.keypoint_detector import HEEstimator, KPDetector from src.facerender.modules.mapping import MappingNet from src.facerender.modules.generator import OcclusionAwareGenerator, OcclusionAwareSPADEGenerator -from src.facerender.modules.make_animation import make_animation +from src.facerender.modules.make_animation import make_animation -from pydub import AudioSegment +from pydub import AudioSegment from src.utils.face_enhancer import enhancer_generator_with_len, enhancer_list from src.utils.paste_pic import paste_pic from src.utils.videoio import save_video_with_watermark +import time +from src.utils.process_log import record_process_log, get_file_size try: import webui # in webui + in_webui = True except: in_webui = False + class AnimateFromCoeff(): def __init__(self, sadtalker_path, device): with open(sadtalker_path['facerender_yaml']) as f: config = yaml.safe_load(f) - + t = time.time() generator = OcclusionAwareSPADEGenerator(**config['model_params']['generator_params'], - **config['model_params']['common_params']) + **config['model_params']['common_params']) + record_process_log(self.__class__.__name__, "OcclusionAwareSPADEGenerator", time.time()-t) + t = time.time() kp_extractor = KPDetector(**config['model_params']['kp_detector_params'], - **config['model_params']['common_params']) + **config['model_params']['common_params']) + record_process_log(self.__class__.__name__, "KPDetector", time.time() - t) + + t = time.time() he_estimator = HEEstimator(**config['model_params']['he_estimator_params'], - **config['model_params']['common_params']) + **config['model_params']['common_params']) + record_process_log(self.__class__.__name__, "HEEstimator", time.time() - t) + t = time.time() mapping = MappingNet(**config['model_params']['mapping_params']) + record_process_log(self.__class__.__name__, "HEEstimator", time.time() - t) generator.to(device) kp_extractor.to(device) @@ -52,24 +63,26 @@ def __init__(self, sadtalker_path, device): for param in generator.parameters(): param.requires_grad = False for param in kp_extractor.parameters(): - param.requires_grad = False + param.requires_grad = False for param in he_estimator.parameters(): param.requires_grad = False for param in mapping.parameters(): param.requires_grad = False if sadtalker_path is not None: - if 'checkpoint' in sadtalker_path: # use safe tensor - self.load_cpk_facevid2vid_safetensor(sadtalker_path['checkpoint'], kp_detector=kp_extractor, generator=generator, he_estimator=None) + if 'checkpoint' in sadtalker_path: # use safe tensor + self.load_cpk_facevid2vid_safetensor(sadtalker_path['checkpoint'], kp_detector=kp_extractor, + generator=generator, he_estimator=None) else: - self.load_cpk_facevid2vid(sadtalker_path['free_view_checkpoint'], kp_detector=kp_extractor, generator=generator, he_estimator=he_estimator) + self.load_cpk_facevid2vid(sadtalker_path['free_view_checkpoint'], kp_detector=kp_extractor, + generator=generator, he_estimator=he_estimator) else: raise AttributeError("Checkpoint should be specified for video head pose estimator.") - if sadtalker_path['mappingnet_checkpoint'] is not None: + if sadtalker_path['mappingnet_checkpoint'] is not None: self.load_cpk_mapping(sadtalker_path['mappingnet_checkpoint'], mapping=mapping) else: - raise AttributeError("Checkpoint should be specified for video head pose estimator.") + raise AttributeError("Checkpoint should be specified for video head pose estimator.") self.kp_extractor = kp_extractor self.generator = generator @@ -80,40 +93,40 @@ def __init__(self, sadtalker_path, device): self.generator.eval() self.he_estimator.eval() self.mapping.eval() - + self.device = device - - def load_cpk_facevid2vid_safetensor(self, checkpoint_path, generator=None, - kp_detector=None, he_estimator=None, - device="cpu"): + + def load_cpk_facevid2vid_safetensor(self, checkpoint_path, generator=None, + kp_detector=None, he_estimator=None, + device="cpu"): checkpoint = safetensors.torch.load_file(checkpoint_path) if generator is not None: x_generator = {} - for k,v in checkpoint.items(): + for k, v in checkpoint.items(): if 'generator' in k: x_generator[k.replace('generator.', '')] = v generator.load_state_dict(x_generator) if kp_detector is not None: x_generator = {} - for k,v in checkpoint.items(): + for k, v in checkpoint.items(): if 'kp_extractor' in k: x_generator[k.replace('kp_extractor.', '')] = v kp_detector.load_state_dict(x_generator) if he_estimator is not None: x_generator = {} - for k,v in checkpoint.items(): + for k, v in checkpoint.items(): if 'he_estimator' in k: x_generator[k.replace('he_estimator.', '')] = v he_estimator.load_state_dict(x_generator) - + return None - def load_cpk_facevid2vid(self, checkpoint_path, generator=None, discriminator=None, - kp_detector=None, he_estimator=None, optimizer_generator=None, - optimizer_discriminator=None, optimizer_kp_detector=None, - optimizer_he_estimator=None, device="cpu"): + def load_cpk_facevid2vid(self, checkpoint_path, generator=None, discriminator=None, + kp_detector=None, he_estimator=None, optimizer_generator=None, + optimizer_discriminator=None, optimizer_kp_detector=None, + optimizer_he_estimator=None, device="cpu"): checkpoint = torch.load(checkpoint_path, map_location=torch.device(device)) if generator is not None: generator.load_state_dict(checkpoint['generator']) @@ -123,26 +136,26 @@ def load_cpk_facevid2vid(self, checkpoint_path, generator=None, discriminator=No he_estimator.load_state_dict(checkpoint['he_estimator']) if discriminator is not None: try: - discriminator.load_state_dict(checkpoint['discriminator']) + discriminator.load_state_dict(checkpoint['discriminator']) except: - print ('No discriminator in the state-dict. Dicriminator will be randomly initialized') + print('No discriminator in the state-dict. Dicriminator will be randomly initialized') if optimizer_generator is not None: optimizer_generator.load_state_dict(checkpoint['optimizer_generator']) if optimizer_discriminator is not None: try: optimizer_discriminator.load_state_dict(checkpoint['optimizer_discriminator']) except RuntimeError as e: - print ('No discriminator optimizer in the state-dict. Optimizer will be not initialized') + print('No discriminator optimizer in the state-dict. Optimizer will be not initialized') if optimizer_kp_detector is not None: optimizer_kp_detector.load_state_dict(checkpoint['optimizer_kp_detector']) if optimizer_he_estimator is not None: optimizer_he_estimator.load_state_dict(checkpoint['optimizer_he_estimator']) return checkpoint['epoch'] - + def load_cpk_mapping(self, checkpoint_path, mapping=None, discriminator=None, - optimizer_mapping=None, optimizer_discriminator=None, device='cpu'): - checkpoint = torch.load(checkpoint_path, map_location=torch.device(device)) + optimizer_mapping=None, optimizer_discriminator=None, device='cpu'): + checkpoint = torch.load(checkpoint_path, map_location=torch.device(device)) if mapping is not None: mapping.load_state_dict(checkpoint['mapping']) if discriminator is not None: @@ -154,14 +167,15 @@ def load_cpk_mapping(self, checkpoint_path, mapping=None, discriminator=None, return checkpoint['epoch'] - def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, background_enhancer=None, preprocess='crop', img_size=256): - - source_image=x['source_image'].type(torch.FloatTensor) - source_semantics=x['source_semantics'].type(torch.FloatTensor) - target_semantics=x['target_semantics_list'].type(torch.FloatTensor) - source_image=source_image.to(self.device) - source_semantics=source_semantics.to(self.device) - target_semantics=target_semantics.to(self.device) + def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, background_enhancer=None, + preprocess='crop', img_size=256): + t = time.time() + source_image = x['source_image'].type(torch.FloatTensor) + source_semantics = x['source_semantics'].type(torch.FloatTensor) + target_semantics = x['target_semantics_list'].type(torch.FloatTensor) + source_image = source_image.to(self.device) + source_semantics = source_semantics.to(self.device) + target_semantics = target_semantics.to(self.device) if 'yaw_c_seq' in x: yaw_c_seq = x['yaw_c_seq'].type(torch.FloatTensor) yaw_c_seq = x['yaw_c_seq'].to(self.device) @@ -173,85 +187,125 @@ def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, backgr else: pitch_c_seq = None if 'roll_c_seq' in x: - roll_c_seq = x['roll_c_seq'].type(torch.FloatTensor) + roll_c_seq = x['roll_c_seq'].type(torch.FloatTensor) roll_c_seq = x['roll_c_seq'].to(self.device) else: roll_c_seq = None - frame_num = x['frame_num'] + record_process_log(self.__class__.__name__, "generate", time.time()-t, "source_image") + frame_num = x['frame_num'] + t = time.time() predictions_video = make_animation(source_image, source_semantics, target_semantics, - self.generator, self.kp_extractor, self.he_estimator, self.mapping, - yaw_c_seq, pitch_c_seq, roll_c_seq, use_exp = True) + self.generator, self.kp_extractor, self.he_estimator, self.mapping, + yaw_c_seq, pitch_c_seq, roll_c_seq, use_exp=True) + # call make_animation v100 消耗时长:16.85565281 + record_process_log(self.__class__.__name__, "generate", time.time()-t, "make_animation") - predictions_video = predictions_video.reshape((-1,)+predictions_video.shape[2:]) + t = time.time() + predictions_video = predictions_video.reshape((-1,) + predictions_video.shape[2:]) predictions_video = predictions_video[:frame_num] + record_process_log(self.__class__.__name__, "generate", time.time()-t, "predictions_video") video = [] + t = time.time() for idx in range(predictions_video.shape[0]): image = predictions_video[idx] image = np.transpose(image.data.cpu().numpy(), [1, 2, 0]).astype(np.float32) video.append(image) - result = img_as_ubyte(video) + record_process_log(self.__class__.__name__, "generate", time.time()-t, "range(predictions_video.shape[0])") + t = time.time() + result = img_as_ubyte(video) ### the generated video is 256x256, so we keep the aspect ratio, original_size = crop_info[0] if original_size: - result = [ cv2.resize(result_i,(img_size, int(img_size * original_size[1]/original_size[0]) )) for result_i in result ] - - video_name = x['video_name'] + '.mp4' - path = os.path.join(video_save_dir, 'temp_'+video_name) - - imageio.mimsave(path, result, fps=float(25)) + result = [cv2.resize(result_i, (img_size, int(img_size * original_size[1] / original_size[0]))) for result_i + in result] + record_process_log(self.__class__.__name__, "generate", time.time()-t, "original_size") + + video_name = x['video_name'] + '.mp4' + path = os.path.join(video_save_dir, 'temp_' + video_name) + + t = time.time() + + imageio.mimsave(path, result, fps=float(25)) + record_process_log(self.__class__.__name__, "generate", time.time()-t, "imageio.mimsave") av_path = os.path.join(video_save_dir, video_name) - return_path = av_path - - audio_path = x['audio_path'] + return_path = av_path + + audio_path = x['audio_path'] audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0] - new_audio_path = os.path.join(video_save_dir, audio_name+'.wav') + new_audio_path = os.path.join(video_save_dir, audio_name + '.wav') start_time = 0 # cog will not keep the .mp3 filename sound = AudioSegment.from_file(audio_path) - frames = frame_num - end_time = start_time + frames*1/25*1000 - word1=sound.set_frame_rate(16000) + frames = frame_num + end_time = start_time + frames * 1 / 25 * 1000 + word1 = sound.set_frame_rate(16000) word = word1[start_time:end_time] word.export(new_audio_path, format="wav") - save_video_with_watermark(path, new_audio_path, av_path, watermark= False) - print(f'The generated video is named {video_save_dir}/{video_name}') + save_video_with_watermark(path, new_audio_path, av_path, watermark=False) + print(f'The generated video is named {video_save_dir}/{video_name}') + t = time.time() if 'full' in preprocess.lower(): # only add watermark to the full image. - video_name_full = x['video_name'] + '_full.mp4' + video_name_full = x['video_name'] + '_full.mp4' full_video_path = os.path.join(video_save_dir, video_name_full) return_path = full_video_path - paste_pic(path, pic_path, crop_info, new_audio_path, full_video_path, extended_crop= True if 'ext' in preprocess.lower() else False) - print(f'The generated video is named {video_save_dir}/{video_name_full}') + # TODO (qingyuan): paste_pic takes 9s. Check the comments inside the function on how to optimize it + paste_pic(path, pic_path, crop_info, new_audio_path, full_video_path, + extended_crop=True if 'ext' in preprocess.lower() else False) + + record_process_log(self.__class__.__name__, "generate", time.time() - t, "paste_pic") + print(f'The generated video is named {video_save_dir}/{video_name_full}') else: - full_video_path = av_path + full_video_path = av_path - #### paste back then enhancers + #### paste back then enhancers if enhancer: - video_name_enhancer = x['video_name'] + '_enhanced.mp4' - enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer) - av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer) + video_name_enhancer = x['video_name'] + '_enhanced.mp4' + enhanced_path = os.path.join(video_save_dir, 'temp_' + video_name_enhancer) + av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer) return_path = av_path_enhancer + t = time.time() try: - enhanced_images_gen_with_len = enhancer_generator_with_len(full_video_path, method=enhancer, bg_upsampler=background_enhancer) + et = time.time() + enhanced_images_gen_with_len = enhancer_generator_with_len(full_video_path, method=enhancer, + bg_upsampler=background_enhancer) + record_process_log(self.__class__.__name__, "generate", time.time() - et, "enhancer_generator_with_len") + it = time.time() imageio.mimsave(enhanced_path, enhanced_images_gen_with_len, fps=float(25)) + # enhancer_generator_with_len.imageio.mimsave 消耗时长:50.53366041 + # TODO (qingyuan): mimsave is an alias for mimwrite. The source code is here: + # https://github.com/imageio/imageio/blob/dac86e36696d4afbd2b6588d8fd119107cdfaf3f/imageio/v2.py#L469 + # All the time is spent on disk i/o. We may send the video back directly without writing to disk. + record_process_log(self.__class__.__name__, "generate", time.time() - it, "enhancer_generator_with_len.imageio.mimsave") + except: - enhanced_images_gen_with_len = enhancer_list(full_video_path, method=enhancer, bg_upsampler=background_enhancer) + _t = time.time() + enhanced_images_gen_with_len = enhancer_list(full_video_path, method=enhancer, + bg_upsampler=background_enhancer) + record_process_log(self.__class__.__name__, "generate", time.time() - _t, "enhancer_list") + it = time.time() imageio.mimsave(enhanced_path, enhanced_images_gen_with_len, fps=float(25)) - - save_video_with_watermark(enhanced_path, new_audio_path, av_path_enhancer, watermark= False) + record_process_log(self.__class__.__name__, "generate", time.time() - it, "enhancer_list.imageio.mimsave") + + # enhanced_images_gen_with_len,支持try except 这部分block 整体时间:50.68383718,99%的时间都是消耗在 enhancer_generator_with_len.imageio.mimsave + record_process_log(self.__class__.__name__, "generate", time.time() - t, "enhanced_images_gen_with_len") + save_video_with_watermark(enhanced_path, new_audio_path, av_path_enhancer, watermark=False) print(f'The generated video is named {video_save_dir}/{video_name_enhancer}') + record_process_log(self.__class__.__name__, "generate", 0, f"av_path_enhancer:{get_file_size(av_path_enhancer)}") + + record_process_log(self.__class__.__name__, "generate", 0, f"enhanced_path:{get_file_size(enhanced_path)}") os.remove(enhanced_path) + record_process_log(self.__class__.__name__, "generate", 0, + f"new_audio_path:{get_file_size(new_audio_path)}") os.remove(path) os.remove(new_audio_path) - return return_path - diff --git a/src/facerender/modules/make_animation.py b/src/facerender/modules/make_animation.py index 3360c535..9ca93356 100644 --- a/src/facerender/modules/make_animation.py +++ b/src/facerender/modules/make_animation.py @@ -2,7 +2,10 @@ import torch import torch.nn.functional as F import numpy as np -from tqdm import tqdm +from tqdm import tqdm +import time +from src.utils.process_log import record_process_log + def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=False, use_relative_movement=False, use_relative_jacobian=False): @@ -26,14 +29,16 @@ def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale return kp_new + def headpose_pred_to_degree(pred): device = pred.device idx_tensor = [idx for idx in range(66)] idx_tensor = torch.FloatTensor(idx_tensor).type_as(pred).to(device) pred = F.softmax(pred) - degree = torch.sum(pred*idx_tensor, 1) * 3 - 99 + degree = torch.sum(pred * idx_tensor, 1) * 3 - 99 return degree + def get_rotation_matrix(yaw, pitch, roll): yaw = yaw / 180 * 3.14 pitch = pitch / 180 * 3.14 @@ -43,29 +48,30 @@ def get_rotation_matrix(yaw, pitch, roll): pitch = pitch.unsqueeze(1) yaw = yaw.unsqueeze(1) - pitch_mat = torch.cat([torch.ones_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch), - torch.zeros_like(pitch), torch.cos(pitch), -torch.sin(pitch), - torch.zeros_like(pitch), torch.sin(pitch), torch.cos(pitch)], dim=1) + pitch_mat = torch.cat([torch.ones_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch), + torch.zeros_like(pitch), torch.cos(pitch), -torch.sin(pitch), + torch.zeros_like(pitch), torch.sin(pitch), torch.cos(pitch)], dim=1) pitch_mat = pitch_mat.view(pitch_mat.shape[0], 3, 3) - yaw_mat = torch.cat([torch.cos(yaw), torch.zeros_like(yaw), torch.sin(yaw), - torch.zeros_like(yaw), torch.ones_like(yaw), torch.zeros_like(yaw), - -torch.sin(yaw), torch.zeros_like(yaw), torch.cos(yaw)], dim=1) + yaw_mat = torch.cat([torch.cos(yaw), torch.zeros_like(yaw), torch.sin(yaw), + torch.zeros_like(yaw), torch.ones_like(yaw), torch.zeros_like(yaw), + -torch.sin(yaw), torch.zeros_like(yaw), torch.cos(yaw)], dim=1) yaw_mat = yaw_mat.view(yaw_mat.shape[0], 3, 3) - roll_mat = torch.cat([torch.cos(roll), -torch.sin(roll), torch.zeros_like(roll), - torch.sin(roll), torch.cos(roll), torch.zeros_like(roll), - torch.zeros_like(roll), torch.zeros_like(roll), torch.ones_like(roll)], dim=1) + roll_mat = torch.cat([torch.cos(roll), -torch.sin(roll), torch.zeros_like(roll), + torch.sin(roll), torch.cos(roll), torch.zeros_like(roll), + torch.zeros_like(roll), torch.zeros_like(roll), torch.ones_like(roll)], dim=1) roll_mat = roll_mat.view(roll_mat.shape[0], 3, 3) rot_mat = torch.einsum('bij,bjk,bkm->bim', pitch_mat, yaw_mat, roll_mat) return rot_mat + def keypoint_transformation(kp_canonical, he, wo_exp=False): - kp = kp_canonical['value'] # (bs, k, 3) - yaw, pitch, roll= he['yaw'], he['pitch'], he['roll'] - yaw = headpose_pred_to_degree(yaw) + kp = kp_canonical['value'] # (bs, k, 3) + yaw, pitch, roll = he['yaw'], he['pitch'], he['roll'] + yaw = headpose_pred_to_degree(yaw) pitch = headpose_pred_to_degree(pitch) roll = headpose_pred_to_degree(roll) @@ -76,18 +82,18 @@ def keypoint_transformation(kp_canonical, he, wo_exp=False): if 'roll_in' in he: roll = he['roll_in'] - rot_mat = get_rotation_matrix(yaw, pitch, roll) # (bs, 3, 3) + rot_mat = get_rotation_matrix(yaw, pitch, roll) # (bs, 3, 3) t, exp = he['t'], he['exp'] if wo_exp: - exp = exp*0 - - # keypoint rotation + exp = exp * 0 + + # keypoint rotation kp_rotated = torch.einsum('bmp,bkp->bkm', rot_mat, kp) # keypoint translation - t[:, 0] = t[:, 0]*0 - t[:, 2] = t[:, 2]*0 + t[:, 0] = t[:, 0] * 0 + t[:, 2] = t[:, 2] * 0 t = t.unsqueeze(1).repeat(1, kp.shape[1], 1) kp_t = kp_rotated + t @@ -98,34 +104,40 @@ def keypoint_transformation(kp_canonical, he, wo_exp=False): return {'value': kp_transformed} - def make_animation(source_image, source_semantics, target_semantics, - generator, kp_detector, he_estimator, mapping, - yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None, - use_exp=True, use_half=False): + generator, kp_detector, he_estimator, mapping, + yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None, + use_exp=True, use_half=False): with torch.no_grad(): predictions = [] kp_canonical = kp_detector(source_image) he_source = mapping(source_semantics) kp_source = keypoint_transformation(kp_canonical, he_source) - + t = time.time() for frame_idx in tqdm(range(target_semantics.shape[1]), 'Face Renderer:'): # still check the dimension # print(target_semantics.shape, source_semantics.shape) + for_loop_t = time.time() target_semantics_frame = target_semantics[:, frame_idx] he_driving = mapping(target_semantics_frame) if yaw_c_seq is not None: he_driving['yaw_in'] = yaw_c_seq[:, frame_idx] if pitch_c_seq is not None: - he_driving['pitch_in'] = pitch_c_seq[:, frame_idx] + he_driving['pitch_in'] = pitch_c_seq[:, frame_idx] if roll_c_seq is not None: - he_driving['roll_in'] = roll_c_seq[:, frame_idx] - + he_driving['roll_in'] = roll_c_seq[:, frame_idx] + kp_driving = keypoint_transformation(kp_canonical, he_driving) - + # record_process_log("make_animation", "Face Renderer for loop: keypoint_transformation", + # time.time() - for_loop_t, + # f"frame_idx:{frame_idx}") kp_norm = kp_driving + gt = time.time() out = generator(source_image, kp_source=kp_source, kp_driving=kp_norm) + # record_process_log("make_animation", "Face Renderer for loop: generator", + # time.time() - gt, + # f"frame_idx:{frame_idx}") ''' source_image_new = out['prediction'].squeeze(1) kp_canonical_new = kp_detector(source_image_new) @@ -135,9 +147,16 @@ def make_animation(source_image, source_semantics, target_semantics, out = generator(source_image_new, kp_source=kp_source_new, kp_driving=kp_driving_new) ''' predictions.append(out['prediction']) + + # record_process_log("make_animation", "Face Renderer for loop", time.time() - for_loop_t, f"frame_idx:{frame_idx}") + predictions_ts = torch.stack(predictions, dim=1) + # Face Renderer 总消耗时长:16.83923388,可以使用并行处理for loop + # TODO(qingyuan): change forloop to parallel processing + record_process_log("make_animation", "Face Renderer", time.time()-t) return predictions_ts + class AnimateModel(torch.nn.Module): """ Merge all generator related updates into single model for better multi-gpu usage @@ -154,7 +173,6 @@ def __init__(self, generator, kp_extractor, mapping): self.mapping.eval() def forward(self, x): - source_image = x['source_image'] source_semantics = x['source_semantics'] target_semantics = x['target_semantics'] @@ -163,8 +181,8 @@ def forward(self, x): roll_c_seq = x['roll_c_seq'] predictions_video = make_animation(source_image, source_semantics, target_semantics, - self.generator, self.kp_extractor, - self.mapping, use_exp = True, - yaw_c_seq=yaw_c_seq, pitch_c_seq=pitch_c_seq, roll_c_seq=roll_c_seq) - - return predictions_video \ No newline at end of file + self.generator, self.kp_extractor, + self.mapping, use_exp=True, + yaw_c_seq=yaw_c_seq, pitch_c_seq=pitch_c_seq, roll_c_seq=roll_c_seq) + + return predictions_video diff --git a/src/generate_facerender_batch.py b/src/generate_facerender_batch.py index a62b6edf..03db82ee 100644 --- a/src/generate_facerender_batch.py +++ b/src/generate_facerender_batch.py @@ -5,15 +5,15 @@ import torch import scipy.io as scio -def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, - batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None, - expression_scale=1.0, still_mode = False, preprocess='crop', size = 256): +def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, + batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None, + expression_scale=1.0, still_mode=False, preprocess='crop', size=256): semantic_radius = 13 video_name = os.path.splitext(os.path.split(coeff_path)[-1])[0] txt_path = os.path.splitext(coeff_path)[0] - data={} + data = {} img1 = Image.open(pic_path) source_image = np.array(img1) @@ -23,17 +23,17 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, source_image_ts = torch.FloatTensor(source_image).unsqueeze(0) source_image_ts = source_image_ts.repeat(batch_size, 1, 1, 1) data['source_image'] = source_image_ts - + source_semantics_dict = scio.loadmat(first_coeff_path) generated_dict = scio.loadmat(coeff_path) if 'full' not in preprocess.lower(): - source_semantics = source_semantics_dict['coeff_3dmm'][:1,:70] #1 70 - generated_3dmm = generated_dict['coeff_3dmm'][:,:70] + source_semantics = source_semantics_dict['coeff_3dmm'][:1, :70] # 1 70 + generated_3dmm = generated_dict['coeff_3dmm'][:, :70] else: - source_semantics = source_semantics_dict['coeff_3dmm'][:1,:73] #1 70 - generated_3dmm = generated_dict['coeff_3dmm'][:,:70] + source_semantics = source_semantics_dict['coeff_3dmm'][:1, :73] # 1 70 + generated_3dmm = generated_dict['coeff_3dmm'][:, :70] source_semantics_new = transform_semantic_1(source_semantics, semantic_radius) source_semantics_ts = torch.FloatTensor(source_semantics_new).unsqueeze(0) @@ -44,35 +44,37 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, generated_3dmm[:, :64] = generated_3dmm[:, :64] * expression_scale if 'full' in preprocess.lower(): - generated_3dmm = np.concatenate([generated_3dmm, np.repeat(source_semantics[:,70:], generated_3dmm.shape[0], axis=0)], axis=1) + generated_3dmm = np.concatenate( + [generated_3dmm, np.repeat(source_semantics[:, 70:], generated_3dmm.shape[0], axis=0)], axis=1) if still_mode: generated_3dmm[:, 64:] = np.repeat(source_semantics[:, 64:], generated_3dmm.shape[0], axis=0) - with open(txt_path+'.txt', 'w') as f: + with open(txt_path + '.txt', 'w') as f: for coeff in generated_3dmm: for i in coeff: - f.write(str(i)[:7] + ' '+'\t') + f.write(str(i)[:7] + ' ' + '\t') f.write('\n') - target_semantics_list = [] + target_semantics_list = [] frame_num = generated_3dmm.shape[0] data['frame_num'] = frame_num for frame_idx in range(frame_num): target_semantics = transform_semantic_target(generated_3dmm, frame_idx, semantic_radius) target_semantics_list.append(target_semantics) - remainder = frame_num%batch_size - if remainder!=0: - for _ in range(batch_size-remainder): + remainder = frame_num % batch_size + if remainder != 0: + for _ in range(batch_size - remainder): target_semantics_list.append(target_semantics) - target_semantics_np = np.array(target_semantics_list) #frame_num 70 semantic_radius*2+1 - target_semantics_np = target_semantics_np.reshape(batch_size, -1, target_semantics_np.shape[-2], target_semantics_np.shape[-1]) + target_semantics_np = np.array(target_semantics_list) # frame_num 70 semantic_radius*2+1 + target_semantics_np = target_semantics_np.reshape(batch_size, -1, target_semantics_np.shape[-2], + target_semantics_np.shape[-1]) data['target_semantics_list'] = torch.FloatTensor(target_semantics_np) data['video_name'] = video_name data['audio_path'] = audio_path - + if input_yaw_list is not None: yaw_c_seq = gen_camera_pose(input_yaw_list, frame_num, batch_size) data['yaw_c_seq'] = torch.FloatTensor(yaw_c_seq) @@ -80,57 +82,58 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, pitch_c_seq = gen_camera_pose(input_pitch_list, frame_num, batch_size) data['pitch_c_seq'] = torch.FloatTensor(pitch_c_seq) if input_roll_list is not None: - roll_c_seq = gen_camera_pose(input_roll_list, frame_num, batch_size) + roll_c_seq = gen_camera_pose(input_roll_list, frame_num, batch_size) data['roll_c_seq'] = torch.FloatTensor(roll_c_seq) - + return data + def transform_semantic_1(semantic, semantic_radius): - semantic_list = [semantic for i in range(0, semantic_radius*2+1)] + semantic_list = [semantic for i in range(0, semantic_radius * 2 + 1)] coeff_3dmm = np.concatenate(semantic_list, 0) - return coeff_3dmm.transpose(1,0) + return coeff_3dmm.transpose(1, 0) + def transform_semantic_target(coeff_3dmm, frame_index, semantic_radius): num_frames = coeff_3dmm.shape[0] - seq = list(range(frame_index- semantic_radius, frame_index + semantic_radius+1)) - index = [ min(max(item, 0), num_frames-1) for item in seq ] + seq = list(range(frame_index - semantic_radius, frame_index + semantic_radius + 1)) + index = [min(max(item, 0), num_frames - 1) for item in seq] coeff_3dmm_g = coeff_3dmm[index, :] - return coeff_3dmm_g.transpose(1,0) + return coeff_3dmm_g.transpose(1, 0) -def gen_camera_pose(camera_degree_list, frame_num, batch_size): - new_degree_list = [] +def gen_camera_pose(camera_degree_list, frame_num, batch_size): + new_degree_list = [] if len(camera_degree_list) == 1: for _ in range(frame_num): - new_degree_list.append(camera_degree_list[0]) - remainder = frame_num%batch_size - if remainder!=0: - for _ in range(batch_size-remainder): + new_degree_list.append(camera_degree_list[0]) + remainder = frame_num % batch_size + if remainder != 0: + for _ in range(batch_size - remainder): new_degree_list.append(new_degree_list[-1]) - new_degree_np = np.array(new_degree_list).reshape(batch_size, -1) + new_degree_np = np.array(new_degree_list).reshape(batch_size, -1) return new_degree_np degree_sum = 0. for i, degree in enumerate(camera_degree_list[1:]): - degree_sum += abs(degree-camera_degree_list[i]) - - degree_per_frame = degree_sum/(frame_num-1) + degree_sum += abs(degree - camera_degree_list[i]) + + degree_per_frame = degree_sum / (frame_num - 1) for i, degree in enumerate(camera_degree_list[1:]): degree_last = camera_degree_list[i] - degree_step = degree_per_frame * abs(degree-degree_last)/(degree-degree_last) - new_degree_list = new_degree_list + list(np.arange(degree_last, degree, degree_step)) + degree_step = degree_per_frame * abs(degree - degree_last) / (degree - degree_last) + new_degree_list = new_degree_list + list(np.arange(degree_last, degree, degree_step)) if len(new_degree_list) > frame_num: new_degree_list = new_degree_list[:frame_num] elif len(new_degree_list) < frame_num: - for _ in range(frame_num-len(new_degree_list)): + for _ in range(frame_num - len(new_degree_list)): new_degree_list.append(new_degree_list[-1]) print(len(new_degree_list)) print(frame_num) - remainder = frame_num%batch_size - if remainder!=0: - for _ in range(batch_size-remainder): + remainder = frame_num % batch_size + if remainder != 0: + for _ in range(batch_size - remainder): new_degree_list.append(new_degree_list[-1]) - new_degree_np = np.array(new_degree_list).reshape(batch_size, -1) + new_degree_np = np.array(new_degree_list).reshape(batch_size, -1) return new_degree_np - diff --git a/src/test_audio2coeff.py b/src/test_audio2coeff.py index be66bffd..0cd1d352 100644 --- a/src/test_audio2coeff.py +++ b/src/test_audio2coeff.py @@ -1,4 +1,4 @@ -import os +import os import torch import numpy as np from scipy.io import savemat, loadmat @@ -6,12 +6,15 @@ from scipy.signal import savgol_filter import safetensors -import safetensors.torch +import safetensors.torch from src.audio2pose_models.audio2pose import Audio2Pose -from src.audio2exp_models.networks import SimpleWrapperV2 +from src.audio2exp_models.networks import SimpleWrapperV2 from src.audio2exp_models.audio2exp import Audio2Exp -from src.utils.safetensor_helper import load_x_from_safetensor +from src.utils.safetensor_helper import load_x_from_safetensor +from src.utils.process_log import record_process_log +import time + def load_cpk(checkpoint_path, model=None, optimizer=None, device="cpu"): checkpoint = torch.load(checkpoint_path, map_location=torch.device(device)) @@ -22,10 +25,11 @@ def load_cpk(checkpoint_path, model=None, optimizer=None, device="cpu"): return checkpoint['epoch'] + class Audio2Coeff(): def __init__(self, sadtalker_path, device): - #load config + # load config fcfg_pose = open(sadtalker_path['audio2pose_yaml_path']) cfg_pose = CN.load_cfg(fcfg_pose) cfg_pose.freeze() @@ -33,22 +37,31 @@ def __init__(self, sadtalker_path, device): cfg_exp = CN.load_cfg(fcfg_exp) cfg_exp.freeze() + t = time.time() + # load audio2pose_model self.audio2pose_model = Audio2Pose(cfg_pose, None, device=device) + + record_process_log(self.__class__.__name__, "Audio2Pose", time.time()-t, "self.audio2pose_model") + self.audio2pose_model = self.audio2pose_model.to(device) self.audio2pose_model.eval() for param in self.audio2pose_model.parameters(): - param.requires_grad = False - + param.requires_grad = False + + t = time.time() try: if sadtalker_path['use_safetensor']: checkpoints = safetensors.torch.load_file(sadtalker_path['checkpoint']) self.audio2pose_model.load_state_dict(load_x_from_safetensor(checkpoints, 'audio2pose')) + record_process_log(self.__class__.__name__, "self.audio2pose_model.load_state_dict", time.time()-t) else: load_cpk(sadtalker_path['audio2pose_checkpoint'], model=self.audio2pose_model, device=device) + record_process_log(self.__class__.__name__, "load_cpk", time.time() - t) except: raise Exception("Failed in loading audio2pose_checkpoint") + t = time.time() # load audio2exp_model netG = SimpleWrapperV2() netG = netG.to(device) @@ -63,62 +76,65 @@ def __init__(self, sadtalker_path, device): load_cpk(sadtalker_path['audio2exp_checkpoint'], model=netG, device=device) except: raise Exception("Failed in loading audio2exp_checkpoint") + + record_process_log(self.__class__.__name__, "load audio2exp_model", time.time() - t, "checkpoints") + + t = time.time() self.audio2exp_model = Audio2Exp(netG, cfg_exp, device=device, prepare_training_loss=False) self.audio2exp_model = self.audio2exp_model.to(device) for param in self.audio2exp_model.parameters(): param.requires_grad = False self.audio2exp_model.eval() - + record_process_log(self.__class__.__name__, "Audio2Exp", time.time() - t, "self.audio2exp_model") + self.device = device def generate(self, batch, coeff_save_dir, pose_style, ref_pose_coeff_path=None): with torch.no_grad(): - #test - results_dict_exp= self.audio2exp_model.test(batch) - exp_pred = results_dict_exp['exp_coeff_pred'] #bs T 64 + # test + results_dict_exp = self.audio2exp_model.test(batch) + exp_pred = results_dict_exp['exp_coeff_pred'] # bs T 64 - #for class_id in range(1): - #class_id = 0#(i+10)%45 - #class_id = random.randint(0,46) #46 styles can be selected + # for class_id in range(1): + # class_id = 0#(i+10)%45 + # class_id = random.randint(0,46) #46 styles can be selected batch['class'] = torch.LongTensor([pose_style]).to(self.device) - results_dict_pose = self.audio2pose_model.test(batch) - pose_pred = results_dict_pose['pose_pred'] #bs T 6 + results_dict_pose = self.audio2pose_model.test(batch) + pose_pred = results_dict_pose['pose_pred'] # bs T 6 pose_len = pose_pred.shape[1] - if pose_len<13: - pose_len = int((pose_len-1)/2)*2+1 + if pose_len < 13: + pose_len = int((pose_len - 1) / 2) * 2 + 1 pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), pose_len, 2, axis=1)).to(self.device) else: - pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), 13, 2, axis=1)).to(self.device) - - coeffs_pred = torch.cat((exp_pred, pose_pred), dim=-1) #bs T 70 + pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), 13, 2, axis=1)).to(self.device) + + coeffs_pred = torch.cat((exp_pred, pose_pred), dim=-1) # bs T 70 + + coeffs_pred_numpy = coeffs_pred[0].clone().detach().cpu().numpy() - coeffs_pred_numpy = coeffs_pred[0].clone().detach().cpu().numpy() + if ref_pose_coeff_path is not None: + coeffs_pred_numpy = self.using_refpose(coeffs_pred_numpy, ref_pose_coeff_path) - - if ref_pose_coeff_path is not None: - coeffs_pred_numpy = self.using_refpose(coeffs_pred_numpy, ref_pose_coeff_path) - - savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])), + savemat(os.path.join(coeff_save_dir, '%s##%s.mat' % (batch['pic_name'], batch['audio_name'])), {'coeff_3dmm': coeffs_pred_numpy}) - return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])) - + return os.path.join(coeff_save_dir, '%s##%s.mat' % (batch['pic_name'], batch['audio_name'])) + def using_refpose(self, coeffs_pred_numpy, ref_pose_coeff_path): num_frames = coeffs_pred_numpy.shape[0] refpose_coeff_dict = loadmat(ref_pose_coeff_path) - refpose_coeff = refpose_coeff_dict['coeff_3dmm'][:,64:70] + refpose_coeff = refpose_coeff_dict['coeff_3dmm'][:, 64:70] refpose_num_frames = refpose_coeff.shape[0] - if refpose_num_frames None: - model_name = TTS.list_models()[0] + model_name = TTS().list_models()[0] self.tts = TTS(model_name) def test(self, text, language='en'): @@ -17,4 +17,4 @@ def test(self, text, language='en'): self.tts.tts_to_file(text, speaker=self.tts.speakers[0], language=language, file_path=tempf.name) - return tempf.name \ No newline at end of file + return tempf.name diff --git a/src/utils/videoio.py b/src/utils/videoio.py index 08bfbdd7..98cf746a 100644 --- a/src/utils/videoio.py +++ b/src/utils/videoio.py @@ -38,4 +38,4 @@ def save_video_with_watermark(video, audio, save_path, watermark=False): cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -filter_complex "[1]scale=100:-1[wm];[0][wm]overlay=(main_w-overlay_w)-10:10" "%s"' % (temp_file, watarmark_path, save_path) os.system(cmd) - os.remove(temp_file) \ No newline at end of file + os.remove(temp_file)