diff --git a/.gitignore b/.gitignore
index 506d5d35..73c66b60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -168,5 +168,7 @@ Dockerfile
 start_docker.sh
 start.sh
 
+checkpoints
+
 # Mac
 .DS_Store
diff --git a/LICENSE b/LICENSE
index b2a615ac..15319625 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,209 @@
-MIT License
-
-Copyright (c) 2023 Tencent AI Lab
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+Tencent is pleased to support the open source community by making SadTalker available.
+
+Copyright (C), a Tencent company. All rights reserved.
+
+SadTalker is licensed under the Apache 2.0 License, except for the third-party components listed below.
+
+Terms of the Apache License Version 2.0:
+---------------------------------------------
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index c330bdce..332615dc 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 
 <!--<h2> 😭 SadTalker： <span style="font-size:12px">Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation </span> </h2> -->
 
-  <a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp; <a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) &nbsp; [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker) 
+  <a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp; <a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) &nbsp; <br> [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker) [![Discord](https://dcbadge.vercel.app/api/server/rrayYqZ4tf?style=flat)](https://discord.gg/rrayYqZ4tf)
 
 <div>
     <a target='_blank'>Wenxuan Zhang <sup>*,1,2</sup> </a>&emsp;
@@ -37,57 +37,62 @@
 
 
 
-## 🔥 Highlight
+## Highlights
 
-- 🔥 The extension of the [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) is online. Checkout more details [here](docs/webui_extension.md).
+- The license has been updated to Apache 2.0, and we've removed the non-commercial restriction
+- **SadTalker has now officially been integrated into Discord, where you can use it for free by sending files. You can also generate high-quailty videos from text prompts. Join: [![Discord](https://dcbadge.vercel.app/api/server/rrayYqZ4tf?style=flat)](https://discord.gg/rrayYqZ4tf)**
 
-https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4
+- We've published a [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) extension. Check out more details [here](docs/webui_extension.md). [Demo Video](https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4)
 
-- 🔥 `full image mode` is online! checkout [here](https://github.com/Winfredy/SadTalker#full-bodyimage-generation) for more details.
+- Full image mode is now available! [More details...](https://github.com/OpenTalker/SadTalker#full-bodyimage-generation)
 
 | still+enhancer in v0.0.1                 | still + enhancer   in v0.0.2       |   [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) |
 |:--------------------: |:--------------------: | :----: |
 | <video  src="https://user-images.githubusercontent.com/48216707/229484996-5d7be64f-2553-4c9e-a452-c5cf0b8ebafe.mp4" type="video/mp4"> </video> | <video  src="https://user-images.githubusercontent.com/4397546/230717873-355b7bf3-d3de-49f9-a439-9220e623fce7.mp4" type="video/mp4"> </video>  | <img src='./examples/source_image/full_body_2.png' width='380'> 
 
-- 🔥 Several new mode, eg, `still mode`, `reference mode`, `resize mode` are online for better and custom applications.
+- Several new modes (Still, reference, and resize modes) are now available!
 
-- 🔥 Happy to  see more community demos at [bilibili](https://search.bilibili.com/all?keyword=sadtalker&from_source=webtop_search&spm_id_from=333.1007&search_source=3
-), [Youtube](https://www.youtube.com/results?search_query=sadtalker&sp=CAM%253D) and [twitter #sadtalker](https://twitter.com/search?q=%23sadtalker&src=typed_query).
+- We're happy to see more community demos on [bilibili](https://search.bilibili.com/all?keyword=sadtalker), [YouTube](https://www.youtube.com/results?search_query=sadtalker) and [X (#sadtalker)](https://twitter.com/search?q=%23sadtalker&src).
 
-## 📋 Changelog (Previous changelog can be founded [here](docs/changlelog.md))
+## Changelog 
 
-- __[2023.06.12]__: add more new features in WEBUI extension, see the discussion [here](https://github.com/OpenTalker/SadTalker/discussions/386).
+The previous changelog can be found [here](docs/changlelog.md).
 
-- __[2023.06.05]__: release a new 512 beta face model. Fixed some bugs and improve the performance.
+- __[2023.06.12]__: Added more new features in WebUI extension, see the discussion [here](https://github.com/OpenTalker/SadTalker/discussions/386).
 
-- __[2023.04.15]__: Adding automatic1111 colab by @camenduru, thanks for this awesome colab: [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb).
+- __[2023.06.05]__: Released a new 512x512px (beta) face model. Fixed some bugs and improve the performance.
 
-- __[2023.04.12]__: adding a more detailed sd-webui installation document, fixed reinstallation problem.
+- __[2023.04.15]__: Added a WebUI Colab notebook by [@camenduru](https://github.com/camenduru/): [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb)
 
-- __[2023.04.12]__: Fixed the sd-webui safe issues becasue of the 3rd packages, optimize the output path in `sd-webui-extension`.
+- __[2023.04.12]__: Added a more detailed WebUI installation document and fixed a problem when reinstalling.
 
-- __[2023.04.08]__: ❗️❗️❗️ In v0.0.2, we add a logo watermark to the generated video to prevent abusing since it is very realistic.
+- __[2023.04.12]__: Fixed the WebUI safe issues becasue of 3rd-party packages, and optimized the output path in `sd-webui-extension`.
 
-- __[2023.04.08]__: v0.0.2, full image animation, adding baidu driver for download checkpoints. Optimizing the logic about enhancer.
+- __[2023.04.08]__: In v0.0.2, we added a logo watermark to the generated video to prevent abuse. _This watermark has since been removed in a later release._
 
+- __[2023.04.08]__: In v0.0.2, we added features for full image animation and a link to download checkpoints from Baidu. We also optimized the enhancer logic.
 
-## 🚧 TODO: See the Discussion https://github.com/OpenTalker/SadTalker/issues/280
+## To-Do
 
-## If you have any problem, please view our [FAQ](docs/FAQ.md) before opening an issue.
+We're tracking new updates in [issue #280](https://github.com/OpenTalker/SadTalker/issues/280).
 
+## Troubleshooting
 
+If you have any problems, please read our [FAQs](docs/FAQ.md) before opening an issue.
 
-## ⚙️ 1. Installation.
 
-Tutorials from communities: [中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース](https://br-d.fanbox.cc/posts/5685086?utm_campaign=manage_post_page&utm_medium=share&utm_source=twitter) 
 
-### Linux:
+## 1. Installation.
 
-1. Installing [anaconda](https://www.anaconda.com/), python and git.
+Community tutorials: [中文Windows教程 (Chinese Windows tutorial)](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース (Japanese tutorial)](https://br-d.fanbox.cc/posts/5685086).
+
+### Linux/Unix
+
+1. Install [Anaconda](https://www.anaconda.com/), Python and `git`.
 
 2. Creating the env and install the requirements.
   ```bash
-  git clone https://github.com/Winfredy/SadTalker.git
+  git clone https://github.com/OpenTalker/SadTalker.git
 
   cd SadTalker 
 
@@ -101,40 +106,54 @@ Tutorials from communities: [中文windows教程](https://www.bilibili.com/video
 
   pip install -r requirements.txt
 
-  ### tts is optional for gradio demo. 
+  ### Coqui TTS is optional for gradio demo. 
   ### pip install TTS
 
   ```  
-### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)):
+### Windows
+
+A video tutorial in chinese is available [here](https://www.bilibili.com/video/BV1Dc411W7V6/). You can also follow the following instructions:
+
+1. Install [Python 3.8](https://www.python.org/downloads/windows/) and check "Add Python to PATH".
+2. Install [git](https://git-scm.com/download/win) manually or using [Scoop](https://scoop.sh/): `scoop install git`.
+3. Install `ffmpeg`, following [this tutorial](https://www.wikihow.com/Install-FFmpeg-on-Windows) or using [scoop](https://scoop.sh/): `scoop install ffmpeg`.
+4. Download the SadTalker repository by running `git clone https://github.com/Winfredy/SadTalker.git`.
+5. Download the checkpoints and gfpgan models in the [downloads section](#2-download-models).
+6. Run `start.bat` from Windows Explorer as normal, non-administrator, user, and a Gradio-powered WebUI demo will be started.
 
-1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH".
-2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)).
-3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)).
-4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`.
-5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
-5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started.
+### macOS
 
-### Macbook:
+A tutorial on installing SadTalker on macOS can be found [here](docs/install.md).
 
-More tips about installnation on Macbook and the Docker file can be founded [here](docs/install.md)
+### Docker, WSL, etc
 
-## 📥 2. Download Trained Models.
+Please check out additional tutorials [here](docs/install.md).
 
-You can run the following script to put all the models in the right place.
+## 2. Download Models
+
+You can run the following script on Linux/macOS to automatically download all the models:
 
 ```bash
 bash scripts/download_models.sh
 ```
 
-Other alternatives:
-> we also provide an offline patch (`gfpgan/`), thus, no model will be downloaded when generating.
+We also provide an offline patch (`gfpgan/`), so no model will be downloaded when generating.
+
+### Pre-Trained Models
+
+* [Google Drive](https://drive.google.com/file/d/1gwWh45pF7aelNP_P78uDJL8Sycep-K7j/view?usp=sharing)
+* [GitHub Releases](https://github.com/OpenTalker/SadTalker/releases)
+* [Baidu (百度云盘)](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt) (Password: `sadt`)
 
-**Google Driver**: download our pre-trained model from [ this link (main checkpoints)](https://drive.google.com/file/d/1gwWh45pF7aelNP_P78uDJL8Sycep-K7j/view?usp=sharing) and [ gfpgan (offline patch)](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing)
+<!-- TODO add Hugging Face links -->
 
-**Github Release Page**: download all the files from the [lastest github release page](https://github.com/Winfredy/SadTalker/releases), and then, put it in ./checkpoints.
+### GFPGAN Offline Patch
 
-**百度云盘**: we provided the downloaded model in [checkpoints,  提取码: sadt.](https://pan.baidu.com/s/1P4fRgk9gaSutZnn8YW034Q?pwd=sadt) And [gfpgan,  提取码: sadt.](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt)
+* [Google Drive](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing)
+* [GitHub Releases](https://github.com/OpenTalker/SadTalker/releases)
+* [Baidu (百度云盘)](https://pan.baidu.com/s/1P4fRgk9gaSutZnn8YW034Q?pwd=sadt) (Password: `sadt`)
 
+<!-- TODO add Hugging Face links -->
 
 
 <details><summary>Model Details</summary>
@@ -174,28 +193,30 @@ The final folder will be shown as:
 
 </details>
 
-## 🔮 3. Quick Start ([Best Practice](docs/best_practice.md)).
+## 3. Quick Start
 
-### WebUI Demos:
+Please read our document on [best practices and configuration tips](docs/best_practice.md)
 
-**Online**: [Huggingface](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)
+### WebUI Demos
 
-**Local Autiomatic1111 stable-diffusion webui extension**: please refer to [Autiomatic1111 stable-diffusion webui docs](docs/webui_extension.md).
+**Online Demo**: [HuggingFace](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)
 
-**Local gradio demo(highly recommanded!)**: Similar to our [hugging-face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run by:
+**Local WebUI extension**: Please refer to [WebUI docs](docs/webui_extension.md).
+
+**Local gradio demo (recommanded)**: A Gradio instance similar to our [Hugging Face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run locally:
 
 ```bash
 ## you need manually install TTS(https://github.com/coqui-ai/TTS) via `pip install tts` in advanced.
-python app.py
+python app_sadtalker.py
 ```
 
-**Local gradio demo(highly recommanded!)**: 
+You can also start it more easily:
 
 - windows: just double click `webui.bat`, the requirements will be installed automatically.
 - Linux/Mac OS: run `bash webui.sh` to start the webui.
 
 
-### Manually usages:
+### CLI usage
 
 ##### Animating a portrait image from default config:
 ```bash
@@ -220,7 +241,7 @@ python inference.py --driven_audio <audio.wav> \
 
 More examples and configuration and tips can be founded in the [ >>> best practice documents <<<](docs/best_practice.md).
 
-## 🛎 Citation
+## Citation
 
 If you find our work useful in your research, please consider citing:
 
@@ -233,23 +254,21 @@ If you find our work useful in your research, please consider citing:
 }
 ```
 
+## Acknowledgements
 
+Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, we also used the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work.
 
-## 💗 Acknowledgements
-
-Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, We also use the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work.
-
-See also these wonderful 3rd libraries we use:
+We also use the following 3rd-party libraries:
 
 - **Face Utils**: https://github.com/xinntao/facexlib
 - **Face Enhancement**: https://github.com/TencentARC/GFPGAN
 - **Image/Video Enhancement**:https://github.com/xinntao/Real-ESRGAN
 
-## 🥂 Extensions:
+## Extensions:
 
 - [SadTalker-Video-Lip-Sync](https://github.com/Zz-ww/SadTalker-Video-Lip-Sync) from [@Zz-ww](https://github.com/Zz-ww): SadTalker for Video Lip Editing
 
-## 🥂 Related Works
+## Related Works
 - [StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN (ECCV 2022)](https://github.com/FeiiYin/StyleHEAT)
 - [CodeTalker: Speech-Driven 3D Facial Animation with Discrete Motion Prior (CVPR 2023)](https://github.com/Doubiiu/CodeTalker)
 - [VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild (SIGGRAPH Asia 2022)](https://github.com/vinthony/video-retalking)
@@ -257,12 +276,23 @@ See also these wonderful 3rd libraries we use:
 - [3D GAN Inversion with Facial Symmetry Prior (CVPR 2023)](https://github.com/FeiiYin/SPI/)
 - [T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations (CVPR 2023)](https://github.com/Mael-zys/T2M-GPT)
 
-## 📢 Disclaimer
+## Disclaimer
 
-This is not an official product of Tencent. This repository can only be used for personal/research/non-commercial purposes.
+This is not an official product of Tencent. 
 
-LOGO: color and font suggestion: [ChatGPT](ai.com), logo font：[Montserrat Alternates
+```
+1. Please carefully read and comply with the open-source license applicable to this code before using it. 
+2. Please carefully read and comply with the intellectual property declaration applicable to this code before using it.
+3. This open-source code runs completely offline and does not collect any personal information or other data. If you use this code to provide services to end-users and collect related data, please take necessary compliance measures according to applicable laws and regulations (such as publishing privacy policies, adopting necessary data security strategies, etc.). If the collected data involves personal information, user consent must be obtained (if applicable). Any legal liabilities arising from this are unrelated to Tencent.
+4. Without Tencent's written permission, you are not authorized to use the names or logos legally owned by Tencent, such as "Tencent." Otherwise, you may be liable for legal responsibilities.
+5. This open-source code does not have the ability to directly provide services to end-users. If you need to use this code for further model training or demos, as part of your product to provide services to end-users, or for similar use, please comply with applicable laws and regulations for your product or service. Any legal liabilities arising from this are unrelated to Tencent.
+6. It is prohibited to use this open-source code for activities that harm the legitimate rights and interests of others (including but not limited to fraud, deception, infringement of others' portrait rights, reputation rights, etc.), or other behaviors that violate applicable laws and regulations or go against social ethics and good customs (including providing incorrect or false information, spreading pornographic, terrorist, and violent information, etc.). Otherwise, you may be liable for legal responsibilities.
+```
+
+LOGO: color and font suggestion: [ChatGPT](https://chat.openai.com), logo font: [Montserrat Alternates
 ](https://fonts.google.com/specimen/Montserrat+Alternates?preview.text=SadTalker&preview.text_type=custom&query=mont).
 
-All the copyright of the demo images and audio are from communities users or the geneartion from stable diffusion. Free free to contact us if you feel uncomfortable.
+All the copyrights of the demo images and audio are from community users or the generation from stable diffusion. Feel free to contact us if you would like use to remove them.
+
 
+<!-- Spelling fixed on Tuesday, September 12, 2023 by @fakerybakery (https://github.com/fakerybakery). These changes are licensed under the Apache 2.0 license. -->
diff --git a/app.py b/app.py
deleted file mode 100644
index 11be0b48..00000000
--- a/app.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import os, sys
-import gradio as gr
-from src.gradio_demo import SadTalker  
-
-
-try:
-    import webui  # in webui
-    in_webui = True
-except:
-    in_webui = False
-
-
-def toggle_audio_file(choice):
-    if choice == False:
-        return gr.update(visible=True), gr.update(visible=False)
-    else:
-        return gr.update(visible=False), gr.update(visible=True)
-    
-def ref_video_fn(path_of_ref_video):
-    if path_of_ref_video is not None:
-        return gr.update(value=True)
-    else:
-        return gr.update(value=False)
-
-
-def sadtalker_demo(checkpoint_path='checkpoints', config_path='src/config', warpfn=None):
-
-    sad_talker = SadTalker(checkpoint_path, config_path, lazy_load=True)
-
-    with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
-
-        gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
-                    <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
-                    <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
-                     <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
-        
-        with gr.Row().style(equal_height=False):
-            with gr.Column(variant='panel'):
-                with gr.Tabs(elem_id="sadtalker_source_image"):
-                    with gr.TabItem('Source image'):
-                        with gr.Row():
-                            source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
-
-
-                with gr.Tabs(elem_id="sadtalker_driven_audio"):
-                    with gr.TabItem('Driving Methods'):
-                        gr.Markdown("Possible driving combinations: <br> 1. Audio only 2. Audio/IDLE Mode + Ref Video(pose, blink, pose+blink) 3. IDLE Mode only 4. Ref Video only (all) ")
-
-                        with gr.Row():
-                            driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
-                            driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
-
-                            with gr.Column():
-                                use_idle_mode = gr.Checkbox(label="Use Idle Animation")
-                                length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
-                                use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
-
-                                if sys.platform != 'win32' and not in_webui:
-                                    with gr.Accordion('Generate Audio From TTS', open=False):
-                                        from src.utils.text2speech import TTSTalker
-                                        tts_talker = TTSTalker()
-                                        with gr.Column(variant='panel'):
-                                            input_text = gr.Textbox(label="Generating audio from text", lines=5, placeholder="please enter some text here, we genreate the audio from text using @Coqui.ai TTS.")
-                                            tts = gr.Button('Generate audio',elem_id="sadtalker_audio_generate", variant='primary')
-                                            tts.click(fn=tts_talker.test, inputs=[input_text], outputs=[driven_audio])
-
-                        with gr.Row():
-                            ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref").style(width=512)
-
-                            with gr.Column():
-                                use_ref_video = gr.Checkbox(label="Use Reference Video")
-                                ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
-
-                            ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
-
-
-            with gr.Column(variant='panel'): 
-                with gr.Tabs(elem_id="sadtalker_checkbox"):
-                    with gr.TabItem('Settings'):
-                        gr.Markdown("need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
-                        with gr.Column(variant='panel'):
-                            # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
-                            # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
-                            with gr.Row():
-                                pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
-                                exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # 
-                                blink_every = gr.Checkbox(label="use eye blink", value=True)
-
-                            with gr.Row():
-                                size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") # 
-                                preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
-                            
-                            with gr.Row():
-                                is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
-                                batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
-                                enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
-                                
-                            submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
-                            
-                with gr.Tabs(elem_id="sadtalker_genearted"):
-                        gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
-
-        submit.click(
-                    fn=sad_talker.test, 
-                    inputs=[source_image,
-                            driven_audio,
-                            preprocess_type,
-                            is_still_mode,
-                            enhancer,
-                            batch_size,                            
-                            size_of_image,
-                            pose_style,
-                            exp_weight,
-                            use_ref_video,
-                            ref_video,
-                            ref_info,
-                            use_idle_mode,
-                            length_of_audio,
-                            blink_every
-                            ], 
-                    outputs=[gen_video]
-                    )
-
-    return sadtalker_interface
- 
-
-if __name__ == "__main__":
-
-    demo = sadtalker_demo()
-    demo.queue()
-    demo.launch()
-
-
diff --git a/app_sadtalker.py b/app_sadtalker.py
new file mode 100644
index 00000000..1401a600
--- /dev/null
+++ b/app_sadtalker.py
@@ -0,0 +1,111 @@
+import os, sys
+import gradio as gr
+from src.gradio_demo import SadTalker  
+
+
+try:
+    import webui  # in webui
+    in_webui = True
+except:
+    in_webui = False
+
+
+def toggle_audio_file(choice):
+    if choice == False:
+        return gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True)
+    
+def ref_video_fn(path_of_ref_video):
+    if path_of_ref_video is not None:
+        return gr.update(value=True)
+    else:
+        return gr.update(value=False)
+
+def sadtalker_demo(checkpoint_path='checkpoints', config_path='src/config', warpfn=None):
+
+    sad_talker = SadTalker(checkpoint_path, config_path, lazy_load=True)
+
+    with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
+        gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
+                    <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
+                    <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
+                     <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
+        
+        with gr.Row().style(equal_height=False):
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_source_image"):
+                    with gr.TabItem('Upload image'):
+                        with gr.Row():
+                            source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
+
+                with gr.Tabs(elem_id="sadtalker_driven_audio"):
+                    with gr.TabItem('Upload OR TTS'):
+                        with gr.Column(variant='panel'):
+                            driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
+
+                        if sys.platform != 'win32' and not in_webui: 
+                            from src.utils.text2speech import TTSTalker
+                            tts_talker = TTSTalker()
+                            with gr.Column(variant='panel'):
+                                input_text = gr.Textbox(label="Generating audio from text", lines=5, placeholder="please enter some text here, we genreate the audio from text using @Coqui.ai TTS.")
+                                tts = gr.Button('Generate audio',elem_id="sadtalker_audio_generate", variant='primary')
+                                tts.click(fn=tts_talker.test, inputs=[input_text], outputs=[driven_audio])
+                            
+            with gr.Column(variant='panel'): 
+                with gr.Tabs(elem_id="sadtalker_checkbox"):
+                    with gr.TabItem('Settings'):
+                        gr.Markdown("need help? please visit our [best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md) for more detials")
+                        with gr.Column(variant='panel'):
+                            # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
+                            # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
+                            pose_style = gr.Slider(minimum=0, maximum=46, step=1, label="Pose style", value=0) # 
+                            size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") # 
+                            preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
+                            is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
+                            batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=2)
+                            enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
+                            submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
+                            
+                with gr.Tabs(elem_id="sadtalker_genearted"):
+                        gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
+
+        if warpfn:
+            submit.click(
+                        fn=warpfn(sad_talker.test), 
+                        inputs=[source_image,
+                                driven_audio,
+                                preprocess_type,
+                                is_still_mode,
+                                enhancer,
+                                batch_size,                            
+                                size_of_image,
+                                pose_style
+                                ], 
+                        outputs=[gen_video]
+                        )
+        else:
+            submit.click(
+                        fn=sad_talker.test, 
+                        inputs=[source_image,
+                                driven_audio,
+                                preprocess_type,
+                                is_still_mode,
+                                enhancer,
+                                batch_size,                            
+                                size_of_image,
+                                pose_style
+                                ], 
+                        outputs=[gen_video]
+                        )
+
+    return sadtalker_interface
+ 
+
+if __name__ == "__main__":
+
+    demo = sadtalker_demo()
+    demo.queue()
+    demo.launch()
+
+
diff --git a/docs/best_practice.md b/docs/best_practice.md
index 3a738b21..76700506 100644
--- a/docs/best_practice.md
+++ b/docs/best_practice.md
@@ -1,8 +1,8 @@
-# Best Practice and Tips for configuration
+# Best Practices and Tips for configuration
 
-> Our model only works on REAL person's photo or the portrait image similar to REAL person. The anime talking head genreation method will be released in future.
+> Our model only works on REAL people or the portrait image similar to REAL person. The anime talking head genreation method will be released in future.
 
-Advanced confiurations for `inference.py`:
+Advanced confiuration options for `inference.py`:
 
 | Name        | Configuration | default |   Explaination  | 
 |:------------- |:------------- |:----- | :------------- |
@@ -20,18 +20,18 @@ Advanced confiurations for `inference.py`:
 
 ### About `--preprocess`
 
-Our method automatically handle the input images via `crop`, `resize` and `full`.
+Our system automatically handles the input images via `crop`, `resize` and `full`.
 
- In `crop` mode, we only generate the croped image via the facial keypoints and generated the facial anime avator. The animation of both expression and head pose are realistic.
+In `crop` mode, we only generate the croped image via the facial keypoints and generated the facial anime avator. The animation of both expression and head pose are realistic.
 
-> still mode will stop the eyeblink and head pose movement.
+> Still mode will stop the eyeblink and head pose movement.
 
 |  [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) | crop | crop w/still |
 |:--------------------: |:--------------------: | :----: |
 | <img src='../examples/source_image/full_body_2.png' width='380'> | ![full_body_2](example_crop.gif) | ![full_body_2](example_crop_still.gif) |
 
 
- In `resize` mode, we resize the whole images to generate the fully talking head video. Thus, an image similar to the ID photo can be produced. ⚠️ It will produce bad results for full person images.
+In `resize` mode, we resize the whole images to generate the fully talking head video. Thus, an image similar to the ID photo can be produced. ⚠️ It will produce bad results for full person images.
 
 
  
@@ -50,7 +50,7 @@ In `full` mode, our model will automatically process the croped region and paste
 
 ### About `--enhancer`
 
-For better facial quality, we intergate [gfpgan](https://github.com/TencentARC/GFPGAN) and [real-esrgan](https://github.com/xinntao/Real-ESRGAN) for different purpose. Just adding `--enhancer <gfpgan or RestoreFormer>` or `--background_enhancer <realesrgan>` for the enhancement of the face and the full image.
+For higher resolution, we intergate [gfpgan](https://github.com/TencentARC/GFPGAN) and [real-esrgan](https://github.com/xinntao/Real-ESRGAN) for different purpose. Just adding `--enhancer <gfpgan or RestoreFormer>` or `--background_enhancer <realesrgan>` for the enhancement of the face and the full image.
 
 ```bash
 # make sure above packages are available:
@@ -70,7 +70,7 @@ This flag indicate that we can generated the 3d-rendered face and it's 3d facial
 
 
 
-#### reference eye-link mode.
+#### Reference eye-link mode.
 
 | Input, w/ reference video   ,  reference video    | 
 |:-------------: | 
diff --git a/docs/face3d.md b/docs/face3d.md
index b5a3f00b..394f4fef 100644
--- a/docs/face3d.md
+++ b/docs/face3d.md
@@ -1,11 +1,11 @@
-## 3D Face visualization
+## 3D Face Visualization
 
-We use pytorch3d to visualize the produced 3d face from a single image.
+We use `pytorch3d` to visualize the 3D faces from a single image.
 
-Since it is not easy to install, we produce a new install guidence here:
+The requirements for 3D visualization are difficult to install, so here's a tutorial:
 
 ```bash
-git clone https://github.com/Winfredy/SadTalker.git
+git clone https://github.com/OpenTalker/SadTalker.git
 cd SadTalker 
 conda create -n sadtalker3d python=3.8
 source activate sadtalker3d
@@ -28,10 +28,9 @@ pip install git+https://github.com/TencentARC/GFPGAN
 ### when occurs gcc version problem `from pytorch import _C` from pytorch3d, add the anaconda path to LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/$YOUR_ANACONDA_PATH/lib/
 
-```  
+``` 
 
-
-Then, generating the result via:
+Then, generate the result via:
 
 ```bash
 
@@ -43,6 +42,6 @@ python inference.py --driven_audio <audio.wav> \
 
 ```
 
-Then, the results will be given in the folders with the file name of `face3d.mp4`.
+The results will appear, named `face3d.mp4`.
 
-More applications about 3d face will be released.
\ No newline at end of file
+More applications about 3D face rendering will be released soon.
diff --git a/docs/install.md b/docs/install.md
index 31d1f918..807c8eb9 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -1,41 +1,33 @@
+### macOS
 
-### Mac (Tested on M1 Mac OS 13.3)
-
-```
-git clone https://github.com/Winfredy/SadTalker.git
+This method has been tested on a M1 Mac (13.3)
 
+```bash
+git clone https://github.com/OpenTalker/SadTalker.git
 cd SadTalker 
-
 conda create -n sadtalker python=3.8
-
 conda activate sadtalker
-
 # install pytorch 2.0
 pip install torch torchvision torchaudio
-
 conda install ffmpeg
-
 pip install -r requirements.txt
-
-pip install dlib # mac need to install the original dlib.
-
+pip install dlib # macOS needs to install the original dlib.
 ```
 
-
-
 ### Windows Native
 
-- Make sure you have `ffmpeg` in the `%PATH%` as suggested in [#54](https://github.com/Winfredy/SadTalker/issues/54), following [this](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) installation to install `ffmpeg`. 
+- Make sure you have `ffmpeg` in the `%PATH%` as suggested in [#54](https://github.com/Winfredy/SadTalker/issues/54), following [this](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) tutorial to install `ffmpeg` or using scoop.
 
 
 ### Windows WSL
-- Make sure the environment: `export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH`
 
 
-### Docker installnation
+- Make sure the environment: `export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH`
+
 
-A dockerfile are also provided by [@thegenerativegeneration](https://github.com/thegenerativegeneration) in [docker hub](https://hub.docker.com/repository/docker/wawa9000/sadtalker), which can be used directly as:
+### Docker Installation
 
+A community Docker image by [@thegenerativegeneration](https://github.com/thegenerativegeneration) is available on the [Docker hub](https://hub.docker.com/repository/docker/wawa9000/sadtalker), which can be used directly:
 ```bash
 docker run --gpus "all" --rm -v $(pwd):/host_dir wawa9000/sadtalker \
     --driven_audio /host_dir/deyu.wav \
diff --git a/docs/webui_extension.md b/docs/webui_extension.md
index 6e272e2d..5e64469f 100644
--- a/docs/webui_extension.md
+++ b/docs/webui_extension.md
@@ -1,7 +1,6 @@
-
 ## Run SadTalker as a Stable Diffusion WebUI Extension.
 
-1. Installing the lastest version of [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) and install the sadtalker via `extension`.
+1. Install the lastest version of [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) and install SadTalker via `extension`.
 <img width="726" alt="image" src="https://user-images.githubusercontent.com/4397546/230698519-267d1d1f-6e99-4dd4-81e1-7b889259efbd.png">
 
 2. Download the checkpoints manually, for Linux and Mac:
@@ -10,30 +9,30 @@
 
     cd SOMEWHERE_YOU_LIKE
 
-    bash <(wget -qO- https://raw.githubusercontent.com/Winfredy/SadTalker/main/scripts/download_models.sh)
+    bash <(wget -qO- https://raw.githubusercontent.com/Winfredy/OpenTalker/main/scripts/download_models.sh)
     ```
 
-    For windows, you can download all the checkpoints from [google drive](https://drive.google.com/drive/folders/1Wd88VDoLhVzYsQ30_qDVluQr_Xm46yHT?usp=sharing) or [百度云盘](https://pan.baidu.com/s/1nXuVNd0exUl37ISwWqbFGA?pwd=sadt) 提取码: sadt.
+    For Windows, you can download all the checkpoints [here](https://github.com/OpenTalker/SadTalker/tree/main#2-download-models).
 
-3.1. options 1: put the checkpoint in `stable-diffusion-webui/models/SadTalker` or `stable-diffusion-webui/extensions/SadTalker/checkpoints/`, the checkpoints will be detected automatically.
+3.1. Option 1: put the checkpoint in `stable-diffusion-webui/models/SadTalker` or `stable-diffusion-webui/extensions/SadTalker/checkpoints/`, the checkpoints will be detected automatically.
 
-3.2. Options 2: Set the path of `SADTALKTER_CHECKPOINTS` in `webui_user.sh`(linux) or `webui_user.bat`(windows) by:
+3.2. Option 2: Set the path of `SADTALKTER_CHECKPOINTS` in `webui_user.sh`(linux) or `webui_user.bat`(windows) by:
 
     > only works if you are directly starting webui from `webui_user.sh` or `webui_user.bat`.
 
     ```bash
-    # windows (webui_user.bat)
+    # Windows (webui_user.bat)
     set SADTALKER_CHECKPOINTS=D:\SadTalker\checkpoints
 
-    # linux (webui_user.sh)
+    # Linux/macOS (webui_user.sh)
     export SADTALKER_CHECKPOINTS=/path/to/SadTalker/checkpoints
     ```
 
-4. Then, starting the webui via `webui.sh or webui_user.sh(linux)` or `webui_user.bat(windows)` or any other methods, the SadTalker can be used in stable-diffusion-webui directly. 
+4. Start the WebUI via `webui.sh or webui_user.sh(linux)` or `webui_user.bat(windows)` or any other method. SadTalker can also be used in stable-diffusion-webui directly.
     
     <img width="726" alt="image" src="https://user-images.githubusercontent.com/4397546/230698614-58015182-2916-4240-b324-e69022ef75b3.png">
     
-## Questsions
+## Questions
 
 1. if you are running on CPU, you need to specific `--disable-safe-unpickle` in `webui_user.sh` or `webui_user.bat`.
 
@@ -47,4 +46,4 @@
 
 
 
-(Some [important discussion](https://github.com/Winfredy/SadTalker/issues/78) if you are unable to use `full` mode).
+(If you're unable to use the `full` mode, please read this [discussion](https://github.com/Winfredy/SadTalker/issues/78).)
diff --git a/inference.py b/inference.py
index a0b00790..345ed5a5 100644
--- a/inference.py
+++ b/inference.py
@@ -1,20 +1,21 @@
 from glob import glob
 import shutil
 import torch
-from time import  strftime
+from time import strftime
 import os, sys, time
 from argparse import ArgumentParser
 
 from src.utils.preprocess import CropAndExtract
-from src.test_audio2coeff import Audio2Coeff  
+from src.test_audio2coeff import Audio2Coeff
 from src.facerender.animate import AnimateFromCoeff
 from src.generate_batch import get_data
 from src.generate_facerender_batch import get_facerender_data
 from src.utils.init_path import init_path
+from src.utils.process_log import record_process_log, get_file_size
 
-def main(args):
-    #torch.backends.cudnn.enabled = False
 
+def main(args):
+    # torch.backends.cudnn.enabled = False
     pic_path = args.source_image
     audio_path = args.driven_audio
     save_dir = os.path.join(args.result_dir, strftime("%Y_%m_%d_%H.%M.%S"))
@@ -29,22 +30,40 @@ def main(args):
     ref_pose = args.ref_pose
 
     current_root_path = os.path.split(sys.argv[0])[0]
+    t = time.time()
+    sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size,
+                                args.old_version, args.preprocess)
+    record_process_log("main", "init_path", time.time() - t)
 
-    sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess)
-
-    #init model
+    # init model
+    t = time.time()
     preprocess_model = CropAndExtract(sadtalker_paths, device)
+    # 初始化 CropAndExtract, V100 消耗时间：3.572038889
+    # TODO(qingyuan): this is loading check points, try to just load once
+    record_process_log("main", "CropAndExtract", time.time() - t)
+
+    t = time.time()
+    audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
+    # 初始化 Audio2Coeff, V100 消耗时间：0.297198772
+    record_process_log("main", "Audio2Coeff", time.time() - t)
 
-    audio_to_coeff = Audio2Coeff(sadtalker_paths,  device)
-    
+    t = time.time()
     animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
+    # 初始化 AnimateFromCoeff, V100 消耗时间：2.004800558
+    # TODO(qingyuan): this is loading check points and models, try to just load once
+    record_process_log("main", "AnimateFromCoeff", time.time() - t)
 
-    #crop image and extract 3dmm from image
+    # crop image and extract 3dmm from image
     first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
     os.makedirs(first_frame_dir, exist_ok=True)
     print('3DMM Extraction for source image')
-    first_coeff_path, crop_pic_path, crop_info =  preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
-                                                                             source_image_flag=True, pic_size=args.size)
+    t = time.time()
+    first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(
+        pic_path, first_frame_dir, args.preprocess,
+        source_image_flag=True, pic_size=args.size)
+    # call preprocess_model.generate，消耗时间：1.483717203
+    # TODO(qingyuan): add logging and check where the time is spent
+    record_process_log("main", "preprocess_model.generate", time.time() - t, "first_coeff_path")
     if first_coeff_path is None:
         print("Can't get the coeffs of the input")
         return
@@ -54,76 +73,102 @@ def main(args):
         ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
         os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
         print('3DMM Extraction for the reference video providing eye blinking')
-        ref_eyeblink_coeff_path, _, _ =  preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
+        t = time.time()
+        ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess,
+                                                                  source_image_flag=False)
+        record_process_log("main", "preprocess_model.generate", time.time() - t, "ref_eyeblink")
     else:
-        ref_eyeblink_coeff_path=None
+        ref_eyeblink_coeff_path = None
 
     if ref_pose is not None:
-        if ref_pose == ref_eyeblink: 
+        if ref_pose == ref_eyeblink:
             ref_pose_coeff_path = ref_eyeblink_coeff_path
         else:
             ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
             ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
             os.makedirs(ref_pose_frame_dir, exist_ok=True)
             print('3DMM Extraction for the reference video providing pose')
-            ref_pose_coeff_path, _, _ =  preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
+            t = time.time()
+            ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess,
+                                                                  source_image_flag=False)
+            record_process_log("main", "preprocess_model.generate", time.time() - t, "ref_pose_coeff_path")
     else:
-        ref_pose_coeff_path=None
+        ref_pose_coeff_path = None
 
-    #audio2ceoff
+    t = time.time()
+    # audio2ceoff
     batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
+    # call get_data, 消耗时间：0.853573322
+    record_process_log("main", "get_data", time.time() - t)
     coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
+    t = time.time()
+    record_process_log("main", "audio_to_coeff.generate", time.time() - t)
 
     # 3dface render
     if args.face3dvis:
         from src.face3d.visualize import gen_composed_video
+        t = time.time()
         gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
-    
-    #coeff2video
-    data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, 
-                                batch_size, input_yaw_list, input_pitch_list, input_roll_list,
-                                expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
-    
+        record_process_log("main", "gen_composed_video", time.time() - t)
+
+    t = time.time()
+    # coeff2video
+    data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
+                               batch_size, input_yaw_list, input_pitch_list, input_roll_list,
+                               expression_scale=args.expression_scale, still_mode=args.still,
+                               preprocess=args.preprocess, size=args.size)
+    record_process_log("main", "get_facerender_data", time.time() - t)
+
+    t = time.time()
     result = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
-                                enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
-    
-    shutil.move(result, save_dir+'.mp4')
-    print('The generated video is named:', save_dir+'.mp4')
+                                         enhancer=args.enhancer, background_enhancer=args.background_enhancer,
+                                         preprocess=args.preprocess, img_size=args.size)
+    # AnimateFromCoeff generate 消耗的总时间：79.29866314
+    record_process_log("main", "animate_from_coeff.generate", time.time() - t)
+
+    shutil.move(result, save_dir + '.mp4')
+    print('The generated video is named:', save_dir + '.mp4')
+
+    record_process_log("main", "result mp4 file", get_file_size(f"{save_dir}.mp4"))
 
     if not args.verbose:
         shutil.rmtree(save_dir)
 
-    
-if __name__ == '__main__':
 
-    parser = ArgumentParser()  
-    parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', help="path to driven audio")
-    parser.add_argument("--source_image", default='./examples/source_image/full_body_1.png', help="path to source image")
+if __name__ == '__main__':
+    t = time.time()
+    parser = ArgumentParser()
+    parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav',
+                        help="path to driven audio")
+    parser.add_argument("--source_image", default='./examples/source_image/full_body_1.png',
+                        help="path to source image")
     parser.add_argument("--ref_eyeblink", default=None, help="path to reference video providing eye blinking")
     parser.add_argument("--ref_pose", default=None, help="path to reference video providing pose")
     parser.add_argument("--checkpoint_dir", default='./checkpoints', help="path to output")
     parser.add_argument("--result_dir", default='./results', help="path to output")
-    parser.add_argument("--pose_style", type=int, default=0,  help="input pose style from [0, 46)")
-    parser.add_argument("--batch_size", type=int, default=2,  help="the batch size of facerender")
-    parser.add_argument("--size", type=int, default=256,  help="the image size of the facerender")
-    parser.add_argument("--expression_scale", type=float, default=1.,  help="the batch size of facerender")
+    parser.add_argument("--pose_style", type=int, default=0, help="input pose style from [0, 46)")
+    parser.add_argument("--batch_size", type=int, default=2, help="the batch size of facerender")
+    parser.add_argument("--size", type=int, default=256, help="the image size of the facerender")
+    parser.add_argument("--expression_scale", type=float, default=1., help="the batch size of facerender")
     parser.add_argument('--input_yaw', nargs='+', type=int, default=None, help="the input yaw degree of the user ")
     parser.add_argument('--input_pitch', nargs='+', type=int, default=None, help="the input pitch degree of the user")
     parser.add_argument('--input_roll', nargs='+', type=int, default=None, help="the input roll degree of the user")
-    parser.add_argument('--enhancer',  type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]")
-    parser.add_argument('--background_enhancer',  type=str, default=None, help="background enhancer, [realesrgan]")
-    parser.add_argument("--cpu", dest="cpu", action="store_true") 
-    parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks") 
-    parser.add_argument("--still", action="store_true", help="can crop back to the original videos for the full body aniamtion") 
-    parser.add_argument("--preprocess", default='crop', choices=['crop', 'extcrop', 'resize', 'full', 'extfull'], help="how to preprocess the images" ) 
-    parser.add_argument("--verbose",action="store_true", help="saving the intermedia output or not" ) 
-    parser.add_argument("--old_version",action="store_true", help="use the pth other than safetensor version" ) 
-
+    parser.add_argument('--enhancer', type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]")
+    parser.add_argument('--background_enhancer', type=str, default=None, help="background enhancer, [realesrgan]")
+    parser.add_argument("--cpu", dest="cpu", action="store_true")
+    parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks")
+    parser.add_argument("--still", action="store_true",
+                        help="can crop back to the original videos for the full body aniamtion")
+    parser.add_argument("--preprocess", default='crop', choices=['crop', 'extcrop', 'resize', 'full', 'extfull'],
+                        help="how to preprocess the images")
+    parser.add_argument("--verbose", action="store_true", help="saving the intermedia output or not")
+    parser.add_argument("--old_version", action="store_true", help="use the pth other than safetensor version")
 
     # net structure and parameters
-    parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='useless')
+    parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'],
+                        help='useless')
     parser.add_argument('--init_path', type=str, default=None, help='Useless')
-    parser.add_argument('--use_last_fc',default=False, help='zero initialize the last fc')
+    parser.add_argument('--use_last_fc', default=False, help='zero initialize the last fc')
     parser.add_argument('--bfm_folder', type=str, default='./checkpoints/BFM_Fitting/')
     parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
 
@@ -135,11 +180,11 @@ def main(args):
     parser.add_argument('--z_far', type=float, default=15.)
 
     args = parser.parse_args()
-
+    record_process_log(__name__, "parse_args", time.time() - t)
     if torch.cuda.is_available() and not args.cpu:
         args.device = "cuda"
     else:
         args.device = "cpu"
 
     main(args)
-
+    record_process_log(__name__, "Total Time", time.time() - t)
diff --git a/launcher.py b/launcher.py
index ae1be9d4..17ce9f1a 100644
--- a/launcher.py
+++ b/launcher.py
@@ -194,7 +194,7 @@ def prepare_environment():
 
 def start():
     print(f"Launching SadTalker Web UI")
-    from app import sadtalker_demo
+    from app_sadtalker import sadtalker_demo
     demo = sadtalker_demo()
     demo.queue()
     demo.launch()
diff --git a/quick_demo.ipynb b/quick_demo.ipynb
index 8b9767d0..8606cfc8 100644
--- a/quick_demo.ipynb
+++ b/quick_demo.ipynb
@@ -60,21 +60,29 @@
       },
       "outputs": [],
       "source": [
-        "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.8 2  \n",
-        "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.9 1  \n",
-        "!python --version  \n",
+        "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.8 2\n",
+        "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.9 1\n",
+        "!sudo apt install python3.8\n",
+        "\n",
+        "!sudo apt-get install python3.8-distutils\n",
+        "\n",
+        "!python --version\n",
+        "\n",
         "!apt-get update\n",
+        "\n",
         "!apt install software-properties-common\n",
+        "\n",
         "!sudo dpkg --remove --force-remove-reinstreq python3-pip python3-setuptools python3-wheel\n",
+        "\n",
         "!apt-get install python3-pip\n",
         "\n",
         "print('Git clone project and install requirements...')\n",
         "!git clone https://github.com/Winfredy/SadTalker &> /dev/null\n",
-        "%cd SadTalker \n",
-        "!export PYTHONPATH=/content/SadTalker:$PYTHONPATH \n",
+        "%cd SadTalker\n",
+        "!export PYTHONPATH=/content/SadTalker:$PYTHONPATH\n",
         "!python3.8 -m pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113\n",
         "!apt update\n",
-        "!apt install ffmpeg &> /dev/null  \n",
+        "!apt install ffmpeg &> /dev/null\n",
         "!python3.8 -m pip install -r requirements.txt"
       ]
     },
diff --git a/scripts/extension.py b/scripts/extension.py
index cd3d1a21..e99fefee 100644
--- a/scripts/extension.py
+++ b/scripts/extension.py
@@ -169,7 +169,7 @@ def on_ui_tabs():
     result_dir = opts.sadtalker_result_dir
     os.makedirs(result_dir, exist_ok=True)
 
-    from app import sadtalker_demo  
+    from app_sadtalker import sadtalker_demo  
 
     if  os.getenv('SADTALKER_CHECKPOINTS'):
         checkpoint_path = os.getenv('SADTALKER_CHECKPOINTS')
diff --git a/src/audio2pose_models/audio2pose.py b/src/audio2pose_models/audio2pose.py
index 1a8410d6..2b8cd142 100644
--- a/src/audio2pose_models/audio2pose.py
+++ b/src/audio2pose_models/audio2pose.py
@@ -25,8 +25,8 @@ def forward(self, x):
 
         batch = {}
         coeff_gt = x['gt'].cuda().squeeze(0)           #bs frame_len+1 73
-        batch['pose_motion_gt'] = coeff_gt[:, 1:, -9:-3] - coeff_gt[:, :1, -9:-3] #bs frame_len 6
-        batch['ref'] = coeff_gt[:, 0, -9:-3]  #bs  6
+        batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6
+        batch['ref'] = coeff_gt[:, 0, 64:70]  #bs  6
         batch['class'] = x['class'].squeeze(0).cuda() # bs
         indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16
 
@@ -37,8 +37,8 @@ def forward(self, x):
         batch = self.netG(batch)
 
         pose_motion_pred = batch['pose_motion_pred']           # bs frame_len 6
-        pose_gt = coeff_gt[:, 1:, -9:-3].clone()               # bs frame_len 6
-        pose_pred = coeff_gt[:, :1, -9:-3] + pose_motion_pred  # bs frame_len 6
+        pose_gt = coeff_gt[:, 1:, 64:70].clone()               # bs frame_len 6
+        pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred  # bs frame_len 6
 
         batch['pose_pred'] = pose_pred
         batch['pose_gt'] = pose_gt
diff --git a/src/face3d/extract_kp_videos_safe.py b/src/face3d/extract_kp_videos_safe.py
index 262439b9..5141ba3a 100644
--- a/src/face3d/extract_kp_videos_safe.py
+++ b/src/face3d/extract_kp_videos_safe.py
@@ -8,10 +8,28 @@
 import torch
 from tqdm import tqdm
 from itertools import cycle
-from facexlib.alignment import init_alignment_model, landmark_98_to_68
-from facexlib.detection import init_detection_model
 from torch.multiprocessing import Pool, Process, set_start_method
 
+from facexlib.alignment import landmark_98_to_68
+from facexlib.detection import init_detection_model
+
+from facexlib.utils import load_file_from_url
+from src.face3d.util.my_awing_arch import FAN
+
+def init_alignment_model(model_name, half=False, device='cuda', model_rootpath=None):
+    if model_name == 'awing_fan':
+        model = FAN(num_modules=4, num_landmarks=98, device=device)
+        model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/alignment_WFLW_4HG.pth'
+    else:
+        raise NotImplementedError(f'{model_name} is not implemented.')
+
+    model_path = load_file_from_url(
+        url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath)
+    model.load_state_dict(torch.load(model_path, map_location=device)['state_dict'], strict=True)
+    model.eval()
+    model = model.to(device)
+    return model
+
 
 class KeypointExtractor():
     def __init__(self, device='cuda'):
diff --git a/src/face3d/util/my_awing_arch.py b/src/face3d/util/my_awing_arch.py
new file mode 100644
index 00000000..cd565617
--- /dev/null
+++ b/src/face3d/util/my_awing_arch.py
@@ -0,0 +1,378 @@
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def calculate_points(heatmaps):
+    # change heatmaps to landmarks
+    B, N, H, W = heatmaps.shape
+    HW = H * W
+    BN_range = np.arange(B * N)
+
+    heatline = heatmaps.reshape(B, N, HW)
+    indexes = np.argmax(heatline, axis=2)
+
+    preds = np.stack((indexes % W, indexes // W), axis=2)
+    preds = preds.astype(np.float, copy=False)
+
+    inr = indexes.ravel()
+
+    heatline = heatline.reshape(B * N, HW)
+    x_up = heatline[BN_range, inr + 1]
+    x_down = heatline[BN_range, inr - 1]
+    # y_up = heatline[BN_range, inr + W]
+
+    if any((inr + W) >= 4096):
+        y_up = heatline[BN_range, 4095]
+    else:
+        y_up = heatline[BN_range, inr + W]
+    if any((inr - W) <= 0):
+        y_down = heatline[BN_range, 0]
+    else:
+        y_down = heatline[BN_range, inr - W]
+
+    think_diff = np.sign(np.stack((x_up - x_down, y_up - y_down), axis=1))
+    think_diff *= .25
+
+    preds += think_diff.reshape(B, N, 2)
+    preds += .5
+    return preds
+
+
+class AddCoordsTh(nn.Module):
+
+    def __init__(self, x_dim=64, y_dim=64, with_r=False, with_boundary=False):
+        super(AddCoordsTh, self).__init__()
+        self.x_dim = x_dim
+        self.y_dim = y_dim
+        self.with_r = with_r
+        self.with_boundary = with_boundary
+
+    def forward(self, input_tensor, heatmap=None):
+        """
+        input_tensor: (batch, c, x_dim, y_dim)
+        """
+        batch_size_tensor = input_tensor.shape[0]
+
+        xx_ones = torch.ones([1, self.y_dim], dtype=torch.int32, device=input_tensor.device)
+        xx_ones = xx_ones.unsqueeze(-1)
+
+        xx_range = torch.arange(self.x_dim, dtype=torch.int32, device=input_tensor.device).unsqueeze(0)
+        xx_range = xx_range.unsqueeze(1)
+
+        xx_channel = torch.matmul(xx_ones.float(), xx_range.float())
+        xx_channel = xx_channel.unsqueeze(-1)
+
+        yy_ones = torch.ones([1, self.x_dim], dtype=torch.int32, device=input_tensor.device)
+        yy_ones = yy_ones.unsqueeze(1)
+
+        yy_range = torch.arange(self.y_dim, dtype=torch.int32, device=input_tensor.device).unsqueeze(0)
+        yy_range = yy_range.unsqueeze(-1)
+
+        yy_channel = torch.matmul(yy_range.float(), yy_ones.float())
+        yy_channel = yy_channel.unsqueeze(-1)
+
+        xx_channel = xx_channel.permute(0, 3, 2, 1)
+        yy_channel = yy_channel.permute(0, 3, 2, 1)
+
+        xx_channel = xx_channel / (self.x_dim - 1)
+        yy_channel = yy_channel / (self.y_dim - 1)
+
+        xx_channel = xx_channel * 2 - 1
+        yy_channel = yy_channel * 2 - 1
+
+        xx_channel = xx_channel.repeat(batch_size_tensor, 1, 1, 1)
+        yy_channel = yy_channel.repeat(batch_size_tensor, 1, 1, 1)
+
+        if self.with_boundary and heatmap is not None:
+            boundary_channel = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0)
+
+            zero_tensor = torch.zeros_like(xx_channel)
+            xx_boundary_channel = torch.where(boundary_channel > 0.05, xx_channel, zero_tensor)
+            yy_boundary_channel = torch.where(boundary_channel > 0.05, yy_channel, zero_tensor)
+        if self.with_boundary and heatmap is not None:
+            xx_boundary_channel = xx_boundary_channel.to(input_tensor.device)
+            yy_boundary_channel = yy_boundary_channel.to(input_tensor.device)
+        ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1)
+
+        if self.with_r:
+            rr = torch.sqrt(torch.pow(xx_channel, 2) + torch.pow(yy_channel, 2))
+            rr = rr / torch.max(rr)
+            ret = torch.cat([ret, rr], dim=1)
+
+        if self.with_boundary and heatmap is not None:
+            ret = torch.cat([ret, xx_boundary_channel, yy_boundary_channel], dim=1)
+        return ret
+
+
+class CoordConvTh(nn.Module):
+    """CoordConv layer as in the paper."""
+
+    def __init__(self, x_dim, y_dim, with_r, with_boundary, in_channels, first_one=False, *args, **kwargs):
+        super(CoordConvTh, self).__init__()
+        self.addcoords = AddCoordsTh(x_dim=x_dim, y_dim=y_dim, with_r=with_r, with_boundary=with_boundary)
+        in_channels += 2
+        if with_r:
+            in_channels += 1
+        if with_boundary and not first_one:
+            in_channels += 2
+        self.conv = nn.Conv2d(in_channels=in_channels, *args, **kwargs)
+
+    def forward(self, input_tensor, heatmap=None):
+        ret = self.addcoords(input_tensor, heatmap)
+        last_channel = ret[:, -2:, :, :]
+        ret = self.conv(ret)
+        return ret, last_channel
+
+
+def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False, dilation=1):
+    '3x3 convolution with padding'
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=strd, padding=padding, bias=bias, dilation=dilation)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        # self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        # self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ConvBlock(nn.Module):
+
+    def __init__(self, in_planes, out_planes):
+        super(ConvBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = conv3x3(in_planes, int(out_planes / 2))
+        self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
+        self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4), padding=1, dilation=1)
+        self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
+        self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4), padding=1, dilation=1)
+
+        if in_planes != out_planes:
+            self.downsample = nn.Sequential(
+                nn.BatchNorm2d(in_planes),
+                nn.ReLU(True),
+                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False),
+            )
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        residual = x
+
+        out1 = self.bn1(x)
+        out1 = F.relu(out1, True)
+        out1 = self.conv1(out1)
+
+        out2 = self.bn2(out1)
+        out2 = F.relu(out2, True)
+        out2 = self.conv2(out2)
+
+        out3 = self.bn3(out2)
+        out3 = F.relu(out3, True)
+        out3 = self.conv3(out3)
+
+        out3 = torch.cat((out1, out2, out3), 1)
+
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+
+        out3 += residual
+
+        return out3
+
+
+class HourGlass(nn.Module):
+
+    def __init__(self, num_modules, depth, num_features, first_one=False):
+        super(HourGlass, self).__init__()
+        self.num_modules = num_modules
+        self.depth = depth
+        self.features = num_features
+        self.coordconv = CoordConvTh(
+            x_dim=64,
+            y_dim=64,
+            with_r=True,
+            with_boundary=True,
+            in_channels=256,
+            first_one=first_one,
+            out_channels=256,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self._generate_network(self.depth)
+
+    def _generate_network(self, level):
+        self.add_module('b1_' + str(level), ConvBlock(256, 256))
+
+        self.add_module('b2_' + str(level), ConvBlock(256, 256))
+
+        if level > 1:
+            self._generate_network(level - 1)
+        else:
+            self.add_module('b2_plus_' + str(level), ConvBlock(256, 256))
+
+        self.add_module('b3_' + str(level), ConvBlock(256, 256))
+
+    def _forward(self, level, inp):
+        # Upper branch
+        up1 = inp
+        up1 = self._modules['b1_' + str(level)](up1)
+
+        # Lower branch
+        low1 = F.avg_pool2d(inp, 2, stride=2)
+        low1 = self._modules['b2_' + str(level)](low1)
+
+        if level > 1:
+            low2 = self._forward(level - 1, low1)
+        else:
+            low2 = low1
+            low2 = self._modules['b2_plus_' + str(level)](low2)
+
+        low3 = low2
+        low3 = self._modules['b3_' + str(level)](low3)
+
+        up2 = F.interpolate(low3, scale_factor=2, mode='nearest')
+
+        return up1 + up2
+
+    def forward(self, x, heatmap):
+        x, last_channel = self.coordconv(x, heatmap)
+        return self._forward(self.depth, x), last_channel
+
+
+class FAN(nn.Module):
+
+    def __init__(self, num_modules=1, end_relu=False, gray_scale=False, num_landmarks=68, device='cuda'):
+        super(FAN, self).__init__()
+        self.device = device
+        self.num_modules = num_modules
+        self.gray_scale = gray_scale
+        self.end_relu = end_relu
+        self.num_landmarks = num_landmarks
+
+        # Base part
+        if self.gray_scale:
+            self.conv1 = CoordConvTh(
+                x_dim=256,
+                y_dim=256,
+                with_r=True,
+                with_boundary=False,
+                in_channels=3,
+                out_channels=64,
+                kernel_size=7,
+                stride=2,
+                padding=3)
+        else:
+            self.conv1 = CoordConvTh(
+                x_dim=256,
+                y_dim=256,
+                with_r=True,
+                with_boundary=False,
+                in_channels=3,
+                out_channels=64,
+                kernel_size=7,
+                stride=2,
+                padding=3)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.conv2 = ConvBlock(64, 128)
+        self.conv3 = ConvBlock(128, 128)
+        self.conv4 = ConvBlock(128, 256)
+
+        # Stacking part
+        for hg_module in range(self.num_modules):
+            if hg_module == 0:
+                first_one = True
+            else:
+                first_one = False
+            self.add_module('m' + str(hg_module), HourGlass(1, 4, 256, first_one))
+            self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
+            self.add_module('conv_last' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
+            self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
+            self.add_module('l' + str(hg_module), nn.Conv2d(256, num_landmarks + 1, kernel_size=1, stride=1, padding=0))
+
+            if hg_module < self.num_modules - 1:
+                self.add_module('bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
+                self.add_module('al' + str(hg_module),
+                                nn.Conv2d(num_landmarks + 1, 256, kernel_size=1, stride=1, padding=0))
+
+    def forward(self, x):
+        x, _ = self.conv1(x)
+        x = F.relu(self.bn1(x), True)
+        # x = F.relu(self.bn1(self.conv1(x)), True)
+        x = F.avg_pool2d(self.conv2(x), 2, stride=2)
+        x = self.conv3(x)
+        x = self.conv4(x)
+
+        previous = x
+
+        outputs = []
+        boundary_channels = []
+        tmp_out = None
+        for i in range(self.num_modules):
+            hg, boundary_channel = self._modules['m' + str(i)](previous, tmp_out)
+
+            ll = hg
+            ll = self._modules['top_m_' + str(i)](ll)
+
+            ll = F.relu(self._modules['bn_end' + str(i)](self._modules['conv_last' + str(i)](ll)), True)
+
+            # Predict heatmaps
+            tmp_out = self._modules['l' + str(i)](ll)
+            if self.end_relu:
+                tmp_out = F.relu(tmp_out)  # HACK: Added relu
+            outputs.append(tmp_out)
+            boundary_channels.append(boundary_channel)
+
+            if i < self.num_modules - 1:
+                ll = self._modules['bl' + str(i)](ll)
+                tmp_out_ = self._modules['al' + str(i)](tmp_out)
+                previous = previous + ll + tmp_out_
+
+        return outputs, boundary_channels
+
+    def get_landmarks(self, img):
+        H, W, _ = img.shape
+        offset = W / 64, H / 64, 0, 0
+
+        img = cv2.resize(img, (256, 256))
+        inp = img[..., ::-1]
+        inp = torch.from_numpy(np.ascontiguousarray(inp.transpose((2, 0, 1)))).float()
+        inp = inp.to(self.device)
+        inp.div_(255.0).unsqueeze_(0)
+
+        outputs, _ = self.forward(inp)
+        out = outputs[-1][:, :-1, :, :]
+        heatmaps = out.detach().cpu().numpy()
+
+        pred = calculate_points(heatmaps).reshape(-1, 2)
+
+        pred *= offset[:2]
+        pred += offset[-2:]
+
+        return pred
diff --git a/src/face3d/visualize.py b/src/face3d/visualize.py
index 23a11108..d4652946 100644
--- a/src/face3d/visualize.py
+++ b/src/face3d/visualize.py
@@ -6,43 +6,48 @@
 import torch
 import subprocess, platform
 import scipy.io as scio
-from tqdm import tqdm 
+from tqdm import tqdm
+import time
+from src.utils.process_log import record_process_log
+
 
 # draft
 def gen_composed_video(args, device, first_frame_coeff, coeff_path, audio_path, save_path, exp_dim=64):
-    
     coeff_first = scio.loadmat(first_frame_coeff)['full_3dmm']
 
     coeff_pred = scio.loadmat(coeff_path)['coeff_3dmm']
 
-    coeff_full = np.repeat(coeff_first, coeff_pred.shape[0], axis=0) # 257
+    coeff_full = np.repeat(coeff_first, coeff_pred.shape[0], axis=0)  # 257
 
     coeff_full[:, 80:144] = coeff_pred[:, 0:64]
-    coeff_full[:, 224:227]  = coeff_pred[:, 64:67] # 3 dim translation
-    coeff_full[:, 254:]  = coeff_pred[:, 67:] # 3 dim translation
+    coeff_full[:, 224:227] = coeff_pred[:, 64:67]  # 3 dim translation
+    coeff_full[:, 254:] = coeff_pred[:, 67:]  # 3 dim translation
 
     tmp_video_path = '/tmp/face3dtmp.mp4'
 
     facemodel = FaceReconModel(args)
-    
+    t = time.time()
     video = cv2.VideoWriter(tmp_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 25, (224, 224))
-
+    record_process_log("gen_composed_video", "cv2.VideoWriter", time.time()-t)
+    t = time.time()
     for k in tqdm(range(coeff_pred.shape[0]), 'face3d rendering:'):
-        cur_coeff_full = torch.tensor(coeff_full[k:k+1], device=device)
+        cur_coeff_full = torch.tensor(coeff_full[k:k + 1], device=device)
 
         facemodel.forward(cur_coeff_full, device)
 
-        predicted_landmark = facemodel.pred_lm # TODO.
+        predicted_landmark = facemodel.pred_lm  # TODO.
         predicted_landmark = predicted_landmark.cpu().numpy().squeeze()
 
         rendered_img = facemodel.pred_face
-        rendered_img = 255. * rendered_img.cpu().numpy().squeeze().transpose(1,2,0)
+        rendered_img = 255. * rendered_img.cpu().numpy().squeeze().transpose(1, 2, 0)
         out_img = rendered_img[:, :, :3].astype(np.uint8)
 
-        video.write(np.uint8(out_img[:,:,::-1]))
+        video.write(np.uint8(out_img[:, :, ::-1]))
 
     video.release()
+    record_process_log("gen_composed_video", "face3d rendering", time.time() - t)
 
+    t = time.time()
     command = 'ffmpeg -v quiet -y -i {} -i {} -strict -2 -q:v 1 {}'.format(audio_path, tmp_video_path, save_path)
     subprocess.call(command, shell=platform.system() != 'Windows')
-
+    record_process_log("gen_composed_video", "ffmpeg", time.time() - t)
diff --git a/src/facerender/animate.py b/src/facerender/animate.py
index 781f5a33..2ada81cc 100644
--- a/src/facerender/animate.py
+++ b/src/facerender/animate.py
@@ -5,45 +5,56 @@
 import warnings
 from skimage import img_as_ubyte
 import safetensors
-import safetensors.torch 
-warnings.filterwarnings('ignore')
+import safetensors.torch
 
+warnings.filterwarnings('ignore')
 
 import imageio
 import torch
 import torchvision
 
-
 from src.facerender.modules.keypoint_detector import HEEstimator, KPDetector
 from src.facerender.modules.mapping import MappingNet
 from src.facerender.modules.generator import OcclusionAwareGenerator, OcclusionAwareSPADEGenerator
-from src.facerender.modules.make_animation import make_animation 
+from src.facerender.modules.make_animation import make_animation
 
-from pydub import AudioSegment 
+from pydub import AudioSegment
 from src.utils.face_enhancer import enhancer_generator_with_len, enhancer_list
 from src.utils.paste_pic import paste_pic
 from src.utils.videoio import save_video_with_watermark
+import time
+from src.utils.process_log import record_process_log, get_file_size
 
 try:
     import webui  # in webui
+
     in_webui = True
 except:
     in_webui = False
 
+
 class AnimateFromCoeff():
 
     def __init__(self, sadtalker_path, device):
 
         with open(sadtalker_path['facerender_yaml']) as f:
             config = yaml.safe_load(f)
-
+        t = time.time()
         generator = OcclusionAwareSPADEGenerator(**config['model_params']['generator_params'],
-                                                    **config['model_params']['common_params'])
+                                                 **config['model_params']['common_params'])
+        record_process_log(self.__class__.__name__, "OcclusionAwareSPADEGenerator", time.time()-t)
+        t = time.time()
         kp_extractor = KPDetector(**config['model_params']['kp_detector_params'],
-                                    **config['model_params']['common_params'])
+                                  **config['model_params']['common_params'])
+        record_process_log(self.__class__.__name__, "KPDetector", time.time() - t)
+
+        t = time.time()
         he_estimator = HEEstimator(**config['model_params']['he_estimator_params'],
-                               **config['model_params']['common_params'])
+                                   **config['model_params']['common_params'])
+        record_process_log(self.__class__.__name__, "HEEstimator", time.time() - t)
+        t = time.time()
         mapping = MappingNet(**config['model_params']['mapping_params'])
+        record_process_log(self.__class__.__name__, "HEEstimator", time.time() - t)
 
         generator.to(device)
         kp_extractor.to(device)
@@ -52,24 +63,26 @@ def __init__(self, sadtalker_path, device):
         for param in generator.parameters():
             param.requires_grad = False
         for param in kp_extractor.parameters():
-            param.requires_grad = False 
+            param.requires_grad = False
         for param in he_estimator.parameters():
             param.requires_grad = False
         for param in mapping.parameters():
             param.requires_grad = False
 
         if sadtalker_path is not None:
-            if 'checkpoint' in sadtalker_path: # use safe tensor
-                self.load_cpk_facevid2vid_safetensor(sadtalker_path['checkpoint'], kp_detector=kp_extractor, generator=generator, he_estimator=None)
+            if 'checkpoint' in sadtalker_path:  # use safe tensor
+                self.load_cpk_facevid2vid_safetensor(sadtalker_path['checkpoint'], kp_detector=kp_extractor,
+                                                     generator=generator, he_estimator=None)
             else:
-                self.load_cpk_facevid2vid(sadtalker_path['free_view_checkpoint'], kp_detector=kp_extractor, generator=generator, he_estimator=he_estimator)
+                self.load_cpk_facevid2vid(sadtalker_path['free_view_checkpoint'], kp_detector=kp_extractor,
+                                          generator=generator, he_estimator=he_estimator)
         else:
             raise AttributeError("Checkpoint should be specified for video head pose estimator.")
 
-        if  sadtalker_path['mappingnet_checkpoint'] is not None:
+        if sadtalker_path['mappingnet_checkpoint'] is not None:
             self.load_cpk_mapping(sadtalker_path['mappingnet_checkpoint'], mapping=mapping)
         else:
-            raise AttributeError("Checkpoint should be specified for video head pose estimator.") 
+            raise AttributeError("Checkpoint should be specified for video head pose estimator.")
 
         self.kp_extractor = kp_extractor
         self.generator = generator
@@ -80,40 +93,40 @@ def __init__(self, sadtalker_path, device):
         self.generator.eval()
         self.he_estimator.eval()
         self.mapping.eval()
-         
+
         self.device = device
-    
-    def load_cpk_facevid2vid_safetensor(self, checkpoint_path, generator=None, 
-                        kp_detector=None, he_estimator=None,  
-                        device="cpu"):
+
+    def load_cpk_facevid2vid_safetensor(self, checkpoint_path, generator=None,
+                                        kp_detector=None, he_estimator=None,
+                                        device="cpu"):
 
         checkpoint = safetensors.torch.load_file(checkpoint_path)
 
         if generator is not None:
             x_generator = {}
-            for k,v in checkpoint.items():
+            for k, v in checkpoint.items():
                 if 'generator' in k:
                     x_generator[k.replace('generator.', '')] = v
             generator.load_state_dict(x_generator)
         if kp_detector is not None:
             x_generator = {}
-            for k,v in checkpoint.items():
+            for k, v in checkpoint.items():
                 if 'kp_extractor' in k:
                     x_generator[k.replace('kp_extractor.', '')] = v
             kp_detector.load_state_dict(x_generator)
         if he_estimator is not None:
             x_generator = {}
-            for k,v in checkpoint.items():
+            for k, v in checkpoint.items():
                 if 'he_estimator' in k:
                     x_generator[k.replace('he_estimator.', '')] = v
             he_estimator.load_state_dict(x_generator)
-        
+
         return None
 
-    def load_cpk_facevid2vid(self, checkpoint_path, generator=None, discriminator=None, 
-                        kp_detector=None, he_estimator=None, optimizer_generator=None, 
-                        optimizer_discriminator=None, optimizer_kp_detector=None, 
-                        optimizer_he_estimator=None, device="cpu"):
+    def load_cpk_facevid2vid(self, checkpoint_path, generator=None, discriminator=None,
+                             kp_detector=None, he_estimator=None, optimizer_generator=None,
+                             optimizer_discriminator=None, optimizer_kp_detector=None,
+                             optimizer_he_estimator=None, device="cpu"):
         checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
         if generator is not None:
             generator.load_state_dict(checkpoint['generator'])
@@ -123,26 +136,26 @@ def load_cpk_facevid2vid(self, checkpoint_path, generator=None, discriminator=No
             he_estimator.load_state_dict(checkpoint['he_estimator'])
         if discriminator is not None:
             try:
-               discriminator.load_state_dict(checkpoint['discriminator'])
+                discriminator.load_state_dict(checkpoint['discriminator'])
             except:
-               print ('No discriminator in the state-dict. Dicriminator will be randomly initialized')
+                print('No discriminator in the state-dict. Dicriminator will be randomly initialized')
         if optimizer_generator is not None:
             optimizer_generator.load_state_dict(checkpoint['optimizer_generator'])
         if optimizer_discriminator is not None:
             try:
                 optimizer_discriminator.load_state_dict(checkpoint['optimizer_discriminator'])
             except RuntimeError as e:
-                print ('No discriminator optimizer in the state-dict. Optimizer will be not initialized')
+                print('No discriminator optimizer in the state-dict. Optimizer will be not initialized')
         if optimizer_kp_detector is not None:
             optimizer_kp_detector.load_state_dict(checkpoint['optimizer_kp_detector'])
         if optimizer_he_estimator is not None:
             optimizer_he_estimator.load_state_dict(checkpoint['optimizer_he_estimator'])
 
         return checkpoint['epoch']
-    
+
     def load_cpk_mapping(self, checkpoint_path, mapping=None, discriminator=None,
-                 optimizer_mapping=None, optimizer_discriminator=None, device='cpu'):
-        checkpoint = torch.load(checkpoint_path,  map_location=torch.device(device))
+                         optimizer_mapping=None, optimizer_discriminator=None, device='cpu'):
+        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
         if mapping is not None:
             mapping.load_state_dict(checkpoint['mapping'])
         if discriminator is not None:
@@ -154,14 +167,15 @@ def load_cpk_mapping(self, checkpoint_path, mapping=None, discriminator=None,
 
         return checkpoint['epoch']
 
-    def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, background_enhancer=None, preprocess='crop', img_size=256):
-
-        source_image=x['source_image'].type(torch.FloatTensor)
-        source_semantics=x['source_semantics'].type(torch.FloatTensor)
-        target_semantics=x['target_semantics_list'].type(torch.FloatTensor) 
-        source_image=source_image.to(self.device)
-        source_semantics=source_semantics.to(self.device)
-        target_semantics=target_semantics.to(self.device)
+    def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, background_enhancer=None,
+                 preprocess='crop', img_size=256):
+        t = time.time()
+        source_image = x['source_image'].type(torch.FloatTensor)
+        source_semantics = x['source_semantics'].type(torch.FloatTensor)
+        target_semantics = x['target_semantics_list'].type(torch.FloatTensor)
+        source_image = source_image.to(self.device)
+        source_semantics = source_semantics.to(self.device)
+        target_semantics = target_semantics.to(self.device)
         if 'yaw_c_seq' in x:
             yaw_c_seq = x['yaw_c_seq'].type(torch.FloatTensor)
             yaw_c_seq = x['yaw_c_seq'].to(self.device)
@@ -173,85 +187,125 @@ def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, backgr
         else:
             pitch_c_seq = None
         if 'roll_c_seq' in x:
-            roll_c_seq = x['roll_c_seq'].type(torch.FloatTensor) 
+            roll_c_seq = x['roll_c_seq'].type(torch.FloatTensor)
             roll_c_seq = x['roll_c_seq'].to(self.device)
         else:
             roll_c_seq = None
 
-        frame_num = x['frame_num']
+        record_process_log(self.__class__.__name__, "generate", time.time()-t, "source_image")
 
+        frame_num = x['frame_num']
+        t = time.time()
         predictions_video = make_animation(source_image, source_semantics, target_semantics,
-                                        self.generator, self.kp_extractor, self.he_estimator, self.mapping, 
-                                        yaw_c_seq, pitch_c_seq, roll_c_seq, use_exp = True)
+                                           self.generator, self.kp_extractor, self.he_estimator, self.mapping,
+                                           yaw_c_seq, pitch_c_seq, roll_c_seq, use_exp=True)
+        # call make_animation v100 消耗时长：16.85565281
+        record_process_log(self.__class__.__name__, "generate", time.time()-t, "make_animation")
 
-        predictions_video = predictions_video.reshape((-1,)+predictions_video.shape[2:])
+        t = time.time()
+        predictions_video = predictions_video.reshape((-1,) + predictions_video.shape[2:])
         predictions_video = predictions_video[:frame_num]
+        record_process_log(self.__class__.__name__, "generate", time.time()-t, "predictions_video")
 
         video = []
+        t = time.time()
         for idx in range(predictions_video.shape[0]):
             image = predictions_video[idx]
             image = np.transpose(image.data.cpu().numpy(), [1, 2, 0]).astype(np.float32)
             video.append(image)
-        result = img_as_ubyte(video)
+        record_process_log(self.__class__.__name__, "generate", time.time()-t, "range(predictions_video.shape[0])")
 
+        t = time.time()
+        result = img_as_ubyte(video)
         ### the generated video is 256x256, so we keep the aspect ratio, 
         original_size = crop_info[0]
         if original_size:
-            result = [ cv2.resize(result_i,(img_size, int(img_size * original_size[1]/original_size[0]) )) for result_i in result ]
-        
-        video_name = x['video_name']  + '.mp4'
-        path = os.path.join(video_save_dir, 'temp_'+video_name)
-        
-        imageio.mimsave(path, result,  fps=float(25))
+            result = [cv2.resize(result_i, (img_size, int(img_size * original_size[1] / original_size[0]))) for result_i
+                      in result]
+        record_process_log(self.__class__.__name__, "generate", time.time()-t, "original_size")
+
+        video_name = x['video_name'] + '.mp4'
+        path = os.path.join(video_save_dir, 'temp_' + video_name)
+
+        t = time.time()
+
+        imageio.mimsave(path, result, fps=float(25))
+        record_process_log(self.__class__.__name__, "generate", time.time()-t, "imageio.mimsave")
 
         av_path = os.path.join(video_save_dir, video_name)
-        return_path = av_path 
-        
-        audio_path =  x['audio_path'] 
+        return_path = av_path
+
+        audio_path = x['audio_path']
         audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0]
-        new_audio_path = os.path.join(video_save_dir, audio_name+'.wav')
+        new_audio_path = os.path.join(video_save_dir, audio_name + '.wav')
         start_time = 0
         # cog will not keep the .mp3 filename
         sound = AudioSegment.from_file(audio_path)
-        frames = frame_num 
-        end_time = start_time + frames*1/25*1000
-        word1=sound.set_frame_rate(16000)
+        frames = frame_num
+        end_time = start_time + frames * 1 / 25 * 1000
+        word1 = sound.set_frame_rate(16000)
         word = word1[start_time:end_time]
         word.export(new_audio_path, format="wav")
 
-        save_video_with_watermark(path, new_audio_path, av_path, watermark= False)
-        print(f'The generated video is named {video_save_dir}/{video_name}') 
+        save_video_with_watermark(path, new_audio_path, av_path, watermark=False)
+        print(f'The generated video is named {video_save_dir}/{video_name}')
 
+        t = time.time()
         if 'full' in preprocess.lower():
             # only add watermark to the full image.
-            video_name_full = x['video_name']  + '_full.mp4'
+            video_name_full = x['video_name'] + '_full.mp4'
             full_video_path = os.path.join(video_save_dir, video_name_full)
             return_path = full_video_path
-            paste_pic(path, pic_path, crop_info, new_audio_path, full_video_path, extended_crop= True if 'ext' in preprocess.lower() else False)
-            print(f'The generated video is named {video_save_dir}/{video_name_full}') 
+            # TODO (qingyuan): paste_pic takes 9s. Check the comments inside the function on how to optimize it
+            paste_pic(path, pic_path, crop_info, new_audio_path, full_video_path,
+                      extended_crop=True if 'ext' in preprocess.lower() else False)
+
+            record_process_log(self.__class__.__name__, "generate", time.time() - t, "paste_pic")
+            print(f'The generated video is named {video_save_dir}/{video_name_full}')
         else:
-            full_video_path = av_path 
+            full_video_path = av_path
 
-        #### paste back then enhancers
+            #### paste back then enhancers
         if enhancer:
-            video_name_enhancer = x['video_name']  + '_enhanced.mp4'
-            enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
-            av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer) 
+            video_name_enhancer = x['video_name'] + '_enhanced.mp4'
+            enhanced_path = os.path.join(video_save_dir, 'temp_' + video_name_enhancer)
+            av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer)
             return_path = av_path_enhancer
 
+            t = time.time()
             try:
-                enhanced_images_gen_with_len = enhancer_generator_with_len(full_video_path, method=enhancer, bg_upsampler=background_enhancer)
+                et = time.time()
+                enhanced_images_gen_with_len = enhancer_generator_with_len(full_video_path, method=enhancer,
+                                                                           bg_upsampler=background_enhancer)
+                record_process_log(self.__class__.__name__, "generate", time.time() - et, "enhancer_generator_with_len")
+                it = time.time()
                 imageio.mimsave(enhanced_path, enhanced_images_gen_with_len, fps=float(25))
+                # enhancer_generator_with_len.imageio.mimsave 消耗时长：50.53366041
+                # TODO (qingyuan): mimsave is an alias for mimwrite. The source code is here:
+                #  https://github.com/imageio/imageio/blob/dac86e36696d4afbd2b6588d8fd119107cdfaf3f/imageio/v2.py#L469
+                #  All the time is spent on disk i/o. We may send the video back directly without writing to disk.
+                record_process_log(self.__class__.__name__, "generate", time.time() - it, "enhancer_generator_with_len.imageio.mimsave")
+
             except:
-                enhanced_images_gen_with_len = enhancer_list(full_video_path, method=enhancer, bg_upsampler=background_enhancer)
+                _t = time.time()
+                enhanced_images_gen_with_len = enhancer_list(full_video_path, method=enhancer,
+                                                             bg_upsampler=background_enhancer)
+                record_process_log(self.__class__.__name__, "generate", time.time() - _t, "enhancer_list")
+                it = time.time()
                 imageio.mimsave(enhanced_path, enhanced_images_gen_with_len, fps=float(25))
-            
-            save_video_with_watermark(enhanced_path, new_audio_path, av_path_enhancer, watermark= False)
+                record_process_log(self.__class__.__name__, "generate", time.time() - it, "enhancer_list.imageio.mimsave")
+
+            # enhanced_images_gen_with_len，支持try except 这部分block 整体时间：50.68383718，99%的时间都是消耗在 enhancer_generator_with_len.imageio.mimsave
+            record_process_log(self.__class__.__name__, "generate", time.time() - t, "enhanced_images_gen_with_len")
+            save_video_with_watermark(enhanced_path, new_audio_path, av_path_enhancer, watermark=False)
             print(f'The generated video is named {video_save_dir}/{video_name_enhancer}')
+            record_process_log(self.__class__.__name__, "generate", 0, f"av_path_enhancer:{get_file_size(av_path_enhancer)}")
+
+            record_process_log(self.__class__.__name__, "generate", 0, f"enhanced_path:{get_file_size(enhanced_path)}")
             os.remove(enhanced_path)
+        record_process_log(self.__class__.__name__, "generate", 0,
+                           f"new_audio_path:{get_file_size(new_audio_path)}")
 
         os.remove(path)
         os.remove(new_audio_path)
-
         return return_path
-
diff --git a/src/facerender/modules/make_animation.py b/src/facerender/modules/make_animation.py
index 3360c535..9ca93356 100644
--- a/src/facerender/modules/make_animation.py
+++ b/src/facerender/modules/make_animation.py
@@ -2,7 +2,10 @@
 import torch
 import torch.nn.functional as F
 import numpy as np
-from tqdm import tqdm 
+from tqdm import tqdm
+import time
+from src.utils.process_log import record_process_log
+
 
 def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=False,
                  use_relative_movement=False, use_relative_jacobian=False):
@@ -26,14 +29,16 @@ def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale
 
     return kp_new
 
+
 def headpose_pred_to_degree(pred):
     device = pred.device
     idx_tensor = [idx for idx in range(66)]
     idx_tensor = torch.FloatTensor(idx_tensor).type_as(pred).to(device)
     pred = F.softmax(pred)
-    degree = torch.sum(pred*idx_tensor, 1) * 3 - 99
+    degree = torch.sum(pred * idx_tensor, 1) * 3 - 99
     return degree
 
+
 def get_rotation_matrix(yaw, pitch, roll):
     yaw = yaw / 180 * 3.14
     pitch = pitch / 180 * 3.14
@@ -43,29 +48,30 @@ def get_rotation_matrix(yaw, pitch, roll):
     pitch = pitch.unsqueeze(1)
     yaw = yaw.unsqueeze(1)
 
-    pitch_mat = torch.cat([torch.ones_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch), 
-                          torch.zeros_like(pitch), torch.cos(pitch), -torch.sin(pitch),
-                          torch.zeros_like(pitch), torch.sin(pitch), torch.cos(pitch)], dim=1)
+    pitch_mat = torch.cat([torch.ones_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch),
+                           torch.zeros_like(pitch), torch.cos(pitch), -torch.sin(pitch),
+                           torch.zeros_like(pitch), torch.sin(pitch), torch.cos(pitch)], dim=1)
     pitch_mat = pitch_mat.view(pitch_mat.shape[0], 3, 3)
 
-    yaw_mat = torch.cat([torch.cos(yaw), torch.zeros_like(yaw), torch.sin(yaw), 
-                           torch.zeros_like(yaw), torch.ones_like(yaw), torch.zeros_like(yaw),
-                           -torch.sin(yaw), torch.zeros_like(yaw), torch.cos(yaw)], dim=1)
+    yaw_mat = torch.cat([torch.cos(yaw), torch.zeros_like(yaw), torch.sin(yaw),
+                         torch.zeros_like(yaw), torch.ones_like(yaw), torch.zeros_like(yaw),
+                         -torch.sin(yaw), torch.zeros_like(yaw), torch.cos(yaw)], dim=1)
     yaw_mat = yaw_mat.view(yaw_mat.shape[0], 3, 3)
 
-    roll_mat = torch.cat([torch.cos(roll), -torch.sin(roll), torch.zeros_like(roll),  
-                         torch.sin(roll), torch.cos(roll), torch.zeros_like(roll),
-                         torch.zeros_like(roll), torch.zeros_like(roll), torch.ones_like(roll)], dim=1)
+    roll_mat = torch.cat([torch.cos(roll), -torch.sin(roll), torch.zeros_like(roll),
+                          torch.sin(roll), torch.cos(roll), torch.zeros_like(roll),
+                          torch.zeros_like(roll), torch.zeros_like(roll), torch.ones_like(roll)], dim=1)
     roll_mat = roll_mat.view(roll_mat.shape[0], 3, 3)
 
     rot_mat = torch.einsum('bij,bjk,bkm->bim', pitch_mat, yaw_mat, roll_mat)
 
     return rot_mat
 
+
 def keypoint_transformation(kp_canonical, he, wo_exp=False):
-    kp = kp_canonical['value']    # (bs, k, 3) 
-    yaw, pitch, roll= he['yaw'], he['pitch'], he['roll']      
-    yaw = headpose_pred_to_degree(yaw) 
+    kp = kp_canonical['value']  # (bs, k, 3)
+    yaw, pitch, roll = he['yaw'], he['pitch'], he['roll']
+    yaw = headpose_pred_to_degree(yaw)
     pitch = headpose_pred_to_degree(pitch)
     roll = headpose_pred_to_degree(roll)
 
@@ -76,18 +82,18 @@ def keypoint_transformation(kp_canonical, he, wo_exp=False):
     if 'roll_in' in he:
         roll = he['roll_in']
 
-    rot_mat = get_rotation_matrix(yaw, pitch, roll)    # (bs, 3, 3)
+    rot_mat = get_rotation_matrix(yaw, pitch, roll)  # (bs, 3, 3)
 
     t, exp = he['t'], he['exp']
     if wo_exp:
-        exp =  exp*0  
-    
-    # keypoint rotation
+        exp = exp * 0
+
+        # keypoint rotation
     kp_rotated = torch.einsum('bmp,bkp->bkm', rot_mat, kp)
 
     # keypoint translation
-    t[:, 0] = t[:, 0]*0
-    t[:, 2] = t[:, 2]*0
+    t[:, 0] = t[:, 0] * 0
+    t[:, 2] = t[:, 2] * 0
     t = t.unsqueeze(1).repeat(1, kp.shape[1], 1)
     kp_t = kp_rotated + t
 
@@ -98,34 +104,40 @@ def keypoint_transformation(kp_canonical, he, wo_exp=False):
     return {'value': kp_transformed}
 
 
-
 def make_animation(source_image, source_semantics, target_semantics,
-                            generator, kp_detector, he_estimator, mapping, 
-                            yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None,
-                            use_exp=True, use_half=False):
+                   generator, kp_detector, he_estimator, mapping,
+                   yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None,
+                   use_exp=True, use_half=False):
     with torch.no_grad():
         predictions = []
 
         kp_canonical = kp_detector(source_image)
         he_source = mapping(source_semantics)
         kp_source = keypoint_transformation(kp_canonical, he_source)
-    
+        t = time.time()
         for frame_idx in tqdm(range(target_semantics.shape[1]), 'Face Renderer:'):
             # still check the dimension
             # print(target_semantics.shape, source_semantics.shape)
+            for_loop_t = time.time()
             target_semantics_frame = target_semantics[:, frame_idx]
             he_driving = mapping(target_semantics_frame)
             if yaw_c_seq is not None:
                 he_driving['yaw_in'] = yaw_c_seq[:, frame_idx]
             if pitch_c_seq is not None:
-                he_driving['pitch_in'] = pitch_c_seq[:, frame_idx] 
+                he_driving['pitch_in'] = pitch_c_seq[:, frame_idx]
             if roll_c_seq is not None:
-                he_driving['roll_in'] = roll_c_seq[:, frame_idx] 
-            
+                he_driving['roll_in'] = roll_c_seq[:, frame_idx]
+
             kp_driving = keypoint_transformation(kp_canonical, he_driving)
-                
+            # record_process_log("make_animation", "Face Renderer for loop: keypoint_transformation",
+            #                    time.time() - for_loop_t,
+            #                    f"frame_idx:{frame_idx}")
             kp_norm = kp_driving
+            gt = time.time()
             out = generator(source_image, kp_source=kp_source, kp_driving=kp_norm)
+            # record_process_log("make_animation", "Face Renderer for loop: generator",
+            #                    time.time() - gt,
+            #                    f"frame_idx:{frame_idx}")
             '''
             source_image_new = out['prediction'].squeeze(1)
             kp_canonical_new =  kp_detector(source_image_new)
@@ -135,9 +147,16 @@ def make_animation(source_image, source_semantics, target_semantics,
             out = generator(source_image_new, kp_source=kp_source_new, kp_driving=kp_driving_new)
             '''
             predictions.append(out['prediction'])
+
+            # record_process_log("make_animation", "Face Renderer for loop", time.time() - for_loop_t, f"frame_idx:{frame_idx}")
+
         predictions_ts = torch.stack(predictions, dim=1)
+        # Face Renderer 总消耗时长：16.83923388，可以使用并行处理for loop
+        # TODO(qingyuan): change forloop to parallel processing
+        record_process_log("make_animation", "Face Renderer", time.time()-t)
     return predictions_ts
 
+
 class AnimateModel(torch.nn.Module):
     """
     Merge all generator related updates into single model for better multi-gpu usage
@@ -154,7 +173,6 @@ def __init__(self, generator, kp_extractor, mapping):
         self.mapping.eval()
 
     def forward(self, x):
-        
         source_image = x['source_image']
         source_semantics = x['source_semantics']
         target_semantics = x['target_semantics']
@@ -163,8 +181,8 @@ def forward(self, x):
         roll_c_seq = x['roll_c_seq']
 
         predictions_video = make_animation(source_image, source_semantics, target_semantics,
-                                        self.generator, self.kp_extractor,
-                                        self.mapping, use_exp = True,
-                                        yaw_c_seq=yaw_c_seq, pitch_c_seq=pitch_c_seq, roll_c_seq=roll_c_seq)
-        
-        return predictions_video
\ No newline at end of file
+                                           self.generator, self.kp_extractor,
+                                           self.mapping, use_exp=True,
+                                           yaw_c_seq=yaw_c_seq, pitch_c_seq=pitch_c_seq, roll_c_seq=roll_c_seq)
+
+        return predictions_video
diff --git a/src/generate_facerender_batch.py b/src/generate_facerender_batch.py
index a62b6edf..03db82ee 100644
--- a/src/generate_facerender_batch.py
+++ b/src/generate_facerender_batch.py
@@ -5,15 +5,15 @@
 import torch
 import scipy.io as scio
 
-def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, 
-                        batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None, 
-                        expression_scale=1.0, still_mode = False, preprocess='crop', size = 256):
 
+def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
+                        batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None,
+                        expression_scale=1.0, still_mode=False, preprocess='crop', size=256):
     semantic_radius = 13
     video_name = os.path.splitext(os.path.split(coeff_path)[-1])[0]
     txt_path = os.path.splitext(coeff_path)[0]
 
-    data={}
+    data = {}
 
     img1 = Image.open(pic_path)
     source_image = np.array(img1)
@@ -23,17 +23,17 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
     source_image_ts = torch.FloatTensor(source_image).unsqueeze(0)
     source_image_ts = source_image_ts.repeat(batch_size, 1, 1, 1)
     data['source_image'] = source_image_ts
- 
+
     source_semantics_dict = scio.loadmat(first_coeff_path)
     generated_dict = scio.loadmat(coeff_path)
 
     if 'full' not in preprocess.lower():
-        source_semantics = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
-        generated_3dmm = generated_dict['coeff_3dmm'][:,:70]
+        source_semantics = source_semantics_dict['coeff_3dmm'][:1, :70]  # 1 70
+        generated_3dmm = generated_dict['coeff_3dmm'][:, :70]
 
     else:
-        source_semantics = source_semantics_dict['coeff_3dmm'][:1,:73]         #1 70
-        generated_3dmm = generated_dict['coeff_3dmm'][:,:70]
+        source_semantics = source_semantics_dict['coeff_3dmm'][:1, :73]  # 1 70
+        generated_3dmm = generated_dict['coeff_3dmm'][:, :70]
 
     source_semantics_new = transform_semantic_1(source_semantics, semantic_radius)
     source_semantics_ts = torch.FloatTensor(source_semantics_new).unsqueeze(0)
@@ -44,35 +44,37 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
     generated_3dmm[:, :64] = generated_3dmm[:, :64] * expression_scale
 
     if 'full' in preprocess.lower():
-        generated_3dmm = np.concatenate([generated_3dmm, np.repeat(source_semantics[:,70:], generated_3dmm.shape[0], axis=0)], axis=1)
+        generated_3dmm = np.concatenate(
+            [generated_3dmm, np.repeat(source_semantics[:, 70:], generated_3dmm.shape[0], axis=0)], axis=1)
 
     if still_mode:
         generated_3dmm[:, 64:] = np.repeat(source_semantics[:, 64:], generated_3dmm.shape[0], axis=0)
 
-    with open(txt_path+'.txt', 'w') as f:
+    with open(txt_path + '.txt', 'w') as f:
         for coeff in generated_3dmm:
             for i in coeff:
-                f.write(str(i)[:7]   + '  '+'\t')
+                f.write(str(i)[:7] + '  ' + '\t')
             f.write('\n')
 
-    target_semantics_list = [] 
+    target_semantics_list = []
     frame_num = generated_3dmm.shape[0]
     data['frame_num'] = frame_num
     for frame_idx in range(frame_num):
         target_semantics = transform_semantic_target(generated_3dmm, frame_idx, semantic_radius)
         target_semantics_list.append(target_semantics)
 
-    remainder = frame_num%batch_size
-    if remainder!=0:
-        for _ in range(batch_size-remainder):
+    remainder = frame_num % batch_size
+    if remainder != 0:
+        for _ in range(batch_size - remainder):
             target_semantics_list.append(target_semantics)
 
-    target_semantics_np = np.array(target_semantics_list)             #frame_num 70 semantic_radius*2+1
-    target_semantics_np = target_semantics_np.reshape(batch_size, -1, target_semantics_np.shape[-2], target_semantics_np.shape[-1])
+    target_semantics_np = np.array(target_semantics_list)  # frame_num 70 semantic_radius*2+1
+    target_semantics_np = target_semantics_np.reshape(batch_size, -1, target_semantics_np.shape[-2],
+                                                      target_semantics_np.shape[-1])
     data['target_semantics_list'] = torch.FloatTensor(target_semantics_np)
     data['video_name'] = video_name
     data['audio_path'] = audio_path
-    
+
     if input_yaw_list is not None:
         yaw_c_seq = gen_camera_pose(input_yaw_list, frame_num, batch_size)
         data['yaw_c_seq'] = torch.FloatTensor(yaw_c_seq)
@@ -80,57 +82,58 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
         pitch_c_seq = gen_camera_pose(input_pitch_list, frame_num, batch_size)
         data['pitch_c_seq'] = torch.FloatTensor(pitch_c_seq)
     if input_roll_list is not None:
-        roll_c_seq = gen_camera_pose(input_roll_list, frame_num, batch_size) 
+        roll_c_seq = gen_camera_pose(input_roll_list, frame_num, batch_size)
         data['roll_c_seq'] = torch.FloatTensor(roll_c_seq)
- 
+
     return data
 
+
 def transform_semantic_1(semantic, semantic_radius):
-    semantic_list =  [semantic for i in range(0, semantic_radius*2+1)]
+    semantic_list = [semantic for i in range(0, semantic_radius * 2 + 1)]
     coeff_3dmm = np.concatenate(semantic_list, 0)
-    return coeff_3dmm.transpose(1,0)
+    return coeff_3dmm.transpose(1, 0)
+
 
 def transform_semantic_target(coeff_3dmm, frame_index, semantic_radius):
     num_frames = coeff_3dmm.shape[0]
-    seq = list(range(frame_index- semantic_radius, frame_index + semantic_radius+1))
-    index = [ min(max(item, 0), num_frames-1) for item in seq ] 
+    seq = list(range(frame_index - semantic_radius, frame_index + semantic_radius + 1))
+    index = [min(max(item, 0), num_frames - 1) for item in seq]
     coeff_3dmm_g = coeff_3dmm[index, :]
-    return coeff_3dmm_g.transpose(1,0)
+    return coeff_3dmm_g.transpose(1, 0)
 
-def gen_camera_pose(camera_degree_list, frame_num, batch_size):
 
-    new_degree_list = [] 
+def gen_camera_pose(camera_degree_list, frame_num, batch_size):
+    new_degree_list = []
     if len(camera_degree_list) == 1:
         for _ in range(frame_num):
-            new_degree_list.append(camera_degree_list[0]) 
-        remainder = frame_num%batch_size
-        if remainder!=0:
-            for _ in range(batch_size-remainder):
+            new_degree_list.append(camera_degree_list[0])
+        remainder = frame_num % batch_size
+        if remainder != 0:
+            for _ in range(batch_size - remainder):
                 new_degree_list.append(new_degree_list[-1])
-        new_degree_np = np.array(new_degree_list).reshape(batch_size, -1) 
+        new_degree_np = np.array(new_degree_list).reshape(batch_size, -1)
         return new_degree_np
 
     degree_sum = 0.
     for i, degree in enumerate(camera_degree_list[1:]):
-        degree_sum += abs(degree-camera_degree_list[i])
-    
-    degree_per_frame = degree_sum/(frame_num-1)
+        degree_sum += abs(degree - camera_degree_list[i])
+
+    degree_per_frame = degree_sum / (frame_num - 1)
     for i, degree in enumerate(camera_degree_list[1:]):
         degree_last = camera_degree_list[i]
-        degree_step = degree_per_frame * abs(degree-degree_last)/(degree-degree_last)
-        new_degree_list =  new_degree_list + list(np.arange(degree_last, degree, degree_step))
+        degree_step = degree_per_frame * abs(degree - degree_last) / (degree - degree_last)
+        new_degree_list = new_degree_list + list(np.arange(degree_last, degree, degree_step))
     if len(new_degree_list) > frame_num:
         new_degree_list = new_degree_list[:frame_num]
     elif len(new_degree_list) < frame_num:
-        for _ in range(frame_num-len(new_degree_list)):
+        for _ in range(frame_num - len(new_degree_list)):
             new_degree_list.append(new_degree_list[-1])
     print(len(new_degree_list))
     print(frame_num)
 
-    remainder = frame_num%batch_size
-    if remainder!=0:
-        for _ in range(batch_size-remainder):
+    remainder = frame_num % batch_size
+    if remainder != 0:
+        for _ in range(batch_size - remainder):
             new_degree_list.append(new_degree_list[-1])
-    new_degree_np = np.array(new_degree_list).reshape(batch_size, -1) 
+    new_degree_np = np.array(new_degree_list).reshape(batch_size, -1)
     return new_degree_np
-    
diff --git a/src/test_audio2coeff.py b/src/test_audio2coeff.py
index be66bffd..0cd1d352 100644
--- a/src/test_audio2coeff.py
+++ b/src/test_audio2coeff.py
@@ -1,4 +1,4 @@
-import os 
+import os
 import torch
 import numpy as np
 from scipy.io import savemat, loadmat
@@ -6,12 +6,15 @@
 from scipy.signal import savgol_filter
 
 import safetensors
-import safetensors.torch 
+import safetensors.torch
 
 from src.audio2pose_models.audio2pose import Audio2Pose
-from src.audio2exp_models.networks import SimpleWrapperV2 
+from src.audio2exp_models.networks import SimpleWrapperV2
 from src.audio2exp_models.audio2exp import Audio2Exp
-from src.utils.safetensor_helper import load_x_from_safetensor  
+from src.utils.safetensor_helper import load_x_from_safetensor
+from src.utils.process_log import record_process_log
+import time
+
 
 def load_cpk(checkpoint_path, model=None, optimizer=None, device="cpu"):
     checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
@@ -22,10 +25,11 @@ def load_cpk(checkpoint_path, model=None, optimizer=None, device="cpu"):
 
     return checkpoint['epoch']
 
+
 class Audio2Coeff():
 
     def __init__(self, sadtalker_path, device):
-        #load config
+        # load config
         fcfg_pose = open(sadtalker_path['audio2pose_yaml_path'])
         cfg_pose = CN.load_cfg(fcfg_pose)
         cfg_pose.freeze()
@@ -33,22 +37,31 @@ def __init__(self, sadtalker_path, device):
         cfg_exp = CN.load_cfg(fcfg_exp)
         cfg_exp.freeze()
 
+        t = time.time()
+
         # load audio2pose_model
         self.audio2pose_model = Audio2Pose(cfg_pose, None, device=device)
+
+        record_process_log(self.__class__.__name__, "Audio2Pose", time.time()-t, "self.audio2pose_model")
+
         self.audio2pose_model = self.audio2pose_model.to(device)
         self.audio2pose_model.eval()
         for param in self.audio2pose_model.parameters():
-            param.requires_grad = False 
-        
+            param.requires_grad = False
+
+        t = time.time()
         try:
             if sadtalker_path['use_safetensor']:
                 checkpoints = safetensors.torch.load_file(sadtalker_path['checkpoint'])
                 self.audio2pose_model.load_state_dict(load_x_from_safetensor(checkpoints, 'audio2pose'))
+                record_process_log(self.__class__.__name__, "self.audio2pose_model.load_state_dict", time.time()-t)
             else:
                 load_cpk(sadtalker_path['audio2pose_checkpoint'], model=self.audio2pose_model, device=device)
+                record_process_log(self.__class__.__name__, "load_cpk", time.time() - t)
         except:
             raise Exception("Failed in loading audio2pose_checkpoint")
 
+        t = time.time()
         # load audio2exp_model
         netG = SimpleWrapperV2()
         netG = netG.to(device)
@@ -63,62 +76,65 @@ def __init__(self, sadtalker_path, device):
                 load_cpk(sadtalker_path['audio2exp_checkpoint'], model=netG, device=device)
         except:
             raise Exception("Failed in loading audio2exp_checkpoint")
+
+        record_process_log(self.__class__.__name__, "load audio2exp_model", time.time() - t, "checkpoints")
+
+        t = time.time()
         self.audio2exp_model = Audio2Exp(netG, cfg_exp, device=device, prepare_training_loss=False)
         self.audio2exp_model = self.audio2exp_model.to(device)
         for param in self.audio2exp_model.parameters():
             param.requires_grad = False
         self.audio2exp_model.eval()
- 
+        record_process_log(self.__class__.__name__, "Audio2Exp", time.time() - t, "self.audio2exp_model")
+
         self.device = device
 
     def generate(self, batch, coeff_save_dir, pose_style, ref_pose_coeff_path=None):
 
         with torch.no_grad():
-            #test
-            results_dict_exp= self.audio2exp_model.test(batch)
-            exp_pred = results_dict_exp['exp_coeff_pred']                         #bs T 64
+            # test
+            results_dict_exp = self.audio2exp_model.test(batch)
+            exp_pred = results_dict_exp['exp_coeff_pred']  # bs T 64
 
-            #for class_id in  range(1):
-            #class_id = 0#(i+10)%45
-            #class_id = random.randint(0,46)                                   #46 styles can be selected 
+            # for class_id in  range(1):
+            # class_id = 0#(i+10)%45
+            # class_id = random.randint(0,46)                                   #46 styles can be selected
             batch['class'] = torch.LongTensor([pose_style]).to(self.device)
-            results_dict_pose = self.audio2pose_model.test(batch) 
-            pose_pred = results_dict_pose['pose_pred']                        #bs T 6
+            results_dict_pose = self.audio2pose_model.test(batch)
+            pose_pred = results_dict_pose['pose_pred']  # bs T 6
 
             pose_len = pose_pred.shape[1]
-            if pose_len<13: 
-                pose_len = int((pose_len-1)/2)*2+1
+            if pose_len < 13:
+                pose_len = int((pose_len - 1) / 2) * 2 + 1
                 pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), pose_len, 2, axis=1)).to(self.device)
             else:
-                pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), 13, 2, axis=1)).to(self.device) 
-            
-            coeffs_pred = torch.cat((exp_pred, pose_pred), dim=-1)            #bs T 70
+                pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), 13, 2, axis=1)).to(self.device)
+
+            coeffs_pred = torch.cat((exp_pred, pose_pred), dim=-1)  # bs T 70
+
+            coeffs_pred_numpy = coeffs_pred[0].clone().detach().cpu().numpy()
 
-            coeffs_pred_numpy = coeffs_pred[0].clone().detach().cpu().numpy() 
+            if ref_pose_coeff_path is not None:
+                coeffs_pred_numpy = self.using_refpose(coeffs_pred_numpy, ref_pose_coeff_path)
 
-            
-            if ref_pose_coeff_path is not None: 
-                 coeffs_pred_numpy = self.using_refpose(coeffs_pred_numpy, ref_pose_coeff_path)
-        
-            savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),  
+            savemat(os.path.join(coeff_save_dir, '%s##%s.mat' % (batch['pic_name'], batch['audio_name'])),
                     {'coeff_3dmm': coeffs_pred_numpy})
 
-            return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))
-    
+            return os.path.join(coeff_save_dir, '%s##%s.mat' % (batch['pic_name'], batch['audio_name']))
+
     def using_refpose(self, coeffs_pred_numpy, ref_pose_coeff_path):
         num_frames = coeffs_pred_numpy.shape[0]
         refpose_coeff_dict = loadmat(ref_pose_coeff_path)
-        refpose_coeff = refpose_coeff_dict['coeff_3dmm'][:,64:70]
+        refpose_coeff = refpose_coeff_dict['coeff_3dmm'][:, 64:70]
         refpose_num_frames = refpose_coeff.shape[0]
-        if refpose_num_frames<num_frames:
-            div = num_frames//refpose_num_frames
-            re = num_frames%refpose_num_frames
+        if refpose_num_frames < num_frames:
+            div = num_frames // refpose_num_frames
+            re = num_frames % refpose_num_frames
             refpose_coeff_list = [refpose_coeff for i in range(div)]
             refpose_coeff_list.append(refpose_coeff[:re, :])
             refpose_coeff = np.concatenate(refpose_coeff_list, axis=0)
 
         #### relative head pose
-        coeffs_pred_numpy[:, 64:70] = coeffs_pred_numpy[:, 64:70] + ( refpose_coeff[:num_frames, :] - refpose_coeff[0:1, :] )
+        coeffs_pred_numpy[:, 64:70] = coeffs_pred_numpy[:, 64:70] + (
+                    refpose_coeff[:num_frames, :] - refpose_coeff[0:1, :])
         return coeffs_pred_numpy
-
-
diff --git a/src/utils/croper.py b/src/utils/croper.py
index 3d9a0ac5..9f639d25 100644
--- a/src/utils/croper.py
+++ b/src/utils/croper.py
@@ -16,6 +16,7 @@
 import numpy as np
 from PIL import Image
 
+
 class Preprocesser:
     def __init__(self, device='cuda'):
         self.predictor = KeypointExtractor(device)
@@ -32,11 +33,11 @@ def get_landmark(self, img_np):
         det = dets[0]
 
         img = img_np[int(det[1]):int(det[3]), int(det[0]):int(det[2]), :]
-        lm = landmark_98_to_68(self.predictor.detector.get_landmarks(img)) # [0]
+        lm = landmark_98_to_68(self.predictor.detector.get_landmarks(img))  # [0]
 
         #### keypoints to the original location
-        lm[:,0] += int(det[0])
-        lm[:,1] += int(det[1])
+        lm[:, 0] += int(det[0])
+        lm[:, 1] += int(det[1])
 
         return lm
 
@@ -66,13 +67,14 @@ def align_face(self, img, lm, output_size=1024):
         eye_to_mouth = mouth_avg - eye_avg
 
         # Choose oriented crop rectangle.
-        x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]  # Addition of binocular difference and double mouth difference
-        x /= np.hypot(*x)   # hypot函数计算直角三角形的斜边长，用斜边长对三角形两条直边做归一化
-        x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)    # 双眼差和眼嘴差，选较大的作为基准尺度
+        x = eye_to_eye - np.flipud(eye_to_mouth) * [-1,
+                                                    1]  # Addition of binocular difference and double mouth difference
+        x /= np.hypot(*x)  # hypot函数计算直角三角形的斜边长，用斜边长对三角形两条直边做归一化
+        x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)  # 双眼差和眼嘴差，选较大的作为基准尺度
         y = np.flipud(x) * [-1, 1]
         c = eye_avg + eye_to_mouth * 0.1
-        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])   # 定义四边形，以面部基准位置为中心上下左右平移得到四个顶点
-        qsize = np.hypot(*x) * 2    # 定义四边形的大小（边长），为基准尺度的2倍
+        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])  # 定义四边形，以面部基准位置为中心上下左右平移得到四个顶点
+        qsize = np.hypot(*x) * 2  # 定义四边形的大小（边长），为基准尺度的2倍
 
         # Shrink.
         # 如果计算出的四边形太大了，就按比例缩小它
@@ -122,8 +124,8 @@ def align_face(self, img, lm, output_size=1024):
 
         # Save aligned image.
         return rsize, crop, [lx, ly, rx, ry]
-    
-    def crop(self, img_np_list, still=False, xsize=512):    # first frame for all video
+
+    def crop(self, img_np_list, still=False, xsize=512):  # first frame for all video
         img_np = img_np_list[0]
         lm = self.get_landmark(img_np)
 
@@ -141,4 +143,3 @@ def crop(self, img_np_list, still=False, xsize=512):    # first frame for all vi
                 _inp = _inp[ly:ry, lx:rx]
             img_np_list[_i] = _inp
         return img_np_list, crop, quad
-
diff --git a/src/utils/face_enhancer.py b/src/utils/face_enhancer.py
index 15851a15..c890c974 100644
--- a/src/utils/face_enhancer.py
+++ b/src/utils/face_enhancer.py
@@ -1,5 +1,5 @@
 import os
-import torch 
+import torch
 
 from gfpgan import GFPGANer
 
@@ -23,15 +23,17 @@ def __len__(self):
     def __iter__(self):
         return self.gen
 
+
 def enhancer_list(images, method='gfpgan', bg_upsampler='realesrgan'):
     gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler)
     return list(gen)
 
+
 def enhancer_generator_with_len(images, method='gfpgan', bg_upsampler='realesrgan'):
     """ Provide a generator with a __len__ method so that it can passed to functions that
     call len()"""
 
-    if os.path.isfile(images): # handle video to images
+    if os.path.isfile(images):  # handle video to images
         # TODO: Create a generator version of load_video_to_cv2
         images = load_video_to_cv2(images)
 
@@ -39,17 +41,18 @@ def enhancer_generator_with_len(images, method='gfpgan', bg_upsampler='realesrga
     gen_with_len = GeneratorWithLen(gen, len(images))
     return gen_with_len
 
+
 def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'):
     """ Provide a generator function so that all of the enhanced images don't need
     to be stored in memory at the same time. This can save tons of RAM compared to
     the enhancer function. """
 
     print('face enhancer....')
-    if not isinstance(images, list) and os.path.isfile(images): # handle video to images
+    if not isinstance(images, list) and os.path.isfile(images):  # handle video to images
         images = load_video_to_cv2(images)
 
     # ------------------------ set up GFPGAN restorer ------------------------
-    if  method == 'gfpgan':
+    if method == 'gfpgan':
         arch = 'clean'
         channel_multiplier = 2
         model_name = 'GFPGANv1.4'
@@ -59,7 +62,7 @@ def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'
         channel_multiplier = 2
         model_name = 'RestoreFormer'
         url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/RestoreFormer.pth'
-    elif method == 'codeformer': # TODO:
+    elif method == 'codeformer':  # TODO:
         arch = 'CodeFormer'
         channel_multiplier = 2
         model_name = 'CodeFormer'
@@ -67,7 +70,6 @@ def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'
     else:
         raise ValueError(f'Wrong model version {method}.')
 
-
     # ------------------------ set up background upsampler ------------------------
     if bg_upsampler == 'realesrgan':
         if not torch.cuda.is_available():  # CPU
@@ -92,10 +94,10 @@ def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'
 
     # determine model paths
     model_path = os.path.join('gfpgan/weights', model_name + '.pth')
-    
+
     if not os.path.isfile(model_path):
         model_path = os.path.join('checkpoints', model_name + '.pth')
-    
+
     if not os.path.isfile(model_path):
         # download pre-trained models from url
         model_path = url
@@ -109,15 +111,14 @@ def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'
 
     # ------------------------ restore ------------------------
     for idx in tqdm(range(len(images)), 'Face Enhancer:'):
-        
         img = cv2.cvtColor(images[idx], cv2.COLOR_RGB2BGR)
-        
+
         # restore faces and background if necessary
         cropped_faces, restored_faces, r_img = restorer.enhance(
             img,
             has_aligned=False,
             only_center_face=False,
             paste_back=True)
-        
+
         r_img = cv2.cvtColor(r_img, cv2.COLOR_BGR2RGB)
         yield r_img
diff --git a/src/utils/init_path.py b/src/utils/init_path.py
index 5f38d119..d3e70b27 100644
--- a/src/utils/init_path.py
+++ b/src/utils/init_path.py
@@ -1,41 +1,42 @@
 import os
 import glob
 
-def init_path(checkpoint_dir, config_dir, size=512, old_version=False, preprocess='crop'):
 
+def init_path(checkpoint_dir, config_dir, size=512, old_version=False, preprocess='crop'):
     if old_version:
         #### load all the checkpoint of `pth`
         sadtalker_paths = {
-                'wav2lip_checkpoint' : os.path.join(checkpoint_dir, 'wav2lip.pth'),
-                'audio2pose_checkpoint' : os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'),
-                'audio2exp_checkpoint' : os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'),
-                'free_view_checkpoint' : os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'),
-                'path_of_net_recon_model' : os.path.join(checkpoint_dir, 'epoch_20.pth')
+            'wav2lip_checkpoint': os.path.join(checkpoint_dir, 'wav2lip.pth'),
+            'audio2pose_checkpoint': os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'),
+            'audio2exp_checkpoint': os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'),
+            'free_view_checkpoint': os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'),
+            'path_of_net_recon_model': os.path.join(checkpoint_dir, 'epoch_20.pth')
         }
 
         use_safetensor = False
     elif len(glob.glob(os.path.join(checkpoint_dir, '*.safetensors'))):
         print('using safetensor as default')
         sadtalker_paths = {
-            "checkpoint":os.path.join(checkpoint_dir, 'SadTalker_V0.0.2_'+str(size)+'.safetensors'),
-            }
+            "checkpoint": os.path.join(checkpoint_dir, 'SadTalker_V0.0.2_' + str(size) + '.safetensors'),
+        }
         use_safetensor = True
     else:
-        print("WARNING: The new version of the model will be updated by safetensor, you may need to download it mannully. We run the old version of the checkpoint this time!")
+        print(
+            "WARNING: The new version of the model will be updated by safetensor, you may need to download it mannully. We run the old version of the checkpoint this time!")
         use_safetensor = False
-        
+
         sadtalker_paths = {
-                'wav2lip_checkpoint' : os.path.join(checkpoint_dir, 'wav2lip.pth'),
-                'audio2pose_checkpoint' : os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'),
-                'audio2exp_checkpoint' : os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'),
-                'free_view_checkpoint' : os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'),
-                'path_of_net_recon_model' : os.path.join(checkpoint_dir, 'epoch_20.pth')
+            'wav2lip_checkpoint': os.path.join(checkpoint_dir, 'wav2lip.pth'),
+            'audio2pose_checkpoint': os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'),
+            'audio2exp_checkpoint': os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'),
+            'free_view_checkpoint': os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'),
+            'path_of_net_recon_model': os.path.join(checkpoint_dir, 'epoch_20.pth')
         }
 
-    sadtalker_paths['dir_of_BFM_fitting'] = os.path.join(config_dir) # , 'BFM_Fitting'
+    sadtalker_paths['dir_of_BFM_fitting'] = os.path.join(config_dir)  # , 'BFM_Fitting'
     sadtalker_paths['audio2pose_yaml_path'] = os.path.join(config_dir, 'auido2pose.yaml')
     sadtalker_paths['audio2exp_yaml_path'] = os.path.join(config_dir, 'auido2exp.yaml')
-    sadtalker_paths['use_safetensor'] =  use_safetensor # os.path.join(config_dir, 'auido2exp.yaml')
+    sadtalker_paths['use_safetensor'] = use_safetensor  # os.path.join(config_dir, 'auido2exp.yaml')
 
     if 'full' in preprocess:
         sadtalker_paths['mappingnet_checkpoint'] = os.path.join(checkpoint_dir, 'mapping_00109-model.pth.tar')
@@ -44,4 +45,4 @@ def init_path(checkpoint_dir, config_dir, size=512, old_version=False, preproces
         sadtalker_paths['mappingnet_checkpoint'] = os.path.join(checkpoint_dir, 'mapping_00229-model.pth.tar')
         sadtalker_paths['facerender_yaml'] = os.path.join(config_dir, 'facerender.yaml')
 
-    return sadtalker_paths
\ No newline at end of file
+    return sadtalker_paths
diff --git a/src/utils/paste_pic.py b/src/utils/paste_pic.py
index f9989e21..26a7f2b3 100644
--- a/src/utils/paste_pic.py
+++ b/src/utils/paste_pic.py
@@ -3,7 +3,10 @@
 from tqdm import tqdm
 import uuid
 
-from src.utils.videoio import save_video_with_watermark 
+from src.utils.videoio import save_video_with_watermark
+import time
+from src.utils.process_log import record_process_log
+
 
 def paste_pic(video_path, pic_path, crop_info, new_audio_path, full_video_path, extended_crop=False):
 
@@ -55,6 +58,7 @@ def paste_pic(video_path, pic_path, crop_info, new_audio_path, full_video_path,
 
     tmp_path = str(uuid.uuid4())+'.mp4'
     out_tmp = cv2.VideoWriter(tmp_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (frame_w, frame_h))
+    t = time.time()
     for crop_frame in tqdm(crop_frames, 'seamlessClone:'):
         p = cv2.resize(crop_frame.astype(np.uint8), (ox2-ox1, oy2 - oy1)) 
 
@@ -64,6 +68,9 @@ def paste_pic(video_path, pic_path, crop_info, new_audio_path, full_video_path,
         out_tmp.write(gen_img)
 
     out_tmp.release()
+    # crop_frames for loop，v100 消耗时长：9.087732077
+    # TODO(qingyuan): change forloop to parallel. Avoid disk i/o, use memory
+    record_process_log("paste_pic", "seamlessClone", time.time()-t, "crop_frames")
 
     save_video_with_watermark(tmp_path, new_audio_path, full_video_path, watermark=False)
     os.remove(tmp_path)
diff --git a/src/utils/preprocess.py b/src/utils/preprocess.py
index 0f784e6c..7e117550 100644
--- a/src/utils/preprocess.py
+++ b/src/utils/preprocess.py
@@ -1,11 +1,11 @@
 import numpy as np
 import cv2, os, sys, torch
 from tqdm import tqdm
-from PIL import Image 
+from PIL import Image
 
 # 3dmm extraction
 import safetensors
-import safetensors.torch 
+import safetensors.torch
 from src.face3d.util.preprocess import align_img
 from src.face3d.util.load_mats import load_lm3d
 from src.face3d.models import networks
@@ -13,34 +13,37 @@
 from scipy.io import loadmat, savemat
 from src.utils.croper import Preprocesser
 
-
 import warnings
 
-from src.utils.safetensor_helper import load_x_from_safetensor 
+from src.utils.safetensor_helper import load_x_from_safetensor
+from src.utils.process_log import record_process_log
+import time
+
 warnings.filterwarnings("ignore")
 
+
 def split_coeff(coeffs):
-        """
-        Return:
-            coeffs_dict     -- a dict of torch.tensors
-
-        Parameters:
-            coeffs          -- torch.tensor, size (B, 256)
-        """
-        id_coeffs = coeffs[:, :80]
-        exp_coeffs = coeffs[:, 80: 144]
-        tex_coeffs = coeffs[:, 144: 224]
-        angles = coeffs[:, 224: 227]
-        gammas = coeffs[:, 227: 254]
-        translations = coeffs[:, 254:]
-        return {
-            'id': id_coeffs,
-            'exp': exp_coeffs,
-            'tex': tex_coeffs,
-            'angle': angles,
-            'gamma': gammas,
-            'trans': translations
-        }
+    """
+    Return:
+        coeffs_dict     -- a dict of torch.tensors
+
+    Parameters:
+        coeffs          -- torch.tensor, size (B, 256)
+    """
+    id_coeffs = coeffs[:, :80]
+    exp_coeffs = coeffs[:, 80: 144]
+    tex_coeffs = coeffs[:, 144: 224]
+    angles = coeffs[:, 224: 227]
+    gammas = coeffs[:, 227: 254]
+    translations = coeffs[:, 254:]
+    return {
+        'id': id_coeffs,
+        'exp': exp_coeffs,
+        'tex': tex_coeffs,
+        'angle': angles,
+        'gamma': gammas,
+        'trans': translations
+    }
 
 
 class CropAndExtract():
@@ -48,27 +51,27 @@ def __init__(self, sadtalker_path, device):
 
         self.propress = Preprocesser(device)
         self.net_recon = networks.define_net_recon(net_recon='resnet50', use_last_fc=False, init_path='').to(device)
-        
+
         if sadtalker_path['use_safetensor']:
-            checkpoint = safetensors.torch.load_file(sadtalker_path['checkpoint'])    
+            checkpoint = safetensors.torch.load_file(sadtalker_path['checkpoint'])
             self.net_recon.load_state_dict(load_x_from_safetensor(checkpoint, 'face_3drecon'))
         else:
-            checkpoint = torch.load(sadtalker_path['path_of_net_recon_model'], map_location=torch.device(device))    
+            checkpoint = torch.load(sadtalker_path['path_of_net_recon_model'], map_location=torch.device(device))
             self.net_recon.load_state_dict(checkpoint['net_recon'])
 
         self.net_recon.eval()
         self.lm3d_std = load_lm3d(sadtalker_path['dir_of_BFM_fitting'])
         self.device = device
-    
+
     def generate(self, input_path, save_dir, crop_or_resize='crop', source_image_flag=False, pic_size=256):
 
-        pic_name = os.path.splitext(os.path.split(input_path)[-1])[0]  
+        pic_name = os.path.splitext(os.path.split(input_path)[-1])[0]
 
-        landmarks_path =  os.path.join(save_dir, pic_name+'_landmarks.txt') 
-        coeff_path =  os.path.join(save_dir, pic_name+'.mat')  
-        png_path =  os.path.join(save_dir, pic_name+'.png')  
+        landmarks_path = os.path.join(save_dir, pic_name + '_landmarks.txt')
+        coeff_path = os.path.join(save_dir, pic_name + '.mat')
+        png_path = os.path.join(save_dir, pic_name + '.png')
 
-        #load input
+        # load input
         if not os.path.isfile(input_path):
             raise ValueError('input_path must be a valid path to video/image file')
         elif input_path.split('.')[-1] in ['jpg', 'png', 'jpeg']:
@@ -79,38 +82,42 @@ def generate(self, input_path, save_dir, crop_or_resize='crop', source_image_fla
             # loader for videos
             video_stream = cv2.VideoCapture(input_path)
             fps = video_stream.get(cv2.CAP_PROP_FPS)
-            full_frames = [] 
+            full_frames = []
             while 1:
                 still_reading, frame = video_stream.read()
                 if not still_reading:
                     video_stream.release()
-                    break 
-                full_frames.append(frame) 
+                    break
+                full_frames.append(frame)
                 if source_image_flag:
                     break
 
-        x_full_frames= [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  for frame in full_frames] 
+        x_full_frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in full_frames]
 
         #### crop images as the 
-        if 'crop' in crop_or_resize.lower(): # default crop
-            x_full_frames, crop, quad = self.propress.crop(x_full_frames, still=True if 'ext' in crop_or_resize.lower() else False, xsize=512)
+        if 'crop' in crop_or_resize.lower():  # default crop
+            x_full_frames, crop, quad = self.propress.crop(x_full_frames,
+                                                           still=True if 'ext' in crop_or_resize.lower() else False,
+                                                           xsize=512)
             clx, cly, crx, cry = crop
             lx, ly, rx, ry = quad
             lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry)
-            oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
+            oy1, oy2, ox1, ox2 = cly + ly, cly + ry, clx + lx, clx + rx
             crop_info = ((ox2 - ox1, oy2 - oy1), crop, quad)
         elif 'full' in crop_or_resize.lower():
-            x_full_frames, crop, quad = self.propress.crop(x_full_frames, still=True if 'ext' in crop_or_resize.lower() else False, xsize=512)
+            x_full_frames, crop, quad = self.propress.crop(x_full_frames,
+                                                           still=True if 'ext' in crop_or_resize.lower() else False,
+                                                           xsize=512)
             clx, cly, crx, cry = crop
             lx, ly, rx, ry = quad
             lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry)
-            oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
+            oy1, oy2, ox1, ox2 = cly + ly, cly + ry, clx + lx, clx + rx
             crop_info = ((ox2 - ox1, oy2 - oy1), crop, quad)
-        else: # resize mode
-            oy1, oy2, ox1, ox2 = 0, x_full_frames[0].shape[0], 0, x_full_frames[0].shape[1] 
+        else:  # resize mode
+            oy1, oy2, ox1, ox2 = 0, x_full_frames[0].shape[0], 0, x_full_frames[0].shape[1]
             crop_info = ((ox2 - ox1, oy2 - oy1), None, None)
 
-        frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size, pic_size))) for frame in x_full_frames]
+        frames_pil = [Image.fromarray(cv2.resize(frame, (pic_size, pic_size))) for frame in x_full_frames]
         if len(frames_pil) == 0:
             print('No face is detected in the input file')
             return None, None
@@ -120,7 +127,7 @@ def generate(self, input_path, save_dir, crop_or_resize='crop', source_image_fla
             cv2.imwrite(png_path, cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR))
 
         # 2. get the landmark according to the detected face. 
-        if not os.path.isfile(landmarks_path): 
+        if not os.path.isfile(landmarks_path):
             lm = self.propress.predictor.extract_keypoint(frames_pil, landmarks_path)
         else:
             print(' Using saved landmarks.')
@@ -129,42 +136,47 @@ def generate(self, input_path, save_dir, crop_or_resize='crop', source_image_fla
 
         if not os.path.isfile(coeff_path):
             # load 3dmm paramter generator from Deep3DFaceRecon_pytorch 
-            video_coeffs, full_coeffs = [],  []
+            video_coeffs, full_coeffs = [], []
+            t = time.time()
+
             for idx in tqdm(range(len(frames_pil)), desc='3DMM Extraction In Video:'):
                 frame = frames_pil[idx]
-                W,H = frame.size
+                W, H = frame.size
                 lm1 = lm[idx].reshape([-1, 2])
-            
+
                 if np.mean(lm1) == -1:
-                    lm1 = (self.lm3d_std[:, :2]+1)/2.
+                    lm1 = (self.lm3d_std[:, :2] + 1) / 2.
                     lm1 = np.concatenate(
-                        [lm1[:, :1]*W, lm1[:, 1:2]*H], 1
+                        [lm1[:, :1] * W, lm1[:, 1:2] * H], 1
                     )
                 else:
                     lm1[:, -1] = H - 1 - lm1[:, -1]
 
                 trans_params, im1, lm1, _ = align_img(frame, lm1, self.lm3d_std)
- 
+
                 trans_params = np.array([float(item) for item in np.hsplit(trans_params, 5)]).astype(np.float32)
-                im_t = torch.tensor(np.array(im1)/255., dtype=torch.float32).permute(2, 0, 1).to(self.device).unsqueeze(0)
-                
+                im_t = torch.tensor(np.array(im1) / 255., dtype=torch.float32).permute(2, 0, 1).to(
+                    self.device).unsqueeze(0)
+
                 with torch.no_grad():
                     full_coeff = self.net_recon(im_t)
                     coeffs = split_coeff(full_coeff)
 
-                pred_coeff = {key:coeffs[key].cpu().numpy() for key in coeffs}
- 
+                pred_coeff = {key: coeffs[key].cpu().numpy() for key in coeffs}
+
                 pred_coeff = np.concatenate([
-                    pred_coeff['exp'], 
+                    pred_coeff['exp'],
                     pred_coeff['angle'],
                     pred_coeff['trans'],
                     trans_params[2:][None],
-                    ], 1)
+                ], 1)
                 video_coeffs.append(pred_coeff)
                 full_coeffs.append(full_coeff.cpu().numpy())
 
-            semantic_npy = np.array(video_coeffs)[:,0] 
-
+            record_process_log(self.__class__.__name__, "generate", time.time() - t, "tqdm(range(len(frames_pil))")
+            semantic_npy = np.array(video_coeffs)[:, 0]
+            t = time.time()
             savemat(coeff_path, {'coeff_3dmm': semantic_npy, 'full_3dmm': np.array(full_coeffs)[0]})
+            record_process_log(self.__class__.__name__, "savemat", time.time() - t)
 
         return coeff_path, png_path, crop_info
diff --git a/src/utils/process_log.py b/src/utils/process_log.py
new file mode 100644
index 00000000..fd9938bf
--- /dev/null
+++ b/src/utils/process_log.py
@@ -0,0 +1,28 @@
+import csv
+import os
+
+
+def record_process_log(func_name, tag, duration, relate_file=None):
+    with open(os.path.join(os.getcwd(), "process_log.csv"), "a+") as f:
+        writer = csv.writer(f)
+        writer.writerow([func_name, tag, duration, relate_file])
+
+
+def get_file_size(file_path):
+    try:
+        # 使用 os.path.getsize 获取文件大小（以字节为单位）
+        size_in_bytes = os.path.getsize(file_path)
+
+        # 将文件大小转换为更适合阅读的格式
+        size_kb = size_in_bytes / 1024.0
+        size_mb = size_kb / 1024.0
+
+        print(f"File Size: {size_in_bytes} bytes")
+        print(f"File Size: {size_kb:.2f} KB")
+        print(f"File Size: {size_mb:.2f} MB")
+        return size_kb
+
+    except FileNotFoundError:
+        print("File not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
diff --git a/src/utils/text2speech.py b/src/utils/text2speech.py
index 00d165b6..a9cd9f58 100644
--- a/src/utils/text2speech.py
+++ b/src/utils/text2speech.py
@@ -5,7 +5,7 @@
 
 class TTSTalker():
     def __init__(self) -> None:
-        model_name = TTS.list_models()[0]
+        model_name = TTS().list_models()[0]
         self.tts = TTS(model_name)
 
     def test(self, text, language='en'):
@@ -17,4 +17,4 @@ def test(self, text, language='en'):
 
         self.tts.tts_to_file(text, speaker=self.tts.speakers[0], language=language, file_path=tempf.name)
 
-        return tempf.name
\ No newline at end of file
+        return tempf.name
diff --git a/src/utils/videoio.py b/src/utils/videoio.py
index 08bfbdd7..98cf746a 100644
--- a/src/utils/videoio.py
+++ b/src/utils/videoio.py
@@ -38,4 +38,4 @@ def save_video_with_watermark(video, audio, save_path, watermark=False):
 
         cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -filter_complex "[1]scale=100:-1[wm];[0][wm]overlay=(main_w-overlay_w)-10:10" "%s"' % (temp_file, watarmark_path, save_path)
         os.system(cmd)
-        os.remove(temp_file)
\ No newline at end of file
+        os.remove(temp_file)