Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Edit colab notebooks #24

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 51 additions & 56 deletions quick_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@
"source": [
"%cd MakeItTalk/\n",
"!export PYTHONPATH=/content/MakeItTalk:$PYTHONPATH\n",
"!pip install -r requirements.txt\n",
"!pip install tensorboardX"
"!pip install -q -r requirements.txt\n",
"!pip install -q tensorboardX"
],
"execution_count": null,
"outputs": [
Expand Down Expand Up @@ -273,7 +273,6 @@
"source": [
"!mkdir examples/dump\n",
"!mkdir examples/ckpt\n",
"!pip install gdown\n",
"!gdown -O examples/ckpt/ckpt_autovc.pth https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x\n",
"!gdown -O examples/ckpt/ckpt_content_branch.pth https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp\n",
"!gdown -O examples/ckpt/ckpt_speaker_branch.pth https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu\n",
Expand Down Expand Up @@ -341,20 +340,21 @@
"source": [
"import sys\n",
"sys.path.append(\"thirdparty/AdaptiveWingLoss\")\n",
"import os, glob\n",
"import os\n",
"import glob\n",
"import numpy as np\n",
"import cv2\n",
"import argparse\n",
"from src.approaches.train_image_translation import Image_translation_block\n",
"import torch\n",
"import pickle\n",
"import face_alignment\n",
"from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n",
"import shutil\n",
"import time\n",
"import util.utils as util\n",
"from scipy.signal import savgol_filter\n",
"from src.approaches.train_audio2landmark import Audio2landmark_model"
"from src.approaches.train_image_translation import Image_translation_block\n",
"from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n",
"from src.approaches.train_audio2landmark import Audio2landmark_model\n",
"from thirdparty.resemblyer_util.speaker_emb import get_spk_emb"
],
"execution_count": null,
"outputs": []
Expand All @@ -374,12 +374,12 @@
"id": "58s-c9H8dWPW"
},
"source": [
"default_head_name = 'paint_boy' # the image name (with no .jpg) to animate\n",
"ADD_NAIVE_EYE = True # whether add naive eye blink\n",
"CLOSE_INPUT_FACE_MOUTH = False # if your image has an opened mouth, put this as True, else False\n",
"AMP_LIP_SHAPE_X = 2. # amplify the lip motion in horizontal direction\n",
"AMP_LIP_SHAPE_Y = 2. # amplify the lip motion in vertical direction\n",
"AMP_HEAD_POSE_MOTION = 0.7 # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)"
"default_head_name = 'paint_boy' # the image name (with no .jpg) to animate\n",
"ADD_NAIVE_EYE = True # whether add naive eye blink\n",
"CLOSE_INPUT_FACE_MOUTH = False # if your image has an opened mouth, put this as True, else False\n",
"AMP_LIP_SHAPE_X = 2. # amplify the lip motion in horizontal direction\n",
"AMP_LIP_SHAPE_Y = 2. # amplify the lip motion in vertical direction\n",
"AMP_HEAD_POSE_MOTION = 0.7 # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)"
],
"execution_count": null,
"outputs": []
Expand All @@ -400,18 +400,18 @@
},
"source": [
"parser = argparse.ArgumentParser()\n",
"parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name))\n",
"parser.add_argument('--jpg', type=str, default=f'{default_head_name}.jpg')\n",
"parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')\n",
"\n",
"parser.add_argument('--load_AUTOVC_name', type=str, default='examples/ckpt/ckpt_autovc.pth')\n",
"parser.add_argument('--load_a2l_G_name', type=str, default='examples/ckpt/ckpt_speaker_branch.pth')\n",
"parser.add_argument('--load_a2l_C_name', type=str, default='examples/ckpt/ckpt_content_branch.pth') #ckpt_audio2landmark_c.pth')\n",
"parser.add_argument('--load_G_name', type=str, default='examples/ckpt/ckpt_116_i2i_comb.pth') #ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c\n",
"parser.add_argument('--load_a2l_C_name', type=str, default='examples/ckpt/ckpt_content_branch.pth') # ckpt_audio2landmark_c.pth')\n",
"parser.add_argument('--load_G_name', type=str, default='examples/ckpt/ckpt_116_i2i_comb.pth') # ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c\n",
"\n",
"parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)\n",
"parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)\n",
"parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)\n",
"parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',\n",
"parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',\n",
"parser.add_argument('--add_audio_in', default=False, action='store_true')\n",
"parser.add_argument('--comb_fan_awing', default=False, action='store_true')\n",
"parser.add_argument('--output_folder', type=str, default='examples')\n",
Expand Down Expand Up @@ -454,15 +454,17 @@
"id": "SmYcSmrugxQK"
},
"source": [
"img =cv2.imread('examples/' + opt_parser.jpg)\n",
"predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='cpu', flip_input=True)\n",
"img = cv2.imread('examples/' + opt_parser.jpg)\n",
"predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D,\n",
" device='cpu',\n",
" flip_input=True)\n",
"shapes = predictor.get_landmarks(img)\n",
"if (not shapes or len(shapes) != 1):\n",
"if not shapes or len(shapes) != 1:\n",
" print('Cannot detect face landmarks. Exit.')\n",
" exit(-1)\n",
"shape_3d = shapes[0]\n",
"\n",
"if(opt_parser.close_input_face_mouth):\n",
"if opt_parser.close_input_face_mouth:\n",
" util.close_input_face_mouth(shape_3d)"
],
"execution_count": null,
Expand All @@ -487,10 +489,10 @@
},
"source": [
"shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * 1.05 + np.mean(shape_3d[48:, 0]) # wider lips\n",
"shape_3d[49:54, 1] += 0. # thinner upper lip\n",
"shape_3d[55:60, 1] -= 1. # thinner lower lip\n",
"shape_3d[[37,38,43,44], 1] -=2. # larger eyes\n",
"shape_3d[[40,41,46,47], 1] +=2. # larger eyes"
"shape_3d[49:54, 1] += 0. # thinner upper lip\n",
"shape_3d[55:60, 1] -= 1. # thinner lower lip\n",
"shape_3d[[37, 38, 43, 44], 1] -= 2. # larger eyes\n",
"shape_3d[[40, 41, 46, 47], 1] += 2. # larger eyes"
],
"execution_count": null,
"outputs": []
Expand Down Expand Up @@ -537,24 +539,23 @@
"au_data = []\n",
"au_emb = []\n",
"ains = glob.glob1('examples', '*.wav')\n",
"ains = [item for item in ains if item is not 'tmp.wav']\n",
"ains = [item for item in ains if item != 'tmp.wav']\n",
"ains.sort()\n",
"for ain in ains:\n",
" os.system('ffmpeg -y -loglevel error -i examples/{} -ar 16000 examples/tmp.wav'.format(ain))\n",
" shutil.copyfile('examples/tmp.wav', 'examples/{}'.format(ain))\n",
" os.system(f'ffmpeg -y -loglevel error -i examples/{ain} -ar 16000 examples/tmp.wav')\n",
" shutil.copyfile('examples/tmp.wav', f'examples/{ain}')\n",
"\n",
" # au embedding\n",
" from thirdparty.resemblyer_util.speaker_emb import get_spk_emb\n",
" me, ae = get_spk_emb('examples/{}'.format(ain))\n",
" au_emb.append(me.reshape(-1))\n",
"\n",
" print('Processing audio file', ain)\n",
" c = AutoVC_mel_Convertor('examples')\n",
"\n",
" au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('examples', ain),\n",
" autovc_model_path=opt_parser.load_AUTOVC_name)\n",
" autovc_model_path=opt_parser.load_AUTOVC_name)\n",
" au_data += au_data_i\n",
"if(os.path.isfile('examples/tmp.wav')):\n",
"if os.path.isfile('examples/tmp.wav'):\n",
" os.remove('examples/tmp.wav')\n",
"\n",
"# landmark fake placeholder\n",
Expand All @@ -568,21 +569,23 @@
" rot_quat.append(np.zeros(shape=(au_length, 4)))\n",
" anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))\n",
"\n",
"if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl.pickle'))):\n",
"if os.path.exists(os.path.join('examples', 'dump', 'random_val_fl.pickle')):\n",
" os.remove(os.path.join('examples', 'dump', 'random_val_fl.pickle'))\n",
"if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))):\n",
"if os.path.exists(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle')):\n",
" os.remove(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))\n",
"if(os.path.exists(os.path.join('examples', 'dump', 'random_val_au.pickle'))):\n",
"if os.path.exists(os.path.join('examples', 'dump', 'random_val_au.pickle')):\n",
" os.remove(os.path.join('examples', 'dump', 'random_val_au.pickle'))\n",
"if (os.path.exists(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))):\n",
"if os.path.exists(os.path.join('examples', 'dump', 'random_val_gaze.pickle')):\n",
" os.remove(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))\n",
"\n",
"with open(os.path.join('examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:\n",
" pickle.dump(fl_data, fp)\n",
"with open(os.path.join('examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:\n",
" pickle.dump(au_data, fp)\n",
"with open(os.path.join('examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:\n",
" gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}\n",
" gaze = {'rot_trans': rot_tran,\n",
" 'rot_quat': rot_quat,\n",
" 'anchor_t_shape': anchor_t_shape}\n",
" pickle.dump(gaze, fp)"
],
"execution_count": null,
Expand Down Expand Up @@ -622,7 +625,7 @@
"source": [
"!pwd\n",
"model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)\n",
"if(len(opt_parser.reuse_train_emb_list) == 0):\n",
"if len(opt_parser.reuse_train_emb_list) == 0:\n",
" model.test(au_emb=au_emb)\n",
"else:\n",
" model.test(au_emb=None)"
Expand Down Expand Up @@ -750,24 +753,27 @@
"fls = glob.glob1('examples', 'pred_fls_*.txt')\n",
"fls.sort()\n",
"\n",
"for i in range(0,len(fls)):\n",
" fl = np.loadtxt(os.path.join('examples', fls[i])).reshape((-1, 68,3))\n",
"for i in range(0, len(fls)):\n",
" fl = np.loadtxt(os.path.join('examples', fls[i])).reshape((-1, 68, 3))\n",
" fl[:, :, 0:2] = -fl[:, :, 0:2]\n",
" fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift\n",
"\n",
" if (ADD_NAIVE_EYE):\n",
" if ADD_NAIVE_EYE:\n",
" fl = util.add_naive_eye(fl)\n",
"\n",
" # additional smooth\n",
" fl = fl.reshape((-1, 204))\n",
" fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)\n",
" fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)\n",
" fl[:, 48 * 3:] = savgol_filter(fl[:, 48 * 3:], 5, 3, axis=0)\n",
" fl = fl.reshape((-1, 68, 3))\n",
"\n",
" ''' STEP 6: Imag2image translation '''\n",
" model = Image_translation_block(opt_parser, single_test=True)\n",
" with torch.no_grad():\n",
" model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])\n",
" model.single_test(jpg=img,\n",
" fls=fl,\n",
" filename=fls[i],\n",
" prefix=opt_parser.jpg.split('.')[0])\n",
" print('finish image2image gen')\n",
" os.remove(os.path.join('examples', fls[i]))"
],
Expand Down Expand Up @@ -813,10 +819,10 @@
" opt_parser.jpg.split('.')[0],\n",
" ain.split('.')[0]\n",
" )\n",
" mp4 = open('examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()\n",
" mp4 = open(f'examples/{OUTPUT_MP4_NAME}', 'rb').read()\n",
" data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
"\n",
" print('Display animation: examples/{}'.format(OUTPUT_MP4_NAME))\n",
" print(f'Display animation: examples/{OUTPUT_MP4_NAME}')\n",
" display(HTML(\"\"\"\n",
" <video width=600 controls>\n",
" <source src=\"%s\" type=\"video/mp4\">\n",
Expand Down Expand Up @@ -851,17 +857,6 @@
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hxWMuEEbpywq"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Loading