diff --git a/batch_process.py b/batch_process.py new file mode 100644 index 0000000..e2754e6 --- /dev/null +++ b/batch_process.py @@ -0,0 +1,67 @@ +import os +import subprocess +import argparse + +def main(args): + # Ensure the output directory exists + if not os.path.exists(args.videoFolderOutput): + os.makedirs(args.videoFolderOutput) + + # List all videos in the input directory + video_files = [f for f in os.listdir(args.videoFolderInput) if f.endswith(('.mp4', '.avi', '.mov'))] + if not video_files: + print(f"No video files found in the directory: {args.videoFolderInput}") + return + + # Process each video + for video_file in video_files: + video_name = os.path.splitext(video_file)[0] + + output_video_path = args.videoFolderOutput + + # Ensure output directory for the video exists + if not os.path.exists(output_video_path): + os.makedirs(output_video_path) + + # Build the command to call demoTalkNet.py + command = [ + "python", "demoTalkNet.py", + "--videoName", video_name, + "--videoFolderInput", args.videoFolderInput, + "--videoFolderOutput", args.videoFolderOutput, + "--channelName", args.channelName, + ] + + # Print and execute the command + print(f"Processing video: {video_file}") + print("Command:", " ".join(command)) + subprocess.run(command) + + print("Batch processing completed.") + +if __name__ == "__main__": + # Parse arguments for the batch process + parser = argparse.ArgumentParser(description="Batch Process Videos with demoTalkNet") + parser.add_argument('--videoFolderInput', type=str, required=True, help='Path to the folder containing input videos.') + parser.add_argument('--videoFolderOutput', default="output_dir", type=str, help='Path to the folder for storing outputs and temporary files.') + parser.add_argument('--bucketName', type=str, help='Path to the folder for storing outputs and temporary files.') + parser.add_argument('--channelName', type=str, required=True, help='Path to the folder for storing outputs and temporary files.') + parser.add_argument('--pretrainModel', type=str,default="pretrain_TalkSet.model", help='Path to the pretrained TalkNet model.') + parser.add_argument('--fps', type=float, default=25, help='Desired FPS.') + parser.add_argument('--frame_size', type=int, default=512, help='Desired frame size.') + parser.add_argument('--angleThreshold', type=int, default=10, help='Yaw threshold.') + parser.add_argument('--contentDetectorThreshold', type=float, default=27.0, help='Content detector threshold.') + parser.add_argument('--thresholdDetectorThreshold', type=float, default=30.0, help='Threshold detector threshold.') + parser.add_argument('--nDataLoaderThread', type=int, default=10, help='Number of data loader threads.') + parser.add_argument('--facedetScale', type=float, default=0.25, help='Face detection scale factor.') + parser.add_argument('--minTrack', type=int, default=40, help='Minimum frames for each shot.') + parser.add_argument('--numFailedDet', type=int, default=5, help='Missed detections allowed before stopping tracking.') + parser.add_argument('--minFaceSize', type=int, default=100, help='Minimum face size in pixels.') + parser.add_argument('--cropScale', type=float, default=0.40, help='Scale bounding box.') + parser.add_argument('--start', type=int, default=0, help='Start time of the video.') + parser.add_argument('--duration', type=int, default=0, help='Duration of the video (0 for full video).') + parser.add_argument('--evalCol', action='store_true', help='Evaluate on Columbia dataset.') + parser.add_argument('--colSavePath', type=str, default="/data08/col", help='Path for inputs, temps, and outputs for Columbia evaluation.') + + args = parser.parse_args() + main(args) diff --git a/demoTalkNet.py b/demoTalkNet.py index 0e496c2..d686f14 100755 --- a/demoTalkNet.py +++ b/demoTalkNet.py @@ -1,5 +1,29 @@ -import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features - +import subprocess +import sys +import time +import os +# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Suppress TensorFlow logging +import tqdm +import torch +import argparse +import glob +import subprocess +import warnings +import cv2 +import pickle +import numpy +import pdb +import math +import python_speech_features +import mediapipe as mp +import matplotlib.pyplot as plt + +import json +import insightface +from insightface.app import FaceAnalysis + +import cProfile +import pstats from scipy import signal from shutil import rmtree from scipy.io import wavfile @@ -8,451 +32,922 @@ from scenedetect.video_manager import VideoManager from scenedetect.scene_manager import SceneManager +from scenedetect import SceneManager, open_video, ContentDetector, ThresholdDetector from scenedetect.frame_timecode import FrameTimecode from scenedetect.stats_manager import StatsManager from scenedetect.detectors import ContentDetector from model.faceDetector.s3fd import S3FD from talkNet import talkNet +from s3_uploader import upload_file_to_s3 warnings.filterwarnings("ignore") -parser = argparse.ArgumentParser(description = "TalkNet Demo or Columnbia ASD Evaluation") - -parser.add_argument('--videoName', type=str, default="001", help='Demo video name') -parser.add_argument('--videoFolder', type=str, default="demo", help='Path for inputs, tmps and outputs') -parser.add_argument('--pretrainModel', type=str, default="pretrain_TalkSet.model", help='Path for the pretrained TalkNet model') - -parser.add_argument('--nDataLoaderThread', type=int, default=10, help='Number of workers') -parser.add_argument('--facedetScale', type=float, default=0.25, help='Scale factor for face detection, the frames will be scale to 0.25 orig') -parser.add_argument('--minTrack', type=int, default=10, help='Number of min frames for each shot') -parser.add_argument('--numFailedDet', type=int, default=10, help='Number of missed detections allowed before tracking is stopped') -parser.add_argument('--minFaceSize', type=int, default=1, help='Minimum face size in pixels') -parser.add_argument('--cropScale', type=float, default=0.40, help='Scale bounding box') - -parser.add_argument('--start', type=int, default=0, help='The start time of the video') -parser.add_argument('--duration', type=int, default=0, help='The duration of the video, when set as 0, will extract the whole video') - -parser.add_argument('--evalCol', dest='evalCol', action='store_true', help='Evaluate on Columnbia dataset') -parser.add_argument('--colSavePath', type=str, default="/data08/col", help='Path for inputs, tmps and outputs') +mp_face_mesh = mp.solutions.face_mesh +face_mesh = mp_face_mesh.FaceMesh( + min_detection_confidence=0.5, min_tracking_confidence=0.5) + +face_app = FaceAnalysis(name='buffalo_s') # This model supports age & gender +face_app.prepare(ctx_id=0) + +parser = argparse.ArgumentParser( + description="TalkNet Demo or Columnbia ASD Evaluation") + +parser.add_argument('--videoName', type=str, + default="001", help='Demo video name') +parser.add_argument('--videoFolderInput', type=str, + required=True, help='Path for inputs') +parser.add_argument('--videoFolderOutput', type=str, + default="output_dir", help='Path for tmps and outputs') +parser.add_argument('--pretrainModel', type=str, + default="pretrain_TalkSet.model", help='Path for the pretrained TalkNet model') +parser.add_argument('--fps', type=float, + default=25, help='Desired FPS') +parser.add_argument('--frame_size', type=int, + default=512, help='Desired frame size') + +parser.add_argument('--angleThreshold', type=int, + default=25, help='Desired threshold for yaw') +parser.add_argument('--contentDetectorThreshold', type=float, + default=27.0, help='Desired frame size') +parser.add_argument('--thresholdDetectorThreshold', type=float, + default=30.0, help='Desired frame size') +parser.add_argument('--bucketName', type=str, + default='hdindiandataset', help='Bucket Name in AWS') +parser.add_argument('--channelName', type=str, + required=True, help='Desired frame size') + +parser.add_argument('--nDataLoaderThread', type=int, + default=10, help='Number of workers') +parser.add_argument('--facedetScale', type=float, default=0.25, + help='Scale factor for face detection, the frames will be scale to 0.25 orig') +parser.add_argument('--minTrack', type=int, + default=40, help='Number of min frames for each shot') +parser.add_argument('--numFailedDet', type=int, default=5, + help='Number of missed detections allowed before tracking is stopped') +parser.add_argument('--minFaceSize', type=int, + default=100, help='Minimum face size in pixels') +parser.add_argument('--cropScale', type=float, + default=0.40, help='Scale bounding box') + +parser.add_argument('--start', type=int, + default=0, help='The start time of the video') +parser.add_argument('--duration', type=int, default=0, + help='The duration of the video, when set as 0, will extract the whole video') + +parser.add_argument('--evalCol', dest='evalCol', + action='store_true', help='Evaluate on Columnbia dataset') +parser.add_argument('--colSavePath', type=str, + default="/data08/col", help='Path for inputs, tmps and outputs') args = parser.parse_args() -if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model +if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea" - cmd = "gdown --id %s -O %s"%(Link, args.pretrainModel) + cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel) subprocess.call(cmd, shell=True, stdout=None) if args.evalCol == True: - # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using) - # 2. extract audio, extract video frames - # 3. scend detection, face detection and face tracking - # 4. active speaker detection for the detected face clips - # 5. use iou to find the identity of each face clips, compute the F1 results - # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour - # The step 4 and 5 need less than 10 minutes - # Need about 20G space finally - # ``` - args.videoName = 'col' - args.videoFolder = args.colSavePath - args.savePath = os.path.join(args.videoFolder, args.videoName) - args.videoPath = os.path.join(args.videoFolder, args.videoName + '.mp4') - args.duration = 0 - if os.path.isfile(args.videoPath) == False: # Download video - link = 'https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s' - cmd = "youtube-dl -f best -o %s '%s'"%(args.videoPath, link) - output = subprocess.call(cmd, shell=True, stdout=None) - if os.path.isdir(args.videoFolder + '/col_labels') == False: # Download label - link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv" - cmd = "gdown --id %s -O %s"%(link, args.videoFolder + '/col_labels.tar.gz') - subprocess.call(cmd, shell=True, stdout=None) - cmd = "tar -xzvf %s -C %s"%(args.videoFolder + '/col_labels.tar.gz', args.videoFolder) - subprocess.call(cmd, shell=True, stdout=None) - os.remove(args.videoFolder + '/col_labels.tar.gz') + # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using) + # 2. extract audio, extract video frames + # 3. scend detection, face detection and face tracking + # 4. active speaker detection for the detected face clips + # 5. use iou to find the identity of each face clips, compute the F1 results + # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour + # The step 4 and 5 need less than 10 minutes + # Need about 20G space finally + # ``` + args.videoName = 'col' + args.videoFolder = args.colSavePath + args.savePath = os.path.join(args.videoFolder, args.videoName) + args.videoPath = os.path.join(args.videoFolder, args.videoName + '.mp4') + args.duration = 0 + if os.path.isfile(args.videoPath) == False: # Download video + link = 'https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s' + cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link) + output = subprocess.call(cmd, shell=True, stdout=None) + if os.path.isdir(args.videoFolder + '/col_labels') == False: # Download label + link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv" + cmd = "gdown --id %s -O %s" % (link, + args.videoFolder + '/col_labels.tar.gz') + subprocess.call(cmd, shell=True, stdout=None) + cmd = "tar -xzvf %s -C %s" % (args.videoFolder + + '/col_labels.tar.gz', args.videoFolder) + subprocess.call(cmd, shell=True, stdout=None) + os.remove(args.videoFolder + '/col_labels.tar.gz') else: - args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + '.*'))[0] - args.savePath = os.path.join(args.videoFolder, args.videoName) + args.videoPath = glob.glob(os.path.join( + args.videoFolderInput, args.videoName + '.*'))[0] + + args.video_id = args.videoName.split('_cluster_')[0] + args.savePath = os.path.join(args.videoFolderOutput, args.video_id) + # args.savePath = args.videoFolderOutput + + +from collections import namedtuple +Scene = namedtuple('Scene', ['frame_num']) +class Scene: + def __init__(self, frame_num): + self.frame_num = frame_num def scene_detect(args): - # CPU: Scene detection, output is the list of each shot's time duration - videoManager = VideoManager([args.videoFilePath]) - statsManager = StatsManager() - sceneManager = SceneManager(statsManager) - sceneManager.add_detector(ContentDetector()) - baseTimecode = videoManager.get_base_timecode() - videoManager.set_downscale_factor() - videoManager.start() - sceneManager.detect_scenes(frame_source = videoManager) - sceneList = sceneManager.get_scene_list(baseTimecode) - savePath = os.path.join(args.pyworkPath, 'scene.pckl') - if sceneList == []: - sceneList = [(videoManager.get_base_timecode(),videoManager.get_current_timecode())] - with open(savePath, 'wb') as fil: - pickle.dump(sceneList, fil) - sys.stderr.write('%s - scenes detected %d\n'%(args.videoFilePath, len(sceneList))) - return sceneList + # CPU: Scene detection, output is the list of each shot's time duration + video = open_video(args.videoFilePath) + + sceneManager = SceneManager() + + # sceneManager.add_detector(ContentDetector(threshold=args.contentDetectorThreshold, min_scene_len=30)) + # sceneManager.add_detector(ThresholdDetector(threshold=args.thresholdDetectorThreshold)) + + sceneManager.add_detector(ContentDetector()) + # sceneManager.add_detector(ThresholdDetector()) + + sceneManager.detect_scenes(video) + sceneList = sceneManager.get_scene_list() + + savePath = os.path.join(args.pyworkPath, 'scene.pckl') + if not sceneList: + cap = cv2.VideoCapture(args.videoFilePath) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + # Fallback: If no scenes detected, create a single "scene" from start to end + # sceneList = [(0, frame_count)] + sceneList = [(Scene(frame_num=0), Scene(frame_num=frame_count))] + cap.release() + with open(savePath, 'wb') as file: + pickle.dump(sceneList, file) + sys.stderr.write(f"{args.videoFilePath} - scenes detected: {len(sceneList)}\n") + + return sceneList def inference_video(args): - # GPU: Face detection, output is the list contains the face location and score in this frame - DET = S3FD(device='cuda') - flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) - flist.sort() - dets = [] - for fidx, fname in enumerate(flist): - image = cv2.imread(fname) - imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale]) - dets.append([]) - for bbox in bboxes: - dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) # dets has the frames info, bbox info, conf info - sys.stderr.write('%s-%05d; %d dets\r' % (args.videoFilePath, fidx, len(dets[-1]))) - savePath = os.path.join(args.pyworkPath,'faces.pckl') - with open(savePath, 'wb') as fil: - pickle.dump(dets, fil) - return dets - -def bb_intersection_over_union(boxA, boxB, evalCol = False): - # CPU: IOU Function to calculate overlap between two image - xA = max(boxA[0], boxB[0]) - yA = max(boxA[1], boxB[1]) - xB = min(boxA[2], boxB[2]) - yB = min(boxA[3], boxB[3]) - interArea = max(0, xB - xA) * max(0, yB - yA) - boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) - boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) - if evalCol == True: - iou = interArea / float(boxAArea) - else: - iou = interArea / float(boxAArea + boxBArea - interArea) - return iou + # GPU: Face detection, output is the list contains the face location and score in this frame + DET = S3FD(device='cuda') + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) + flist.sort() + dets = [] + for fidx, fname in enumerate(flist): + image = cv2.imread(fname) + imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bboxes = DET.detect_faces( + imageNumpy, conf_th=0.9, scales=[args.facedetScale]) + dets.append([]) + for bbox in bboxes: + # dets has the frames info, bbox info, conf info + dets[-1].append({'frame': fidx, 'bbox': (bbox[:-1] + ).tolist(), 'conf': bbox[-1]}) + sys.stderr.write('%s-%05d; %d dets\r' % + (args.videoFilePath, fidx, len(dets[-1]))) + savePath = os.path.join(args.pyworkPath, 'faces.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(dets, fil) + return dets + + +# import os +# import glob +# import cv2 +# import torch +# from ultralytics import YOLO # YOLOv8 library +# import pickle +# import sys + +# def inference_video(args): +# # Load the YOLOv11n-face model +# model = YOLO('./model/faceDetector/yolov11n-face.pt') # Path to the YOLOv11n-face model + +# flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) +# flist.sort() +# dets = [] + +# for fidx, fname in enumerate(flist): +# # Read the frame +# image = cv2.imread(fname) +# results = model(image, verbose=False) +# detections = results[0].boxes.data.cpu().numpy() + +# dets.append([]) +# for det in detections: +# x1, y1, x2, y2, conf, class_id = det +# if conf>0.8: +# dets[-1].append({ +# 'frame': fidx, +# 'bbox': [x1, y1, x2, y2], +# 'conf': conf +# }) + +# # Log progress +# sys.stderr.write('%s-%05d; %d dets\r' % +# (args.videoFilePath, fidx, len(dets[-1]))) + +# # Save detections +# savePath = os.path.join(args.pyworkPath, 'faces.pckl') +# with open(savePath, 'wb') as fil: +# pickle.dump(dets, fil) + +# return dets + +# def inference_video(args): +# # Load the YOLOv11n-face model +# model = YOLO('./model/faceDetector/yolov11n-face.pt') # Path to the YOLOv11n-face model + +# # Get list of all frame image files +# flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) +# flist.sort() +# dets = [] + +# for fidx, fname in enumerate(flist): +# # Read the frame +# image = cv2.imread(fname) + +# # Perform face detection +# results = model.predict(image, conf=0.8, verbose=False) # Use .predict() + +# # Extract detections +# dets.append([]) +# if results[0].boxes: +# detections = results[0].boxes.data.cpu().numpy() # Bounding box data +# for det in detections: +# x1, y1, x2, y2, conf = det[:5] # Parse bounding box and confidence +# if conf > 0.8: # Check confidence threshold +# dets[-1].append({ +# 'frame': fidx, +# 'bbox': [x1, y1, x2, y2], +# 'conf': conf +# }) + +# # Log progress +# sys.stderr.write('%s-%05d; %d dets\r' % +# (args.videoFilePath, fidx, len(dets[-1]))) + +# # Save detections to a file +# savePath = os.path.join(args.pyworkPath, 'faces.pckl') +# with open(savePath, 'wb') as fil: +# pickle.dump(dets, fil) + +# return dets + +def bb_intersection_over_union(boxA, boxB, evalCol=False): + # CPU: IOU Function to calculate overlap between two image + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + interArea = max(0, xB - xA) * max(0, yB - yA) + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + if evalCol == True: + iou = interArea / float(boxAArea) + else: + iou = interArea / float(boxAArea + boxBArea - interArea) + return iou + def track_shot(args, sceneFaces): - # CPU: Face tracking - iouThres = 0.5 # Minimum IOU between consecutive face detections - tracks = [] - while True: - track = [] - for frameFaces in sceneFaces: - for face in frameFaces: - if track == []: - track.append(face) - frameFaces.remove(face) - elif face['frame'] - track[-1]['frame'] <= args.numFailedDet: - iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) - if iou > iouThres: - track.append(face) - frameFaces.remove(face) - continue - else: - break - if track == []: - break - elif len(track) > args.minTrack: - frameNum = numpy.array([ f['frame'] for f in track ]) - bboxes = numpy.array([numpy.array(f['bbox']) for f in track]) - frameI = numpy.arange(frameNum[0],frameNum[-1]+1) - bboxesI = [] - for ij in range(0,4): - interpfn = interp1d(frameNum, bboxes[:,ij]) - bboxesI.append(interpfn(frameI)) - bboxesI = numpy.stack(bboxesI, axis=1) - if max(numpy.mean(bboxesI[:,2]-bboxesI[:,0]), numpy.mean(bboxesI[:,3]-bboxesI[:,1])) > args.minFaceSize: - tracks.append({'frame':frameI,'bbox':bboxesI}) - return tracks + # CPU: Face tracking + iouThres = 0.5 # Minimum IOU between consecutive face detections + tracks = [] + while True: + track = [] + for frameFaces in sceneFaces: + for face in frameFaces: + if track == []: + track.append(face) + frameFaces.remove(face) + elif face['frame'] - track[-1]['frame'] <= args.numFailedDet: + iou = bb_intersection_over_union( + face['bbox'], track[-1]['bbox']) + if iou > iouThres: + track.append(face) + frameFaces.remove(face) + continue + else: + break + if track == []: + break + elif len(track) > args.minTrack: + frameNum = numpy.array([f['frame'] for f in track]) + bboxes = numpy.array([numpy.array(f['bbox']) for f in track]) + frameI = numpy.arange(frameNum[0], frameNum[-1]+1) + bboxesI = [] + for ij in range(0, 4): + interpfn = interp1d(frameNum, bboxes[:, ij]) + bboxesI.append(interpfn(frameI)) + bboxesI = numpy.stack(bboxesI, axis=1) + if max(numpy.mean(bboxesI[:, 2]-bboxesI[:, 0]), numpy.mean(bboxesI[:, 3]-bboxesI[:, 1])) > args.minFaceSize: + tracks.append({'frame': frameI, 'bbox': bboxesI}) + return tracks + def crop_video(args, track, cropFile): - # CPU: crop the face clips - flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Read the frames - flist.sort() - vOut = cv2.VideoWriter(cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video - dets = {'x':[], 'y':[], 's':[]} - for det in track['bbox']: # Read the tracks - dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) - dets['y'].append((det[1]+det[3])/2) # crop center x - dets['x'].append((det[0]+det[2])/2) # crop center y - dets['s'] = signal.medfilt(dets['s'], kernel_size=13) # Smooth detections - dets['x'] = signal.medfilt(dets['x'], kernel_size=13) - dets['y'] = signal.medfilt(dets['y'], kernel_size=13) - for fidx, frame in enumerate(track['frame']): - cs = args.cropScale - bs = dets['s'][fidx] # Detection box size - bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount - image = cv2.imread(flist[frame]) - frame = numpy.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110)) - my = dets['y'][fidx] + bsi # BBox center Y - mx = dets['x'][fidx] + bsi # BBox center X - face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] - vOut.write(cv2.resize(face, (224, 224))) - audioTmp = cropFile + '.wav' - audioStart = (track['frame'][0]) / 25 - audioEnd = (track['frame'][-1]+1) / 25 - vOut.release() - command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % \ - (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)) - output = subprocess.call(command, shell=True, stdout=None) # Crop audio file - _, audio = wavfile.read(audioTmp) - command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % \ - (cropFile, audioTmp, args.nDataLoaderThread, cropFile)) # Combine audio and video file - output = subprocess.call(command, shell=True, stdout=None) - os.remove(cropFile + 't.avi') - return {'track':track, 'proc_track':dets} + # CPU: crop the face clips + flist = glob.glob(os.path.join( + args.pyframesPath, '*.jpg')) # Read the frames + flist.sort() + vOut = cv2.VideoWriter( + cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), args.fps, (args.frame_size, args.frame_size)) # Write video + dets = {'x': [], 'y': [], 's': []} + for det in track['bbox']: # Read the tracks + dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x + dets['x'].append((det[0]+det[2])/2) # crop center y + dets['s'] = signal.medfilt(dets['s'], kernel_size=13) # Smooth detections + dets['x'] = signal.medfilt(dets['x'], kernel_size=13) + dets['y'] = signal.medfilt(dets['y'], kernel_size=13) + for fidx, frame in enumerate(track['frame']): + cs = args.cropScale + bs = dets['s'][fidx] # Detection box size + bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount + image = cv2.imread(flist[frame]) + frame = numpy.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), + 'constant', constant_values=(110, 110)) + my = dets['y'][fidx] + bsi # BBox center Y + mx = dets['x'][fidx] + bsi # BBox center X + face = frame[int(my-bs):int(my+bs*(1+2*cs)), + int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + vOut.write(cv2.resize(face, (args.frame_size, args.frame_size))) + audioTmp = cropFile + '.wav' + audioStart = (track['frame'][0]) / args.fps + audioEnd = (track['frame'][-1]+1) / args.fps + vOut.release() + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % + (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)) + output = subprocess.call( + command, shell=True, stdout=None) # Crop audio file + _, audio = wavfile.read(audioTmp) + command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % + (cropFile, audioTmp, args.nDataLoaderThread, cropFile)) # Combine audio and video file + output = subprocess.call(command, shell=True, stdout=None) + os.remove(cropFile + 't.avi') + return {'track': track, 'proc_track': dets} + def extract_MFCC(file, outPath): - # CPU: extract mfcc - sr, audio = wavfile.read(file) - mfcc = python_speech_features.mfcc(audio,sr) # (N_frames, 13) [1s = 100 frames] - featuresPath = os.path.join(outPath, file.split('/')[-1].replace('.wav', '.npy')) - numpy.save(featuresPath, mfcc) + # CPU: extract mfcc + sr, audio = wavfile.read(file) + # (N_frames, 13) [1s = 100 frames] + mfcc = python_speech_features.mfcc(audio, sr) + featuresPath = os.path.join( + outPath, file.split('/')[-1].replace('.wav', '.npy')) + numpy.save(featuresPath, mfcc) + def evaluate_network(files, args): - # GPU: active speaker detection by pretrained TalkNet - s = talkNet() - s.loadParameters(args.pretrainModel) - sys.stderr.write("Model %s loaded from previous state! \r\n"%args.pretrainModel) - s.eval() - allScores = [] - # durationSet = {1,2,4,6} # To make the result more reliable - durationSet = {1,1,1,2,2,2,3,3,4,5,6} # Use this line can get more reliable result - for file in tqdm.tqdm(files, total = len(files)): - fileName = os.path.splitext(file.split('/')[-1])[0] # Load audio and video - _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + '.wav')) - audioFeature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010) - video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + '.avi')) - videoFeature = [] - while video.isOpened(): - ret, frames = video.read() - if ret == True: - face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) - face = cv2.resize(face, (224,224)) - face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))] - videoFeature.append(face) - else: - break - video.release() - videoFeature = numpy.array(videoFeature) - length = min((audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0] / 25) - audioFeature = audioFeature[:int(round(length * 100)),:] - videoFeature = videoFeature[:int(round(length * 25)),:,:] - allScore = [] # Evaluation use TalkNet - for duration in durationSet: - batchSize = int(math.ceil(length / duration)) - scores = [] - with torch.no_grad(): - for i in range(batchSize): - inputA = torch.FloatTensor(audioFeature[i * duration * 100:(i+1) * duration * 100,:]).unsqueeze(0).cuda() - inputV = torch.FloatTensor(videoFeature[i * duration * 25: (i+1) * duration * 25,:,:]).unsqueeze(0).cuda() - embedA = s.model.forward_audio_frontend(inputA) - embedV = s.model.forward_visual_frontend(inputV) - embedA, embedV = s.model.forward_cross_attention(embedA, embedV) - out = s.model.forward_audio_visual_backend(embedA, embedV) - score = s.lossAV.forward(out, labels = None) - scores.extend(score) - allScore.append(scores) - allScore = numpy.round((numpy.mean(numpy.array(allScore), axis = 0)), 1).astype(float) - allScores.append(allScore) - return allScores + # GPU: active speaker detection by pretrained TalkNet + s = talkNet() + s.loadParameters(args.pretrainModel) + sys.stderr.write("Model %s loaded from previous state! \r\n" % + args.pretrainModel) + s.eval() + allScores = [] + # durationSet = {1,2,4,6} # To make the result more reliable + # Use this line can get more reliable result + durationSet = {1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6} + for file in tqdm.tqdm(files, total=len(files)): + fileName = os.path.splitext(file.split( + '/')[-1])[0] # Load audio and video + _, audio = wavfile.read(os.path.join( + args.pycropPath, fileName + '.wav')) + audioFeature = python_speech_features.mfcc( + audio, 16000, numcep=13, winlen=0.025, winstep=0.010) + video = cv2.VideoCapture(os.path.join( + args.pycropPath, fileName + '.avi')) + videoFeature = [] + while video.isOpened(): + ret, frames = video.read() + if ret == True: + face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) + # face = cv2.resize(face, (args.frame_size, args.frame_size)) + face = cv2.resize(face, (224, 224)) + face = face[int(112-(112/2)):int(112+(112/2)), + int(112-(112/2)):int(112+(112/2))] + videoFeature.append(face) + else: + break + video.release() + videoFeature = numpy.array(videoFeature) + length = min((audioFeature.shape[0] - audioFeature.shape[0] % + 4) / 100, videoFeature.shape[0] / args.fps) + audioFeature = audioFeature[:int(round(length * 100)), :] + videoFeature = videoFeature[:int(round(length * args.fps)), :, :] + allScore = [] # Evaluation use TalkNet + for duration in durationSet: + batchSize = int(math.ceil(length / duration)) + scores = [] + with torch.no_grad(): + for i in range(batchSize): + inputA = torch.FloatTensor( + audioFeature[i * duration * 100:(i+1) * duration * 100, :]).unsqueeze(0).cuda() + inputV = torch.FloatTensor( + videoFeature[i * duration * args.fps: (i+1) * duration * args.fps, :, :]).unsqueeze(0).cuda() + embedA = s.model.forward_audio_frontend(inputA) + embedV = s.model.forward_visual_frontend(inputV) + embedA, embedV = s.model.forward_cross_attention( + embedA, embedV) + out = s.model.forward_audio_visual_backend(embedA, embedV) + score = s.lossAV.forward(out, labels=None) + scores.extend(score) + allScore.append(scores) + allScore = numpy.round( + (numpy.mean(numpy.array(allScore), axis=0)), 1).astype(float) + allScores.append(allScore) + return allScores + def visualization(tracks, scores, args): - # CPU: visulize the result for video format - flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) - flist.sort() - faces = [[] for i in range(len(flist))] - for tidx, track in enumerate(tracks): - score = scores[tidx] - for fidx, frame in enumerate(track['track']['frame'].tolist()): - s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)] # average smoothing - s = numpy.mean(s) - faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) - firstImage = cv2.imread(flist[0]) - fw = firstImage.shape[1] - fh = firstImage.shape[0] - vOut = cv2.VideoWriter(os.path.join(args.pyaviPath, 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw,fh)) - colorDict = {0: 0, 1: 255} - for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)): - image = cv2.imread(fname) - for face in faces[fidx]: - clr = colorDict[int((face['score'] >= 0))] - txt = round(face['score'], 1) - cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int(face['x']+face['s']), int(face['y']+face['s'])),(0,clr,255-clr),10) - cv2.putText(image,'%s'%(txt), (int(face['x']-face['s']), int(face['y']-face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,clr,255-clr),5) - vOut.write(image) - vOut.release() - command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % \ - (os.path.join(args.pyaviPath, 'video_only.avi'), os.path.join(args.pyaviPath, 'audio.wav'), \ - args.nDataLoaderThread, os.path.join(args.pyaviPath,'video_out.avi'))) - output = subprocess.call(command, shell=True, stdout=None) + # CPU: visulize the result for video format + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) + flist.sort() + faces = [[] for i in range(len(flist))] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track['track']['frame'].tolist()): + # average smoothing + s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)] + s = numpy.mean(s) + faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s'] + [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]}) + firstImage = cv2.imread(flist[0]) + fw = firstImage.shape[1] + fh = firstImage.shape[0] + vOut = cv2.VideoWriter(os.path.join(args.pyaviPath, 'video_only.avi'), + cv2.VideoWriter_fourcc(*'XVID'), args.fps, (fw, fh)) + colorDict = {0: 0, 1: 255} + for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)): + image = cv2.imread(fname) + # image = cv2.resize(image, ) + for face in faces[fidx]: + clr = colorDict[int((face['score'] >= 0))] + txt = round(face['score'], 1) + cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int( + face['x']+face['s']), int(face['y']+face['s'])), (0, clr, 255-clr), 10) + cv2.putText(image, '%s' % (txt), (int(face['x']-face['s']), int( + face['y']-face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, clr, 255-clr), 5) + vOut.write(image) + vOut.release() + command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % + (os.path.join(args.pyaviPath, 'video_only.avi'), os.path.join(args.pyaviPath, 'audio.wav'), + args.nDataLoaderThread, os.path.join(args.pyaviPath, 'video_out.avi'))) + output = subprocess.call(command, shell=True, stdout=None) + def evaluate_col_ASD(tracks, scores, args): - txtPath = args.videoFolder + '/col_labels/fusion/*.txt' # Load labels - predictionSet = {} - for name in {'long', 'bell', 'boll', 'lieb', 'sick', 'abbas'}: - predictionSet[name] = [[],[]] - dictGT = {} - txtFiles = glob.glob("%s"%txtPath) - for file in txtFiles: - lines = open(file).read().splitlines() - idName = file.split('/')[-1][:-4] - for line in lines: - data = line.split('\t') - frame = int(int(data[0]) / 29.97 * 25) - x1 = int(data[1]) - y1 = int(data[2]) - x2 = int(data[1]) + int(data[3]) - y2 = int(data[2]) + int(data[3]) - gt = int(data[4]) - if frame in dictGT: - dictGT[frame].append([x1,y1,x2,y2,gt,idName]) - else: - dictGT[frame] = [[x1,y1,x2,y2,gt,idName]] - flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Load files - flist.sort() - faces = [[] for i in range(len(flist))] - for tidx, track in enumerate(tracks): - score = scores[tidx] - for fidx, frame in enumerate(track['track']['frame'].tolist()): - s = numpy.mean(score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]) # average smoothing - faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) - for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)): - if fidx in dictGT: # This frame has label - for gtThisFrame in dictGT[fidx]: # What this label is ? - faceGT = gtThisFrame[0:4] - labelGT = gtThisFrame[4] - idGT = gtThisFrame[5] - ious = [] - for face in faces[fidx]: # Find the right face in my result - faceLocation = [int(face['x']-face['s']), int(face['y']-face['s']), int(face['x']+face['s']), int(face['y']+face['s'])] - faceLocation_new = [int(face['x']-face['s']) // 2, int(face['y']-face['s']) // 2, int(face['x']+face['s']) // 2, int(face['y']+face['s']) // 2] - iou = bb_intersection_over_union(faceLocation_new, faceGT, evalCol = True) - if iou > 0.5: - ious.append([iou, round(face['score'],2)]) - if len(ious) > 0: # Find my result - ious.sort() - labelPredict = ious[-1][1] - else: - labelPredict = 0 - x1 = faceGT[0] - y1 = faceGT[1] - width = faceGT[2] - faceGT[0] - predictionSet[idGT][0].append(labelPredict) - predictionSet[idGT][1].append(labelGT) - names = ['long', 'bell', 'boll', 'lieb', 'sick', 'abbas'] # Evaluate - names.sort() - F1s = 0 - for i in names: - scores = numpy.array(predictionSet[i][0]) - labels = numpy.array(predictionSet[i][1]) - scores = numpy.int64(scores > 0) - F1 = f1_score(labels, scores) - ACC = accuracy_score(labels, scores) - if i != 'abbas': - F1s += F1 - print("%s, ACC:%.2f, F1:%.2f"%(i, 100 * ACC, 100 * F1)) - print("Average F1:%.2f"%(100 * (F1s / 5))) + txtPath = args.videoFolder + '/col_labels/fusion/*.txt' # Load labels + predictionSet = {} + for name in {'long', 'bell', 'boll', 'lieb', 'sick', 'abbas'}: + predictionSet[name] = [[], []] + dictGT = {} + txtFiles = glob.glob("%s" % txtPath) + for file in txtFiles: + lines = open(file).read().splitlines() + idName = file.split('/')[-1][:-4] + for line in lines: + data = line.split('\t') + frame = int(int(data[0]) / 29.97 * args.fps) + x1 = int(data[1]) + y1 = int(data[2]) + x2 = int(data[1]) + int(data[3]) + y2 = int(data[2]) + int(data[3]) + gt = int(data[4]) + if frame in dictGT: + dictGT[frame].append([x1, y1, x2, y2, gt, idName]) + else: + dictGT[frame] = [[x1, y1, x2, y2, gt, idName]] + flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Load files + flist.sort() + faces = [[] for i in range(len(flist))] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track['track']['frame'].tolist()): + # average smoothing + s = numpy.mean( + score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]) + faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s'] + [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]}) + for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)): + if fidx in dictGT: # This frame has label + for gtThisFrame in dictGT[fidx]: # What this label is ? + faceGT = gtThisFrame[0:4] + labelGT = gtThisFrame[4] + idGT = gtThisFrame[5] + ious = [] + for face in faces[fidx]: # Find the right face in my result + faceLocation = [int(face['x']-face['s']), int(face['y']-face['s']), + int(face['x']+face['s']), int(face['y']+face['s'])] + faceLocation_new = [int(face['x']-face['s']) // 2, int(face['y']-face['s']) // 2, int( + face['x']+face['s']) // 2, int(face['y']+face['s']) // 2] + iou = bb_intersection_over_union( + faceLocation_new, faceGT, evalCol=True) + if iou > 0.5: + ious.append([iou, round(face['score'], 2)]) + if len(ious) > 0: # Find my result + ious.sort() + labelPredict = ious[-1][1] + else: + labelPredict = 0 + x1 = faceGT[0] + y1 = faceGT[1] + width = faceGT[2] - faceGT[0] + predictionSet[idGT][0].append(labelPredict) + predictionSet[idGT][1].append(labelGT) + names = ['long', 'bell', 'boll', 'lieb', 'sick', 'abbas'] # Evaluate + names.sort() + F1s = 0 + for i in names: + scores = numpy.array(predictionSet[i][0]) + labels = numpy.array(predictionSet[i][1]) + scores = numpy.int64(scores > 0) + F1 = f1_score(labels, scores) + ACC = accuracy_score(labels, scores) + if i != 'abbas': + F1s += F1 + print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1)) + print("Average F1:%.2f" % (100 * (F1s / 5))) + + +# def extract_segment(track_path, start_frame, end_frame, output_path_video, output_path_audio): +# # Convert start_frame and end_frame to time (in seconds) +# start_time = start_frame / args.fps +# end_time = end_frame / args.fps + +# # FFmpeg command to extract video with audio trimming +# command_video = f'ffmpeg -accurate_seek -i "{track_path}.avi" -ss {start_time} -to {end_time} -c:v libx264 -c:a aac "{output_path_video}" -loglevel panic' + +# # FFmpeg command to extract audio separately +# command_audio = f'ffmpeg -accurate_seek -i "{track_path}.avi" -ss {start_time} -to {end_time} -vn -c:a aac "{output_path_audio}" -loglevel panic' + +# # Execute command for video and audio extraction +# try: +# # Extract video +# subprocess.run(command_video, shell=True, check=True, +# stdout=subprocess.PIPE, stderr=subprocess.PIPE) +# # print(f"Video segment extracted successfully: {output_path_video}") + +# # Extract audio +# subprocess.run(command_audio, shell=True, check=True, +# stdout=subprocess.PIPE, stderr=subprocess.PIPE) +# # print(f"Audio segment extracted successfully: {output_path_audio}") + +# except subprocess.CalledProcessError as e: +# print(f"Error extracting segment: {e}") + +import subprocess + +def extract_segment(track_path, start_frame, end_frame, output_path_video, file_name): + # Convert start_frame and end_frame to time (in seconds) + start_time = start_frame / args.fps + end_time = end_frame / args.fps + + # FFmpeg command to extract video with audio trimming + command_video = [ + "ffmpeg", + "-accurate_seek", + "-i", f"{track_path}.avi", + "-ss", str(start_time), + "-to", str(end_time), + "-c:v", "libx264", + "-c:a", "aac", + output_path_video, + ] + + try: + # Run the command and capture output + result = subprocess.run( + command_video, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + if result.returncode != 0: + print("Error during FFmpeg execution:") + print(result.stderr) # Print the error message from FFmpeg + else: + print("Segment extracted successfully, Copying to S3 and deleting local file") + + s3_object_key = f"{args.channelName}/{args.video_id}/{file_name}" + upload_file_to_s3(output_path_video, args.bucketName, s3_object_key) + + # Check if the file exists + if os.path.exists(output_path_video): + os.remove(output_path_video) + print(f"File '{output_path_video}' has been deleted successfully.") + else: + print(f"File '{output_path_video}' does not exist.") + + + except Exception as e: + print(f"Exception occurred while extracting segment: {e}") + # Main function def main(): - # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python). - # ``` - # . - # ├── pyavi - # │   ├── audio.wav (Audio from input video) - # │   ├── video.avi (Copy of the input video) - # │   ├── video_only.avi (Output video without audio) - # │   └── video_out.avi (Output video with audio) - # ├── pycrop (The detected face videos and audios) - # │ ├── 000000.avi - # │ ├── 000000.wav - # │ ├── 000001.avi - # │ ├── 000001.wav - # │ └── ... - # ├── pyframes (All the video frames in this video) - # │ ├── 000001.jpg - # │ ├── 000002.jpg - # │ └── ... - # └── pywork - # ├── faces.pckl (face detection result) - # ├── scene.pckl (scene detection result) - # ├── scores.pckl (ASD result) - # └── tracks.pckl (face tracking result) - # ``` - - # Initialization - args.pyaviPath = os.path.join(args.savePath, 'pyavi') - args.pyframesPath = os.path.join(args.savePath, 'pyframes') - args.pyworkPath = os.path.join(args.savePath, 'pywork') - args.pycropPath = os.path.join(args.savePath, 'pycrop') - if os.path.exists(args.savePath): - rmtree(args.savePath) - os.makedirs(args.pyaviPath, exist_ok = True) # The path for the input video, input audio, output video - os.makedirs(args.pyframesPath, exist_ok = True) # Save all the video frames - os.makedirs(args.pyworkPath, exist_ok = True) # Save the results in this process by the pckl method - os.makedirs(args.pycropPath, exist_ok = True) # Save the detected face clips (audio+video) in this process - - # Extract video - args.videoFilePath = os.path.join(args.pyaviPath, 'video.avi') - # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration' - if args.duration == 0: - command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic" % \ - (args.videoPath, args.nDataLoaderThread, args.videoFilePath)) - else: - command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic" % \ - (args.videoPath, args.nDataLoaderThread, args.start, args.start + args.duration, args.videoFilePath)) - subprocess.call(command, shell=True, stdout=None) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the video and save in %s \r\n" %(args.videoFilePath)) - - # Extract audio - args.audioFilePath = os.path.join(args.pyaviPath, 'audio.wav') - command = ("ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % \ - (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)) - subprocess.call(command, shell=True, stdout=None) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the audio and save in %s \r\n" %(args.audioFilePath)) - - # Extract the video frames - command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % \ - (args.videoFilePath, args.nDataLoaderThread, os.path.join(args.pyframesPath, '%06d.jpg'))) - subprocess.call(command, shell=True, stdout=None) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the frames and save in %s \r\n" %(args.pyframesPath)) - - # Scene detection for the video frames - scene = scene_detect(args) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scene detection and save in %s \r\n" %(args.pyworkPath)) - - # Face detection for the video frames - faces = inference_video(args) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face detection and save in %s \r\n" %(args.pyworkPath)) - - # Face tracking - allTracks, vidTracks = [], [] - for shot in scene: - if shot[1].frame_num - shot[0].frame_num >= args.minTrack: # Discard the shot frames less than minTrack frames - allTracks.extend(track_shot(args, faces[shot[0].frame_num:shot[1].frame_num])) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face track and detected %d tracks \r\n" %len(allTracks)) - - # Face clips cropping - for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)): - vidTracks.append(crop_video(args, track, os.path.join(args.pycropPath, '%05d'%ii))) - savePath = os.path.join(args.pyworkPath, 'tracks.pckl') - with open(savePath, 'wb') as fil: - pickle.dump(vidTracks, fil) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face Crop and saved in %s tracks \r\n" %args.pycropPath) - fil = open(savePath, 'rb') - vidTracks = pickle.load(fil) - - # Active Speaker Detection by TalkNet - files = glob.glob("%s/*.avi"%args.pycropPath) - files.sort() - scores = evaluate_network(files, args) - savePath = os.path.join(args.pyworkPath, 'scores.pckl') - with open(savePath, 'wb') as fil: - pickle.dump(scores, fil) - sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scores extracted and saved in %s \r\n" %args.pyworkPath) - - if args.evalCol == True: - evaluate_col_ASD(vidTracks, scores, args) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want - quit() - else: - # Visualization, save the result as the new video - visualization(vidTracks, scores, args) + # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python). + # ``` + # . + # ├── pyavi + # │   ├── audio.wav (Audio from input video) + # │   ├── video.avi (Copy of the input video) + # │   ├── video_only.avi (Output video without audio) + # │   └── video_out.avi (Output video with audio) + # ├── pycrop (The detected face videos and audios) + # │ ├── 000000.avi + # │ ├── 000000.wav + # │ ├── 000001.avi + # │ ├── 000001.wav + # │ └── ... + # ├── pyframes (All the video frames in this video) + # │ ├── 000001.jpg + # │ ├── 000002.jpg + # │ └── ... + # |── pyfilter (Output clipped videos) + # └── pywork + # ├── faces.pckl (face detection result) + # ├── scene.pckl (scene detection result) + # ├── scores.pckl (ASD result) + # └── tracks.pckl (face tracking result) + # ``` + + # Initialization + args.pyaviPath = os.path.join(args.savePath, 'pyavi') + args.pyframesPath = os.path.join(args.savePath, 'pyframes') + args.pyworkPath = os.path.join(args.savePath, 'pywork') + args.pycropPath = os.path.join(args.savePath, 'pycrop') + args.pyfilteredVideo = os.path.join(args.savePath) + # args.pyfilteredAudio = os.path.join(args.savePath, 'pyfilter', 'audio') + + # if os.path.exists(args.savePath): + # rmtree(args.savePath) + + # The path for the input video, input audio, output video + os.makedirs(args.pyaviPath, exist_ok=True) + os.makedirs(args.pyframesPath, exist_ok=True) # Save all the video frames + # Save the results in this process by the pckl method + os.makedirs(args.pyworkPath, exist_ok=True) + # Save the detected face clips (audio+video) in this process + os.makedirs(args.pycropPath, exist_ok=True) + # Save the detected face clips (audio+video) in this process + os.makedirs(args.pyfilteredVideo, exist_ok=True) + # Save the detected face clips (audio+video) in this process + # os.makedirs(args.pyfilteredAudio, exist_ok=True) + + # Extract video + args.videoFilePath = os.path.join(args.pyaviPath, 'video.avi') + # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration' + if args.duration == 0: + command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r %.3f %s -loglevel panic" % + (args.videoPath, args.nDataLoaderThread, args.fps, args.videoFilePath)) + else: + command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r %.3f %s -loglevel panic" % + (args.videoPath, args.nDataLoaderThread, args.start, args.start + args.duration, args.fps, args.videoFilePath)) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Extract the video and save in %s \r\n" % (args.videoFilePath)) + + # Extract audio + args.audioFilePath = os.path.join(args.pyaviPath, 'audio.wav') + command = ("ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % + (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Extract the audio and save in %s \r\n" % (args.audioFilePath)) + + # Extract the video frames + command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % + (args.videoFilePath, args.nDataLoaderThread, os.path.join(args.pyframesPath, '%06d.jpg'))) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Extract the frames and save in %s \r\n" % (args.pyframesPath)) + + # Scene detection for the video frames + scene = scene_detect(args) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Scene detection and save in %s \r\n" % (args.pyworkPath)) + + # Face detection for the video frames + faces = inference_video(args) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Face detection and save in %s \r\n" % (args.pyworkPath)) + + # Face tracking + allTracks, vidTracks = [], [] + for shot in scene: + # Discard the shot frames less than minTrack frames + if shot[1].frame_num - shot[0].frame_num >= args.minTrack: + # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces + allTracks.extend(track_shot( + args, faces[shot[0].frame_num:shot[1].frame_num])) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Face track and detected %d tracks \r\n" % len(allTracks)) + + # Face clips cropping + for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)): + vidTracks.append(crop_video( + args, track, os.path.join(args.pycropPath, '%05d' % ii))) + savePath = os.path.join(args.pyworkPath, 'tracks.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(vidTracks, fil) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Face Crop and saved in %s tracks \r\n" % args.pycropPath) + fil = open(savePath, 'rb') + vidTracks = pickle.load(fil) + + # Active Speaker Detection by TalkNet + files = glob.glob("%s/*.avi" % args.pycropPath) + files_audio = glob.glob("%s/*.wav" % args.pycropPath) + + files.sort() + scores = evaluate_network(files, args) + savePath = os.path.join(args.pyworkPath, 'scores.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(scores, fil) + sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + + " Scores extracted and saved in %s \r\n" % args.pyworkPath) + + # Frame rate of the video (assumed 25 FPS) + MIN_SEGMENT_FRAMES = 3 * args.fps # Minimum segment length in frames + MAX_SEGMENT_FRAMES = 10 * args.fps # Maximum segment length in frames + + filtered_segments = [] + count_segments = 0 + # Process each track and its corresponding score + for ii, (track, score_array) in tqdm.tqdm(enumerate(zip(allTracks, scores)), total=len(allTracks)): + start_frame = None + end_frame = None + segment_frames = [] + + for frame_idx, score in enumerate(score_array): + + if score > 0: + frame_number = track['frame'][frame_idx] + frame_path = os.path.join(args.pyframesPath, f"{(frame_number+1):06d}.jpg") + image = cv2.imread(frame_path) + + # Check if the image was loaded successfully + if image is None: + print(f"Warning: Frame {frame_number} could not be loaded! Skipping...") + continue + + # # Display the frame using Matplotlib + # plt.imshow(image) + # plt.axis("off") # Turn off axes for a cleaner display + # plt.show() + + results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + img_h, img_w, _ = image.shape + face_2d = [] + face_3d = [] + y=0 + if results.multi_face_landmarks: + for face_landmarks in results.multi_face_landmarks: + for idx, lm in enumerate(face_landmarks.landmark): + if idx == 33 or idx == 263 or idx ==1 or idx == 61 or idx == 291 or idx==199: + if idx ==1: + nose_2d = (lm.x * img_w,lm.y * img_h) + nose_3d = (lm.x * img_w,lm.y * img_h,lm.z * 3000) + x,y = int(lm.x * img_w),int(lm.y * img_h) + + face_2d.append([x,y]) + face_3d.append(([x,y,lm.z])) + + #Get 2d, 3d Coord + face_2d = numpy.array(face_2d,dtype=numpy.float64) + face_3d = numpy.array(face_3d,dtype=numpy.float64) + + # Camera matrix (intrinsic parameters) + focal_length = 1 * img_w + cam_matrix = numpy.array([[focal_length,0,img_h/2], + [0,focal_length,img_w/2], + [0,0,1]]) + # No lens distortion + distortion_matrix = numpy.zeros((4,1),dtype=numpy.float64) + + # SolvePnP to calculate rotation and translation vectors + _,rotation_vec,_ = cv2.solvePnP(face_3d,face_2d,cam_matrix,distortion_matrix) + + #getting rotational of face + rmat,_ = cv2.Rodrigues(rotation_vec) + + angles,_,_,_,_,_ = cv2.RQDecomp3x3(rmat) + y = angles[1] * 360 + + + if abs(y) < args.angleThreshold: + # Start a new segment if not already started + if start_frame is None: + start_frame = frame_idx + end_frame = frame_idx + + # Check if segment length exceeds the maximum allowed duration + if (end_frame - start_frame + 1) > MAX_SEGMENT_FRAMES: + # Save the current valid segment + segment_frames.append((start_frame, end_frame)) + start_frame = None # Reset for next segment + else: + # End the current segment if the score is not positive + if start_frame is not None: + # Save only if segment is long enough + if (end_frame - start_frame + 1) >= MIN_SEGMENT_FRAMES: + segment_frames.append((start_frame, end_frame)) + start_frame = None + + else: + # End the current segment if the score is not positive + if start_frame is not None: + # Save only if segment is long enough + if (end_frame - start_frame + 1) >= MIN_SEGMENT_FRAMES: + segment_frames.append((start_frame, end_frame)) + start_frame = None + + # Handle last segment if it ends positively + if start_frame is not None and (end_frame - start_frame + 1) >= MIN_SEGMENT_FRAMES: + segment_frames.append((start_frame, end_frame)) + + count_segments += len(segment_frames) + if segment_frames: + # Extract and save each valid segment + # for seg_idx, (seg_start, seg_end) in enumerate(segment_frames): + seg_idx = 0 + seg_start, seg_end = segment_frames[0] + # segment_video_path = os.path.join( + # args.pyfilteredVideo, f"{args.videoName}_track_{ii:05d}_segment_{seg_idx:02d}.avi") + segment_video_path = os.path.join( + args.pyfilteredVideo, f"{ii:05d}.avi") + # segment_audio_path = os.path.join( + # args.pyfilteredAudio, f"{args.videoName}_track_{ii:05d}_segment_{seg_idx:02d}.wav") + track_path = os.path.join(args.pycropPath, '%05d' % ii) + extract_segment(track_path, seg_start+10, seg_end-10, segment_video_path, f"{ii:05d}.avi") + + # Extract middle frame for age and gender prediction + middle_frame = (seg_start + seg_end) // 2 + frame_path = os.path.join(args.pyframesPath, f"{(middle_frame + 1):06d}.jpg") + image = cv2.imread(frame_path) + + if image is not None: + faces = face_app.get(image) + if faces: + face = faces[0] # Assume single face + age = face.age + gender = 'male' if face.gender == 1 else 'female' + + # Save age and gender in JSON + metadata = {"age": age, "gender": gender} + json_path = os.path.join(args.pyfilteredVideo, f"{ii:05d}.json") + with open(json_path, 'w') as json_file: + json.dump(metadata, json_file) + + s3_object_key = f"{args.channelName}/{args.video_id}/{ii:05d}.json" + upload_file_to_s3(json_path, args.bucketName, s3_object_key) + + # Check if the file exists + if os.path.exists(json_path): + os.remove(json_path) + print(f"File '{json_path}' has been deleted successfully.") + else: + print(f"File '{json_path}' does not exist.") + + filtered_segments.append(segment_video_path) + + print("Found ", count_segments, " Segments") + # Save filtered segments metadata + savePath = os.path.join(args.pyworkPath, 'filtered_segments.pckl') + with open(savePath, 'wb') as fil: + pickle.dump(filtered_segments, fil) + sys.stderr.write( + f"{time.strftime('%Y-%m-%d %H:%M:%S')} Filtered segments saved in {savePath}\n") + + # if args.evalCol == True: + # # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want + # evaluate_col_ASD(vidTracks, scores, args) + # quit() + # else: + # # Visualization, save the result as the new video + # visualization(vidTracks, scores, args) + + # At the end of the main function + folders_to_keep = [] + folders_to_delete = [args.pyfilteredVideo, args.pyaviPath ,args.pyframesPath, args.pyworkPath, args.pycropPath] + # folders_to_keep = [args.pyfilteredVideo, args.pyaviPath ,args.pyframesPath, args.pyworkPath, args.pycropPath] + # folders_to_delete = [] + + for folder in folders_to_delete: + if folder not in folders_to_keep and os.path.exists(folder): + rmtree(folder) + sys.stderr.write( + f"{time.strftime('%Y-%m-%d %H:%M:%S')} Removed unnecessary folders after processing.\n") + if __name__ == '__main__': - main() + # profiler = cProfile.Profile() + # profiler.enable() + main() # Run your script here + # profiler.disable() + + # Save the profiling data to a file + # profiler.dump_stats("profiling_results.prof") + + # # Optional: Print profiling stats to the console + # stats = pstats.Stats(profiler) + # stats.strip_dirs() + # stats.sort_stats("cumulative") # Sort by cumulative time + # stats.print_stats(20) # Print the top 20 time-consuming functions diff --git a/fileTransfer.py b/fileTransfer.py new file mode 100644 index 0000000..7988a69 --- /dev/null +++ b/fileTransfer.py @@ -0,0 +1,29 @@ +import boto3 +import os + +# Define the bucket name and local folder path +bucket_name = "scenespart1" +local_folder_path = "/home/ubuntu/ZeeNews" # Folder to upload +s3_object_key_prefix = "croppedData/ZeeNews_output/" # S3 destination prefix + +# Create an S3 client +s3_client = boto3.client("s3") + +def upload_folder_to_s3(local_folder, bucket, s3_prefix): + try: + for root, dirs, files in os.walk(local_folder): + for file in files: + local_file_path = os.path.join(root, file) + # Create the relative path for S3 object key + relative_path = os.path.relpath(local_file_path, local_folder) + s3_object_key = os.path.join(s3_prefix, relative_path) + + # Upload the file + s3_client.upload_file(local_file_path, bucket, s3_object_key) + print(f"Uploaded {local_file_path} to s3://{bucket}/{s3_object_key}") + + except Exception as e: + print(f"Error uploading folder to S3: {e}") + +# Call the function to upload the folder +upload_folder_to_s3(local_folder_path, bucket_name, s3_object_key_prefix) diff --git a/fileTransferFolder.py b/fileTransferFolder.py new file mode 100644 index 0000000..7988a69 --- /dev/null +++ b/fileTransferFolder.py @@ -0,0 +1,29 @@ +import boto3 +import os + +# Define the bucket name and local folder path +bucket_name = "scenespart1" +local_folder_path = "/home/ubuntu/ZeeNews" # Folder to upload +s3_object_key_prefix = "croppedData/ZeeNews_output/" # S3 destination prefix + +# Create an S3 client +s3_client = boto3.client("s3") + +def upload_folder_to_s3(local_folder, bucket, s3_prefix): + try: + for root, dirs, files in os.walk(local_folder): + for file in files: + local_file_path = os.path.join(root, file) + # Create the relative path for S3 object key + relative_path = os.path.relpath(local_file_path, local_folder) + s3_object_key = os.path.join(s3_prefix, relative_path) + + # Upload the file + s3_client.upload_file(local_file_path, bucket, s3_object_key) + print(f"Uploaded {local_file_path} to s3://{bucket}/{s3_object_key}") + + except Exception as e: + print(f"Error uploading folder to S3: {e}") + +# Call the function to upload the folder +upload_folder_to_s3(local_folder_path, bucket_name, s3_object_key_prefix) diff --git a/model/faceDetector/s3fd/box_utils.py b/model/faceDetector/s3fd/box_utils.py index 0779bcd..1bf4be2 100755 --- a/model/faceDetector/s3fd/box_utils.py +++ b/model/faceDetector/s3fd/box_utils.py @@ -35,7 +35,7 @@ def nms_(dets, thresh): inds = np.where(ovr <= thresh)[0] order = order[inds + 1] - return np.array(keep).astype(np.int) + return np.array(keep).astype(int) def decode(loc, priors, variances): diff --git a/model/faceDetector/yolov11n-face.pt b/model/faceDetector/yolov11n-face.pt new file mode 100644 index 0000000..2059cd2 Binary files /dev/null and b/model/faceDetector/yolov11n-face.pt differ diff --git a/requirement.txt b/requirement.txt index ccbc4a9..6a55e36 100755 --- a/requirement.txt +++ b/requirement.txt @@ -10,4 +10,5 @@ python_speech_features torchvision ffmpeg gdown -youtube-dl \ No newline at end of file +youtube-dl +mediapipe==0.10.9 \ No newline at end of file diff --git a/s3_uploader.py b/s3_uploader.py new file mode 100644 index 0000000..e0d5985 --- /dev/null +++ b/s3_uploader.py @@ -0,0 +1,19 @@ +import boto3 + +def upload_file_to_s3(local_file, bucket, s3_key): + """ + Upload a single file to an S3 bucket. + + :param local_file: Path to the local file. + :param bucket: Name of the S3 bucket. + :param s3_key: Path and name of the file in the S3 bucket. + """ + # Create an S3 client + s3_client = boto3.client("s3") + + try: + # Upload the file + s3_client.upload_file(local_file, bucket, s3_key) + print(f"Uploaded {local_file} to s3://{bucket}/{s3_key}") + except Exception as e: + print(f"Error uploading file: {e}")