diff --git a/batch_process.py b/batch_process.py
new file mode 100644
index 0000000..e2754e6
--- /dev/null
+++ b/batch_process.py
@@ -0,0 +1,67 @@
+import os
+import subprocess
+import argparse
+
+def main(args):
+    # Ensure the output directory exists
+    if not os.path.exists(args.videoFolderOutput):
+        os.makedirs(args.videoFolderOutput)
+    
+    # List all videos in the input directory
+    video_files = [f for f in os.listdir(args.videoFolderInput) if f.endswith(('.mp4', '.avi', '.mov'))]
+    if not video_files:
+        print(f"No video files found in the directory: {args.videoFolderInput}")
+        return
+
+    # Process each video
+    for video_file in video_files:
+        video_name = os.path.splitext(video_file)[0]
+
+        output_video_path = args.videoFolderOutput
+
+        # Ensure output directory for the video exists
+        if not os.path.exists(output_video_path):
+            os.makedirs(output_video_path)
+
+        # Build the command to call demoTalkNet.py
+        command = [
+            "python", "demoTalkNet.py",
+            "--videoName", video_name,
+            "--videoFolderInput", args.videoFolderInput,
+            "--videoFolderOutput", args.videoFolderOutput,
+            "--channelName", args.channelName,
+        ]
+
+        # Print and execute the command
+        print(f"Processing video: {video_file}")
+        print("Command:", " ".join(command))
+        subprocess.run(command)
+
+    print("Batch processing completed.")
+
+if __name__ == "__main__":
+    # Parse arguments for the batch process
+    parser = argparse.ArgumentParser(description="Batch Process Videos with demoTalkNet")
+    parser.add_argument('--videoFolderInput', type=str, required=True, help='Path to the folder containing input videos.')
+    parser.add_argument('--videoFolderOutput', default="output_dir", type=str, help='Path to the folder for storing outputs and temporary files.')
+    parser.add_argument('--bucketName', type=str, help='Path to the folder for storing outputs and temporary files.')
+    parser.add_argument('--channelName', type=str, required=True, help='Path to the folder for storing outputs and temporary files.')
+    parser.add_argument('--pretrainModel', type=str,default="pretrain_TalkSet.model", help='Path to the pretrained TalkNet model.')
+    parser.add_argument('--fps', type=float, default=25, help='Desired FPS.')
+    parser.add_argument('--frame_size', type=int, default=512, help='Desired frame size.')
+    parser.add_argument('--angleThreshold', type=int, default=10, help='Yaw threshold.')
+    parser.add_argument('--contentDetectorThreshold', type=float, default=27.0, help='Content detector threshold.')
+    parser.add_argument('--thresholdDetectorThreshold', type=float, default=30.0, help='Threshold detector threshold.')
+    parser.add_argument('--nDataLoaderThread', type=int, default=10, help='Number of data loader threads.')
+    parser.add_argument('--facedetScale', type=float, default=0.25, help='Face detection scale factor.')
+    parser.add_argument('--minTrack', type=int, default=40, help='Minimum frames for each shot.')
+    parser.add_argument('--numFailedDet', type=int, default=5, help='Missed detections allowed before stopping tracking.')
+    parser.add_argument('--minFaceSize', type=int, default=100, help='Minimum face size in pixels.')
+    parser.add_argument('--cropScale', type=float, default=0.40, help='Scale bounding box.')
+    parser.add_argument('--start', type=int, default=0, help='Start time of the video.')
+    parser.add_argument('--duration', type=int, default=0, help='Duration of the video (0 for full video).')
+    parser.add_argument('--evalCol', action='store_true', help='Evaluate on Columbia dataset.')
+    parser.add_argument('--colSavePath', type=str, default="/data08/col", help='Path for inputs, temps, and outputs for Columbia evaluation.')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/demoTalkNet.py b/demoTalkNet.py
index 0e496c2..d686f14 100755
--- a/demoTalkNet.py
+++ b/demoTalkNet.py
@@ -1,5 +1,29 @@
-import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features
-
+import subprocess
+import sys
+import time
+import os
+# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
+import tqdm
+import torch
+import argparse
+import glob
+import subprocess
+import warnings
+import cv2
+import pickle
+import numpy
+import pdb
+import math
+import python_speech_features
+import mediapipe as mp
+import matplotlib.pyplot as plt
+
+import json
+import insightface
+from insightface.app import FaceAnalysis
+
+import cProfile
+import pstats
 from scipy import signal
 from shutil import rmtree
 from scipy.io import wavfile
@@ -8,451 +32,922 @@
 
 from scenedetect.video_manager import VideoManager
 from scenedetect.scene_manager import SceneManager
+from scenedetect import SceneManager, open_video, ContentDetector, ThresholdDetector
 from scenedetect.frame_timecode import FrameTimecode
 from scenedetect.stats_manager import StatsManager
 from scenedetect.detectors import ContentDetector
 
 from model.faceDetector.s3fd import S3FD
 from talkNet import talkNet
+from s3_uploader import upload_file_to_s3
 
 warnings.filterwarnings("ignore")
 
-parser = argparse.ArgumentParser(description = "TalkNet Demo or Columnbia ASD Evaluation")
-
-parser.add_argument('--videoName',             type=str, default="001",   help='Demo video name')
-parser.add_argument('--videoFolder',           type=str, default="demo",  help='Path for inputs, tmps and outputs')
-parser.add_argument('--pretrainModel',         type=str, default="pretrain_TalkSet.model",   help='Path for the pretrained TalkNet model')
-
-parser.add_argument('--nDataLoaderThread',     type=int,   default=10,   help='Number of workers')
-parser.add_argument('--facedetScale',          type=float, default=0.25, help='Scale factor for face detection, the frames will be scale to 0.25 orig')
-parser.add_argument('--minTrack',              type=int,   default=10,   help='Number of min frames for each shot')
-parser.add_argument('--numFailedDet',          type=int,   default=10,   help='Number of missed detections allowed before tracking is stopped')
-parser.add_argument('--minFaceSize',           type=int,   default=1,    help='Minimum face size in pixels')
-parser.add_argument('--cropScale',             type=float, default=0.40, help='Scale bounding box')
-
-parser.add_argument('--start',                 type=int, default=0,   help='The start time of the video')
-parser.add_argument('--duration',              type=int, default=0,  help='The duration of the video, when set as 0, will extract the whole video')
-
-parser.add_argument('--evalCol',               dest='evalCol', action='store_true', help='Evaluate on Columnbia dataset')
-parser.add_argument('--colSavePath',           type=str, default="/data08/col",  help='Path for inputs, tmps and outputs')
+mp_face_mesh = mp.solutions.face_mesh
+face_mesh = mp_face_mesh.FaceMesh(
+    min_detection_confidence=0.5, min_tracking_confidence=0.5)
+
+face_app = FaceAnalysis(name='buffalo_s')  # This model supports age & gender
+face_app.prepare(ctx_id=0)
+
+parser = argparse.ArgumentParser(
+    description="TalkNet Demo or Columnbia ASD Evaluation")
+
+parser.add_argument('--videoName',             type=str,
+                    default="001",   help='Demo video name')
+parser.add_argument('--videoFolderInput',           type=str,
+                    required=True,  help='Path for inputs')
+parser.add_argument('--videoFolderOutput',           type=str,
+                    default="output_dir",  help='Path for tmps and outputs')
+parser.add_argument('--pretrainModel',         type=str,
+                    default="pretrain_TalkSet.model",   help='Path for the pretrained TalkNet model')
+parser.add_argument('--fps',                   type=float,
+                    default=25,   help='Desired FPS')
+parser.add_argument('--frame_size',                   type=int,
+                    default=512,   help='Desired frame size')
+
+parser.add_argument('--angleThreshold',                   type=int,
+                    default=25,   help='Desired threshold for yaw')
+parser.add_argument('--contentDetectorThreshold',                   type=float,
+                    default=27.0,   help='Desired frame size')
+parser.add_argument('--thresholdDetectorThreshold',                   type=float,
+                    default=30.0,   help='Desired frame size')
+parser.add_argument('--bucketName',                   type=str,
+                    default='hdindiandataset',   help='Bucket Name in AWS')
+parser.add_argument('--channelName',                   type=str,
+                    required=True,   help='Desired frame size')
+
+parser.add_argument('--nDataLoaderThread',     type=int,
+                    default=10,   help='Number of workers')
+parser.add_argument('--facedetScale',          type=float, default=0.25,
+                    help='Scale factor for face detection, the frames will be scale to 0.25 orig')
+parser.add_argument('--minTrack',              type=int,
+                    default=40,   help='Number of min frames for each shot')
+parser.add_argument('--numFailedDet',          type=int,   default=5,
+                    help='Number of missed detections allowed before tracking is stopped')
+parser.add_argument('--minFaceSize',           type=int,
+                    default=100,    help='Minimum face size in pixels')
+parser.add_argument('--cropScale',             type=float,
+                    default=0.40, help='Scale bounding box')
+
+parser.add_argument('--start',                 type=int,
+                    default=0,   help='The start time of the video')
+parser.add_argument('--duration',              type=int, default=0,
+                    help='The duration of the video, when set as 0, will extract the whole video')
+
+parser.add_argument('--evalCol',               dest='evalCol',
+                    action='store_true', help='Evaluate on Columnbia dataset')
+parser.add_argument('--colSavePath',           type=str,
+                    default="/data08/col",  help='Path for inputs, tmps and outputs')
 
 args = parser.parse_args()
 
-if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model
+if os.path.isfile(args.pretrainModel) == False:  # Download the pretrained model
     Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea"
-    cmd = "gdown --id %s -O %s"%(Link, args.pretrainModel)
+    cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel)
     subprocess.call(cmd, shell=True, stdout=None)
 
 if args.evalCol == True:
-	# The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using)
-	# 	              2. extract audio, extract video frames
-	#                 3. scend detection, face detection and face tracking
-	#                 4. active speaker detection for the detected face clips
-	#                 5. use iou to find the identity of each face clips, compute the F1 results
-	# The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour
-	# The step 4 and 5 need less than 10 minutes
-	# Need about 20G space finally
-	# ```
-	args.videoName = 'col'
-	args.videoFolder = args.colSavePath
-	args.savePath = os.path.join(args.videoFolder, args.videoName)
-	args.videoPath = os.path.join(args.videoFolder, args.videoName + '.mp4')
-	args.duration = 0
-	if os.path.isfile(args.videoPath) == False:  # Download video
-		link = 'https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s'
-		cmd = "youtube-dl -f best -o %s '%s'"%(args.videoPath, link)
-		output = subprocess.call(cmd, shell=True, stdout=None)
-	if os.path.isdir(args.videoFolder + '/col_labels') == False: # Download label
-		link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
-		cmd = "gdown --id %s -O %s"%(link, args.videoFolder + '/col_labels.tar.gz')
-		subprocess.call(cmd, shell=True, stdout=None)
-		cmd = "tar -xzvf %s -C %s"%(args.videoFolder + '/col_labels.tar.gz', args.videoFolder)
-		subprocess.call(cmd, shell=True, stdout=None)
-		os.remove(args.videoFolder + '/col_labels.tar.gz')	
+    # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using)
+    # 	              2. extract audio, extract video frames
+    #                 3. scend detection, face detection and face tracking
+    #                 4. active speaker detection for the detected face clips
+    #                 5. use iou to find the identity of each face clips, compute the F1 results
+    # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour
+    # The step 4 and 5 need less than 10 minutes
+    # Need about 20G space finally
+    # ```
+    args.videoName = 'col'
+    args.videoFolder = args.colSavePath
+    args.savePath = os.path.join(args.videoFolder, args.videoName)
+    args.videoPath = os.path.join(args.videoFolder, args.videoName + '.mp4')
+    args.duration = 0
+    if os.path.isfile(args.videoPath) == False:  # Download video
+        link = 'https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s'
+        cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link)
+        output = subprocess.call(cmd, shell=True, stdout=None)
+    if os.path.isdir(args.videoFolder + '/col_labels') == False:  # Download label
+        link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
+        cmd = "gdown --id %s -O %s" % (link,
+                                       args.videoFolder + '/col_labels.tar.gz')
+        subprocess.call(cmd, shell=True, stdout=None)
+        cmd = "tar -xzvf %s -C %s" % (args.videoFolder +
+                                      '/col_labels.tar.gz', args.videoFolder)
+        subprocess.call(cmd, shell=True, stdout=None)
+        os.remove(args.videoFolder + '/col_labels.tar.gz')
 else:
-	args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + '.*'))[0]
-	args.savePath = os.path.join(args.videoFolder, args.videoName)
+    args.videoPath = glob.glob(os.path.join(
+        args.videoFolderInput, args.videoName + '.*'))[0]
+    
+    args.video_id = args.videoName.split('_cluster_')[0]
+    args.savePath = os.path.join(args.videoFolderOutput, args.video_id)
+    # args.savePath = args.videoFolderOutput
+
+
+from collections import namedtuple
+Scene = namedtuple('Scene', ['frame_num'])
+class Scene:
+    def __init__(self, frame_num):
+        self.frame_num = frame_num
 
 def scene_detect(args):
-	# CPU: Scene detection, output is the list of each shot's time duration
-	videoManager = VideoManager([args.videoFilePath])
-	statsManager = StatsManager()
-	sceneManager = SceneManager(statsManager)
-	sceneManager.add_detector(ContentDetector())
-	baseTimecode = videoManager.get_base_timecode()
-	videoManager.set_downscale_factor()
-	videoManager.start()
-	sceneManager.detect_scenes(frame_source = videoManager)
-	sceneList = sceneManager.get_scene_list(baseTimecode)
-	savePath = os.path.join(args.pyworkPath, 'scene.pckl')
-	if sceneList == []:
-		sceneList = [(videoManager.get_base_timecode(),videoManager.get_current_timecode())]
-	with open(savePath, 'wb') as fil:
-		pickle.dump(sceneList, fil)
-		sys.stderr.write('%s - scenes detected %d\n'%(args.videoFilePath, len(sceneList)))
-	return sceneList
+    # CPU: Scene detection, output is the list of each shot's time duration
+    video = open_video(args.videoFilePath)
+
+    sceneManager = SceneManager()
+
+    # sceneManager.add_detector(ContentDetector(threshold=args.contentDetectorThreshold, min_scene_len=30))
+    # sceneManager.add_detector(ThresholdDetector(threshold=args.thresholdDetectorThreshold))
+
+    sceneManager.add_detector(ContentDetector())
+    # sceneManager.add_detector(ThresholdDetector())
+
+    sceneManager.detect_scenes(video)
+    sceneList = sceneManager.get_scene_list()
+
+    savePath = os.path.join(args.pyworkPath, 'scene.pckl')
+    if not sceneList:
+        cap = cv2.VideoCapture(args.videoFilePath)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Fallback: If no scenes detected, create a single "scene" from start to end
+        # sceneList = [(0, frame_count)]
+        sceneList = [(Scene(frame_num=0), Scene(frame_num=frame_count))]
+        cap.release()
+    with open(savePath, 'wb') as file:
+        pickle.dump(sceneList, file)
+        sys.stderr.write(f"{args.videoFilePath} - scenes detected: {len(sceneList)}\n")
+    
+    return sceneList
 
 def inference_video(args):
-	# GPU: Face detection, output is the list contains the face location and score in this frame
-	DET = S3FD(device='cuda')
-	flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))
-	flist.sort()
-	dets = []
-	for fidx, fname in enumerate(flist):
-		image = cv2.imread(fname)
-		imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-		bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale])
-		dets.append([])
-		for bbox in bboxes:
-		  dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) # dets has the frames info, bbox info, conf info
-		sys.stderr.write('%s-%05d; %d dets\r' % (args.videoFilePath, fidx, len(dets[-1])))
-	savePath = os.path.join(args.pyworkPath,'faces.pckl')
-	with open(savePath, 'wb') as fil:
-		pickle.dump(dets, fil)
-	return dets
-
-def bb_intersection_over_union(boxA, boxB, evalCol = False):
-	# CPU: IOU Function to calculate overlap between two image
-	xA = max(boxA[0], boxB[0])
-	yA = max(boxA[1], boxB[1])
-	xB = min(boxA[2], boxB[2])
-	yB = min(boxA[3], boxB[3])
-	interArea = max(0, xB - xA) * max(0, yB - yA)
-	boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
-	boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
-	if evalCol == True:
-		iou = interArea / float(boxAArea)
-	else:
-		iou = interArea / float(boxAArea + boxBArea - interArea)
-	return iou
+    # GPU: Face detection, output is the list contains the face location and score in this frame
+    DET = S3FD(device='cuda')
+    flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))
+    flist.sort()
+    dets = []
+    for fidx, fname in enumerate(flist):
+        image = cv2.imread(fname)
+        imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        bboxes = DET.detect_faces(
+            imageNumpy, conf_th=0.9, scales=[args.facedetScale])
+        dets.append([])
+        for bbox in bboxes:
+            # dets has the frames info, bbox info, conf info
+            dets[-1].append({'frame': fidx, 'bbox': (bbox[:-1]
+                                                     ).tolist(), 'conf': bbox[-1]})
+        sys.stderr.write('%s-%05d; %d dets\r' %
+                         (args.videoFilePath, fidx, len(dets[-1])))
+    savePath = os.path.join(args.pyworkPath, 'faces.pckl')
+    with open(savePath, 'wb') as fil:
+        pickle.dump(dets, fil)
+    return dets
+
+
+# import os
+# import glob
+# import cv2
+# import torch
+# from ultralytics import YOLO  # YOLOv8 library
+# import pickle
+# import sys
+
+# def inference_video(args):
+#     # Load the YOLOv11n-face model
+#     model = YOLO('./model/faceDetector/yolov11n-face.pt')  # Path to the YOLOv11n-face model
+
+#     flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))
+#     flist.sort()
+#     dets = []
+
+#     for fidx, fname in enumerate(flist):
+#         # Read the frame
+#         image = cv2.imread(fname)
+#         results = model(image, verbose=False)
+#         detections = results[0].boxes.data.cpu().numpy()
+
+#         dets.append([])
+#         for det in detections:
+#             x1, y1, x2, y2, conf, class_id = det
+#             if conf>0.8:
+#                 dets[-1].append({
+#                     'frame': fidx,
+#                     'bbox': [x1, y1, x2, y2],
+#                     'conf': conf
+#                 })
+
+#         # Log progress
+#         sys.stderr.write('%s-%05d; %d dets\r' %
+#                          (args.videoFilePath, fidx, len(dets[-1])))
+
+#     # Save detections
+#     savePath = os.path.join(args.pyworkPath, 'faces.pckl')
+#     with open(savePath, 'wb') as fil:
+#         pickle.dump(dets, fil)
+
+#     return dets
+
+# def inference_video(args):
+#     # Load the YOLOv11n-face model
+#     model = YOLO('./model/faceDetector/yolov11n-face.pt')  # Path to the YOLOv11n-face model
+
+#     # Get list of all frame image files
+#     flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))
+#     flist.sort()
+#     dets = []
+
+#     for fidx, fname in enumerate(flist):
+#         # Read the frame
+#         image = cv2.imread(fname)
+
+#         # Perform face detection
+#         results = model.predict(image, conf=0.8, verbose=False)  # Use .predict()
+
+#         # Extract detections
+#         dets.append([])
+#         if results[0].boxes:
+#             detections = results[0].boxes.data.cpu().numpy()  # Bounding box data
+#             for det in detections:
+#                 x1, y1, x2, y2, conf = det[:5]  # Parse bounding box and confidence
+#                 if conf > 0.8:  # Check confidence threshold
+#                     dets[-1].append({
+#                         'frame': fidx,
+#                         'bbox': [x1, y1, x2, y2],
+#                         'conf': conf
+#                     })
+
+#         # Log progress
+#         sys.stderr.write('%s-%05d; %d dets\r' %
+#                          (args.videoFilePath, fidx, len(dets[-1])))
+
+#     # Save detections to a file
+#     savePath = os.path.join(args.pyworkPath, 'faces.pckl')
+#     with open(savePath, 'wb') as fil:
+#         pickle.dump(dets, fil)
+
+#     return dets
+
+def bb_intersection_over_union(boxA, boxB, evalCol=False):
+    # CPU: IOU Function to calculate overlap between two image
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    if evalCol == True:
+        iou = interArea / float(boxAArea)
+    else:
+        iou = interArea / float(boxAArea + boxBArea - interArea)
+    return iou
+
 
 def track_shot(args, sceneFaces):
-	# CPU: Face tracking
-	iouThres  = 0.5     # Minimum IOU between consecutive face detections
-	tracks    = []
-	while True:
-		track     = []
-		for frameFaces in sceneFaces:
-			for face in frameFaces:
-				if track == []:
-					track.append(face)
-					frameFaces.remove(face)
-				elif face['frame'] - track[-1]['frame'] <= args.numFailedDet:
-					iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
-					if iou > iouThres:
-						track.append(face)
-						frameFaces.remove(face)
-						continue
-				else:
-					break
-		if track == []:
-			break
-		elif len(track) > args.minTrack:
-			frameNum    = numpy.array([ f['frame'] for f in track ])
-			bboxes      = numpy.array([numpy.array(f['bbox']) for f in track])
-			frameI      = numpy.arange(frameNum[0],frameNum[-1]+1)
-			bboxesI    = []
-			for ij in range(0,4):
-				interpfn  = interp1d(frameNum, bboxes[:,ij])
-				bboxesI.append(interpfn(frameI))
-			bboxesI  = numpy.stack(bboxesI, axis=1)
-			if max(numpy.mean(bboxesI[:,2]-bboxesI[:,0]), numpy.mean(bboxesI[:,3]-bboxesI[:,1])) > args.minFaceSize:
-				tracks.append({'frame':frameI,'bbox':bboxesI})
-	return tracks
+    # CPU: Face tracking
+    iouThres = 0.5     # Minimum IOU between consecutive face detections
+    tracks = []
+    while True:
+        track = []
+        for frameFaces in sceneFaces:
+            for face in frameFaces:
+                if track == []:
+                    track.append(face)
+                    frameFaces.remove(face)
+                elif face['frame'] - track[-1]['frame'] <= args.numFailedDet:
+                    iou = bb_intersection_over_union(
+                        face['bbox'], track[-1]['bbox'])
+                    if iou > iouThres:
+                        track.append(face)
+                        frameFaces.remove(face)
+                        continue
+                else:
+                    break
+        if track == []:
+            break
+        elif len(track) > args.minTrack:
+            frameNum = numpy.array([f['frame'] for f in track])
+            bboxes = numpy.array([numpy.array(f['bbox']) for f in track])
+            frameI = numpy.arange(frameNum[0], frameNum[-1]+1)
+            bboxesI = []
+            for ij in range(0, 4):
+                interpfn = interp1d(frameNum, bboxes[:, ij])
+                bboxesI.append(interpfn(frameI))
+            bboxesI = numpy.stack(bboxesI, axis=1)
+            if max(numpy.mean(bboxesI[:, 2]-bboxesI[:, 0]), numpy.mean(bboxesI[:, 3]-bboxesI[:, 1])) > args.minFaceSize:
+                tracks.append({'frame': frameI, 'bbox': bboxesI})
+    return tracks
+
 
 def crop_video(args, track, cropFile):
-	# CPU: crop the face clips
-	flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Read the frames
-	flist.sort()
-	vOut = cv2.VideoWriter(cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video
-	dets = {'x':[], 'y':[], 's':[]}
-	for det in track['bbox']: # Read the tracks
-		dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) 
-		dets['y'].append((det[1]+det[3])/2) # crop center x 
-		dets['x'].append((det[0]+det[2])/2) # crop center y
-	dets['s'] = signal.medfilt(dets['s'], kernel_size=13)  # Smooth detections 
-	dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
-	dets['y'] = signal.medfilt(dets['y'], kernel_size=13)
-	for fidx, frame in enumerate(track['frame']):
-		cs  = args.cropScale
-		bs  = dets['s'][fidx]   # Detection box size
-		bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount 
-		image = cv2.imread(flist[frame])
-		frame = numpy.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110))
-		my  = dets['y'][fidx] + bsi  # BBox center Y
-		mx  = dets['x'][fidx] + bsi  # BBox center X
-		face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
-		vOut.write(cv2.resize(face, (224, 224)))
-	audioTmp    = cropFile + '.wav'
-	audioStart  = (track['frame'][0]) / 25
-	audioEnd    = (track['frame'][-1]+1) / 25
-	vOut.release()
-	command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % \
-		      (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)) 
-	output = subprocess.call(command, shell=True, stdout=None) # Crop audio file
-	_, audio = wavfile.read(audioTmp)
-	command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % \
-			  (cropFile, audioTmp, args.nDataLoaderThread, cropFile)) # Combine audio and video file
-	output = subprocess.call(command, shell=True, stdout=None)
-	os.remove(cropFile + 't.avi')
-	return {'track':track, 'proc_track':dets}
+    # CPU: crop the face clips
+    flist = glob.glob(os.path.join(
+        args.pyframesPath, '*.jpg'))  # Read the frames
+    flist.sort()
+    vOut = cv2.VideoWriter(
+        cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), args.fps, (args.frame_size, args.frame_size))  # Write video
+    dets = {'x': [], 'y': [], 's': []}
+    for det in track['bbox']:  # Read the tracks
+        dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2)
+        dets['y'].append((det[1]+det[3])/2)  # crop center x
+        dets['x'].append((det[0]+det[2])/2)  # crop center y
+    dets['s'] = signal.medfilt(dets['s'], kernel_size=13)  # Smooth detections
+    dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
+    dets['y'] = signal.medfilt(dets['y'], kernel_size=13)
+    for fidx, frame in enumerate(track['frame']):
+        cs = args.cropScale
+        bs = dets['s'][fidx]   # Detection box size
+        bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount
+        image = cv2.imread(flist[frame])
+        frame = numpy.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)),
+                          'constant', constant_values=(110, 110))
+        my = dets['y'][fidx] + bsi  # BBox center Y
+        mx = dets['x'][fidx] + bsi  # BBox center X
+        face = frame[int(my-bs):int(my+bs*(1+2*cs)),
+                     int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
+        vOut.write(cv2.resize(face, (args.frame_size, args.frame_size)))
+    audioTmp = cropFile + '.wav'
+    audioStart = (track['frame'][0]) / args.fps
+    audioEnd = (track['frame'][-1]+1) / args.fps
+    vOut.release()
+    command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" %
+               (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp))
+    output = subprocess.call(
+        command, shell=True, stdout=None)  # Crop audio file
+    _, audio = wavfile.read(audioTmp)
+    command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" %
+               (cropFile, audioTmp, args.nDataLoaderThread, cropFile))  # Combine audio and video file
+    output = subprocess.call(command, shell=True, stdout=None)
+    os.remove(cropFile + 't.avi')
+    return {'track': track, 'proc_track': dets}
+
 
 def extract_MFCC(file, outPath):
-	# CPU: extract mfcc
-	sr, audio = wavfile.read(file)
-	mfcc = python_speech_features.mfcc(audio,sr) # (N_frames, 13)   [1s = 100 frames]
-	featuresPath = os.path.join(outPath, file.split('/')[-1].replace('.wav', '.npy'))
-	numpy.save(featuresPath, mfcc)
+    # CPU: extract mfcc
+    sr, audio = wavfile.read(file)
+    # (N_frames, 13)   [1s = 100 frames]
+    mfcc = python_speech_features.mfcc(audio, sr)
+    featuresPath = os.path.join(
+        outPath, file.split('/')[-1].replace('.wav', '.npy'))
+    numpy.save(featuresPath, mfcc)
+
 
 def evaluate_network(files, args):
-	# GPU: active speaker detection by pretrained TalkNet
-	s = talkNet()
-	s.loadParameters(args.pretrainModel)
-	sys.stderr.write("Model %s loaded from previous state! \r\n"%args.pretrainModel)
-	s.eval()
-	allScores = []
-	# durationSet = {1,2,4,6} # To make the result more reliable
-	durationSet = {1,1,1,2,2,2,3,3,4,5,6} # Use this line can get more reliable result
-	for file in tqdm.tqdm(files, total = len(files)):
-		fileName = os.path.splitext(file.split('/')[-1])[0] # Load audio and video
-		_, audio = wavfile.read(os.path.join(args.pycropPath, fileName + '.wav'))
-		audioFeature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010)
-		video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + '.avi'))
-		videoFeature = []
-		while video.isOpened():
-			ret, frames = video.read()
-			if ret == True:
-				face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
-				face = cv2.resize(face, (224,224))
-				face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))]
-				videoFeature.append(face)
-			else:
-				break
-		video.release()
-		videoFeature = numpy.array(videoFeature)
-		length = min((audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0] / 25)
-		audioFeature = audioFeature[:int(round(length * 100)),:]
-		videoFeature = videoFeature[:int(round(length * 25)),:,:]
-		allScore = [] # Evaluation use TalkNet
-		for duration in durationSet:
-			batchSize = int(math.ceil(length / duration))
-			scores = []
-			with torch.no_grad():
-				for i in range(batchSize):
-					inputA = torch.FloatTensor(audioFeature[i * duration * 100:(i+1) * duration * 100,:]).unsqueeze(0).cuda()
-					inputV = torch.FloatTensor(videoFeature[i * duration * 25: (i+1) * duration * 25,:,:]).unsqueeze(0).cuda()
-					embedA = s.model.forward_audio_frontend(inputA)
-					embedV = s.model.forward_visual_frontend(inputV)	
-					embedA, embedV = s.model.forward_cross_attention(embedA, embedV)
-					out = s.model.forward_audio_visual_backend(embedA, embedV)
-					score = s.lossAV.forward(out, labels = None)
-					scores.extend(score)
-			allScore.append(scores)
-		allScore = numpy.round((numpy.mean(numpy.array(allScore), axis = 0)), 1).astype(float)
-		allScores.append(allScore)	
-	return allScores
+    # GPU: active speaker detection by pretrained TalkNet
+    s = talkNet()
+    s.loadParameters(args.pretrainModel)
+    sys.stderr.write("Model %s loaded from previous state! \r\n" %
+                     args.pretrainModel)
+    s.eval()
+    allScores = []
+    # durationSet = {1,2,4,6} # To make the result more reliable
+    # Use this line can get more reliable result
+    durationSet = {1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6}
+    for file in tqdm.tqdm(files, total=len(files)):
+        fileName = os.path.splitext(file.split(
+            '/')[-1])[0]  # Load audio and video
+        _, audio = wavfile.read(os.path.join(
+            args.pycropPath, fileName + '.wav'))
+        audioFeature = python_speech_features.mfcc(
+            audio, 16000, numcep=13, winlen=0.025, winstep=0.010)
+        video = cv2.VideoCapture(os.path.join(
+            args.pycropPath, fileName + '.avi'))
+        videoFeature = []
+        while video.isOpened():
+            ret, frames = video.read()
+            if ret == True:
+                face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
+                # face = cv2.resize(face, (args.frame_size, args.frame_size))
+                face = cv2.resize(face, (224, 224))
+                face = face[int(112-(112/2)):int(112+(112/2)),
+                            int(112-(112/2)):int(112+(112/2))]
+                videoFeature.append(face)
+            else:
+                break
+        video.release()
+        videoFeature = numpy.array(videoFeature)
+        length = min((audioFeature.shape[0] - audioFeature.shape[0] %
+                     4) / 100, videoFeature.shape[0] / args.fps)
+        audioFeature = audioFeature[:int(round(length * 100)), :]
+        videoFeature = videoFeature[:int(round(length * args.fps)), :, :]
+        allScore = []  # Evaluation use TalkNet
+        for duration in durationSet:
+            batchSize = int(math.ceil(length / duration))
+            scores = []
+            with torch.no_grad():
+                for i in range(batchSize):
+                    inputA = torch.FloatTensor(
+                        audioFeature[i * duration * 100:(i+1) * duration * 100, :]).unsqueeze(0).cuda()
+                    inputV = torch.FloatTensor(
+                        videoFeature[i * duration * args.fps: (i+1) * duration * args.fps, :, :]).unsqueeze(0).cuda()
+                    embedA = s.model.forward_audio_frontend(inputA)
+                    embedV = s.model.forward_visual_frontend(inputV)
+                    embedA, embedV = s.model.forward_cross_attention(
+                        embedA, embedV)
+                    out = s.model.forward_audio_visual_backend(embedA, embedV)
+                    score = s.lossAV.forward(out, labels=None)
+                    scores.extend(score)
+            allScore.append(scores)
+        allScore = numpy.round(
+            (numpy.mean(numpy.array(allScore), axis=0)), 1).astype(float)
+        allScores.append(allScore)
+    return allScores
+
 
 def visualization(tracks, scores, args):
-	# CPU: visulize the result for video format
-	flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))
-	flist.sort()
-	faces = [[] for i in range(len(flist))]
-	for tidx, track in enumerate(tracks):
-		score = scores[tidx]
-		for fidx, frame in enumerate(track['track']['frame'].tolist()):
-			s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)] # average smoothing
-			s = numpy.mean(s)
-			faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]})
-	firstImage = cv2.imread(flist[0])
-	fw = firstImage.shape[1]
-	fh = firstImage.shape[0]
-	vOut = cv2.VideoWriter(os.path.join(args.pyaviPath, 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw,fh))
-	colorDict = {0: 0, 1: 255}
-	for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)):
-		image = cv2.imread(fname)
-		for face in faces[fidx]:
-			clr = colorDict[int((face['score'] >= 0))]
-			txt = round(face['score'], 1)
-			cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int(face['x']+face['s']), int(face['y']+face['s'])),(0,clr,255-clr),10)
-			cv2.putText(image,'%s'%(txt), (int(face['x']-face['s']), int(face['y']-face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,clr,255-clr),5)
-		vOut.write(image)
-	vOut.release()
-	command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % \
-		(os.path.join(args.pyaviPath, 'video_only.avi'), os.path.join(args.pyaviPath, 'audio.wav'), \
-		args.nDataLoaderThread, os.path.join(args.pyaviPath,'video_out.avi'))) 
-	output = subprocess.call(command, shell=True, stdout=None)
+    # CPU: visulize the result for video format
+    flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))
+    flist.sort()
+    faces = [[] for i in range(len(flist))]
+    for tidx, track in enumerate(tracks):
+        score = scores[tidx]
+        for fidx, frame in enumerate(track['track']['frame'].tolist()):
+            # average smoothing
+            s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]
+            s = numpy.mean(s)
+            faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s']
+                                [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]})
+    firstImage = cv2.imread(flist[0])
+    fw = firstImage.shape[1]
+    fh = firstImage.shape[0]
+    vOut = cv2.VideoWriter(os.path.join(args.pyaviPath, 'video_only.avi'),
+                           cv2.VideoWriter_fourcc(*'XVID'), args.fps, (fw, fh))
+    colorDict = {0: 0, 1: 255}
+    for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
+        image = cv2.imread(fname)
+        # image = cv2.resize(image, )
+        for face in faces[fidx]:
+            clr = colorDict[int((face['score'] >= 0))]
+            txt = round(face['score'], 1)
+            cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int(
+                face['x']+face['s']), int(face['y']+face['s'])), (0, clr, 255-clr), 10)
+            cv2.putText(image, '%s' % (txt), (int(face['x']-face['s']), int(
+                face['y']-face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, clr, 255-clr), 5)
+        vOut.write(image)
+    vOut.release()
+    command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" %
+               (os.path.join(args.pyaviPath, 'video_only.avi'), os.path.join(args.pyaviPath, 'audio.wav'),
+                args.nDataLoaderThread, os.path.join(args.pyaviPath, 'video_out.avi')))
+    output = subprocess.call(command, shell=True, stdout=None)
+
 
 def evaluate_col_ASD(tracks, scores, args):
-	txtPath = args.videoFolder + '/col_labels/fusion/*.txt' # Load labels
-	predictionSet = {}
-	for name in {'long', 'bell', 'boll', 'lieb', 'sick', 'abbas'}:
-		predictionSet[name] = [[],[]]
-	dictGT = {}
-	txtFiles = glob.glob("%s"%txtPath)
-	for file in txtFiles:
-		lines = open(file).read().splitlines()
-		idName = file.split('/')[-1][:-4]
-		for line in lines:
-			data = line.split('\t')
-			frame = int(int(data[0]) / 29.97 * 25)
-			x1 = int(data[1])
-			y1 = int(data[2])
-			x2 = int(data[1]) + int(data[3])
-			y2 = int(data[2]) + int(data[3])
-			gt = int(data[4])
-			if frame in dictGT:
-				dictGT[frame].append([x1,y1,x2,y2,gt,idName])
-			else:
-				dictGT[frame] = [[x1,y1,x2,y2,gt,idName]]	
-	flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg')) # Load files
-	flist.sort()
-	faces = [[] for i in range(len(flist))]
-	for tidx, track in enumerate(tracks):
-		score = scores[tidx]				
-		for fidx, frame in enumerate(track['track']['frame'].tolist()):
-			s = numpy.mean(score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]) # average smoothing
-			faces[frame].append({'track':tidx, 'score':float(s),'s':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]})
-	for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)):
-		if fidx in dictGT: # This frame has label
-			for gtThisFrame in dictGT[fidx]: # What this label is ?
-				faceGT = gtThisFrame[0:4]
-				labelGT = gtThisFrame[4]
-				idGT = gtThisFrame[5]
-				ious = []
-				for face in faces[fidx]: # Find the right face in my result
-					faceLocation = [int(face['x']-face['s']), int(face['y']-face['s']), int(face['x']+face['s']), int(face['y']+face['s'])]
-					faceLocation_new = [int(face['x']-face['s']) // 2, int(face['y']-face['s']) // 2, int(face['x']+face['s']) // 2, int(face['y']+face['s']) // 2]
-					iou = bb_intersection_over_union(faceLocation_new, faceGT, evalCol = True)
-					if iou > 0.5:
-						ious.append([iou, round(face['score'],2)])
-				if len(ious) > 0: # Find my result
-					ious.sort()
-					labelPredict = ious[-1][1]
-				else:					
-					labelPredict = 0
-				x1 = faceGT[0]
-				y1 = faceGT[1]
-				width = faceGT[2] - faceGT[0]
-				predictionSet[idGT][0].append(labelPredict)
-				predictionSet[idGT][1].append(labelGT)
-	names = ['long', 'bell', 'boll', 'lieb', 'sick', 'abbas'] # Evaluate
-	names.sort()
-	F1s = 0
-	for i in names:
-		scores = numpy.array(predictionSet[i][0])
-		labels = numpy.array(predictionSet[i][1])
-		scores = numpy.int64(scores > 0)
-		F1 = f1_score(labels, scores)
-		ACC = accuracy_score(labels, scores)
-		if i != 'abbas':
-			F1s += F1
-			print("%s, ACC:%.2f, F1:%.2f"%(i, 100 * ACC, 100 * F1))
-	print("Average F1:%.2f"%(100 * (F1s / 5)))	  
+    txtPath = args.videoFolder + '/col_labels/fusion/*.txt'  # Load labels
+    predictionSet = {}
+    for name in {'long', 'bell', 'boll', 'lieb', 'sick', 'abbas'}:
+        predictionSet[name] = [[], []]
+    dictGT = {}
+    txtFiles = glob.glob("%s" % txtPath)
+    for file in txtFiles:
+        lines = open(file).read().splitlines()
+        idName = file.split('/')[-1][:-4]
+        for line in lines:
+            data = line.split('\t')
+            frame = int(int(data[0]) / 29.97 * args.fps)
+            x1 = int(data[1])
+            y1 = int(data[2])
+            x2 = int(data[1]) + int(data[3])
+            y2 = int(data[2]) + int(data[3])
+            gt = int(data[4])
+            if frame in dictGT:
+                dictGT[frame].append([x1, y1, x2, y2, gt, idName])
+            else:
+                dictGT[frame] = [[x1, y1, x2, y2, gt, idName]]
+    flist = glob.glob(os.path.join(args.pyframesPath, '*.jpg'))  # Load files
+    flist.sort()
+    faces = [[] for i in range(len(flist))]
+    for tidx, track in enumerate(tracks):
+        score = scores[tidx]
+        for fidx, frame in enumerate(track['track']['frame'].tolist()):
+            # average smoothing
+            s = numpy.mean(
+                score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)])
+            faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s']
+                                [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]})
+    for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
+        if fidx in dictGT:  # This frame has label
+            for gtThisFrame in dictGT[fidx]:  # What this label is ?
+                faceGT = gtThisFrame[0:4]
+                labelGT = gtThisFrame[4]
+                idGT = gtThisFrame[5]
+                ious = []
+                for face in faces[fidx]:  # Find the right face in my result
+                    faceLocation = [int(face['x']-face['s']), int(face['y']-face['s']),
+                                    int(face['x']+face['s']), int(face['y']+face['s'])]
+                    faceLocation_new = [int(face['x']-face['s']) // 2, int(face['y']-face['s']) // 2, int(
+                        face['x']+face['s']) // 2, int(face['y']+face['s']) // 2]
+                    iou = bb_intersection_over_union(
+                        faceLocation_new, faceGT, evalCol=True)
+                    if iou > 0.5:
+                        ious.append([iou, round(face['score'], 2)])
+                if len(ious) > 0:  # Find my result
+                    ious.sort()
+                    labelPredict = ious[-1][1]
+                else:
+                    labelPredict = 0
+                x1 = faceGT[0]
+                y1 = faceGT[1]
+                width = faceGT[2] - faceGT[0]
+                predictionSet[idGT][0].append(labelPredict)
+                predictionSet[idGT][1].append(labelGT)
+    names = ['long', 'bell', 'boll', 'lieb', 'sick', 'abbas']  # Evaluate
+    names.sort()
+    F1s = 0
+    for i in names:
+        scores = numpy.array(predictionSet[i][0])
+        labels = numpy.array(predictionSet[i][1])
+        scores = numpy.int64(scores > 0)
+        F1 = f1_score(labels, scores)
+        ACC = accuracy_score(labels, scores)
+        if i != 'abbas':
+            F1s += F1
+            print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1))
+    print("Average F1:%.2f" % (100 * (F1s / 5)))
+
+
+# def extract_segment(track_path, start_frame, end_frame, output_path_video, output_path_audio):
+#     # Convert start_frame and end_frame to time (in seconds)
+#     start_time = start_frame / args.fps
+#     end_time = end_frame / args.fps
+
+#     # FFmpeg command to extract video with audio trimming
+#     command_video = f'ffmpeg -accurate_seek -i "{track_path}.avi" -ss {start_time} -to {end_time} -c:v libx264 -c:a aac "{output_path_video}" -loglevel panic'
+
+#     # FFmpeg command to extract audio separately
+#     command_audio = f'ffmpeg -accurate_seek -i "{track_path}.avi" -ss {start_time} -to {end_time} -vn -c:a aac "{output_path_audio}" -loglevel panic'
+
+#     # Execute command for video and audio extraction
+#     try:
+#         # Extract video
+#         subprocess.run(command_video, shell=True, check=True,
+#                        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+#         # print(f"Video segment extracted successfully: {output_path_video}")
+
+#         # Extract audio
+#         subprocess.run(command_audio, shell=True, check=True,
+#                        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+#         # print(f"Audio segment extracted successfully: {output_path_audio}")
+
+#     except subprocess.CalledProcessError as e:
+#         print(f"Error extracting segment: {e}")
+
+import subprocess
+
+def extract_segment(track_path, start_frame, end_frame, output_path_video, file_name):
+    # Convert start_frame and end_frame to time (in seconds)
+    start_time = start_frame / args.fps
+    end_time = end_frame / args.fps
+
+    # FFmpeg command to extract video with audio trimming
+    command_video = [
+        "ffmpeg",
+        "-accurate_seek",
+        "-i", f"{track_path}.avi",
+        "-ss", str(start_time),
+        "-to", str(end_time),
+        "-c:v", "libx264",
+        "-c:a", "aac",
+        output_path_video,
+    ]
+
+    try:
+        # Run the command and capture output
+        result = subprocess.run(
+            command_video, 
+            stdout=subprocess.PIPE, 
+            stderr=subprocess.PIPE, 
+            text=True
+        )
+        
+        if result.returncode != 0:
+            print("Error during FFmpeg execution:")
+            print(result.stderr)  # Print the error message from FFmpeg
+        else:
+            print("Segment extracted successfully, Copying to S3 and deleting local file")
+           
+            s3_object_key = f"{args.channelName}/{args.video_id}/{file_name}"
+            upload_file_to_s3(output_path_video, args.bucketName, s3_object_key)
+
+            # Check if the file exists
+            if os.path.exists(output_path_video):
+                os.remove(output_path_video)
+                print(f"File '{output_path_video}' has been deleted successfully.")
+            else:
+                print(f"File '{output_path_video}' does not exist.")          
+
+    
+    except Exception as e:
+        print(f"Exception occurred while extracting segment: {e}")
+
 
 # Main function
 def main():
-	# This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python).
-	# ```
-	# .
-	# ├── pyavi
-	# │   ├── audio.wav (Audio from input video)
-	# │   ├── video.avi (Copy of the input video)
-	# │   ├── video_only.avi (Output video without audio)
-	# │   └── video_out.avi  (Output video with audio)
-	# ├── pycrop (The detected face videos and audios)
-	# │   ├── 000000.avi
-	# │   ├── 000000.wav
-	# │   ├── 000001.avi
-	# │   ├── 000001.wav
-	# │   └── ...
-	# ├── pyframes (All the video frames in this video)
-	# │   ├── 000001.jpg
-	# │   ├── 000002.jpg
-	# │   └── ...	
-	# └── pywork
-	#     ├── faces.pckl (face detection result)
-	#     ├── scene.pckl (scene detection result)
-	#     ├── scores.pckl (ASD result)
-	#     └── tracks.pckl (face tracking result)
-	# ```
-
-	# Initialization 
-	args.pyaviPath = os.path.join(args.savePath, 'pyavi')
-	args.pyframesPath = os.path.join(args.savePath, 'pyframes')
-	args.pyworkPath = os.path.join(args.savePath, 'pywork')
-	args.pycropPath = os.path.join(args.savePath, 'pycrop')
-	if os.path.exists(args.savePath):
-		rmtree(args.savePath)
-	os.makedirs(args.pyaviPath, exist_ok = True) # The path for the input video, input audio, output video
-	os.makedirs(args.pyframesPath, exist_ok = True) # Save all the video frames
-	os.makedirs(args.pyworkPath, exist_ok = True) # Save the results in this process by the pckl method
-	os.makedirs(args.pycropPath, exist_ok = True) # Save the detected face clips (audio+video) in this process
-
-	# Extract video
-	args.videoFilePath = os.path.join(args.pyaviPath, 'video.avi')
-	# If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration'
-	if args.duration == 0:
-		command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic" % \
-			(args.videoPath, args.nDataLoaderThread, args.videoFilePath))
-	else:
-		command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic" % \
-			(args.videoPath, args.nDataLoaderThread, args.start, args.start + args.duration, args.videoFilePath))
-	subprocess.call(command, shell=True, stdout=None)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the video and save in %s \r\n" %(args.videoFilePath))
-	
-	# Extract audio
-	args.audioFilePath = os.path.join(args.pyaviPath, 'audio.wav')
-	command = ("ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % \
-		(args.videoFilePath, args.nDataLoaderThread, args.audioFilePath))
-	subprocess.call(command, shell=True, stdout=None)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the audio and save in %s \r\n" %(args.audioFilePath))
-
-	# Extract the video frames
-	command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % \
-		(args.videoFilePath, args.nDataLoaderThread, os.path.join(args.pyframesPath, '%06d.jpg'))) 
-	subprocess.call(command, shell=True, stdout=None)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the frames and save in %s \r\n" %(args.pyframesPath))
-
-	# Scene detection for the video frames
-	scene = scene_detect(args)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scene detection and save in %s \r\n" %(args.pyworkPath))	
-
-	# Face detection for the video frames
-	faces = inference_video(args)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face detection and save in %s \r\n" %(args.pyworkPath))
-
-	# Face tracking
-	allTracks, vidTracks = [], []
-	for shot in scene:
-		if shot[1].frame_num - shot[0].frame_num >= args.minTrack: # Discard the shot frames less than minTrack frames
-			allTracks.extend(track_shot(args, faces[shot[0].frame_num:shot[1].frame_num])) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face track and detected %d tracks \r\n" %len(allTracks))
-
-	# Face clips cropping
-	for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)):
-		vidTracks.append(crop_video(args, track, os.path.join(args.pycropPath, '%05d'%ii)))
-	savePath = os.path.join(args.pyworkPath, 'tracks.pckl')
-	with open(savePath, 'wb') as fil:
-		pickle.dump(vidTracks, fil)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face Crop and saved in %s tracks \r\n" %args.pycropPath)
-	fil = open(savePath, 'rb')
-	vidTracks = pickle.load(fil)
-
-	# Active Speaker Detection by TalkNet
-	files = glob.glob("%s/*.avi"%args.pycropPath)
-	files.sort()
-	scores = evaluate_network(files, args)
-	savePath = os.path.join(args.pyworkPath, 'scores.pckl')
-	with open(savePath, 'wb') as fil:
-		pickle.dump(scores, fil)
-	sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scores extracted and saved in %s \r\n" %args.pyworkPath)
-
-	if args.evalCol == True:
-		evaluate_col_ASD(vidTracks, scores, args) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want
-		quit()
-	else:
-		# Visualization, save the result as the new video	
-		visualization(vidTracks, scores, args)	
+    # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python).
+    # ```
+    # .
+    # ├── pyavi
+    # │   ├── audio.wav (Audio from input video)
+    # │   ├── video.avi (Copy of the input video)
+    # │   ├── video_only.avi (Output video without audio)
+    # │   └── video_out.avi  (Output video with audio)
+    # ├── pycrop (The detected face videos and audios)
+    # │   ├── 000000.avi
+    # │   ├── 000000.wav
+    # │   ├── 000001.avi
+    # │   ├── 000001.wav
+    # │   └── ...
+    # ├── pyframes (All the video frames in this video)
+    # │   ├── 000001.jpg
+    # │   ├── 000002.jpg
+    # │   └── ...
+    # |── pyfilter (Output clipped videos)
+    # └── pywork
+    #     ├── faces.pckl (face detection result)
+    #     ├── scene.pckl (scene detection result)
+    #     ├── scores.pckl (ASD result)
+    #     └── tracks.pckl (face tracking result)
+    # ```
+
+    # Initialization
+    args.pyaviPath = os.path.join(args.savePath, 'pyavi')
+    args.pyframesPath = os.path.join(args.savePath, 'pyframes')
+    args.pyworkPath = os.path.join(args.savePath, 'pywork')
+    args.pycropPath = os.path.join(args.savePath, 'pycrop')
+    args.pyfilteredVideo = os.path.join(args.savePath)
+    # args.pyfilteredAudio = os.path.join(args.savePath, 'pyfilter', 'audio')
+    
+    # if os.path.exists(args.savePath):
+    #     rmtree(args.savePath)
+    
+    # The path for the input video, input audio, output video
+    os.makedirs(args.pyaviPath, exist_ok=True)
+    os.makedirs(args.pyframesPath, exist_ok=True)  # Save all the video frames
+    # Save the results in this process by the pckl method
+    os.makedirs(args.pyworkPath, exist_ok=True)
+    # Save the detected face clips (audio+video) in this process
+    os.makedirs(args.pycropPath, exist_ok=True)
+    # Save the detected face clips (audio+video) in this process
+    os.makedirs(args.pyfilteredVideo, exist_ok=True)
+    # Save the detected face clips (audio+video) in this process
+    # os.makedirs(args.pyfilteredAudio, exist_ok=True)
+
+    # Extract video
+    args.videoFilePath = os.path.join(args.pyaviPath, 'video.avi')
+    # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration'
+    if args.duration == 0:
+        command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r %.3f %s -loglevel panic" %
+                   (args.videoPath, args.nDataLoaderThread, args.fps,  args.videoFilePath))
+    else:
+        command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r %.3f %s -loglevel panic" %
+                   (args.videoPath, args.nDataLoaderThread, args.start, args.start + args.duration, args.fps, args.videoFilePath))
+    subprocess.call(command, shell=True, stdout=None)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Extract the video and save in %s \r\n" % (args.videoFilePath))
+
+    # Extract audio
+    args.audioFilePath = os.path.join(args.pyaviPath, 'audio.wav')
+    command = ("ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" %
+               (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath))
+    subprocess.call(command, shell=True, stdout=None)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Extract the audio and save in %s \r\n" % (args.audioFilePath))
+
+    # Extract the video frames
+    command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" %
+               (args.videoFilePath, args.nDataLoaderThread, os.path.join(args.pyframesPath, '%06d.jpg')))
+    subprocess.call(command, shell=True, stdout=None)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Extract the frames and save in %s \r\n" % (args.pyframesPath))
+
+    # Scene detection for the video frames
+    scene = scene_detect(args)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Scene detection and save in %s \r\n" % (args.pyworkPath))
+
+    # Face detection for the video frames
+    faces = inference_video(args)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Face detection and save in %s \r\n" % (args.pyworkPath))
+
+    # Face tracking
+    allTracks, vidTracks = [], []
+    for shot in scene:
+        # Discard the shot frames less than minTrack frames
+        if shot[1].frame_num - shot[0].frame_num >= args.minTrack:
+            # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
+            allTracks.extend(track_shot(
+                args, faces[shot[0].frame_num:shot[1].frame_num]))
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Face track and detected %d tracks \r\n" % len(allTracks))
+
+    # Face clips cropping
+    for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
+        vidTracks.append(crop_video(
+            args, track, os.path.join(args.pycropPath, '%05d' % ii)))
+    savePath = os.path.join(args.pyworkPath, 'tracks.pckl')
+    with open(savePath, 'wb') as fil:
+        pickle.dump(vidTracks, fil)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Face Crop and saved in %s tracks \r\n" % args.pycropPath)
+    fil = open(savePath, 'rb')
+    vidTracks = pickle.load(fil)
+
+    # Active Speaker Detection by TalkNet
+    files = glob.glob("%s/*.avi" % args.pycropPath)
+    files_audio = glob.glob("%s/*.wav" % args.pycropPath)
+
+    files.sort()
+    scores = evaluate_network(files, args)
+    savePath = os.path.join(args.pyworkPath, 'scores.pckl')
+    with open(savePath, 'wb') as fil:
+        pickle.dump(scores, fil)
+    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") +
+                     " Scores extracted and saved in %s \r\n" % args.pyworkPath)
+
+    # Frame rate of the video (assumed 25 FPS)
+    MIN_SEGMENT_FRAMES = 3 * args.fps  # Minimum segment length in frames
+    MAX_SEGMENT_FRAMES = 10 * args.fps  # Maximum segment length in frames
+
+    filtered_segments = []
+    count_segments = 0
+    # Process each track and its corresponding score
+    for ii, (track, score_array) in tqdm.tqdm(enumerate(zip(allTracks, scores)), total=len(allTracks)):
+        start_frame = None
+        end_frame = None
+        segment_frames = []
+
+        for frame_idx, score in enumerate(score_array):
+
+            if score > 0:
+                frame_number = track['frame'][frame_idx]
+                frame_path = os.path.join(args.pyframesPath, f"{(frame_number+1):06d}.jpg")
+                image = cv2.imread(frame_path)
+
+                # Check if the image was loaded successfully
+                if image is None:
+                    print(f"Warning: Frame {frame_number} could not be loaded! Skipping...")
+                    continue
+
+                # # Display the frame using Matplotlib
+                # plt.imshow(image)
+                # plt.axis("off")  # Turn off axes for a cleaner display
+                # plt.show()
+
+                results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+
+                img_h, img_w, _ = image.shape
+                face_2d = []
+                face_3d = []
+                y=0
+                if results.multi_face_landmarks:
+                    for face_landmarks in results.multi_face_landmarks:
+                        for idx, lm in enumerate(face_landmarks.landmark):
+                            if idx == 33 or idx == 263 or idx ==1 or idx == 61 or idx == 291 or idx==199:
+                                if idx ==1:
+                                    nose_2d = (lm.x * img_w,lm.y * img_h)
+                                    nose_3d = (lm.x * img_w,lm.y * img_h,lm.z * 3000)
+                                x,y = int(lm.x * img_w),int(lm.y * img_h)
+
+                                face_2d.append([x,y])
+                                face_3d.append(([x,y,lm.z]))
+
+                        #Get 2d, 3d Coord
+                        face_2d = numpy.array(face_2d,dtype=numpy.float64)
+                        face_3d = numpy.array(face_3d,dtype=numpy.float64)
+
+                        # Camera matrix (intrinsic parameters)
+                        focal_length = 1 * img_w
+                        cam_matrix = numpy.array([[focal_length,0,img_h/2],
+                                            [0,focal_length,img_w/2],
+                                            [0,0,1]])
+                        # No lens distortion
+                        distortion_matrix = numpy.zeros((4,1),dtype=numpy.float64)
+
+                        # SolvePnP to calculate rotation and translation vectors
+                        _,rotation_vec,_ = cv2.solvePnP(face_3d,face_2d,cam_matrix,distortion_matrix)
+
+                        #getting rotational of face
+                        rmat,_ = cv2.Rodrigues(rotation_vec)
+
+                        angles,_,_,_,_,_ = cv2.RQDecomp3x3(rmat)
+                        y = angles[1] * 360				
+                              
+
+                if abs(y) < args.angleThreshold:
+                    # Start a new segment if not already started
+                    if start_frame is None:
+                        start_frame = frame_idx
+                    end_frame = frame_idx
+
+                    # Check if segment length exceeds the maximum allowed duration
+                    if (end_frame - start_frame + 1) > MAX_SEGMENT_FRAMES:
+                        # Save the current valid segment
+                        segment_frames.append((start_frame, end_frame))
+                        start_frame = None  # Reset for next segment
+                else:
+                    # End the current segment if the score is not positive
+                    if start_frame is not None:
+                        # Save only if segment is long enough
+                        if (end_frame - start_frame + 1) >= MIN_SEGMENT_FRAMES:
+                            segment_frames.append((start_frame, end_frame))
+                        start_frame = None
+            
+            else:
+                # End the current segment if the score is not positive
+                if start_frame is not None:
+                    # Save only if segment is long enough
+                    if (end_frame - start_frame + 1) >= MIN_SEGMENT_FRAMES:
+                        segment_frames.append((start_frame, end_frame))
+                    start_frame = None
+
+        # Handle last segment if it ends positively
+        if start_frame is not None and (end_frame - start_frame + 1) >= MIN_SEGMENT_FRAMES:
+            segment_frames.append((start_frame, end_frame))
+
+        count_segments += len(segment_frames)
+        if segment_frames:
+            # Extract and save each valid segment
+            # for seg_idx, (seg_start, seg_end) in enumerate(segment_frames):
+            seg_idx = 0
+            seg_start, seg_end = segment_frames[0]
+            # segment_video_path = os.path.join(
+                # args.pyfilteredVideo, f"{args.videoName}_track_{ii:05d}_segment_{seg_idx:02d}.avi")
+            segment_video_path = os.path.join(
+                args.pyfilteredVideo, f"{ii:05d}.avi")
+            # segment_audio_path = os.path.join(
+            #     args.pyfilteredAudio, f"{args.videoName}_track_{ii:05d}_segment_{seg_idx:02d}.wav")
+            track_path = os.path.join(args.pycropPath, '%05d' % ii)
+            extract_segment(track_path, seg_start+10, seg_end-10, segment_video_path, f"{ii:05d}.avi")
+
+            # Extract middle frame for age and gender prediction
+            middle_frame = (seg_start + seg_end) // 2
+            frame_path = os.path.join(args.pyframesPath, f"{(middle_frame + 1):06d}.jpg")
+            image = cv2.imread(frame_path)
+
+            if image is not None:
+                faces = face_app.get(image)
+                if faces:
+                    face = faces[0]  # Assume single face
+                    age = face.age
+                    gender = 'male' if face.gender == 1 else 'female'
+
+                    # Save age and gender in JSON
+                    metadata = {"age": age, "gender": gender}
+                    json_path = os.path.join(args.pyfilteredVideo, f"{ii:05d}.json")
+                    with open(json_path, 'w') as json_file:
+                        json.dump(metadata, json_file)
+
+                    s3_object_key = f"{args.channelName}/{args.video_id}/{ii:05d}.json"
+                    upload_file_to_s3(json_path, args.bucketName, s3_object_key)
+
+                    # Check if the file exists
+                    if os.path.exists(json_path):
+                        os.remove(json_path)
+                        print(f"File '{json_path}' has been deleted successfully.")
+                    else:
+                        print(f"File '{json_path}' does not exist.") 
+
+            filtered_segments.append(segment_video_path)
+
+    print("Found ", count_segments, " Segments")
+    # Save filtered segments metadata
+    savePath = os.path.join(args.pyworkPath, 'filtered_segments.pckl')
+    with open(savePath, 'wb') as fil:
+        pickle.dump(filtered_segments, fil)
+    sys.stderr.write(
+        f"{time.strftime('%Y-%m-%d %H:%M:%S')} Filtered segments saved in {savePath}\n")
+
+    # if args.evalCol == True:
+    #     # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want
+    #     evaluate_col_ASD(vidTracks, scores, args)
+    #     quit()
+    # else:
+    #     # Visualization, save the result as the new video
+    #     visualization(vidTracks, scores, args)
+
+    # At the end of the main function
+    folders_to_keep = []
+    folders_to_delete = [args.pyfilteredVideo, args.pyaviPath ,args.pyframesPath, args.pyworkPath, args.pycropPath]
+    # folders_to_keep = [args.pyfilteredVideo, args.pyaviPath ,args.pyframesPath, args.pyworkPath, args.pycropPath]
+    # folders_to_delete = []
+
+    for folder in folders_to_delete:
+        if folder not in folders_to_keep and os.path.exists(folder):
+            rmtree(folder)
+    sys.stderr.write(
+        f"{time.strftime('%Y-%m-%d %H:%M:%S')} Removed unnecessary folders after processing.\n")
+
 
 if __name__ == '__main__':
-    main()
+    # profiler = cProfile.Profile()
+    # profiler.enable()
+    main()  # Run your script here
+    # profiler.disable()
+
+    # Save the profiling data to a file
+    # profiler.dump_stats("profiling_results.prof")
+
+    # # Optional: Print profiling stats to the console
+    # stats = pstats.Stats(profiler)
+    # stats.strip_dirs()
+    # stats.sort_stats("cumulative")  # Sort by cumulative time
+    # stats.print_stats(20)  # Print the top 20 time-consuming functions
diff --git a/fileTransfer.py b/fileTransfer.py
new file mode 100644
index 0000000..7988a69
--- /dev/null
+++ b/fileTransfer.py
@@ -0,0 +1,29 @@
+import boto3
+import os
+
+# Define the bucket name and local folder path
+bucket_name = "scenespart1"
+local_folder_path = "/home/ubuntu/ZeeNews"  # Folder to upload
+s3_object_key_prefix = "croppedData/ZeeNews_output/"  # S3 destination prefix
+
+# Create an S3 client
+s3_client = boto3.client("s3")
+
+def upload_folder_to_s3(local_folder, bucket, s3_prefix):
+    try:
+        for root, dirs, files in os.walk(local_folder):
+            for file in files:
+                local_file_path = os.path.join(root, file)
+                # Create the relative path for S3 object key
+                relative_path = os.path.relpath(local_file_path, local_folder)
+                s3_object_key = os.path.join(s3_prefix, relative_path)
+                
+                # Upload the file
+                s3_client.upload_file(local_file_path, bucket, s3_object_key)
+                print(f"Uploaded {local_file_path} to s3://{bucket}/{s3_object_key}")
+
+    except Exception as e:
+        print(f"Error uploading folder to S3: {e}")
+
+# Call the function to upload the folder
+upload_folder_to_s3(local_folder_path, bucket_name, s3_object_key_prefix)
diff --git a/fileTransferFolder.py b/fileTransferFolder.py
new file mode 100644
index 0000000..7988a69
--- /dev/null
+++ b/fileTransferFolder.py
@@ -0,0 +1,29 @@
+import boto3
+import os
+
+# Define the bucket name and local folder path
+bucket_name = "scenespart1"
+local_folder_path = "/home/ubuntu/ZeeNews"  # Folder to upload
+s3_object_key_prefix = "croppedData/ZeeNews_output/"  # S3 destination prefix
+
+# Create an S3 client
+s3_client = boto3.client("s3")
+
+def upload_folder_to_s3(local_folder, bucket, s3_prefix):
+    try:
+        for root, dirs, files in os.walk(local_folder):
+            for file in files:
+                local_file_path = os.path.join(root, file)
+                # Create the relative path for S3 object key
+                relative_path = os.path.relpath(local_file_path, local_folder)
+                s3_object_key = os.path.join(s3_prefix, relative_path)
+                
+                # Upload the file
+                s3_client.upload_file(local_file_path, bucket, s3_object_key)
+                print(f"Uploaded {local_file_path} to s3://{bucket}/{s3_object_key}")
+
+    except Exception as e:
+        print(f"Error uploading folder to S3: {e}")
+
+# Call the function to upload the folder
+upload_folder_to_s3(local_folder_path, bucket_name, s3_object_key_prefix)
diff --git a/model/faceDetector/s3fd/box_utils.py b/model/faceDetector/s3fd/box_utils.py
index 0779bcd..1bf4be2 100755
--- a/model/faceDetector/s3fd/box_utils.py
+++ b/model/faceDetector/s3fd/box_utils.py
@@ -35,7 +35,7 @@ def nms_(dets, thresh):
         inds = np.where(ovr <= thresh)[0]
         order = order[inds + 1]
 
-    return np.array(keep).astype(np.int)
+    return np.array(keep).astype(int)
 
 
 def decode(loc, priors, variances):
diff --git a/model/faceDetector/yolov11n-face.pt b/model/faceDetector/yolov11n-face.pt
new file mode 100644
index 0000000..2059cd2
Binary files /dev/null and b/model/faceDetector/yolov11n-face.pt differ
diff --git a/requirement.txt b/requirement.txt
index ccbc4a9..6a55e36 100755
--- a/requirement.txt
+++ b/requirement.txt
@@ -10,4 +10,5 @@ python_speech_features
 torchvision
 ffmpeg
 gdown
-youtube-dl
\ No newline at end of file
+youtube-dl
+mediapipe==0.10.9
\ No newline at end of file
diff --git a/s3_uploader.py b/s3_uploader.py
new file mode 100644
index 0000000..e0d5985
--- /dev/null
+++ b/s3_uploader.py
@@ -0,0 +1,19 @@
+import boto3
+
+def upload_file_to_s3(local_file, bucket, s3_key):
+    """
+    Upload a single file to an S3 bucket.
+
+    :param local_file: Path to the local file.
+    :param bucket: Name of the S3 bucket.
+    :param s3_key: Path and name of the file in the S3 bucket.
+    """
+    # Create an S3 client
+    s3_client = boto3.client("s3")
+    
+    try:
+        # Upload the file
+        s3_client.upload_file(local_file, bucket, s3_key)
+        print(f"Uploaded {local_file} to s3://{bucket}/{s3_key}")
+    except Exception as e:
+        print(f"Error uploading file: {e}")