diff --git a/app/api/v1/search/controller.py b/app/api/v1/search/controller.py index d70a431..574afca 100644 --- a/app/api/v1/search/controller.py +++ b/app/api/v1/search/controller.py @@ -36,8 +36,71 @@ def search_transcriptions_by_video(self, search: SearchDTO): } }, ] + + # Aggregation for expanded transcription within each transcription + expanded_transcription_stage = { + "$lookup": { + "from": "videoTranscriptions", # Assuming collection name is videoTranscriptions + "let": {"videoId": "$video.videoId", "startTime": "$start"}, + "pipeline": [ + { + "$match": { + "$expr": { + "$and": [ + {"$eq": ["$video.videoId", "$$videoId"]}, + { + "$gt": [ + "$start", + {"$subtract": ["$$startTime", 10]}, + ] + }, + { + "$lt": [ + "$start", + {"$add": ["$$startTime", 10]}, + ] + }, + ] + } + } + }, + {"$sort": {"start": 1}}, + { + "$group": { + "_id": "$video.videoId", + "merged_start": {"$first": "$start"}, + "merged_end": { + "$last": {"$add": ["$start", "$duration"]} + }, + "merged_text": {"$push": "$text"}, + "merged_duration": {"$sum": "$duration"}, + } + }, + { + "$project": { + "_id": 0, + "start": "$merged_start", + "end": "$merged_end", + "text": { + "$reduce": { + "input": "$merged_text", + "initialValue": "", + "in": {"$concat": ["$$value", " ", "$$this"]}, + } + }, + "duration": "$merged_duration", + } + }, + ], + "as": "expanded_transcription", + } + } + # Create the text search query results_pipeline = common_pipeline + [ + {"$unwind": "$transcriptions"}, + {"$addFields": {"start": "$transcriptions.start"}}, + expanded_transcription_stage, {"$sort": search.order_by}, {"$skip": skip_count}, {"$limit": search.per_page}, @@ -79,48 +142,3 @@ def search_transcriptions_by_video(self, search: SearchDTO): status=500, mimetype="application/json", ) - - def expanded_transcription(self, video_id, start_time): - - # Aggregation pipeline to find 2 before and 2 after the target transcription - pipeline = [ - { - "$match": { - "video._id": video_id, - "start": {"$gt": start_time - 10, "$lt": start_time + 10}, - } - }, - {"$sort": {"start": 1}}, # Ensure documents are sorted by start time - { - "$group": { - "_id": "$video._id", # Grouping by videoId, adjust if you need different granularity - "merged_start": {"$first": "$start"}, - "merged_end": {"$last": {"$add": ["$start", "$duration"]}}, - "merged_text": {"$push": "$text"}, - "merged_duration": {"$sum": "$duration"}, - } - }, - { - "$project": { - "_id": 0, - "start": "$merged_start", - "end": "$merged_end", - "text": { - "$reduce": { - "input": "$merged_text", - "initialValue": "", - "in": {"$concat": ["$$value", " ", "$$this"]}, - } - }, - "duration": "$merged_duration", - } - }, - ] - - transcriptions = self.search_service.aggregate_transcriptions(pipeline=pipeline) - if len(transcriptions) > 0: - return jsonify(transcriptions) - return ( - jsonify({"error": "No transcriptions found or error in aggregation"}), - 404, - )