Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 63 additions & 45 deletions app/api/v1/search/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,71 @@ def search_transcriptions_by_video(self, search: SearchDTO):
}
},
]

# Aggregation for expanded transcription within each transcription
expanded_transcription_stage = {
"$lookup": {
"from": "videoTranscriptions", # Assuming collection name is videoTranscriptions
"let": {"videoId": "$video.videoId", "startTime": "$start"},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{"$eq": ["$video.videoId", "$$videoId"]},
{
"$gt": [
"$start",
{"$subtract": ["$$startTime", 10]},
]
},
{
"$lt": [
"$start",
{"$add": ["$$startTime", 10]},
]
},
]
}
}
},
{"$sort": {"start": 1}},
{
"$group": {
"_id": "$video.videoId",
"merged_start": {"$first": "$start"},
"merged_end": {
"$last": {"$add": ["$start", "$duration"]}
},
"merged_text": {"$push": "$text"},
"merged_duration": {"$sum": "$duration"},
}
},
{
"$project": {
"_id": 0,
"start": "$merged_start",
"end": "$merged_end",
"text": {
"$reduce": {
"input": "$merged_text",
"initialValue": "",
"in": {"$concat": ["$$value", " ", "$$this"]},
}
},
"duration": "$merged_duration",
}
},
],
"as": "expanded_transcription",
}
}

# Create the text search query
results_pipeline = common_pipeline + [
{"$unwind": "$transcriptions"},
{"$addFields": {"start": "$transcriptions.start"}},
expanded_transcription_stage,
{"$sort": search.order_by},
{"$skip": skip_count},
{"$limit": search.per_page},
Expand Down Expand Up @@ -79,48 +142,3 @@ def search_transcriptions_by_video(self, search: SearchDTO):
status=500,
mimetype="application/json",
)

def expanded_transcription(self, video_id, start_time):

# Aggregation pipeline to find 2 before and 2 after the target transcription
pipeline = [
{
"$match": {
"video._id": video_id,
"start": {"$gt": start_time - 10, "$lt": start_time + 10},
}
},
{"$sort": {"start": 1}}, # Ensure documents are sorted by start time
{
"$group": {
"_id": "$video._id", # Grouping by videoId, adjust if you need different granularity
"merged_start": {"$first": "$start"},
"merged_end": {"$last": {"$add": ["$start", "$duration"]}},
"merged_text": {"$push": "$text"},
"merged_duration": {"$sum": "$duration"},
}
},
{
"$project": {
"_id": 0,
"start": "$merged_start",
"end": "$merged_end",
"text": {
"$reduce": {
"input": "$merged_text",
"initialValue": "",
"in": {"$concat": ["$$value", " ", "$$this"]},
}
},
"duration": "$merged_duration",
}
},
]

transcriptions = self.search_service.aggregate_transcriptions(pipeline=pipeline)
if len(transcriptions) > 0:
return jsonify(transcriptions)
return (
jsonify({"error": "No transcriptions found or error in aggregation"}),
404,
)