-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlambda_function-note_gen.py
More file actions
404 lines (325 loc) · 18.9 KB
/
lambda_function-note_gen.py
File metadata and controls
404 lines (325 loc) · 18.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import json
import os
import boto3
from supabase import create_client, Client
from google import genai # Using Google Generative AI library
# Initialize Supabase client
# Ensure SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are set as environment variables in Lambda
url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)
# Initialize Gemini client
# Ensure GEMINI_API_KEY environment variable is set in Lambda configuration
gemini_api_key = os.environ.get("GEMINI_API_KEY")
client = genai.Client(api_key=gemini_api_key)
def update_video_status(video_id: str, status: str, error_message: str = None):
"""Helper function to update video transcription status in Supabase."""
print(f"Attempting to update video {video_id} status to: {status}")
update_data = {'transcription_status': status}
if error_message:
update_data['error_message'] = error_message # Assuming an error_message column exists
try:
# Use the service role key for this update
response = supabase.table('videos').update(update_data).eq('id', video_id).execute()
if response.data is None or (isinstance(response.data, list) and len(response.data) == 0):
print(f"Warning: Update video status for {video_id} to {status} might not have been successful. Response data is empty.")
else:
print(f"Video {video_id} status updated to {status} successfully.")
except Exception as e:
print(f"An unexpected error occurred while updating video status for {video_id} to {status}: {e}")
def add_spaces_around_math(content: str) -> str:
"""
Add spaces around inline math delimiters to ensure proper rendering.
"""
import re
# First, protect display math blocks ($$...$$) from spacing changes
display_math_blocks = []
display_math_pattern = r'\$\$[\s\S]*?\$\$'
def preserve_display_math(match):
display_math_blocks.append(match.group())
return f"__DISPLAY_MATH_PLACEHOLDER_{len(display_math_blocks) - 1}__"
# Replace display math with placeholders
processed_content = re.sub(display_math_pattern, preserve_display_math, content)
# Add space before $ if it's preceded by a non-whitespace character that's not another $
# This handles cases like "text$math$" -> "text $math$"
processed_content = re.sub(r'(?<![\s$])\$', r' $', processed_content)
# Add space after $ if it's followed by a non-whitespace character that's not another $
# This handles cases like "$math$text" -> "$math$ text"
processed_content = re.sub(r'\$(?![\s$])', r'$ ', processed_content)
# Restore display math blocks
for i, block in enumerate(display_math_blocks):
placeholder = f"__DISPLAY_MATH_PLACEHOLDER_{i}__"
processed_content = processed_content.replace(placeholder, block)
# Clean up any double spaces that might have been created
processed_content = re.sub(r' +', ' ', processed_content)
return processed_content
def post_process_latex_content(content: str) -> str:
"""
Post-process LaTeX content to fix common formatting issues and ensure KaTeX compatibility.
"""
import re
# First, add proper spacing around inline math delimiters
processed_content = add_spaces_around_math(content)
# Fix missing backslashes before common LaTeX functions
common_math_functions = [
'frac', 'sqrt', 'sin', 'cos', 'tan', 'log', 'ln', 'exp', 'sum', 'prod', 'int',
'lim', 'alpha', 'beta', 'gamma', 'delta', 'pi', 'theta', 'sigma', 'omega',
'infty', 'partial', 'nabla', 'cdot', 'times', 'pm', 'leq', 'geq', 'neq'
]
# Fix missing backslashes in math expressions
for func in common_math_functions:
# Pattern: $ followed by function name without backslash
pattern = rf'(\$\$?[^$]*?)([^\\]|^)({func})([^a-zA-Z])'
replacement = rf'\1\2\\{func}\4'
processed_content = re.sub(pattern, replacement, processed_content)
# Fix common LaTeX math delimiter issues
# Replace \[ ... \] with $$ ... $$
processed_content = re.sub(r'\\?\\\[([^\\]*?)\\?\\\]', r'$$\1$$', processed_content)
# Replace \( ... \) with $ ... $
processed_content = re.sub(r'\\?\\\(([^\\]*?)\\?\\\)', r'$\1$', processed_content)
# Fix equation environments - replace with display math
processed_content = re.sub(r'\\begin\{equation\*?\}(.*?)\\end\{equation\*?\}', r'$$\1$$', processed_content, flags=re.DOTALL)
# Fix align environments - replace with display math
processed_content = re.sub(r'\\begin\{align\*?\}(.*?)\\end\{align\*?\}', r'$$\1$$', processed_content, flags=re.DOTALL)
# Clean up extra backslashes in math mode
processed_content = re.sub(r'\$\$\s*\\\s*', r'$$', processed_content)
processed_content = re.sub(r'\s*\\\s*\$\$', r'$$', processed_content)
processed_content = re.sub(r'\$\s*\\\s*', r'$', processed_content)
processed_content = re.sub(r'\s*\\\s*\$', r'$', processed_content)
# Fix common spacing issues in math
processed_content = re.sub(r'\$\s+', r'$', processed_content)
processed_content = re.sub(r'\s+\$', r'$', processed_content)
processed_content = re.sub(r'\$\$\s+', r'$$', processed_content)
processed_content = re.sub(r'\s+\$\$', r'$$', processed_content)
# Fix integration by parts notation specifically
# Replace \[ u \, dv = uv - v \, du \] with proper display math
processed_content = re.sub(r'\\?\\\[\s*([^\\]*?)\s*\\?\\\]', r'$$\1$$', processed_content)
# Ensure proper spacing in mathematical expressions
processed_content = re.sub(r'(\$\$[^$]*?)\s*\\\s*([^$]*?\$\$)', r'\1 \2', processed_content)
# Fix any remaining document-level commands that shouldn't be there
processed_content = re.sub(r'\\documentclass.*?\n', '', processed_content)
processed_content = re.sub(r'\\begin\{document\}', '', processed_content)
processed_content = re.sub(r'\\end\{document\}', '', processed_content)
processed_content = re.sub(r'\\usepackage.*?\n', '', processed_content)
# Clean up multiple newlines
processed_content = re.sub(r'\n\s*\n\s*\n+', '\n\n', processed_content)
return processed_content.strip()
def lambda_handler(event, context):
print("Received note generation event:", json.dumps(event))
video_id = None # Initialize video_id to None
transcript_id = None # Initialize transcript_id to None
try:
# Extract data from the event payload sent by the transcription Lambda
payload = event # Assuming the event is the payload JSON
video_id = payload.get('videoId')
user_id = payload.get('userId')
note_format = payload.get('noteFormat', 'Markdown') # Get note format, default to markdown
# raw_transcript is no longer expected directly, we fetch it
on_screen_text_data = payload.get('onScreenTextData') # Expect on-screen text data
if not video_id or not user_id:
print("Missing videoId or userId in payload.")
# No video_id to update status if missing
return {
'statusCode': 400,
'body': json.dumps('Missing videoId or userId')
}
print(f"Generating notes for video ID: {video_id} for user: {user_id} in format: {note_format}")
# 1. Fetch the transcript text and transcript_id using the video_id
# Assuming a one-to-one relationship between videos and transcripts
# Use maybe_single() here as well, in case a video has no transcript (though less likely)
transcript_response = supabase.table('transcripts').select('id, content').eq('video_id', video_id).maybe_single().execute()
if transcript_response.data:
transcript_id = transcript_response.data['id']
raw_transcript = transcript_response.data['content']
print(f"Fetched transcript for video {video_id}, transcript ID: {transcript_id}")
else:
print(f"No transcript found for video ID: {video_id}")
update_video_status(video_id, 'note_generation_failed', f'No transcript found for video ID: {video_id}')
return {
'statusCode': 404,
'body': json.dumps(f'No transcript found for video ID: {video_id}')
}
# --- Generate Notes in Unified Markdown + LaTeX Format ---
# Always generate Markdown content with embedded LaTeX math expressions
generation_prompt = f"""
AI AGENT INSTRUCTIONS: CONVERT TRANSCRIPT TO MARKDOWN CONVERTER WITH LATEX MATH
You are a specialized AI agent responsible for converting the transcript found at the end of this message into well-structured Markdown notes. Your primary focus is creating clean, readable documentation with properly formatted mathematical expressions that render correctly in KaTeX.
CORE RESPONSIBILITIES:
1. Transform spoken content into structured, academic-style notes
- Be detailed and include all important information from the transcript. These are academic notes, not a summary.
- There is no limit to the amount of notes needed to cover all the content in the transcript.
2. Organize content with clear hierarchy and flow
3. Format mathematical expressions using proper LaTeX syntax
4. Ensure KaTeX compatibility for all math expressions
MARKDOWN STRUCTURE GUIDELINES:
- Use appropriate heading levels (`#`, `##`, `###`) to create logical document hierarchy
- Employ bullet points and numbered lists for clarity
- Add emphasis with **bold** and *italic* text where appropriate
- Include code blocks for non-mathematical code or formulas
- Use blockquotes for important definitions or key concepts
LaTeX MATH FORMATTING RULES:
1. For KaTeX Compatibility:
- Inline math: Wrap in single dollar signs `$...$`
- Display math blocks: Wrap in double dollar signs `$$...$$`
- Always place display blocks on separate lines with blank lines above and below
2. Mathematical Expression Guidelines:
- Use `\\frac{{numerator}}{{denominator}}` for fractions
- Use `^{{}}` for superscripts and `_{{}}` for subscripts
- Use `\\sqrt{{}}` for square roots, `\\sqrt[n]{{}}` for nth roots
- Use proper LaTeX function names: `\\sin`, `\\cos`, `\\log`, `\\ln`, `\\exp`
- Use `\\sum`, `\\prod`, `\\int` for summation, product, and integral symbols
- Use `\\alpha`, `\\beta`, `\\gamma`, etc. for Greek letters
- Use `\\mathbf{{}}` for bold math symbols
- Use `\\text{{}}` for text within math expressions
3. Common Math Symbols and Operators:
- `\\pm` for ±, `\\mp` for ∓
- `\\times` for ×, `\\cdot` for ·
- `\\leq` for ≤, `\\geq` for ≥
- `\\neq` for ≠, `\\approx` for ≈
- `\\infty` for ∞
- `\\partial` for partial derivatives
- `\\nabla` for gradient operator
CONTENT ORGANIZATION:
1. Title: Create a clear, descriptive title
2. Overview/Summary: Brief introduction to the topic
3. Main Sections: Organize content thematically with subheadings
4. Key Equations: Highlight important formulas in display math blocks
5. Examples: Include worked examples where applicable
6. Definitions: Clearly mark and format important definitions
QUALITY STANDARDS:
- Accuracy: Ensure all mathematical expressions are syntactically correct
- Readability: Balance detail with clarity
- Consistency: Use consistent formatting throughout
- Completeness: Don't omit important information from the transcript
EXAMPLE OUTPUT FORMAT:
```markdown
# Topic Title
## Overview
Brief description of the content covered.
## Key Concepts
### Concept 1
Explanation with inline math like $E = mc^2$ when appropriate.
Important formula:
$$
\\int_{{-\\infty}}^{{\\infty}} e^{{-x^2}} dx = \\sqrt{{\\pi}}
$$
### Concept 2
More content with proper LaTeX formatting.
## Examples
### Example 1
Step-by-step solution showing:
$$
\\frac{{d}}{{dx}}[x^n] = nx^{{n-1}}
$$
## Summary
Key takeaways and important formulas.
```
ERROR PREVENTION CHECKLIST
Before finalizing output, verify:
- All math expressions use proper LaTeX syntax
- Display math blocks are properly separated with blank lines
- Inline math doesn't break across lines
- Heading hierarchy is logical and consistent
- All mathematical symbols render correctly in KaTeX
- No raw transcript artifacts remain (e.g., "um", "uh", speaker names)
SPECIAL INSTRUCITONS:
- If the transcript contains unclear mathematical expressions, make reasonable interpretations based on context
- When in doubt about mathematical notation, choose the most standard LaTeX representation
- Preserve the logical flow and key insights from the original transcript
- Add clarifying context where the spoken word might be ambiguous in written form
- If equations are referenced verbally (e.g., "equation 1"), create numbered equations using `\\tag{{}}`
Remember: Your output will be processed by KaTeX, so all LaTeX must be compatible with KaTeX's supported functions and syntax.
THE TRANSCRIPT IS:
{raw_transcript}
"""
print(f"Sending request to Gemini 2.5 Flash for unified Markdown+LaTeX content generation...")
# Combine system message with user prompt for Gemini
system_instructions = """
You are a helpful assistant that generates structured academic notes in Markdown format with embedded LaTeX math expressions.
CRITICAL REQUIREMENTS:
1) MUST use proper math delimiters: single $ for inline math and double $$ for display math.
2) EVERY LaTeX function MUST start with a backslash (\\): use \\frac not frac, \\sin not sin, \\sum not sum, \\alpha not alpha.
3) ALWAYS add spaces around inline math: write 'as $x$ approaches $c$' NOT 'as$x$approaches$c$'.
4) Never use \\[...\\] or \\(...\\) or \\begin{{equation}}.
5) Always use KaTeX-compatible LaTeX syntax within Markdown structure.
REMEMBER: Missing backslashes will break math rendering!
"""
# Configure generation parameters
generation_config = genai.types.GenerateContentConfig(
system_instruction=system_instructions,
candidate_count=1,
max_output_tokens=8192, # Sufficient for detailed academic notes
temperature=0.1, # Low temperature for consistent, factual output
)
response = client.models.generate_content(
model="gemini-2.5-flash-preview-05-20",
contents=generation_prompt,
config=generation_config
)
# Check if the response was blocked
if response.candidates[0].finish_reason == 'SAFETY':
print("[WARNING] Response was blocked by safety filters, trying with higher temperature...")
# Retry with slightly higher temperature
generation_config.temperature = 0.3
response = client.models.generate_content(
model="gemini-2.5-flash-preview-05-20",
contents=generation_prompt,
config=generation_config
)
generated_content = response.text
print(f"Generated unified Markdown+LaTeX content successfully.")
# Post-process LaTeX content to fix common issues
generated_content = post_process_latex_content(generated_content)
print("Applied LaTeX post-processing.")
# --- Save to Supabase (notes table) ---
# Insert a new record into the 'notes' table
try:
# Check if a note already exists for this transcript_id using maybe_single()
existing_note_response = supabase.table('notes').select('id').eq('transcript_id', transcript_id).maybe_single().execute()
if existing_note_response:
# Update existing note
note_id = existing_note_response.data['id']
print(f"Note already exists for transcript {transcript_id}, updating note ID: {note_id}")
update_response = supabase.table('notes').update({
'content': generated_content, # Save generated content
'markdown_content': None # Markdown content is not saved for LaTeX notes
}).eq('id', note_id).execute()
else:
# Insert new note
print(f"No existing note for transcript {transcript_id}, inserting new note.")
insert_response = supabase.table('notes').insert({
'transcript_id': transcript_id,
'user_id': user_id, # Link note to user
'content': generated_content, # Save generated content
'markdown_content': None # Markdown content is not saved for LaTeX notes
}).execute()
# Update video status to indicate notes are generated
update_video_status(video_id, 'completed') # Assuming 'completed' means notes are ready
print(f"Updated video {video_id} status to 'completed'.")
return {
'statusCode': 200,
'body': json.dumps('Notes generated and saved successfully!')
}
except Exception as db_error:
print(f"Error saving generated notes to Supabase: {db_error}")
update_video_status(video_id, 'note_generation_failed', f'Error saving generated notes to Supabase: {db_error}')
return {
'statusCode': 500,
'body': json.dumps(f'Error saving generated notes to Supabase: {db_error}')
}
except Exception as gemini_error:
print(f"Error during Gemini API calls: {gemini_error}")
if video_id:
update_video_status(video_id, 'note_generation_failed', f'Error during Gemini API calls: {gemini_error}')
return {
'statusCode': 500,
'body': json.dumps(f'Error during Gemini API calls: {gemini_error}')
}
except Exception as e:
print(f"An unexpected error occurred: {e}")
if video_id:
update_video_status(video_id, 'note_generation_failed', f'An unexpected error occurred: {e}')
return {
'statusCode': 500,
'body': json.dumps(f'An unexpected error occurred: {e}')
}