@@ -149,28 +149,34 @@ async def convert_markdown_chunks_to_cards(
149
149
If the conversion fails.
150
150
"""
151
151
for header_split in md_header_splits :
152
- try :
153
- card = ContentCreate (
154
- content_metadata = header_split .metadata ,
155
- context_text = header_split .page_content ,
156
- context_title = "--" .join (
152
+ num_sub_chunks = int (len (header_split .page_content ) / 2000 + 1 )
153
+ for i in range (num_sub_chunks ):
154
+ try :
155
+ title = "--" .join (
157
156
[str (v ) for v in header_split .metadata .values ()]
158
157
+ [header_split .page_content [:10 ]]
159
- ),
160
- context_tags = [tag_id ],
161
- )
162
- await save_content_to_db (
163
- asession = asession ,
164
- content = card ,
165
- exclude_archived = True ,
166
- workspace_id = workspace_id ,
167
- )
168
- except Exception as e :
169
- # TODO: this is a dumb way to handle errors in card creation
170
- raise HTTPException (
171
- status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
172
- detail = f"Failed to process PDF file: { e } " ,
173
- ) from e
158
+ )
159
+ metadata = header_split .metadata
160
+ metadata ["sub_chunk" ] = i
161
+
162
+ card = ContentCreate (
163
+ content_text = header_split .page_content [i * 2000 : (i + 1 ) * 2000 ],
164
+ content_title = title ,
165
+ content_metadata = metadata ,
166
+ context_tags = [tag_id ],
167
+ )
168
+ await save_content_to_db (
169
+ asession = asession ,
170
+ content = card ,
171
+ exclude_archived = True ,
172
+ workspace_id = workspace_id ,
173
+ )
174
+ except Exception as e :
175
+ # TODO: this is a dumb way to handle errors in card creation
176
+ raise HTTPException (
177
+ status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
178
+ detail = f"Failed to process PDF file: { e } " ,
179
+ ) from e
174
180
return {"detail" : "Cards saved successfully" }
175
181
176
182
@@ -207,15 +213,18 @@ async def process_pdf_file(
207
213
"""
208
214
# Update redis state operations
209
215
redis = request .app .state .redis
210
- job_status = redis .get (task_id )
211
- if job_status is None :
216
+ job_status = await redis .get (task_id )
217
+ if not job_status :
212
218
raise HTTPException (
213
219
status_code = status .HTTP_404_NOT_FOUND ,
214
220
detail = "Job not found" ,
215
221
)
216
- job_status = DocUploadResponse .model_validate (job_status )
217
- job_status .status = DocStatusEnum .in_progress
218
- redis .set (task_id , job_status .model_dump_json ())
222
+
223
+ job_status_pydantic = DocUploadResponse .model_validate (
224
+ json .loads (job_status .decode ("utf-8" ))
225
+ )
226
+ job_status_pydantic .status = DocStatusEnum .in_progress
227
+ await redis .set (task_id , job_status_pydantic .model_dump_json ())
219
228
220
229
# Process PDF
221
230
try :
@@ -229,14 +238,14 @@ async def process_pdf_file(
229
238
)
230
239
231
240
except Exception as e :
232
- job_status .status = DocStatusEnum .failed
233
- redis .set (task_id , job_status .model_dump_json ())
241
+ job_status_pydantic .status = DocStatusEnum .failed
242
+ await redis .set (task_id , job_status_pydantic .model_dump_json ())
234
243
raise HTTPException (
235
244
status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
236
245
detail = f"Failed to process PDF file: { e } " ,
237
246
) from e
238
- finally :
239
- job_status .status = DocStatusEnum .success
240
- redis .set (task_id , job_status .model_dump_json ())
247
+
248
+ job_status_pydantic .status = DocStatusEnum .success
249
+ await redis .set (task_id , job_status_pydantic .model_dump_json ())
241
250
242
251
return job_status
0 commit comments