Skip to content

Commit b4427e1

Browse files
working version of document ingestion
1 parent f49532f commit b4427e1

File tree

5 files changed

+70
-38
lines changed

5 files changed

+70
-38
lines changed

core_backend/app/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
contents,
1717
dashboard,
1818
data_api,
19+
docmuncher,
1920
question_answer,
2021
tags,
2122
urgency_detection,
@@ -161,6 +162,7 @@ def create_app() -> FastAPI:
161162
app.include_router(contents.router)
162163
app.include_router(dashboard.router)
163164
app.include_router(data_api.router)
165+
app.include_router(docmuncher.router)
164166
app.include_router(question_answer.router)
165167
app.include_router(tags.router)
166168
app.include_router(urgency_detection.router)
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""Package initialization for the FastAPI application.
2+
3+
This module imports and exposes key components required for API routing, including the
4+
main FastAPI router and metadata tags used for API documentation.
5+
6+
Exports:
7+
- `router`: The main FastAPI APIRouter instance containing all route definitions.
8+
- `TAG_METADATA`: Metadata describing API tags for better documentation.
9+
10+
These components can be imported directly from the package for use in the application.
11+
"""
12+
13+
from .routers import TAG_METADATA, router
14+
15+
__all__ = ["router", "TAG_METADATA"]

core_backend/app/docmuncher/dependencies.py

+39-30
Original file line numberDiff line numberDiff line change
@@ -149,28 +149,34 @@ async def convert_markdown_chunks_to_cards(
149149
If the conversion fails.
150150
"""
151151
for header_split in md_header_splits:
152-
try:
153-
card = ContentCreate(
154-
content_metadata=header_split.metadata,
155-
context_text=header_split.page_content,
156-
context_title="--".join(
152+
num_sub_chunks = int(len(header_split.page_content) / 2000 + 1)
153+
for i in range(num_sub_chunks):
154+
try:
155+
title = "--".join(
157156
[str(v) for v in header_split.metadata.values()]
158157
+ [header_split.page_content[:10]]
159-
),
160-
context_tags=[tag_id],
161-
)
162-
await save_content_to_db(
163-
asession=asession,
164-
content=card,
165-
exclude_archived=True,
166-
workspace_id=workspace_id,
167-
)
168-
except Exception as e:
169-
# TODO: this is a dumb way to handle errors in card creation
170-
raise HTTPException(
171-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
172-
detail=f"Failed to process PDF file: {e}",
173-
) from e
158+
)
159+
metadata = header_split.metadata
160+
metadata["sub_chunk"] = i
161+
162+
card = ContentCreate(
163+
content_text=header_split.page_content[i * 2000 : (i + 1) * 2000],
164+
content_title=title,
165+
content_metadata=metadata,
166+
context_tags=[tag_id],
167+
)
168+
await save_content_to_db(
169+
asession=asession,
170+
content=card,
171+
exclude_archived=True,
172+
workspace_id=workspace_id,
173+
)
174+
except Exception as e:
175+
# TODO: this is a dumb way to handle errors in card creation
176+
raise HTTPException(
177+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
178+
detail=f"Failed to process PDF file: {e}",
179+
) from e
174180
return {"detail": "Cards saved successfully"}
175181

176182

@@ -207,15 +213,18 @@ async def process_pdf_file(
207213
"""
208214
# Update redis state operations
209215
redis = request.app.state.redis
210-
job_status = redis.get(task_id)
211-
if job_status is None:
216+
job_status = await redis.get(task_id)
217+
if not job_status:
212218
raise HTTPException(
213219
status_code=status.HTTP_404_NOT_FOUND,
214220
detail="Job not found",
215221
)
216-
job_status = DocUploadResponse.model_validate(job_status)
217-
job_status.status = DocStatusEnum.in_progress
218-
redis.set(task_id, job_status.model_dump_json())
222+
223+
job_status_pydantic = DocUploadResponse.model_validate(
224+
json.loads(job_status.decode("utf-8"))
225+
)
226+
job_status_pydantic.status = DocStatusEnum.in_progress
227+
await redis.set(task_id, job_status_pydantic.model_dump_json())
219228

220229
# Process PDF
221230
try:
@@ -229,14 +238,14 @@ async def process_pdf_file(
229238
)
230239

231240
except Exception as e:
232-
job_status.status = DocStatusEnum.failed
233-
redis.set(task_id, job_status.model_dump_json())
241+
job_status_pydantic.status = DocStatusEnum.failed
242+
await redis.set(task_id, job_status_pydantic.model_dump_json())
234243
raise HTTPException(
235244
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
236245
detail=f"Failed to process PDF file: {e}",
237246
) from e
238-
finally:
239-
job_status.status = DocStatusEnum.success
240-
redis.set(task_id, job_status.model_dump_json())
247+
248+
job_status_pydantic.status = DocStatusEnum.success
249+
await redis.set(task_id, job_status_pydantic.model_dump_json())
241250

242251
return job_status

core_backend/app/docmuncher/routers.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
from fastapi import (
55
APIRouter,
6-
BackgroundTasks,
76
Depends,
87
File,
98
HTTPException,
@@ -34,18 +33,18 @@
3433
"description": "_Requires user login._ Document management to create content",
3534
}
3635

37-
router = APIRouter(prefix="/content", tags=[TAG_METADATA["name"]])
36+
router = APIRouter(prefix="/docmuncher", tags=[TAG_METADATA["name"]])
3837
logger = setup_logger()
3938

4039

4140
@router.post("/upload", response_model=DocUploadResponse)
4241
async def upload_document(
4342
request: Request,
43+
# background_tasks: BackgroundTasks,
4444
file: Annotated[UploadFile, File(...)],
4545
calling_user_db: Annotated[UserDB, Depends(get_current_user)],
4646
workspace_name: Annotated[str, Depends(get_current_workspace_name)],
4747
asession: AsyncSession = Depends(get_async_session),
48-
background_tasks: BackgroundTasks = Depends(),
4948
) -> DocUploadResponse:
5049
"""Upload document to create content.
5150
@@ -119,26 +118,34 @@ async def upload_document(
119118
created_datetime_utc=created_datetime_utc,
120119
status=DocStatusEnum.not_started,
121120
)
122-
redis.set(task_id, task_status.model_dump_json())
121+
await redis.set(task_id, task_status.model_dump_json())
123122

124123
# Start background task
125-
background_tasks.add_task(
126-
process_pdf_file,
124+
await process_pdf_file(
127125
request=request,
128126
task_id=task_id,
129127
file=file,
130128
tag_id=tag_db.tag_id,
131129
workspace_id=workspace_db.workspace_id,
132130
asession=asession,
133131
)
132+
# background_tasks.add_task(
133+
# process_pdf_file,
134+
# request=request,
135+
# task_id=task_id,
136+
# file=file,
137+
# tag_id=tag_db.tag_id,
138+
# workspace_id=workspace_db.workspace_id,
139+
# asession=asession,
140+
# )
134141

135142
return task_status
136143

137144

138145
@router.get("/status", response_model=DocUploadResponse)
139146
async def get_doc_ingestion_status(
140147
request: Request,
141-
ingestion_job_id: str,
148+
ingestion_job_id: int,
142149
calling_user_db: Annotated[UserDB, Depends(get_current_user)],
143150
workspace_name: Annotated[str, Depends(get_current_workspace_name)],
144151
asession: AsyncSession = Depends(get_async_session),

core_backend/app/docmuncher/schemas.py

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ class DocStatusEnum(str, Enum):
1616
class DocUploadResponse(BaseModel):
1717
"""Pydantic model for document upload response."""
1818

19-
doc_id: int
2019
doc_name: str
2120
ingestion_job_id: int
2221
created_datetime_utc: datetime

0 commit comments

Comments
 (0)