Skip to content

Commit 286103c

Browse files
Merge pull request #2 from IDinsight/ingestion-tidy
More ingestion tidying
2 parents 0742cd6 + f0bf7ad commit 286103c

File tree

3 files changed

+42
-15
lines changed

3 files changed

+42
-15
lines changed

backend/app/ingestion/models.py

+9-14
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ class DocumentDB(Base):
8989
countries: Mapped[Optional[list[str]]] = mapped_column(JSONB, nullable=True)
9090
organizations: Mapped[Optional[list[str]]] = mapped_column(JSONB, nullable=True)
9191
regions: Mapped[Optional[list[str]]] = mapped_column(JSONB, nullable=True)
92-
notes: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
9392
drive_link: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
9493
year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
9594
date_added: Mapped[Optional[datetime]] = mapped_column(
@@ -108,8 +107,8 @@ async def save_document_to_db(
108107
asession: AsyncSession,
109108
metadata: dict,
110109
pdf_url: str,
111-
summary: str = None,
112-
title: str = None,
110+
summary: str,
111+
title: str,
113112
document_id: int,
114113
) -> None:
115114
"""
@@ -127,7 +126,6 @@ async def save_document_to_db(
127126
countries = metadata.get("Countries", [])
128127
organizations = metadata.get("Organization(s)", [])
129128
regions = metadata.get("Region(s)", [])
130-
notes = metadata.get("Notes", "")
131129
drive_link = metadata.get("Drive link", "")
132130
year = metadata.get("Year", None)
133131
document_id = metadata.get("ID", None)
@@ -152,9 +150,8 @@ async def save_document_to_db(
152150
countries=countries,
153151
organizations=organizations,
154152
regions=regions,
155-
notes=notes,
156153
drive_link=drive_link,
157-
year=metadata.get("Year"),
154+
year=year,
158155
date_added=date_added,
159156
document_id=document_id, # Using the document_id passed to the function
160157
pdf_url=pdf_url,
@@ -171,7 +168,7 @@ async def save_document_to_db(
171168
extracted_qa_pairs = []
172169

173170
qa_pairs = []
174-
for qa_idx, qa_pair in enumerate(extracted_qa_pairs):
171+
for _, qa_pair in enumerate(extracted_qa_pairs):
175172
try:
176173
question = qa_pair.get("question", "")
177174
answers = qa_pair.get("answers", [])
@@ -220,13 +217,11 @@ async def save_single_document(metadata: Dict[str, Any]) -> bool:
220217
file_name=metadata["file_name"],
221218
processed_pages=metadata["processed_pages"],
222219
asession=asession,
223-
metadata=metadata.get("fields", {}),
224-
pdf_url=metadata.get("pdf_url", ""), # Use empty string as fallback
225-
title=metadata.get("survey_name"),
226-
summary=metadata.get("summary"),
227-
document_id=metadata.get(
228-
"document_id"
229-
), # Pass document_id from metadata
220+
metadata=metadata["fields"],
221+
pdf_url=metadata["pdf_url"],
222+
title=metadata["survey_name"],
223+
summary=metadata["summary"],
224+
document_id=metadata["document_id"],
230225
)
231226
logger.info(
232227
f"File '{metadata['file_name']}' processed and saved successfully."

backend/app/ingestion/routers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333

3434
@router.post("/airtable/refresh", response_model=AirtableIngestionResponse)
35-
async def airtable_refresh_and_ingest():
35+
async def airtable_refresh_and_ingest() -> AirtableIngestionResponse:
3636
"""
3737
Refresh the list of documents by comparing Airtable 'ID' fields with database
3838
'document_id's. Automatically ingest the missing documents.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""remove_notes_column
2+
3+
Revision ID: 62342bab6861
4+
Revises: aaa104d6c3ed
5+
Create Date: 2025-03-05 11:34:59.706611
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = "62342bab6861"
16+
down_revision: Union[str, None] = "aaa104d6c3ed"
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
# ### commands auto generated by Alembic - please adjust! ###
23+
op.drop_column("documents", "notes")
24+
# ### end Alembic commands ###
25+
26+
27+
def downgrade() -> None:
28+
# ### commands auto generated by Alembic - please adjust! ###
29+
op.add_column(
30+
"documents", sa.Column("notes", sa.TEXT(), autoincrement=False, nullable=True)
31+
)
32+
# ### end Alembic commands ###

0 commit comments

Comments
 (0)