-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
45 lines (28 loc) · 1.3 KB
/
main.py
File metadata and controls
45 lines (28 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from src.text_extractor import extract_text_from_pdf
from src.table_extractor import extract_tables_from_pdf
from pathlib import Path
# Define the input directory containing PDF files
input_dir = Path("data/inputs")
# List all PDF files in the input directory
pdf_files = input_dir.glob("*.pdf")
# Define the output directory for extracted data
output_dir = Path("data/outputs")
output_dir.mkdir(parents=True, exist_ok=True)
# Print the stem (filename without extension) of each PDF file
for pdf_path in pdf_files:
base_name = pdf_path.stem # Use for naming output files
pages = extract_text_from_pdf(pdf_path)
tables = extract_tables_from_pdf(pdf_path, pages='all')
# Save extracted text to a .txt file
text_output_path = output_dir / f"{base_name}_text.txt"
# Write extracted text to file
with open(text_output_path, "w", encoding="utf-8") as f:
# Write each page's text to the file
for page_text in pages:
f.write(page_text)
f.write("\n\n")
# Save extracted tables to CSV files
for i, df in enumerate(tables):
# Save each table as a separate CSV file
table_output_path = output_dir / f"{base_name}_table_{i+1}.csv"
df.to_csv(table_output_path, sep=";", decimal=",", index=False)