-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_extractor.py
More file actions
101 lines (81 loc) · 3.47 KB
/
pdf_extractor.py
File metadata and controls
101 lines (81 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import tkinter as tk
from tkinter import messagebox
from typing import List, Tuple
import fitz # install with 'pip install pymupdf'
import re
def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
points = annot.vertices
quad_count = int(len(points) / 4)
sentences = []
for i in range(quad_count):
# where the highlighted part is
r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect
words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
sentences.append(" ".join(w[4] for w in words))
sentence = " ".join(sentences)
return sentence
def handle_page(page):
wordlist = page.get_text("words") # list of words on page
wordlist.sort(key=lambda w: (w[1], w[0])) # ascending y, then x
highlights = []
annot = page.first_annot
while annot:
if annot.type[0] == 8:
highlights.append(_parse_highlight(annot, wordlist))
annot = annot.next
return highlights
def extract_info(lst):
# Regular expression to match the required information
regex1 = r'(?P<Name_of_Insured>[A-Z][a-z]+ [A-Z][a-z]+)\D+(?P<Policy_Number>\d+)\D+(?P<Effective_Date>[A-Z][a-z]+\s\d{1,2},\s\d{4})\D+(?P<Expiry_Date>[A-Z][a-z]+\s\d{1,2},\s\d{4})'
regex2 = r'(?P<Effective_Date>\d{4}/\d{2}/\d{2}),\s*(?P<Expiry_Date>\d{4}/\d{2}/\d{2}),\s*(?P<Name_of_Insured>[A-Z][a-z]+\s[A-Z][a-z]+).*?\b(?P<Policy_Number>\d{6,})\b'
# Extract the information from the list using regex1
data_str = ' '.join(lst)
match = re.search(regex1, data_str)
if match:
data_dict = match.groupdict()
return data_dict
# If regex1 did not match, try regex2
match = re.search(regex2, data_str)
if match:
data_dict = match.groupdict()
return data_dict
# If neither regex matched, return None
return None
def main(filepath: str) -> dict:
doc = fitz.open(filepath)
highlights = []
for page in doc:
highlights += handle_page(page)
return extract_info(highlights)
def process_pdf(filepath: str):
result = main(filepath)
if result:
# Prepare the formatted output
output = ''
output += '----------------------------------------------------------------------------------------\n'
output += 'Sample PDF Output:\n\n'
output += f"Name of Insured: {result['Name_of_Insured']}\n"
output += f"Policy Number: {result['Policy_Number']}\n"
output += f"Effective Date: {result['Effective_Date']}\n"
output += f"Expiry Date: {result['Expiry_Date']}\n"
output += '----------------------------------------------------------------------------------------\n'
output += "Extracted Information (Dictionary):\n"
output += str(result)
# Create the GUI window
window = tk.Tk()
window.title("PDF Extractor")
# Create a label to display the output
output_label = tk.Label(window, text=output, justify="left")
output_label.pack(padx=10, pady=10)
# Create a button to close the application
close_button = tk.Button(window, text="Close", command=window.quit)
close_button.pack(padx=10, pady=10)
# Run the GUI event loop
window.mainloop()
# Prompt the user to select a PDF file
from tkinter import filedialog
file_path = filedialog.askopenfilename(title="Select a PDF file")
if file_path:
process_pdf(file_path)
else:
messagebox.showerror("Error", "No PDF file selected.")