Skip to content

Commit 65e2bcf

Browse files
committed
feat(): entities extraction from pdf
Entities are inferred also from images, tables and schemas
1 parent d736ca8 commit 65e2bcf

File tree

6 files changed

+163
-2
lines changed

6 files changed

+163
-2
lines changed

.env.example

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# OpenAI API Key
2+
OPENAI_API_KEY=your_openai_api_key_here

SmartEntities.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import requests
2+
from pdf_processor import process_pdf
3+
from config import HEADERS, DIGRAPH_EXAMPLE
4+
5+
def generate_digraph(base64_images):
6+
page_answers = []
7+
for page_num, base64_image in enumerate(base64_images, start=1):
8+
payload = {
9+
"model": "gpt-4o",
10+
"messages": [
11+
{
12+
"role": "system",
13+
"content": "You are an AI specialized in extracting structured information from documents. Your task is to analyze the provided image and generate a Graphviz digraph that represents the entities and their relationships found within. Focus on identifying key concepts, hierarchical structures, and relevant data points regardless of the document type. The digraph should be clear, well-organized, and follow the structure of the example provided. Ensure that all entities are properly connected, labeled, and reflect the content and relationships present in the document."
14+
},
15+
{
16+
"role": "user",
17+
"content": [
18+
{"type": "text", "text": f"Generate a digraph like the following for the meaningful entities in this image, following this example: {DIGRAPH_EXAMPLE} (Page {page_num})"},
19+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
20+
]
21+
}
22+
],
23+
}
24+
25+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=HEADERS, json=payload)
26+
answer = response.json()['choices'][0]['message']['content']
27+
page_answers.append(f"Page {page_num}: {answer}")
28+
print(f"Processed page {page_num}")
29+
30+
return page_answers
31+
32+
def merge_digraphs(page_answers):
33+
digraph_prompt = "Merge the partial digraphs that I provide to you merging together all the detected entities, \n\n" + "\n\n".join(page_answers) + \
34+
"\nYour answer digraph must be a tree and must contain only the code for a valid graphviz graph"
35+
digraph_payload = {
36+
"model": "gpt-4o",
37+
"messages": [
38+
{"role": "system", "content": "You are an AI that generates only valid digraph code without any comments before or after the generated code. At the end, it always shows the generated viz with dot.render('ontology_graph', format='png'). You have to provide a graph that takes as reference the following graph: {DIGRAPH_EXAMPLE}"},
39+
{"role": "user", "content": digraph_prompt}
40+
],
41+
}
42+
43+
digraph_response = requests.post("https://api.openai.com/v1/chat/completions", headers=HEADERS, json=digraph_payload)
44+
digraph_code = digraph_response.json()['choices'][0]['message']['content']
45+
return digraph_code
46+
47+
def main():
48+
pdf_path = './test.pdf'
49+
base64_images = process_pdf(pdf_path)
50+
51+
if base64_images:
52+
page_answers = generate_digraph(base64_images)
53+
digraph_code = merge_digraphs(page_answers)
54+
55+
print("\nDigraph code for all pages:")
56+
print(digraph_code[9:-3])
57+
print("digraph_code_execution----------------------------------")
58+
exec(digraph_code[9:-3])
59+
60+
if __name__ == "__main__":
61+
main()
62+
63+
64+
65+
66+

config.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from dotenv import load_dotenv
2+
import os
3+
4+
load_dotenv()
5+
6+
API_KEY = os.getenv("OPENAI_API_KEY")
7+
8+
HEADERS = {
9+
"Content-Type": "application/json",
10+
"Authorization": f"Bearer {API_KEY}"
11+
}
12+
13+
DIGRAPH_EXAMPLE = """
14+
from graphviz import Digraph
15+
16+
dot = Digraph(comment='Portfolio Structure')
17+
18+
# Root
19+
dot.node('ROOT', 'ROOT\\nportfolio: object')
20+
21+
# Portfolio node
22+
dot.node('portfolio', 'portfolio\\nname: string\\nseries: string\\nfees: object\\nwithdrawalRights: object\\n'
23+
'contactInformation: object\\nyearByYearReturns: object[]\\nbestWorstReturns: object[]\\n'
24+
'averageReturn: string\\ntargetInvestors: string[]\\ntaxInformation: string')
25+
26+
# Connect Root to Portfolio
27+
dot.edge('ROOT', 'portfolio')
28+
29+
# Nodes under Portfolio
30+
dot.node('fees', 'fees\\nsalesCharges: string\\nfundExpenses: object\\ntrailingCommissions: string')
31+
dot.node('withdrawalRights', 'withdrawalRights\\ntimeLimit: string\\nconditions: string[]')
32+
dot.node('contactInformation', 'contactInformation\\ncompanyName: string\\naddress: string\\nphone: string\\n'
33+
'email: string\\nwebsite: string')
34+
dot.node('yearByYearReturns', 'yearByYearReturns\\nyear: string\\nreturn: string')
35+
dot.node('bestWorstReturns', 'bestWorstReturns\\ntype: string\\nreturn: string\\ndate: string\\ninvestmentValue: string')
36+
37+
# Connect Portfolio to its components
38+
dot.edge('portfolio', 'fees')
39+
dot.edge('portfolio', 'withdrawalRights')
40+
dot.edge('portfolio', 'contactInformation')
41+
dot.edge('portfolio', 'yearByYearReturns')
42+
dot.edge('portfolio', 'bestWorstReturns')
43+
44+
# Sub-components
45+
dot.node('fundExpenses', 'fundExpenses\\nmanagementExpenseRatio: string\\ntradingExpenseRatio: string\\n'
46+
'totalExpenses: string')
47+
48+
# Connect sub-components
49+
dot.edge('fees', 'fundExpenses')
50+
"""

pdf_processor.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import base64
2+
import os
3+
import tempfile
4+
from pdf2image import convert_from_path
5+
from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError
6+
7+
def encode_image(image_path):
8+
with open(image_path, "rb") as image_file:
9+
return base64.b64encode(image_file.read()).decode('utf-8')
10+
11+
def load_pdf_as_images(pdf_path):
12+
try:
13+
images = convert_from_path(pdf_path)
14+
return images
15+
except (PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError) as e:
16+
print(f"Error converting PDF: {e}")
17+
return None
18+
19+
def save_image_to_temp(image):
20+
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp_file:
21+
image.save(temp_file.name, 'JPEG')
22+
return temp_file.name
23+
24+
def process_pdf(pdf_path):
25+
images = load_pdf_as_images(pdf_path)
26+
if not images:
27+
return None
28+
29+
base64_images = []
30+
for page_num, image in enumerate(images, start=1):
31+
temp_image_path = save_image_to_temp(image)
32+
base64_image = encode_image(temp_image_path)
33+
base64_images.append(base64_image)
34+
os.unlink(temp_image_path)
35+
print(f"Processed page {page_num}")
36+
37+
return base64_images

requirements.txt

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,8 @@
1-
pypdf==4.3.1
2-
streamlit==1.37.1
1+
certifi==2024.7.4
2+
charset-normalizer==3.3.2
3+
idna==3.8
4+
pdf2image==1.17.0
5+
pillow==10.4.0
6+
python-dotenv==1.0.1
7+
requests==2.32.3
8+
urllib3==2.2.2

test.pdf

254 KB
Binary file not shown.

0 commit comments

Comments
 (0)