feat(): entities extraction from pdf

lurenss · lurenss · commit 65e2bcfcb75c · 2024-08-29T16:09:54.000+02:00
Entities are inferred also from images, tables and schemas
diff --git a/.env.example b/.env.example
@@ -0,0 +1,2 @@
+# OpenAI API Key
+OPENAI_API_KEY=your_openai_api_key_here
diff --git a/SmartEntities.py b/SmartEntities.py
@@ -0,0 +1,66 @@
+import requests
+from pdf_processor import process_pdf
+from config import HEADERS, DIGRAPH_EXAMPLE
+
+def generate_digraph(base64_images):
+    page_answers = []
+    for page_num, base64_image in enumerate(base64_images, start=1):
+        payload = {
+            "model": "gpt-4o",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are an AI specialized in extracting structured information from documents. Your task is to analyze the provided image and generate a Graphviz digraph that represents the entities and their relationships found within. Focus on identifying key concepts, hierarchical structures, and relevant data points regardless of the document type. The digraph should be clear, well-organized, and follow the structure of the example provided. Ensure that all entities are properly connected, labeled, and reflect the content and relationships present in the document."
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": f"Generate a digraph like the following for the meaningful entities in this image, following this example: {DIGRAPH_EXAMPLE} (Page {page_num})"},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                    ]
+                }
+            ],
+        }
+
+        response = requests.post("https://api.openai.com/v1/chat/completions", headers=HEADERS, json=payload)
+        answer = response.json()['choices'][0]['message']['content']
+        page_answers.append(f"Page {page_num}: {answer}")
+        print(f"Processed page {page_num}")
+
+    return page_answers
+
+def merge_digraphs(page_answers):
+    digraph_prompt = "Merge the partial digraphs that I provide to you merging together all the detected entities, \n\n" + "\n\n".join(page_answers) + \
+        "\nYour answer digraph must be a tree and must contain only the code for a valid graphviz graph"
+    digraph_payload = {
+        "model": "gpt-4o",
+        "messages": [
+            {"role": "system", "content": "You are an AI that generates only valid digraph code without any comments before or after the generated code. At the end, it always shows the generated viz with dot.render('ontology_graph', format='png'). You have to provide a graph that takes as reference the following graph: {DIGRAPH_EXAMPLE}"},
+            {"role": "user", "content": digraph_prompt}
+        ],
+    }
+
+    digraph_response = requests.post("https://api.openai.com/v1/chat/completions", headers=HEADERS, json=digraph_payload)
+    digraph_code = digraph_response.json()['choices'][0]['message']['content']
+    return digraph_code
+
+def main():
+    pdf_path = './test.pdf'
+    base64_images = process_pdf(pdf_path)
+
+    if base64_images:
+        page_answers = generate_digraph(base64_images)
+        digraph_code = merge_digraphs(page_answers)
+
+        print("\nDigraph code for all pages:")
+        print(digraph_code[9:-3])
+        print("digraph_code_execution----------------------------------")
+        exec(digraph_code[9:-3])
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+
diff --git a/config.py b/config.py
@@ -0,0 +1,50 @@
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+API_KEY = os.getenv("OPENAI_API_KEY")
+
+HEADERS = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {API_KEY}"
+}
+
+DIGRAPH_EXAMPLE = """
+    from graphviz import Digraph
+
+    dot = Digraph(comment='Portfolio Structure')
+
+    # Root
+    dot.node('ROOT', 'ROOT\\nportfolio: object')
+
+    # Portfolio node
+    dot.node('portfolio', 'portfolio\\nname: string\\nseries: string\\nfees: object\\nwithdrawalRights: object\\n'
+                        'contactInformation: object\\nyearByYearReturns: object[]\\nbestWorstReturns: object[]\\n'
+                        'averageReturn: string\\ntargetInvestors: string[]\\ntaxInformation: string')
+
+    # Connect Root to Portfolio
+    dot.edge('ROOT', 'portfolio')
+
+    # Nodes under Portfolio
+    dot.node('fees', 'fees\\nsalesCharges: string\\nfundExpenses: object\\ntrailingCommissions: string')
+    dot.node('withdrawalRights', 'withdrawalRights\\ntimeLimit: string\\nconditions: string[]')
+    dot.node('contactInformation', 'contactInformation\\ncompanyName: string\\naddress: string\\nphone: string\\n'
+                                    'email: string\\nwebsite: string')
+    dot.node('yearByYearReturns', 'yearByYearReturns\\nyear: string\\nreturn: string')
+    dot.node('bestWorstReturns', 'bestWorstReturns\\ntype: string\\nreturn: string\\ndate: string\\ninvestmentValue: string')
+
+    # Connect Portfolio to its components
+    dot.edge('portfolio', 'fees')
+    dot.edge('portfolio', 'withdrawalRights')
+    dot.edge('portfolio', 'contactInformation')
+    dot.edge('portfolio', 'yearByYearReturns')
+    dot.edge('portfolio', 'bestWorstReturns')
+
+    # Sub-components
+    dot.node('fundExpenses', 'fundExpenses\\nmanagementExpenseRatio: string\\ntradingExpenseRatio: string\\n'
+                            'totalExpenses: string')
+
+    # Connect sub-components
+    dot.edge('fees', 'fundExpenses')
+"""
diff --git a/pdf_processor.py b/pdf_processor.py
@@ -0,0 +1,37 @@
+import base64
+import os
+import tempfile
+from pdf2image import convert_from_path
+from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+def load_pdf_as_images(pdf_path):
+    try:
+        images = convert_from_path(pdf_path)
+        return images
+    except (PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError) as e:
+        print(f"Error converting PDF: {e}")
+        return None
+
+def save_image_to_temp(image):
+    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp_file:
+        image.save(temp_file.name, 'JPEG')
+        return temp_file.name
+
+def process_pdf(pdf_path):
+    images = load_pdf_as_images(pdf_path)
+    if not images:
+        return None
+
+    base64_images = []
+    for page_num, image in enumerate(images, start=1):
+        temp_image_path = save_image_to_temp(image)
+        base64_image = encode_image(temp_image_path)
+        base64_images.append(base64_image)
+        os.unlink(temp_image_path)
+        print(f"Processed page {page_num}")
+
+    return base64_images
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,8 @@
-pypdf==4.3.1
-streamlit==1.37.1
+certifi==2024.7.4
+charset-normalizer==3.3.2
+idna==3.8
+pdf2image==1.17.0
+pillow==10.4.0
+python-dotenv==1.0.1
+requests==2.32.3
+urllib3==2.2.2
diff --git a/test.pdf b/test.pdf

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# OpenAI API Key`
	`2`	`+OPENAI_API_KEY=your_openai_api_key_here`