Convert to pypdf

microsoft · Jan 7, 2025 · fc80618 · fc80618
1 parent 023de32
commit fc80618
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 110 deletions.
diff --git a/intelligence_toolkit/helpers/document_processor.py b/intelligence_toolkit/helpers/document_processor.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 import pandas as pd
 from json import dumps, loads
-import pdfplumber
+from pypdf import PdfReader
 import io
 from intelligence_toolkit.AI.text_splitter import TextSplitter
 
@@ -44,11 +44,11 @@ def add_chunks(filename, text, chunk_size):
                 add_chunks(filename, text, chunk_size)
         elif filename.endswith(".pdf"):
             page_texts = []
-            bytes = open(filepath, "rb").read()
-            pdf_reader = pdfplumber.open(io.BytesIO(bytes))
-            for px in range(len(pdf_reader.pages)):
-                page_text = pdf_reader.pages[px].extract_text()
-                page_texts.append(page_text)
+            pdf_reader = PdfReader(filepath)
+            num_pages = pdf_reader.get_num_pages()
+            for px in range(num_pages):
+                page = pdf_reader.pages[px]
+                page_texts.append(page.extract_text())
             text = " ".join(page_texts)
             add_chunks(filename, text, chunk_size)
         else:

diff --git a/intelligence_toolkit/query_text_data/input_processor.py b/intelligence_toolkit/query_text_data/input_processor.py
@@ -8,9 +8,6 @@
 from json import dumps, loads
 
 import networkx as nx
-import pandas as pd
-import pdfplumber
-
 import intelligence_toolkit.query_text_data.graph_builder as graph_builder
 from intelligence_toolkit.AI.text_splitter import TextSplitter
 from intelligence_toolkit.query_text_data.classes import ProcessedChunks
@@ -29,52 +26,6 @@ def concert_titled_texts_to_chunks(titled_texts):
             text_to_chunks[title] = text_chunks
     return text_to_chunks
 
-# def convert_file_bytes_to_chunks(
-#     input_file_bytes,
-#     analysis_window_size: PeriodOption = PeriodOption.NONE,
-#     callbacks=[],
-# ):
-#     text_to_chunks = defaultdict(list)
-#     splitter = TextSplitter()
-#     for fx, file_name in enumerate(input_file_bytes.keys()):
-#         old_file_name = file_name
-#         file_name = file_name.replace("(", "").replace(")", "").replace(" ", "_")
-#         for cb in callbacks:
-#             cb.on_batch_change(fx + 1, len(input_file_bytes.keys()))
-#         bytes = input_file_bytes[old_file_name]
-
-#         if file_name.endswith(".csv"):
-#             df = pd.read_csv(io.BytesIO(bytes))
-#             text_to_chunks = convert_df_to_chunks(df, file_name)
-#         else:
-#             if file_name.endswith(".pdf"):
-#                 page_texts = []
-#                 pdf_reader = pdfplumber.open(io.BytesIO(bytes))
-#                 for px in range(len(pdf_reader.pages)):
-#                     page_text = pdf_reader.pages[px].extract_text()
-#                     page_texts.append(page_text)
-#                 doc_text = " ".join(page_texts)
-#             elif file_name.endswith(".json"):
-#                 text_chunks = process_json_text(
-#                     loads(bytes.decode("utf-8")), analysis_window_size
-#                 )
-#             else:
-#                 doc_text = bytes.decode("utf-8")
-
-#             if not file_name.endswith(".json"):
-#                 text_chunks = splitter.split(doc_text)
-#                 for index, text in enumerate(text_chunks):
-#                     chunk = {
-#                         "title": file_name,
-#                         "text_chunk": text,
-#                         "chunk_id": index + 1,
-#                     }
-#                     text_chunks[index] = dumps(chunk, indent=2, ensure_ascii=False)
-
-#             text_to_chunks[file_name] = text_chunks
-#     return text_to_chunks
-
-
 def process_json_text(text_json, period: PeriodOption):
     def convert_to_year_quarter(datetm):
         month = datetm.month

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,6 @@ networkx = "3.3"
 numpy = "1.26.4"
 openai = "^1.37.1"
 pac-synth = "0.0.8"
-pdfplumber = "0.11.2"
 plotly = "5.22.0"
 plotly-express = "0.4.1"
 polars = "0.20.10"
@@ -58,6 +57,7 @@ torch = [
 sentence-transformers = "^3.1.1"
 graspologic = "^3.4.1"
 future = "^1.0.0"
+pypdf = "^5.1.0"
 
 [tool.poetry.group.dev.dependencies]
 newspaper3k = "^0.2.8"