Fixes QTD multi-file upload and replaces pdfplumber with pypdf (#84)

* New unstructured data prep * Removing unstructured * pdf processing * Updating UI workflow * Convert to pypdf --------- Co-authored-by: Darren Edge <[email protected]>
microsoft · Jan 16, 2025 · 69abdfa · 69abdfa
1 parent 276436f
commit 69abdfa
Show file tree

Hide file tree

Showing 7 changed files with 963 additions and 781 deletions.
diff --git a/app/workflows/query_text_data/workflow.py b/app/workflows/query_text_data/workflow.py
@@ -97,10 +97,17 @@ async def create(sv: SessionVariables, workflow=None):
                 file_pb, file_callback = functions.create_progress_callback(
                     "Loaded {} of {} files..."
                 )
+                for file in files:
+                    with open(file.name, "wb") as f:
+                        f.write(file.getbuffer())
+                input_files={file.name for file in files}
                 qtd.process_data_from_files(
-                    input_file_bytes={file.name: file.getvalue() for file in files},
+                    input_files=input_files,
+                    chunk_size=CHUNK_SIZE,
                     callbacks=[file_callback],
                 )
+                for file in input_files:
+                    os.remove(file)
                 file_pb.empty()
             else:
                 qtd.import_chunks_from_str(file_chunks)

diff --git a/example_notebooks/query_text_data.ipynb b/example_notebooks/query_text_data.ipynb
diff --git a/intelligence_toolkit/helpers/document_processor.py b/intelligence_toolkit/helpers/document_processor.py
@@ -0,0 +1,58 @@
+
+from collections import defaultdict
+import pandas as pd
+from json import dumps, loads
+from pypdf import PdfReader
+import io
+from intelligence_toolkit.AI.text_splitter import TextSplitter
+
+def convert_files_to_chunks(
+    input_filepaths,
+    chunk_size,
+    callbacks=[],
+):
+    text_to_chunks = defaultdict(list)
+
+    def add_chunks(filename, text, chunk_size):
+        splitter = TextSplitter(chunk_size=chunk_size)
+        text_chunks = splitter.split(text)
+        for index, text in enumerate(text_chunks):
+            chunk = {"title": filename, "text_chunk": text, "chunk_id": index + 1}
+            text_to_chunks[filename].append(dumps(chunk, indent=2, ensure_ascii=False))
+
+    for fx, filepath in enumerate(input_filepaths):
+        filename = filepath.split("/")[-1]
+        filename = filename.replace("(", "").replace(")", "").replace(" ", "_")
+        for cb in callbacks:
+            cb.on_batch_change(fx + 1, len(input_filepaths))
+
+        if filename.endswith(".csv"):
+            df = pd.read_csv(filepath)
+            cols = df.columns.values
+            for ix, row in df.iterrows():
+                rec_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
+                add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
+        elif filename.endswith(".json"):
+            json_obj = loads(open(filepath).read())
+            # check if json_obj is a list
+            if isinstance(json_obj, list):
+                for ix, js_rec in enumerate(json_obj):
+                    rec_text = dumps(js_rec)
+                    add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
+            else:
+                text = dumps(json_obj)
+                add_chunks(filename, text, chunk_size)
+        elif filename.endswith(".pdf"):
+            page_texts = []
+            pdf_reader = PdfReader(filepath)
+            num_pages = pdf_reader.get_num_pages()
+            for px in range(num_pages):
+                page = pdf_reader.pages[px]
+                page_texts.append(page.extract_text())
+            text = " ".join(page_texts)
+            add_chunks(filename, text, chunk_size)
+        else:
+            text = open(filepath).read()
+            add_chunks(filename, text, chunk_size)
+
+    return text_to_chunks
diff --git a/intelligence_toolkit/query_text_data/api.py b/intelligence_toolkit/query_text_data/api.py
@@ -14,6 +14,7 @@
 import intelligence_toolkit.query_text_data.prompts as prompts
 import intelligence_toolkit.query_text_data.query_rewriter as query_rewriter
 import intelligence_toolkit.query_text_data.relevance_assessor as relevance_assessor
+import intelligence_toolkit.helpers.document_processor as document_processor
 from intelligence_toolkit.AI.base_embedder import BaseEmbedder
 from intelligence_toolkit.AI.client import OpenAIClient
 from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
@@ -89,42 +90,28 @@ def set_embedder(self, text_embedder: BaseEmbedder) -> None:
         """
         self.text_embedder = text_embedder
 
-    def process_data_from_df(
-        self, df: pd.DataFrame, label: str
-    ) -> dict[str, list[str]]:
-        """
-        Process data from a DataFrame.
-
-        Args:
-            df (pd.DataFrame): The DataFrame
-            label (str): The label (e.g., filename) used as the prefix for the chunk names
-
-        Returns:
-            dict[str, list[str]]: The label to chunks mapping
-        """
-        self.label_to_chunks = input_processor.convert_df_to_chunks(df, label)
-        self.stage = QueryTextDataStage.CHUNKS_CREATED
-        return self.label_to_chunks
-
     def process_data_from_files(
         self,
-        input_file_bytes: bytes,
-        analysis_window_size: input_processor.PeriodOption = input_processor.PeriodOption.NONE,
+        input_files: list[str],
+        chunk_size: int = 1000,
         callbacks: list = [],
     ) -> dict[str, list[str]]:
         """
         Process data from files.
 
         Args:
-            input_file_bytes (bytes): The input file bytes
-            analysis_window_size (input_processor.PeriodOption): The analysis window size
+            input_files (str): The list of input files
+            new_after_n_chars (int): The minimum partition size (characters)
+            max_characters (int): The maximum partition size (characters)
             callbacks (list): The list of callbacks
 
         Returns:
             dict[str, list[str]]: The label to chunks mapping
         """
-        self.label_to_chunks = input_processor.convert_file_bytes_to_chunks(
-            input_file_bytes, analysis_window_size, callbacks
+        self.label_to_chunks = document_processor.convert_files_to_chunks(
+            input_files,
+            chunk_size=chunk_size,
+            callbacks=callbacks
         )
         self.stage = QueryTextDataStage.CHUNKS_CREATED
         return self.label_to_chunks

diff --git a/intelligence_toolkit/query_text_data/input_processor.py b/intelligence_toolkit/query_text_data/input_processor.py
@@ -8,9 +8,6 @@
 from json import dumps, loads
 
 import networkx as nx
-import pandas as pd
-import pdfplumber
-
 import intelligence_toolkit.query_text_data.graph_builder as graph_builder
 from intelligence_toolkit.AI.text_splitter import TextSplitter
 from intelligence_toolkit.query_text_data.classes import ProcessedChunks
@@ -29,70 +26,6 @@ def concert_titled_texts_to_chunks(titled_texts):
             text_to_chunks[title] = text_chunks
     return text_to_chunks
 
-
-def convert_df_to_chunks(df, label):
-    label = label.replace("(", "").replace(")", "").replace(" ", "_")
-    splitter = TextSplitter()
-    text_to_chunks = defaultdict(list)
-    for ix, row in df.iterrows():
-        cols = df.columns.values
-        doc_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
-        text_chunks = splitter.split(doc_text)
-        for index, text in enumerate(text_chunks):
-            this_label = f"{label}_{ix + 1}"
-            chunk = {"title": this_label, "text_chunk": text, "chunk_id": index + 1}
-            text_to_chunks[this_label].append(
-                dumps(chunk, indent=2, ensure_ascii=False)
-            )
-    return text_to_chunks
-
-
-def convert_file_bytes_to_chunks(
-    input_file_bytes,
-    analysis_window_size: PeriodOption = PeriodOption.NONE,
-    callbacks=[],
-):
-    text_to_chunks = defaultdict(list)
-    splitter = TextSplitter()
-    for fx, file_name in enumerate(input_file_bytes.keys()):
-        old_file_name = file_name
-        file_name = file_name.replace("(", "").replace(")", "").replace(" ", "_")
-        for cb in callbacks:
-            cb.on_batch_change(fx + 1, len(input_file_bytes.keys()))
-        bytes = input_file_bytes[old_file_name]
-
-        if file_name.endswith(".csv"):
-            df = pd.read_csv(io.BytesIO(bytes))
-            text_to_chunks = convert_df_to_chunks(df, file_name)
-        else:
-            if file_name.endswith(".pdf"):
-                page_texts = []
-                pdf_reader = pdfplumber.open(io.BytesIO(bytes))
-                for px in range(len(pdf_reader.pages)):
-                    page_text = pdf_reader.pages[px].extract_text()
-                    page_texts.append(page_text)
-                doc_text = " ".join(page_texts)
-            elif file_name.endswith(".json"):
-                text_chunks = process_json_text(
-                    loads(bytes.decode("utf-8")), analysis_window_size
-                )
-            else:
-                doc_text = bytes.decode("utf-8")
-
-            if not file_name.endswith(".json"):
-                text_chunks = splitter.split(doc_text)
-                for index, text in enumerate(text_chunks):
-                    chunk = {
-                        "title": file_name,
-                        "text_chunk": text,
-                        "chunk_id": index + 1,
-                    }
-                    text_chunks[index] = dumps(chunk, indent=2, ensure_ascii=False)
-
-            text_to_chunks[file_name] = text_chunks
-    return text_to_chunks
-
-
 def process_json_text(text_json, period: PeriodOption):
     def convert_to_year_quarter(datetm):
         month = datetm.month

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,6 @@ networkx = "3.3"
 numpy = "1.26.4"
 openai = "^1.37.1"
 pac-synth = "0.0.8"
-pdfplumber = "0.11.2"
 plotly = "5.22.0"
 plotly-express = "0.4.1"
 polars = "0.20.10"
@@ -58,6 +57,7 @@ torch = [
 sentence-transformers = "^3.1.1"
 graspologic = "^3.4.1"
 future = "^1.0.0"
+pypdf = "^5.1.0"
 
 [tool.poetry.group.dev.dependencies]
 newspaper3k = "^0.2.8"