Skip to content

Commit

Permalink
Fixes QTD multi-file upload and replaces pdfplumber with pypdf (#84)
Browse files Browse the repository at this point in the history
* New unstructured data prep

* Removing unstructured

* pdf processing

* Updating UI workflow

* Convert to pypdf

---------

Co-authored-by: Darren Edge <[email protected]>
  • Loading branch information
darrenedge and Darren Edge authored Jan 16, 2025
1 parent 276436f commit 69abdfa
Show file tree
Hide file tree
Showing 7 changed files with 963 additions and 781 deletions.
9 changes: 8 additions & 1 deletion app/workflows/query_text_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,17 @@ async def create(sv: SessionVariables, workflow=None):
file_pb, file_callback = functions.create_progress_callback(
"Loaded {} of {} files..."
)
for file in files:
with open(file.name, "wb") as f:
f.write(file.getbuffer())
input_files={file.name for file in files}
qtd.process_data_from_files(
input_file_bytes={file.name: file.getvalue() for file in files},
input_files=input_files,
chunk_size=CHUNK_SIZE,
callbacks=[file_callback],
)
for file in input_files:
os.remove(file)
file_pb.empty()
else:
qtd.import_chunks_from_str(file_chunks)
Expand Down
1,506 changes: 871 additions & 635 deletions example_notebooks/query_text_data.ipynb

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions intelligence_toolkit/helpers/document_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

from collections import defaultdict
import pandas as pd
from json import dumps, loads
from pypdf import PdfReader
import io
from intelligence_toolkit.AI.text_splitter import TextSplitter

def convert_files_to_chunks(
input_filepaths,
chunk_size,
callbacks=[],
):
text_to_chunks = defaultdict(list)

def add_chunks(filename, text, chunk_size):
splitter = TextSplitter(chunk_size=chunk_size)
text_chunks = splitter.split(text)
for index, text in enumerate(text_chunks):
chunk = {"title": filename, "text_chunk": text, "chunk_id": index + 1}
text_to_chunks[filename].append(dumps(chunk, indent=2, ensure_ascii=False))

for fx, filepath in enumerate(input_filepaths):
filename = filepath.split("/")[-1]
filename = filename.replace("(", "").replace(")", "").replace(" ", "_")
for cb in callbacks:
cb.on_batch_change(fx + 1, len(input_filepaths))

if filename.endswith(".csv"):
df = pd.read_csv(filepath)
cols = df.columns.values
for ix, row in df.iterrows():
rec_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
elif filename.endswith(".json"):
json_obj = loads(open(filepath).read())
# check if json_obj is a list
if isinstance(json_obj, list):
for ix, js_rec in enumerate(json_obj):
rec_text = dumps(js_rec)
add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
else:
text = dumps(json_obj)
add_chunks(filename, text, chunk_size)
elif filename.endswith(".pdf"):
page_texts = []
pdf_reader = PdfReader(filepath)
num_pages = pdf_reader.get_num_pages()
for px in range(num_pages):
page = pdf_reader.pages[px]
page_texts.append(page.extract_text())
text = " ".join(page_texts)
add_chunks(filename, text, chunk_size)
else:
text = open(filepath).read()
add_chunks(filename, text, chunk_size)

return text_to_chunks
33 changes: 10 additions & 23 deletions intelligence_toolkit/query_text_data/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import intelligence_toolkit.query_text_data.prompts as prompts
import intelligence_toolkit.query_text_data.query_rewriter as query_rewriter
import intelligence_toolkit.query_text_data.relevance_assessor as relevance_assessor
import intelligence_toolkit.helpers.document_processor as document_processor
from intelligence_toolkit.AI.base_embedder import BaseEmbedder
from intelligence_toolkit.AI.client import OpenAIClient
from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
Expand Down Expand Up @@ -89,42 +90,28 @@ def set_embedder(self, text_embedder: BaseEmbedder) -> None:
"""
self.text_embedder = text_embedder

def process_data_from_df(
self, df: pd.DataFrame, label: str
) -> dict[str, list[str]]:
"""
Process data from a DataFrame.
Args:
df (pd.DataFrame): The DataFrame
label (str): The label (e.g., filename) used as the prefix for the chunk names
Returns:
dict[str, list[str]]: The label to chunks mapping
"""
self.label_to_chunks = input_processor.convert_df_to_chunks(df, label)
self.stage = QueryTextDataStage.CHUNKS_CREATED
return self.label_to_chunks

def process_data_from_files(
self,
input_file_bytes: bytes,
analysis_window_size: input_processor.PeriodOption = input_processor.PeriodOption.NONE,
input_files: list[str],
chunk_size: int = 1000,
callbacks: list = [],
) -> dict[str, list[str]]:
"""
Process data from files.
Args:
input_file_bytes (bytes): The input file bytes
analysis_window_size (input_processor.PeriodOption): The analysis window size
input_files (str): The list of input files
new_after_n_chars (int): The minimum partition size (characters)
max_characters (int): The maximum partition size (characters)
callbacks (list): The list of callbacks
Returns:
dict[str, list[str]]: The label to chunks mapping
"""
self.label_to_chunks = input_processor.convert_file_bytes_to_chunks(
input_file_bytes, analysis_window_size, callbacks
self.label_to_chunks = document_processor.convert_files_to_chunks(
input_files,
chunk_size=chunk_size,
callbacks=callbacks
)
self.stage = QueryTextDataStage.CHUNKS_CREATED
return self.label_to_chunks
Expand Down
67 changes: 0 additions & 67 deletions intelligence_toolkit/query_text_data/input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
from json import dumps, loads

import networkx as nx
import pandas as pd
import pdfplumber

import intelligence_toolkit.query_text_data.graph_builder as graph_builder
from intelligence_toolkit.AI.text_splitter import TextSplitter
from intelligence_toolkit.query_text_data.classes import ProcessedChunks
Expand All @@ -29,70 +26,6 @@ def concert_titled_texts_to_chunks(titled_texts):
text_to_chunks[title] = text_chunks
return text_to_chunks


def convert_df_to_chunks(df, label):
label = label.replace("(", "").replace(")", "").replace(" ", "_")
splitter = TextSplitter()
text_to_chunks = defaultdict(list)
for ix, row in df.iterrows():
cols = df.columns.values
doc_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
text_chunks = splitter.split(doc_text)
for index, text in enumerate(text_chunks):
this_label = f"{label}_{ix + 1}"
chunk = {"title": this_label, "text_chunk": text, "chunk_id": index + 1}
text_to_chunks[this_label].append(
dumps(chunk, indent=2, ensure_ascii=False)
)
return text_to_chunks


def convert_file_bytes_to_chunks(
input_file_bytes,
analysis_window_size: PeriodOption = PeriodOption.NONE,
callbacks=[],
):
text_to_chunks = defaultdict(list)
splitter = TextSplitter()
for fx, file_name in enumerate(input_file_bytes.keys()):
old_file_name = file_name
file_name = file_name.replace("(", "").replace(")", "").replace(" ", "_")
for cb in callbacks:
cb.on_batch_change(fx + 1, len(input_file_bytes.keys()))
bytes = input_file_bytes[old_file_name]

if file_name.endswith(".csv"):
df = pd.read_csv(io.BytesIO(bytes))
text_to_chunks = convert_df_to_chunks(df, file_name)
else:
if file_name.endswith(".pdf"):
page_texts = []
pdf_reader = pdfplumber.open(io.BytesIO(bytes))
for px in range(len(pdf_reader.pages)):
page_text = pdf_reader.pages[px].extract_text()
page_texts.append(page_text)
doc_text = " ".join(page_texts)
elif file_name.endswith(".json"):
text_chunks = process_json_text(
loads(bytes.decode("utf-8")), analysis_window_size
)
else:
doc_text = bytes.decode("utf-8")

if not file_name.endswith(".json"):
text_chunks = splitter.split(doc_text)
for index, text in enumerate(text_chunks):
chunk = {
"title": file_name,
"text_chunk": text,
"chunk_id": index + 1,
}
text_chunks[index] = dumps(chunk, indent=2, ensure_ascii=False)

text_to_chunks[file_name] = text_chunks
return text_to_chunks


def process_json_text(text_json, period: PeriodOption):
def convert_to_year_quarter(datetm):
month = datetm.month
Expand Down
69 changes: 15 additions & 54 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ networkx = "3.3"
numpy = "1.26.4"
openai = "^1.37.1"
pac-synth = "0.0.8"
pdfplumber = "0.11.2"
plotly = "5.22.0"
plotly-express = "0.4.1"
polars = "0.20.10"
Expand Down Expand Up @@ -58,6 +57,7 @@ torch = [
sentence-transformers = "^3.1.1"
graspologic = "^3.4.1"
future = "^1.0.0"
pypdf = "^5.1.0"

[tool.poetry.group.dev.dependencies]
newspaper3k = "^0.2.8"
Expand Down

0 comments on commit 69abdfa

Please sign in to comment.