Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes QTD multi-file upload and replaces pdfplumber with pypdf #84

Merged
merged 6 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion app/workflows/query_text_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,17 @@ async def create(sv: SessionVariables, workflow=None):
file_pb, file_callback = functions.create_progress_callback(
"Loaded {} of {} files..."
)
for file in files:
with open(file.name, "wb") as f:
f.write(file.getbuffer())
input_files={file.name for file in files}
qtd.process_data_from_files(
input_file_bytes={file.name: file.getvalue() for file in files},
input_files=input_files,
chunk_size=CHUNK_SIZE,
callbacks=[file_callback],
)
for file in input_files:
os.remove(file)
file_pb.empty()
else:
qtd.import_chunks_from_str(file_chunks)
Expand Down
1,506 changes: 871 additions & 635 deletions example_notebooks/query_text_data.ipynb

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions intelligence_toolkit/helpers/document_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

from collections import defaultdict
import pandas as pd
from json import dumps, loads
from pypdf import PdfReader
import io
from intelligence_toolkit.AI.text_splitter import TextSplitter

def convert_files_to_chunks(
input_filepaths,
chunk_size,
callbacks=[],
):
text_to_chunks = defaultdict(list)

def add_chunks(filename, text, chunk_size):
splitter = TextSplitter(chunk_size=chunk_size)
text_chunks = splitter.split(text)
for index, text in enumerate(text_chunks):
chunk = {"title": filename, "text_chunk": text, "chunk_id": index + 1}
text_to_chunks[filename].append(dumps(chunk, indent=2, ensure_ascii=False))

for fx, filepath in enumerate(input_filepaths):
filename = filepath.split("/")[-1]
filename = filename.replace("(", "").replace(")", "").replace(" ", "_")
for cb in callbacks:
cb.on_batch_change(fx + 1, len(input_filepaths))

if filename.endswith(".csv"):
df = pd.read_csv(filepath)
cols = df.columns.values
for ix, row in df.iterrows():
rec_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
elif filename.endswith(".json"):
json_obj = loads(open(filepath).read())
# check if json_obj is a list
if isinstance(json_obj, list):
for ix, js_rec in enumerate(json_obj):
rec_text = dumps(js_rec)
add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
else:
text = dumps(json_obj)
add_chunks(filename, text, chunk_size)
elif filename.endswith(".pdf"):
page_texts = []
pdf_reader = PdfReader(filepath)
num_pages = pdf_reader.get_num_pages()
for px in range(num_pages):
page = pdf_reader.pages[px]
page_texts.append(page.extract_text())
text = " ".join(page_texts)
add_chunks(filename, text, chunk_size)
else:
text = open(filepath).read()
add_chunks(filename, text, chunk_size)

return text_to_chunks
33 changes: 10 additions & 23 deletions intelligence_toolkit/query_text_data/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import intelligence_toolkit.query_text_data.prompts as prompts
import intelligence_toolkit.query_text_data.query_rewriter as query_rewriter
import intelligence_toolkit.query_text_data.relevance_assessor as relevance_assessor
import intelligence_toolkit.helpers.document_processor as document_processor
from intelligence_toolkit.AI.base_embedder import BaseEmbedder
from intelligence_toolkit.AI.client import OpenAIClient
from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
Expand Down Expand Up @@ -89,42 +90,28 @@ def set_embedder(self, text_embedder: BaseEmbedder) -> None:
"""
self.text_embedder = text_embedder

def process_data_from_df(
self, df: pd.DataFrame, label: str
) -> dict[str, list[str]]:
"""
Process data from a DataFrame.

Args:
df (pd.DataFrame): The DataFrame
label (str): The label (e.g., filename) used as the prefix for the chunk names

Returns:
dict[str, list[str]]: The label to chunks mapping
"""
self.label_to_chunks = input_processor.convert_df_to_chunks(df, label)
self.stage = QueryTextDataStage.CHUNKS_CREATED
return self.label_to_chunks

def process_data_from_files(
self,
input_file_bytes: bytes,
analysis_window_size: input_processor.PeriodOption = input_processor.PeriodOption.NONE,
input_files: list[str],
chunk_size: int = 1000,
callbacks: list = [],
) -> dict[str, list[str]]:
"""
Process data from files.

Args:
input_file_bytes (bytes): The input file bytes
analysis_window_size (input_processor.PeriodOption): The analysis window size
input_files (str): The list of input files
new_after_n_chars (int): The minimum partition size (characters)
max_characters (int): The maximum partition size (characters)
callbacks (list): The list of callbacks

Returns:
dict[str, list[str]]: The label to chunks mapping
"""
self.label_to_chunks = input_processor.convert_file_bytes_to_chunks(
input_file_bytes, analysis_window_size, callbacks
self.label_to_chunks = document_processor.convert_files_to_chunks(
input_files,
chunk_size=chunk_size,
callbacks=callbacks
)
self.stage = QueryTextDataStage.CHUNKS_CREATED
return self.label_to_chunks
Expand Down
67 changes: 0 additions & 67 deletions intelligence_toolkit/query_text_data/input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
from json import dumps, loads

import networkx as nx
import pandas as pd
import pdfplumber

import intelligence_toolkit.query_text_data.graph_builder as graph_builder
from intelligence_toolkit.AI.text_splitter import TextSplitter
from intelligence_toolkit.query_text_data.classes import ProcessedChunks
Expand All @@ -29,70 +26,6 @@ def concert_titled_texts_to_chunks(titled_texts):
text_to_chunks[title] = text_chunks
return text_to_chunks


def convert_df_to_chunks(df, label):
label = label.replace("(", "").replace(")", "").replace(" ", "_")
splitter = TextSplitter()
text_to_chunks = defaultdict(list)
for ix, row in df.iterrows():
cols = df.columns.values
doc_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
text_chunks = splitter.split(doc_text)
for index, text in enumerate(text_chunks):
this_label = f"{label}_{ix + 1}"
chunk = {"title": this_label, "text_chunk": text, "chunk_id": index + 1}
text_to_chunks[this_label].append(
dumps(chunk, indent=2, ensure_ascii=False)
)
return text_to_chunks


def convert_file_bytes_to_chunks(
input_file_bytes,
analysis_window_size: PeriodOption = PeriodOption.NONE,
callbacks=[],
):
text_to_chunks = defaultdict(list)
splitter = TextSplitter()
for fx, file_name in enumerate(input_file_bytes.keys()):
old_file_name = file_name
file_name = file_name.replace("(", "").replace(")", "").replace(" ", "_")
for cb in callbacks:
cb.on_batch_change(fx + 1, len(input_file_bytes.keys()))
bytes = input_file_bytes[old_file_name]

if file_name.endswith(".csv"):
df = pd.read_csv(io.BytesIO(bytes))
text_to_chunks = convert_df_to_chunks(df, file_name)
else:
if file_name.endswith(".pdf"):
page_texts = []
pdf_reader = pdfplumber.open(io.BytesIO(bytes))
for px in range(len(pdf_reader.pages)):
page_text = pdf_reader.pages[px].extract_text()
page_texts.append(page_text)
doc_text = " ".join(page_texts)
elif file_name.endswith(".json"):
text_chunks = process_json_text(
loads(bytes.decode("utf-8")), analysis_window_size
)
else:
doc_text = bytes.decode("utf-8")

if not file_name.endswith(".json"):
text_chunks = splitter.split(doc_text)
for index, text in enumerate(text_chunks):
chunk = {
"title": file_name,
"text_chunk": text,
"chunk_id": index + 1,
}
text_chunks[index] = dumps(chunk, indent=2, ensure_ascii=False)

text_to_chunks[file_name] = text_chunks
return text_to_chunks


def process_json_text(text_json, period: PeriodOption):
def convert_to_year_quarter(datetm):
month = datetm.month
Expand Down
69 changes: 15 additions & 54 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ networkx = "3.3"
numpy = "1.26.4"
openai = "^1.37.1"
pac-synth = "0.0.8"
pdfplumber = "0.11.2"
plotly = "5.22.0"
plotly-express = "0.4.1"
polars = "0.20.10"
Expand Down Expand Up @@ -58,6 +57,7 @@ torch = [
sentence-transformers = "^3.1.1"
graspologic = "^3.4.1"
future = "^1.0.0"
pypdf = "^5.1.0"

[tool.poetry.group.dev.dependencies]
newspaper3k = "^0.2.8"
Expand Down
Loading