Skip to content

Commit

Permalink
Convert to pypdf
Browse files Browse the repository at this point in the history
  • Loading branch information
Darren Edge committed Jan 7, 2025
1 parent 023de32 commit fc80618
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 110 deletions.
12 changes: 6 additions & 6 deletions intelligence_toolkit/helpers/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from collections import defaultdict
import pandas as pd
from json import dumps, loads
import pdfplumber
from pypdf import PdfReader
import io
from intelligence_toolkit.AI.text_splitter import TextSplitter

Expand Down Expand Up @@ -44,11 +44,11 @@ def add_chunks(filename, text, chunk_size):
add_chunks(filename, text, chunk_size)
elif filename.endswith(".pdf"):
page_texts = []
bytes = open(filepath, "rb").read()
pdf_reader = pdfplumber.open(io.BytesIO(bytes))
for px in range(len(pdf_reader.pages)):
page_text = pdf_reader.pages[px].extract_text()
page_texts.append(page_text)
pdf_reader = PdfReader(filepath)
num_pages = pdf_reader.get_num_pages()
for px in range(num_pages):
page = pdf_reader.pages[px]
page_texts.append(page.extract_text())
text = " ".join(page_texts)
add_chunks(filename, text, chunk_size)
else:
Expand Down
49 changes: 0 additions & 49 deletions intelligence_toolkit/query_text_data/input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
from json import dumps, loads

import networkx as nx
import pandas as pd
import pdfplumber

import intelligence_toolkit.query_text_data.graph_builder as graph_builder
from intelligence_toolkit.AI.text_splitter import TextSplitter
from intelligence_toolkit.query_text_data.classes import ProcessedChunks
Expand All @@ -29,52 +26,6 @@ def concert_titled_texts_to_chunks(titled_texts):
text_to_chunks[title] = text_chunks
return text_to_chunks

# def convert_file_bytes_to_chunks(
# input_file_bytes,
# analysis_window_size: PeriodOption = PeriodOption.NONE,
# callbacks=[],
# ):
# text_to_chunks = defaultdict(list)
# splitter = TextSplitter()
# for fx, file_name in enumerate(input_file_bytes.keys()):
# old_file_name = file_name
# file_name = file_name.replace("(", "").replace(")", "").replace(" ", "_")
# for cb in callbacks:
# cb.on_batch_change(fx + 1, len(input_file_bytes.keys()))
# bytes = input_file_bytes[old_file_name]

# if file_name.endswith(".csv"):
# df = pd.read_csv(io.BytesIO(bytes))
# text_to_chunks = convert_df_to_chunks(df, file_name)
# else:
# if file_name.endswith(".pdf"):
# page_texts = []
# pdf_reader = pdfplumber.open(io.BytesIO(bytes))
# for px in range(len(pdf_reader.pages)):
# page_text = pdf_reader.pages[px].extract_text()
# page_texts.append(page_text)
# doc_text = " ".join(page_texts)
# elif file_name.endswith(".json"):
# text_chunks = process_json_text(
# loads(bytes.decode("utf-8")), analysis_window_size
# )
# else:
# doc_text = bytes.decode("utf-8")

# if not file_name.endswith(".json"):
# text_chunks = splitter.split(doc_text)
# for index, text in enumerate(text_chunks):
# chunk = {
# "title": file_name,
# "text_chunk": text,
# "chunk_id": index + 1,
# }
# text_chunks[index] = dumps(chunk, indent=2, ensure_ascii=False)

# text_to_chunks[file_name] = text_chunks
# return text_to_chunks


def process_json_text(text_json, period: PeriodOption):
def convert_to_year_quarter(datetm):
month = datetm.month
Expand Down
69 changes: 15 additions & 54 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ networkx = "3.3"
numpy = "1.26.4"
openai = "^1.37.1"
pac-synth = "0.0.8"
pdfplumber = "0.11.2"
plotly = "5.22.0"
plotly-express = "0.4.1"
polars = "0.20.10"
Expand Down Expand Up @@ -58,6 +57,7 @@ torch = [
sentence-transformers = "^3.1.1"
graspologic = "^3.4.1"
future = "^1.0.0"
pypdf = "^5.1.0"

[tool.poetry.group.dev.dependencies]
newspaper3k = "^0.2.8"
Expand Down

0 comments on commit fc80618

Please sign in to comment.