Skip to content

Commit 7280787

Browse files
committed
Improve DocSum file handling
Use temporary file only when necessary, and use aiofiles own functionality for that. Signed-off-by: Eero Tamminen <[email protected]>
1 parent 241971b commit 7280787

File tree

1 file changed

+43
-34
lines changed

1 file changed

+43
-34
lines changed

DocSum/docsum.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Copyright (C) 2024 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import asyncio
54
import base64
65
import os
76
import subprocess
@@ -55,15 +54,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
5554
return inputs
5655

5756

58-
def read_pdf(file):
57+
def read_pdf(file: str):
5958
from langchain.document_loaders import PyPDFLoader
6059

6160
loader = PyPDFLoader(file)
6261
docs = loader.load_and_split()
6362
return docs
6463

6564

66-
def encode_file_to_base64(file_path):
65+
async def encode_file_to_base64(f: UploadFile):
6766
"""Encode the content of a file to a base64 string.
6867
6968
Args:
@@ -72,8 +71,7 @@ def encode_file_to_base64(file_path):
7271
Returns:
7372
str: The base64 encoded string of the file content.
7473
"""
75-
with open(file_path, "rb") as f:
76-
base64_str = base64.b64encode(f.read()).decode("utf-8")
74+
base64_str = await base64.b64encode(f.read()).decode("utf-8")
7775
return base64_str
7876

7977

@@ -90,6 +88,7 @@ def video2audio(
9088
"""
9189
video_data = base64.b64decode(video_base64)
9290

91+
# TODO: why this processing is not async?
9392
uid = str(uuid.uuid4())
9493
temp_video_path = f"{uid}.mp4"
9594
temp_audio_path = f"{uid}.mp3"
@@ -115,29 +114,50 @@ def video2audio(
115114
return audio_base64
116115

117116

118-
def read_text_from_file(file, save_file_name):
117+
async def read_text_from_file(file: UploadFile):
118+
ctype = file.headers["content-type"]
119+
valid = (
120+
"text/plain",
121+
"application/pdf",
122+
"application/octet-stream",
123+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
124+
)
125+
126+
file_content = None
127+
if ctype not in valid:
128+
return file_content
129+
130+
import aiofiles
119131
import docx2txt
120132
from langchain.text_splitter import CharacterTextSplitter
121133

122134
# read text file
123-
if file.headers["content-type"] == "text/plain":
135+
if ctype == "text/plain":
124136
file.file.seek(0)
125137
content = file.file.read().decode("utf-8")
126-
# Split text
138+
# Split text to multiple documents
127139
text_splitter = CharacterTextSplitter()
128-
texts = text_splitter.split_text(content)
129-
# Create multiple documents
130-
file_content = texts
131-
# read pdf file
132-
elif file.headers["content-type"] == "application/pdf":
133-
documents = read_pdf(save_file_name)
134-
file_content = [doc.page_content for doc in documents]
135-
# read docx file
136-
elif (
137-
file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
138-
or file.headers["content-type"] == "application/octet-stream"
139-
):
140-
file_content = docx2txt.process(save_file_name)
140+
return text_splitter.split_text(content)
141+
142+
# need a tmp file for rest
143+
async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
144+
await tmp.write(await file.read())
145+
await tmp.flush()
146+
147+
# read pdf file
148+
if ctype == "application/pdf":
149+
documents = read_pdf(tmp.name)
150+
file_content = [doc.page_content for doc in documents]
151+
152+
# read docx file
153+
if ctype in (
154+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
155+
"application/octet-stream",
156+
):
157+
file_content = docx2txt.process(tmp.name)
158+
159+
# remove temp file
160+
await tmp.close()
141161

142162
return file_content
143163

@@ -201,25 +221,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
201221
file_summaries = []
202222
if files:
203223
for file in files:
204-
# Fix concurrency issue with the same file name
205-
# https://github.com/opea-project/GenAIExamples/issues/1279
206-
uid = str(uuid.uuid4())
207-
file_path = f"/tmp/{uid}"
208-
209-
import aiofiles
210-
211-
async with aiofiles.open(file_path, "wb") as f:
212-
await f.write(await file.read())
213224

214225
if data_type == "text":
215-
docs = read_text_from_file(file, file_path)
226+
docs = await read_text_from_file(file)
216227
elif data_type in ["audio", "video"]:
217-
docs = encode_file_to_base64(file_path)
228+
docs = await encode_file_to_base64(file)
218229
else:
219230
raise ValueError(f"Data type not recognized: {data_type}")
220231

221-
os.remove(file_path)
222-
223232
if isinstance(docs, list):
224233
file_summaries.extend(docs)
225234
else:

0 commit comments

Comments
 (0)