-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_processing.py
55 lines (50 loc) · 2.33 KB
/
file_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from zipfile import BadZipFile
def process_pdf(file_path, chunk_size=500, chunk_overlap=100):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
documents = [Document(page_content=text)]
return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap).split_documents(documents)
def process_docx(file_path, chunk_size=500, chunk_overlap=100):
try:
doc = DocxDocument(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
text = '\n'.join(full_text)
documents = [Document(page_content=text)]
return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap).split_documents(documents)
except BadZipFile:
raise ValueError("The DOCX file is corrupted or not a valid DOCX file.")
except KeyError as e:
raise ValueError(f"There is no item named '{e.args[0]}' in the archive")
def process_txt(file_path, chunk_size=500, chunk_overlap=100):
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
documents = [Document(page_content=text)]
return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap).split_documents(documents)
def process_file(file_path, chunk_size=500, chunk_overlap=100):
file_extension = os.path.splitext(file_path)[-1].lower()
if file_extension == ".pdf":
return process_pdf(file_path, chunk_size, chunk_overlap)
elif file_extension == ".docx":
return process_docx(file_path, chunk_size, chunk_overlap)
elif file_extension == ".txt":
return process_txt(file_path, chunk_size, chunk_overlap)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
if __name__ == "__main__":
try:
result = process_file("/Users/leonidstepanov/Desktop/site 2/Uploads/4.pdf", 500, 100)
if result:
print(result[0].page_content)
else:
print("No content processed.")
except Exception as e:
print(f"Ошибка при обработке файла DOCX: {e}")