Skip to content
This repository has been archived by the owner on Jan 5, 2025. It is now read-only.

Commit

Permalink
Merge pull request #232 from lvalics/main
Browse files Browse the repository at this point in the history
Full Screen Fix.
  • Loading branch information
codebanesr authored Jan 28, 2024
2 parents 28abd2f + 98e6572 commit ff9edf8
Show file tree
Hide file tree
Showing 16 changed files with 1,149 additions and 162 deletions.
75 changes: 51 additions & 24 deletions dj_backend_server/api/data_sources/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@
from web.utils.delete_foler import delete_folder
from web.models.failed_jobs import FailedJob
from web.models.pdf_data_sources import PdfDataSource
from pypdfium2 import PdfDocument



@csrf_exempt
def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: Optional[bool] = False, text_data: Optional[str] = None):
def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: Optional[bool] = False, ocr_pdf_file: Optional[bool] = False, text_data: Optional[str] = None):
"""
This function handles PDF files and other types of files in a shared folder. It processes the text data if provided directly,
otherwise it reads from the files in the shared folder. It processes each file based on its extension, converts .doc, .docx, .xls,
Expand All @@ -36,48 +38,55 @@ def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: Optional
shared_folder (str): The name of the shared folder where the files are located.
namespace (str): The namespace for the vector database.
delete_folder_flag (bool): A flag indicating whether to delete the folder after processing the files.
ocr_pdf_file (bool): A flag indicating whether to send or not PDF to OCR API services.
text_data (Optional[str], optional): The text data to be processed. If this is provided, the function will not read from
the files. Defaults to None.
Raises:
Exception: If an error occurs during the processing of the files or the conversion of the text data to a vector database.
"""
print ("Debug: pdf_handler")

# If text data is provided directly, process it without reading from files
if text_data:
process_text_data(text_data, namespace)
print ("Debug: text_data is provided directly, process it without reading from files")
return

# Convert delete_folder_flag to boolean (send 0 - FALSE or 1 - TRUE)
# Convert delete_folder_flag and ocr_pdf_file to boolean (send 0 - FALSE or 1 - TRUE)
delete_folder_flag = bool(delete_folder_flag) if delete_folder_flag is not None else False
ocr_pdf_file = bool(ocr_pdf_file) if ocr_pdf_file is not None else False
print (f"Debug: delete_folder_flag: {delete_folder_flag}, ocr_pdf_file: {ocr_pdf_file}")

# Check if the shared_folder is provided, if not, return early as there are no files to process
if not shared_folder:
print("No shared folder provided for file processing.")
return

try:
#TODO: When will be multiple external library to choose, need to change.
if os.environ.get("PDF_LIBRARY") == "external":
if shared_folder:
directory_path = os.path.join("website_data_sources", shared_folder)
print(f"Debug: Processing folder {directory_path}")

if os.path.exists(directory_path):
print(f"Debug: Directory exists. Files: {os.listdir(directory_path)}")
else:
print(f"Debug: No shared folder provided for file processing.")
return

# Process each file in the directory based on its extension
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if filename.endswith(".pdf"):
directory_path = os.path.join("website_data_sources", shared_folder)
print(f"Debug: Processing folder {directory_path}")

if not os.path.exists(directory_path):
print(f"Debug: Directory {directory_path} does not exist.")
return

print(f"Debug: Directory exists. Files: {os.listdir(directory_path)}")

# Process each file in the directory based on its extension
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if filename.endswith(".pdf"):
if ocr_pdf_file == True:
process_pdf(file_path, directory_path)
elif filename.endswith((".txt", ".csv", ".json")):
save_as_txt(file_path)
elif filename.endswith((".doc", ".docx", ".xls", ".xlsx")):
convert_to_txt(file_path)
print(f"Debug: OCR PDF file {ocr_pdf_file}")
else:
process_pdf_with_pypdfium(file_path, directory_path)
print(f"Debug: Not need to send to OCR API {ocr_pdf_file}")
elif filename.endswith((".txt", ".csv", ".json")):
save_as_txt(file_path)
elif filename.endswith((".doc", ".docx", ".xls", ".xlsx")):
convert_to_txt(file_path)

txt_to_vectordb(shared_folder, namespace, delete_folder_flag)

Expand All @@ -89,11 +98,29 @@ def pdf_handler(shared_folder: str, namespace: str, delete_folder_flag: Optional
print("Exception occurred:", e)
traceback.print_exc()

@csrf_exempt
def process_pdf_with_pypdfium(file_path, directory_path):
pdf = PdfDocument(file_path)
text_pages = []

for page_index in range(len(pdf)):
page = pdf.get_page(page_index)
text_page = page.get_textpage() # get a text page handle for this page
text = text_page.get_text_range() # extract text from the text page
text_pages.append(text)
text_page.close() # close the text page handle

text = ''.join(text_pages)
txt_file_path = os.path.splitext(file_path)[0] + '.txt'
print(f"Debug: Writing text to {txt_file_path}, AAA: {directory_path}, BBB: {text}")

with open(txt_file_path, 'w') as f:
f.write(text)

pdf.close()

@csrf_exempt
def process_pdf(FilePath,directory_path):
#pdf_data_source = PdfDataSource.objects.get(folder_name=FilePath)
#pdf_data_source = PdfDataSource.objects.get(folder_name=directory_path)
UserName = os.environ.get("OCR_USERNAME")
LicenseCode = os.environ.get("OCR_LICCODE")
gettext = True
Expand Down
4 changes: 3 additions & 1 deletion dj_backend_server/api/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def upload_pdf_api(request):
API endpoint for uploading PDF files. It expects a POST request with the following parameters:
- 'X-Bot-Token' header: A token to authenticate the chatbot.
- 'delete_folder_flag': A flag indicating whether to delete the folder after processing (0 or 1).
- 'ocr_pdf_file': A flag indicating that the file need to be send to OCR API. (0 or 1).
- 'pdffiles': The PDF file(s) to be uploaded. Can be a single file or multiple files.
"""

Expand All @@ -28,6 +29,7 @@ def upload_pdf_api(request):
return JsonResponse({'error': 'Invalid token'}, status=403)

delete_folder_flag = request.POST.get('delete_folder_flag', '0') == '1'
ocr_pdf_file = request.POST.get('ocr_pdf_file', '0') == '1'
files = request.FILES.getlist('pdffiles')
text_data = request.POST.get('text_data', '')

Expand All @@ -37,6 +39,6 @@ def upload_pdf_api(request):
print (f"text_data: {data_source}")

# Trigger the PdfDataSourceWasAdded event
pdf_data_source_added.send(sender='create_via_pdf_flow', bot_id=bot.id, data_source_id=data_source.id, delete_folder_flag=delete_folder_flag)
pdf_data_source_added.send(sender='create_via_pdf_flow', bot_id=bot.id, data_source_id=data_source.id, delete_folder_flag=delete_folder_flag, ocr_pdf_file=ocr_pdf_file, text_data=text_data)
return JsonResponse({'message': 'PDF uploaded and chatbot created successfully', 'data_source_id': data_source.id, 'bot_id': bot.id})

4 changes: 2 additions & 2 deletions dj_backend_server/api/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from web.workers.crawler import start_recursive_crawler

@shared_task
def pdf_handler_task(shared_folder, namespace, delete_folder_flag):
return pdf_handler(shared_folder=shared_folder, namespace=namespace, delete_folder_flag=delete_folder_flag)
def pdf_handler_task(shared_folder, namespace, delete_folder_flag, ocr_pdf_file):
return pdf_handler(shared_folder=shared_folder, namespace=namespace, delete_folder_flag=delete_folder_flag, ocr_pdf_file=ocr_pdf_file)

@shared_task
def website_handler_task(shared_folder, namespace):
Expand Down
3 changes: 2 additions & 1 deletion dj_backend_server/api/views/views_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def ingest(request):

if type_ == 'pdf':
delete_folder_flag = data.get('delete_folder_flag', False)
pdf_handler_task.delay(shared_folder, namespace, delete_folder_flag)
ocr_pdf_file = data.get('ocr_pdf_file', False)
pdf_handler_task.delay(shared_folder, namespace, delete_folder_flag, ocr_pdf_file)
elif type_ == 'website':
print("Calling website handler task")
website_handler_task.delay(shared_folder, namespace)
Expand Down
11 changes: 6 additions & 5 deletions dj_backend_server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ grpcio-tools==1.56.2
h11==0.14.0
h2==4.1.0
hpack==4.0.0
httpcore==0.17.3
httpx==0.24.1
httpcore1.0.2
httpx=0.25.2
hyperframe==6.0.1
idna==3.4
idna==3.6
kombu==5.3.1
langchain==0.0.247
langsmith==0.0.15
Expand All @@ -54,7 +54,7 @@ pycparser==2.21
pydantic==1.10.12
PyMySQL==1.1.0
pypdf==3.14.0
pypdfium2==4.18.0
pypdfium2==4.26.0
python-dateutil==2.8.2
python-dotenv==1.0.0
python-docx==1.1.0
Expand All @@ -78,4 +78,5 @@ urllib3==1.26.16
vine==5.0.0
wcwidth==0.2.6
yarl==1.9.2
django-cors-headers==4.3.1
django-cors-headers==4.3.1
ollama==0.1.4
2 changes: 2 additions & 0 deletions dj_backend_server/web/listeners/ingest_pdf_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def ingest_pdf_datasource(sender, **kwargs):
bot_id = kwargs['bot_id']
pdf_data_source_id = kwargs['data_source_id']
delete_folder_flag = kwargs['delete_folder_flag']
ocr_pdf_file = kwargs.get('ocr_pdf_file', False)

try:
pdf_data_source = PdfDataSource.objects.get(id=pdf_data_source_id)
Expand All @@ -24,6 +25,7 @@ def ingest_pdf_datasource(sender, **kwargs):
'shared_folder': pdf_data_source.folder_name,
'namespace': str(bot_id),
'delete_folder_flag': delete_folder_flag,
'ocr_pdf_file': ocr_pdf_file,
}

try:
Expand Down
Loading

0 comments on commit ff9edf8

Please sign in to comment.