From 3343c0492c2cc74d9786ea6afe330ee427db3875 Mon Sep 17 00:00:00 2001 From: lvalics Date: Thu, 22 Feb 2024 12:18:36 +0000 Subject: [PATCH] PDF upload fixed with langchain and still metadata work in progress. --- .gitignore | 2 + .../api/data_sources/pdf_handler.py | 38 ++++++----- dj_backend_server/api/utils/make_chain.py | 14 ++-- dj_backend_server/api/views/views_message.py | 64 +++++++++++-------- .../web/templates/widgets/metadata.html | 5 +- 5 files changed, 68 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index 2a1061ec..a7513790 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ dj_backend_server.code-workspace .aiderignore dj_backend_server/.vscode/settings.json +dj_backend_server/a.py +dj_backend_server/1.pdf diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py index 119e740b..4bb5ef3f 100644 --- a/dj_backend_server/api/data_sources/pdf_handler.py +++ b/dj_backend_server/api/data_sources/pdf_handler.py @@ -126,26 +126,31 @@ def pdf_handler( @csrf_exempt def process_pdf_with_pypdfium(file_path, directory_path): - pdf = PdfDocument(file_path) - text_pages = [] + pdf_document = PdfDocument(file_path) + text_pages_with_numbers = [] - for page_index in range(len(pdf)): - page = pdf.get_page(page_index) + for page_index in range(len(pdf_document)): + page = pdf_document.get_page(page_index) text_page = page.get_textpage() # get a text page handle for this page text = text_page.get_text_range() # extract text from the text page - text_pages.append(text) + text_pages_with_numbers.append( + (page_index + 1, text) + ) # Store page number and text text_page.close() # close the text page handle - text = "".join(text_pages) + # Combine texts from all pages, prepending each with its page number + combined_text = "\n".join( + [f"Page {num}: {text}" for num, text in text_pages_with_numbers] + ) txt_file_path = os.path.splitext(file_path)[0] + ".txt" logging.debug( - f"Debug: Writing text to {txt_file_path}, directory_path: {directory_path}, text: {text}" + f"Debug: Writing text with page numbers to {txt_file_path}, directory_path: {directory_path}" ) with open(txt_file_path, "w") as f: - f.write(text) + f.write(combined_text) - pdf.close() + pdf_document.close() @csrf_exempt @@ -159,6 +164,7 @@ def process_pdf(FilePath, directory_path): resturl = "http://www.ocrwebservice.com/restservices/processDocument" RequestUrl = f"{resturl}?pagerange={pagerange}&language={language}&outputformat={outputformat}&gettext={gettext}" + logging.debug(f"Debug: RequestUrl: {RequestUrl}") try: with open(FilePath, "rb") as image_file: @@ -216,7 +222,7 @@ def process_pdf(FilePath, directory_path): f"\nThe text: {{text}}. " ) - # print (f"Debug: initial_prompt: {initial_prompt}") + logging.debug(f"Debug: initial_prompt: {initial_prompt}") # Call LLM and write the result into a new text file process_text_with_llm(txt_file, mode, initial_prompt) @@ -291,13 +297,8 @@ def txt_to_vectordb( ) docs = text_splitter.split_documents(raw_docs) - logging.debug("external files docs -->", docs) - if not docs: - print("No documents were processed successfully.") - return - embeddings = get_embeddings() logging.debug( @@ -311,6 +312,11 @@ def txt_to_vectordb( "bot_id": str(pdf_data_source.chatbot.id), "last_update": pdf_data_source.updated_at.strftime("%Y-%m-%d %H:%M:%S"), "type": "document", + "doc_type": ( + pdf_data_source.files_info[0]["original_name"].split(".")[-1] + if pdf_data_source.files_info + else "unknown" + ), "page": "1", # @TODO to extract the page number. "folder": pdf_data_source.folder_name, "original_filename": ( @@ -321,7 +327,7 @@ def txt_to_vectordb( }, ) logging.debug( - f"Vector store initialized successfully for namespace: {namespace}." + f"Vector store initialized successfully for metadata: {metadata}." ) logging.debug(f"Folder need or not to delete. {delete_folder_flag}") diff --git a/dj_backend_server/api/utils/make_chain.py b/dj_backend_server/api/utils/make_chain.py index 34fd940b..17838592 100644 --- a/dj_backend_server/api/utils/make_chain.py +++ b/dj_backend_server/api/utils/make_chain.py @@ -145,14 +145,11 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str): # Send the formatted prompt to LLM and get the result llm = get_llm() - result = llm(prompt=initial_prompt.format(text=text), temperature=0) - - # Check if result is a string - if isinstance(result, str): - response = result - elif isinstance(result, dict): - # Extract only the response from the result - response = result["choices"][0]["message"]["content"] + result = llm.invoke(input=initial_prompt.format(text=text), temperature=0) + + # Extract the response from the result + if hasattr(result, "content"): + response = result.content else: print( f"Error: LLM result is not a dictionary or a string. It is a {type(result)} with value {result}" @@ -166,6 +163,7 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str): print(f"Write with value {txt_file_path}") else: # Write the response into a new text file + result_file_path = txt_file_path.replace(".txt", "_processed.txt") result_file_path = txt_file_path.replace(".txt", ".txt") with open(result_file_path, "w") as result_file: result_file.write(response) diff --git a/dj_backend_server/api/views/views_message.py b/dj_backend_server/api/views/views_message.py index f41c63fd..19952e3e 100644 --- a/dj_backend_server/api/views/views_message.py +++ b/dj_backend_server/api/views/views_message.py @@ -249,39 +249,15 @@ def send_chat(request): ) bot_response = ChatbotResponse(response.json()) - for metadata_entry in response_json.get("metadata", []): - print(f"Last Update: {metadata_entry.get('last_update')}") - print(f"Type: {metadata_entry.get('type')}") - print(f"Original Filename: {metadata_entry.get('original_filename')}") - print(f"Source: {metadata_entry.get('source')}") - print(f"Folder: {metadata_entry.get('folder')}") - print(f"Page: {metadata_entry.get('page')}") - print(f"Bot ID: {metadata_entry.get('bot_id')}") - print(f"ID: {metadata_entry.get('_id')}") - print(f"Collection Name: {metadata_entry.get('_collection_name')}") - - metadata_items = [ - { - "source": entry.get("source"), - "original_filename": entry.get("original_filename"), - } - for entry in response_json.get("metadata", []) - ] - metadata_html = render_to_string( - "widgets/metadata.html", - { - "APP_URL": settings.APP_URL, - "session_id": session_id, - "metadata_items": metadata_items, - }, - ) + feedback_form_html = render_to_string( "widgets/feedback.html", {"APP_URL": settings.APP_URL, "session_id": session_id}, ) - print(f"Response in JSON {session_id}") - html_compose = metadata_html + feedback_form_html + html_compose = ( + metadata_html_append(response_json, session_id) + feedback_form_html + ) return JsonResponse( { "type": "text", @@ -334,3 +310,35 @@ def handle_feedback(request): return JsonResponse({"error": "Chat history not found"}, status=404) except Exception as e: return JsonResponse({"error": "An error occurred"}, status=500) + + +def metadata_html_append(response_json, session_id): + + for metadata_entry in response_json.get("metadata", []): + type = metadata_entry.get("type") + print(f"Type: {type}") + print(f"Original Filename: {metadata_entry.get('original_filename')}") + print(f"Source: {metadata_entry.get('source')}") + print(f"Page: {metadata_entry.get('page')}") + print(f"Bot ID: {metadata_entry.get('bot_id')}") + print(f"ID: {metadata_entry.get('_id')}") + + seen_filenames = set() + metadata_items = [] + # if the original_filename is the same in for, then show it only one time. + for entry in response_json.get("metadata", []): + original_filename = entry.get("original_filename") + if original_filename not in seen_filenames: + metadata_items.append( + {"source": entry.get("source"), "original_filename": original_filename} + ) + seen_filenames.add(original_filename) + + return render_to_string( + "widgets/metadata.html", + { + "APP_URL": settings.APP_URL, + "session_id": session_id, + "metadata_items": metadata_items, + }, + ) diff --git a/dj_backend_server/web/templates/widgets/metadata.html b/dj_backend_server/web/templates/widgets/metadata.html index a5020a28..98e24ff2 100644 --- a/dj_backend_server/web/templates/widgets/metadata.html +++ b/dj_backend_server/web/templates/widgets/metadata.html @@ -2,13 +2,12 @@ {% load static %}
-
+
{% for item in metadata_items %}
- {% trans "Source:" %} {{ item.source }}
- {% trans "Original Filename:" %} {{ item.original_filename }} + {% trans "Source of data" %}: {% trans "Download" %}
{% endfor %}