PDF upload fixed with langchain and still metadata work in progress.

openchatai · Feb 22, 2024 · 3343c04 · 3343c04
1 parent cc24b4e
commit 3343c04
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,5 @@ dj_backend_server.code-workspace
 .aiderignore
 dj_backend_server/.vscode/settings.json
 
+dj_backend_server/a.py
+dj_backend_server/1.pdf
diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py
@@ -126,26 +126,31 @@ def pdf_handler(
 
 @csrf_exempt
 def process_pdf_with_pypdfium(file_path, directory_path):
-    pdf = PdfDocument(file_path)
-    text_pages = []
+    pdf_document = PdfDocument(file_path)
+    text_pages_with_numbers = []
 
-    for page_index in range(len(pdf)):
-        page = pdf.get_page(page_index)
+    for page_index in range(len(pdf_document)):
+        page = pdf_document.get_page(page_index)
         text_page = page.get_textpage()  # get a text page handle for this page
         text = text_page.get_text_range()  # extract text from the text page
-        text_pages.append(text)
+        text_pages_with_numbers.append(
+            (page_index + 1, text)
+        )  # Store page number and text
         text_page.close()  # close the text page handle
 
-    text = "".join(text_pages)
+    # Combine texts from all pages, prepending each with its page number
+    combined_text = "\n".join(
+        [f"Page {num}: {text}" for num, text in text_pages_with_numbers]
+    )
     txt_file_path = os.path.splitext(file_path)[0] + ".txt"
     logging.debug(
-        f"Debug: Writing text to {txt_file_path}, directory_path: {directory_path}, text: {text}"
+        f"Debug: Writing text with page numbers to {txt_file_path}, directory_path: {directory_path}"
     )
 
     with open(txt_file_path, "w") as f:
-        f.write(text)
+        f.write(combined_text)
 
-    pdf.close()
+    pdf_document.close()
 
 
 @csrf_exempt
@@ -159,6 +164,7 @@ def process_pdf(FilePath, directory_path):
     resturl = "http://www.ocrwebservice.com/restservices/processDocument"
 
     RequestUrl = f"{resturl}?pagerange={pagerange}&language={language}&outputformat={outputformat}&gettext={gettext}"
+    logging.debug(f"Debug: RequestUrl: {RequestUrl}")
 
     try:
         with open(FilePath, "rb") as image_file:
@@ -216,7 +222,7 @@ def process_pdf(FilePath, directory_path):
                 f"\nThe text: {{text}}. "
             )
 
-            # print (f"Debug: initial_prompt: {initial_prompt}")
+            logging.debug(f"Debug: initial_prompt: {initial_prompt}")
 
             # Call LLM and write the result into a new text file
             process_text_with_llm(txt_file, mode, initial_prompt)
@@ -291,13 +297,8 @@ def txt_to_vectordb(
         )
 
         docs = text_splitter.split_documents(raw_docs)
-
         logging.debug("external files docs -->", docs)
 
-        if not docs:
-            print("No documents were processed successfully.")
-            return
-
         embeddings = get_embeddings()
 
         logging.debug(
@@ -311,6 +312,11 @@ def txt_to_vectordb(
                 "bot_id": str(pdf_data_source.chatbot.id),
                 "last_update": pdf_data_source.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
                 "type": "document",
+                "doc_type": (
+                    pdf_data_source.files_info[0]["original_name"].split(".")[-1]
+                    if pdf_data_source.files_info
+                    else "unknown"
+                ),
                 "page": "1",  # @TODO to extract the page number.
                 "folder": pdf_data_source.folder_name,
                 "original_filename": (
@@ -321,7 +327,7 @@ def txt_to_vectordb(
             },
         )
         logging.debug(
-            f"Vector store initialized successfully for namespace: {namespace}."
+            f"Vector store initialized successfully for metadata: {metadata}."
         )
 
         logging.debug(f"Folder need or not to delete. {delete_folder_flag}")

diff --git a/dj_backend_server/api/utils/make_chain.py b/dj_backend_server/api/utils/make_chain.py
@@ -145,14 +145,11 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str):
 
     # Send the formatted prompt to LLM and get the result
     llm = get_llm()
-    result = llm(prompt=initial_prompt.format(text=text), temperature=0)
-
-    # Check if result is a string
-    if isinstance(result, str):
-        response = result
-    elif isinstance(result, dict):
-        # Extract only the response from the result
-        response = result["choices"][0]["message"]["content"]
+    result = llm.invoke(input=initial_prompt.format(text=text), temperature=0)
+
+    # Extract the response from the result
+    if hasattr(result, "content"):
+        response = result.content
     else:
         print(
             f"Error: LLM result is not a dictionary or a string. It is a {type(result)} with value {result}"
@@ -166,6 +163,7 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str):
         print(f"Write with value {txt_file_path}")
     else:
         # Write the response into a new text file
+        result_file_path = txt_file_path.replace(".txt", "_processed.txt")
         result_file_path = txt_file_path.replace(".txt", ".txt")
         with open(result_file_path, "w") as result_file:
             result_file.write(response)

diff --git a/dj_backend_server/api/views/views_message.py b/dj_backend_server/api/views/views_message.py
@@ -249,39 +249,15 @@ def send_chat(request):
                 )
 
         bot_response = ChatbotResponse(response.json())
-        for metadata_entry in response_json.get("metadata", []):
-            print(f"Last Update: {metadata_entry.get('last_update')}")
-            print(f"Type: {metadata_entry.get('type')}")
-            print(f"Original Filename: {metadata_entry.get('original_filename')}")
-            print(f"Source: {metadata_entry.get('source')}")
-            print(f"Folder: {metadata_entry.get('folder')}")
-            print(f"Page: {metadata_entry.get('page')}")
-            print(f"Bot ID: {metadata_entry.get('bot_id')}")
-            print(f"ID: {metadata_entry.get('_id')}")
-            print(f"Collection Name: {metadata_entry.get('_collection_name')}")
-
-        metadata_items = [
-            {
-                "source": entry.get("source"),
-                "original_filename": entry.get("original_filename"),
-            }
-            for entry in response_json.get("metadata", [])
-        ]
-        metadata_html = render_to_string(
-            "widgets/metadata.html",
-            {
-                "APP_URL": settings.APP_URL,
-                "session_id": session_id,
-                "metadata_items": metadata_items,
-            },
-        )
+
         feedback_form_html = render_to_string(
             "widgets/feedback.html",
             {"APP_URL": settings.APP_URL, "session_id": session_id},
         )
 
-        print(f"Response in JSON {session_id}")
-        html_compose = metadata_html + feedback_form_html
+        html_compose = (
+            metadata_html_append(response_json, session_id) + feedback_form_html
+        )
         return JsonResponse(
             {
                 "type": "text",
@@ -334,3 +310,35 @@ def handle_feedback(request):
             return JsonResponse({"error": "Chat history not found"}, status=404)
     except Exception as e:
         return JsonResponse({"error": "An error occurred"}, status=500)
+
+
+def metadata_html_append(response_json, session_id):
+
+    for metadata_entry in response_json.get("metadata", []):
+        type = metadata_entry.get("type")
+        print(f"Type: {type}")
+        print(f"Original Filename: {metadata_entry.get('original_filename')}")
+        print(f"Source: {metadata_entry.get('source')}")
+        print(f"Page: {metadata_entry.get('page')}")
+        print(f"Bot ID: {metadata_entry.get('bot_id')}")
+        print(f"ID: {metadata_entry.get('_id')}")
+
+    seen_filenames = set()
+    metadata_items = []
+    # if the original_filename is the same in for, then show it only one time.
+    for entry in response_json.get("metadata", []):
+        original_filename = entry.get("original_filename")
+        if original_filename not in seen_filenames:
+            metadata_items.append(
+                {"source": entry.get("source"), "original_filename": original_filename}
+            )
+            seen_filenames.add(original_filename)
+
+    return render_to_string(
+        "widgets/metadata.html",
+        {
+            "APP_URL": settings.APP_URL,
+            "session_id": session_id,
+            "metadata_items": metadata_items,
+        },
+    )
diff --git a/dj_backend_server/web/templates/widgets/metadata.html b/dj_backend_server/web/templates/widgets/metadata.html
@@ -2,13 +2,12 @@
 {% load static %}
 
 <div class="my-5">
-    <div class="w-full block h-px bg-gray-200 my-2"></div>
+    <div class="w-full block h-px bg-white-200 my-2"></div>
     <div style="color: #000; background-color: #c3c3c3; transition: all 0.2s; padding: 0.25rem 0.5rem; border-radius: 0.375rem;">
         <div>
             {% for item in metadata_items %}
                 <div style="margin-bottom: 10px;">
-                    <strong>{% trans "Source:" %}</strong> {{ item.source }}<br>
-                    <strong>{% trans "Original Filename:" %}</strong> {{ item.original_filename }}
+                    <strong>{% trans "Source of data" %}:</strong> <a href = "{{ APP_URL }}/{{ item.source }}" target="_blank" title="{{ item.original_filename }}">{% trans "Download" %}</a><br>
                 </div>
             {% endfor %}
         </div>