Skip to content
This repository has been archived by the owner on Jan 5, 2025. It is now read-only.

Commit

Permalink
PDF upload fixed with langchain and still metadata work in progress.
Browse files Browse the repository at this point in the history
  • Loading branch information
lvalics committed Feb 22, 2024
1 parent cc24b4e commit 3343c04
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 55 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ dj_backend_server.code-workspace
.aiderignore
dj_backend_server/.vscode/settings.json

dj_backend_server/a.py
dj_backend_server/1.pdf
38 changes: 22 additions & 16 deletions dj_backend_server/api/data_sources/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,26 +126,31 @@ def pdf_handler(

@csrf_exempt
def process_pdf_with_pypdfium(file_path, directory_path):
pdf = PdfDocument(file_path)
text_pages = []
pdf_document = PdfDocument(file_path)
text_pages_with_numbers = []

for page_index in range(len(pdf)):
page = pdf.get_page(page_index)
for page_index in range(len(pdf_document)):
page = pdf_document.get_page(page_index)
text_page = page.get_textpage() # get a text page handle for this page
text = text_page.get_text_range() # extract text from the text page
text_pages.append(text)
text_pages_with_numbers.append(
(page_index + 1, text)
) # Store page number and text
text_page.close() # close the text page handle

text = "".join(text_pages)
# Combine texts from all pages, prepending each with its page number
combined_text = "\n".join(
[f"Page {num}: {text}" for num, text in text_pages_with_numbers]
)
txt_file_path = os.path.splitext(file_path)[0] + ".txt"
logging.debug(
f"Debug: Writing text to {txt_file_path}, directory_path: {directory_path}, text: {text}"
f"Debug: Writing text with page numbers to {txt_file_path}, directory_path: {directory_path}"
)

with open(txt_file_path, "w") as f:
f.write(text)
f.write(combined_text)

pdf.close()
pdf_document.close()


@csrf_exempt
Expand All @@ -159,6 +164,7 @@ def process_pdf(FilePath, directory_path):
resturl = "http://www.ocrwebservice.com/restservices/processDocument"

RequestUrl = f"{resturl}?pagerange={pagerange}&language={language}&outputformat={outputformat}&gettext={gettext}"
logging.debug(f"Debug: RequestUrl: {RequestUrl}")

try:
with open(FilePath, "rb") as image_file:
Expand Down Expand Up @@ -216,7 +222,7 @@ def process_pdf(FilePath, directory_path):
f"\nThe text: {{text}}. "
)

# print (f"Debug: initial_prompt: {initial_prompt}")
logging.debug(f"Debug: initial_prompt: {initial_prompt}")

# Call LLM and write the result into a new text file
process_text_with_llm(txt_file, mode, initial_prompt)
Expand Down Expand Up @@ -291,13 +297,8 @@ def txt_to_vectordb(
)

docs = text_splitter.split_documents(raw_docs)

logging.debug("external files docs -->", docs)

if not docs:
print("No documents were processed successfully.")
return

embeddings = get_embeddings()

logging.debug(
Expand All @@ -311,6 +312,11 @@ def txt_to_vectordb(
"bot_id": str(pdf_data_source.chatbot.id),
"last_update": pdf_data_source.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
"type": "document",
"doc_type": (
pdf_data_source.files_info[0]["original_name"].split(".")[-1]
if pdf_data_source.files_info
else "unknown"
),
"page": "1", # @TODO to extract the page number.
"folder": pdf_data_source.folder_name,
"original_filename": (
Expand All @@ -321,7 +327,7 @@ def txt_to_vectordb(
},
)
logging.debug(
f"Vector store initialized successfully for namespace: {namespace}."
f"Vector store initialized successfully for metadata: {metadata}."
)

logging.debug(f"Folder need or not to delete. {delete_folder_flag}")
Expand Down
14 changes: 6 additions & 8 deletions dj_backend_server/api/utils/make_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,11 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str):

# Send the formatted prompt to LLM and get the result
llm = get_llm()
result = llm(prompt=initial_prompt.format(text=text), temperature=0)

# Check if result is a string
if isinstance(result, str):
response = result
elif isinstance(result, dict):
# Extract only the response from the result
response = result["choices"][0]["message"]["content"]
result = llm.invoke(input=initial_prompt.format(text=text), temperature=0)

# Extract the response from the result
if hasattr(result, "content"):
response = result.content
else:
print(
f"Error: LLM result is not a dictionary or a string. It is a {type(result)} with value {result}"
Expand All @@ -166,6 +163,7 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str):
print(f"Write with value {txt_file_path}")
else:
# Write the response into a new text file
result_file_path = txt_file_path.replace(".txt", "_processed.txt")
result_file_path = txt_file_path.replace(".txt", ".txt")
with open(result_file_path, "w") as result_file:
result_file.write(response)
Expand Down
64 changes: 36 additions & 28 deletions dj_backend_server/api/views/views_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,39 +249,15 @@ def send_chat(request):
)

bot_response = ChatbotResponse(response.json())
for metadata_entry in response_json.get("metadata", []):
print(f"Last Update: {metadata_entry.get('last_update')}")
print(f"Type: {metadata_entry.get('type')}")
print(f"Original Filename: {metadata_entry.get('original_filename')}")
print(f"Source: {metadata_entry.get('source')}")
print(f"Folder: {metadata_entry.get('folder')}")
print(f"Page: {metadata_entry.get('page')}")
print(f"Bot ID: {metadata_entry.get('bot_id')}")
print(f"ID: {metadata_entry.get('_id')}")
print(f"Collection Name: {metadata_entry.get('_collection_name')}")

metadata_items = [
{
"source": entry.get("source"),
"original_filename": entry.get("original_filename"),
}
for entry in response_json.get("metadata", [])
]
metadata_html = render_to_string(
"widgets/metadata.html",
{
"APP_URL": settings.APP_URL,
"session_id": session_id,
"metadata_items": metadata_items,
},
)

feedback_form_html = render_to_string(
"widgets/feedback.html",
{"APP_URL": settings.APP_URL, "session_id": session_id},
)

print(f"Response in JSON {session_id}")
html_compose = metadata_html + feedback_form_html
html_compose = (
metadata_html_append(response_json, session_id) + feedback_form_html
)
return JsonResponse(
{
"type": "text",
Expand Down Expand Up @@ -334,3 +310,35 @@ def handle_feedback(request):
return JsonResponse({"error": "Chat history not found"}, status=404)
except Exception as e:
return JsonResponse({"error": "An error occurred"}, status=500)


def metadata_html_append(response_json, session_id):

for metadata_entry in response_json.get("metadata", []):
type = metadata_entry.get("type")
print(f"Type: {type}")
print(f"Original Filename: {metadata_entry.get('original_filename')}")
print(f"Source: {metadata_entry.get('source')}")
print(f"Page: {metadata_entry.get('page')}")
print(f"Bot ID: {metadata_entry.get('bot_id')}")
print(f"ID: {metadata_entry.get('_id')}")

seen_filenames = set()
metadata_items = []
# if the original_filename is the same in for, then show it only one time.
for entry in response_json.get("metadata", []):
original_filename = entry.get("original_filename")
if original_filename not in seen_filenames:
metadata_items.append(
{"source": entry.get("source"), "original_filename": original_filename}
)
seen_filenames.add(original_filename)

return render_to_string(
"widgets/metadata.html",
{
"APP_URL": settings.APP_URL,
"session_id": session_id,
"metadata_items": metadata_items,
},
)
5 changes: 2 additions & 3 deletions dj_backend_server/web/templates/widgets/metadata.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
{% load static %}

<div class="my-5">
<div class="w-full block h-px bg-gray-200 my-2"></div>
<div class="w-full block h-px bg-white-200 my-2"></div>
<div style="color: #000; background-color: #c3c3c3; transition: all 0.2s; padding: 0.25rem 0.5rem; border-radius: 0.375rem;">
<div>
{% for item in metadata_items %}
<div style="margin-bottom: 10px;">
<strong>{% trans "Source:" %}</strong> {{ item.source }}<br>
<strong>{% trans "Original Filename:" %}</strong> {{ item.original_filename }}
<strong>{% trans "Source of data" %}:</strong> <a href = "{{ APP_URL }}/{{ item.source }}" target="_blank" title="{{ item.original_filename }}">{% trans "Download" %}</a><br>
</div>
{% endfor %}
</div>
Expand Down

0 comments on commit 3343c04

Please sign in to comment.