neo4j-labs · prakriti-solankey · Apr 1, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz
 .vennv
 newenv
 files
-
+startupbackend.sh
+startupfrontend.sh
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,63 +1,63 @@
 asyncio==3.4.3
-boto3==1.36.2
-botocore==1.36.2
-certifi==2024.8.30
-fastapi==0.115.6
+boto3==1.37.11
+botocore==1.37.11
+certifi==2025.1.31
+fastapi==0.115.11
 fastapi-health==0.4.0
-google-api-core==2.24.0
-google-auth==2.37.0
+google-api-core==2.24.2
+google-auth==2.38.0
 google_auth_oauthlib==1.2.1
-google-cloud-core==2.4.1
-json-repair==0.30.2
+google-cloud-core==2.4.3
+json-repair==0.30.3
 pip-install==1.3.5
-langchain==0.3.15
-langchain-aws==0.2.11
-langchain-anthropic==0.3.3
-langchain-fireworks==0.2.6
-langchain-community==0.3.15
-langchain-core==0.3.31
+langchain==0.3.20
+langchain-aws==0.2.15
+langchain-anthropic==0.3.9
+langchain-fireworks==0.2.7
+langchain-community==0.3.19
+langchain-core==0.3.45
 langchain-experimental==0.3.4
-langchain-google-vertexai==2.0.11
-langchain-groq==0.2.3
-langchain-openai==0.3.1
-langchain-text-splitters==0.3.5
+langchain-google-vertexai==2.0.15
+langchain-groq==0.2.5
+langchain-openai==0.3.8
+langchain-text-splitters==0.3.6
 langchain-huggingface==0.1.2
 langdetect==1.0.9
-langsmith==0.2.11
+langsmith==0.3.13
 langserve==0.3.1
 neo4j-rust-ext
 nltk==3.9.1
-openai==1.59.9
-opencv-python==4.10.0.84
-psutil==6.1.0
-pydantic==2.9.2
+openai==1.66.2
+opencv-python==4.11.0.86
+psutil==7.0.0
+pydantic==2.10.6
 python-dotenv==1.0.1
 python-magic==0.4.27
 PyPDF2==3.0.1
-PyMuPDF==1.24.14
-starlette==0.41.3
-sse-starlette==2.1.3
+PyMuPDF==1.25.3
+starlette==0.46.1
+sse-starlette==2.2.1
 starlette-session==0.4.3
 tqdm==4.67.1
 unstructured[all-docs]
-unstructured==0.16.11
-unstructured-client==0.28.1
-unstructured-inference==0.8.1
-urllib3==2.2.2
-uvicorn==0.32.1
+unstructured==0.16.25
+unstructured-client==0.31.1
+unstructured-inference==0.8.9
+urllib3==2.3.0
+uvicorn==0.34.0
 gunicorn==23.0.0
 wikipedia==1.4.0
-wrapt==1.16.0
-yarl==1.9.4
-youtube-transcript-api==0.6.3
-zipp==3.17.0
-sentence-transformers==3.3.1
-google-cloud-logging==3.11.3
-pypandoc==1.13
-graphdatascience==1.12
-Secweb==1.11.0
-ragas==0.2.11
+wrapt==1.17.2
+yarl==1.18.3
+youtube-transcript-api==1.0.0
+zipp==3.21.0
+sentence-transformers==3.4.1
+google-cloud-logging==3.11.4
+pypandoc==1.15
+graphdatascience==1.14
+Secweb==1.18.1
+ragas==0.2.14
 rouge_score==0.1.2
-langchain-neo4j==0.3.0
+langchain-neo4j==0.4.0
 pypandoc-binary==1.15
-chardet==5.2.0
+chardet==5.2.0
diff --git a/backend/score.py b/backend/score.py
@@ -576,9 +576,10 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
         result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
         end = time.time()
         elapsed_time = end - start
-        json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
-                            'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
-        logger.log_struct(json_obj, "INFO")
+        if int(chunkNumber) == int(totalChunks):
+            json_obj = {'api_name':'upload','db_url':uri,'userName':userName, 'database':database, 'chunkNumber':chunkNumber,'totalChunks':totalChunks,
+                                'original_file_name':originalname,'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email}
+            logger.log_struct(json_obj, "INFO")
         if int(chunkNumber) == int(totalChunks):
             return create_api_response('Success',data=result, message='Source Node Created Successfully')
         else:
@@ -894,7 +895,7 @@ async def retry_processing(uri=Form(None), userName=Form(None), password=Form(No
     try:
         start = time.time()
         graph = create_graph_database_connection(uri, userName, password, database)
-        chunks =  graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
+        chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name})
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition,

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -380,7 +380,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
             search_kwargs={
-                'k': search_k,
+                'top_k': search_k,
                 'effective_search_ratio': ef_ratio,
                 'score_threshold': score_threshold,
                 'filter': {'fileName': {'$in': document_names}}
@@ -390,7 +390,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
     else:
         retriever = neo_db.as_retriever(
             search_type="similarity_score_threshold",
-            search_kwargs={'k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
+            search_kwargs={'top_k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold}
         )
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
     return retriever

diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py
@@ -1,6 +1,7 @@
 from langchain.docstore.document import Document
 from src.shared.llm_graph_builder_exception import LLMGraphBuilderException
 from youtube_transcript_api import YouTubeTranscriptApi 
+from youtube_transcript_api.proxies import GenericProxyConfig
 import logging
 from urllib.parse import urlparse,parse_qs
 from difflib import SequenceMatcher
@@ -12,8 +13,10 @@
 def get_youtube_transcript(youtube_id):
   try:
     proxy = os.environ.get("YOUTUBE_TRANSCRIPT_PROXY") 
-    proxies = { 'https': proxy }
-    transcript_pieces = YouTubeTranscriptApi.get_transcript(youtube_id, proxies = proxies)
+    proxy_config = GenericProxyConfig(http_url=proxy, https_url=proxy) if proxy else None
+    youtube_api = YouTubeTranscriptApi(proxy_config=proxy_config)
+    transcript_pieces = youtube_api.fetch(youtube_id, preserve_formatting=True)
+    transcript_pieces = transcript_pieces.to_raw_data()
     return transcript_pieces
   except Exception as e:
     message = f"Youtube transcript is not available for youtube Id: {youtube_id}"

diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -1,5 +1,7 @@
 import logging
 import os
+import time
+from neo4j.exceptions import TransientError
 from langchain_neo4j import Neo4jGraph
 from src.shared.common_fn import create_gcs_bucket_folder_name_hashed, delete_uploaded_local_file, load_embedding_model
 from src.document_sources.gcs_bucket import delete_file_from_gcs
@@ -16,7 +18,7 @@ class graphDBdataAccess:
     def __init__(self, graph: Neo4jGraph):
         self.graph = graph
 
-    def update_exception_db(self, file_name, exp_msg, retry_condition):
+    def update_exception_db(self, file_name, exp_msg, retry_condition=None):
         try:
             job_status = "Failed"
             result = self.get_current_status_document_node(file_name)
@@ -254,8 +256,20 @@ def connection_check_and_get_vector_dimensions(self,database):
                 else:
                     return {'message':"Connection Successful","gds_status": gds_status,"write_access":write_access}
 
-    def execute_query(self, query, param=None):
-        return self.graph.query(query, param)
+    def execute_query(self, query, param=None,max_retries=3, delay=2):
+        retries = 0
+        while retries < max_retries:
+            try:
+                return self.graph.query(query, param)
+            except TransientError as e:
+                if "DeadlockDetected" in str(e):
+                    retries += 1
+                    logging.info(f"Deadlock detected. Retrying {retries}/{max_retries} in {delay} seconds...")
+                    time.sleep(delay)  # Wait before retrying
+                else:
+                    raise 
+        logging.error("Failed to execute query after maximum retries due to persistent deadlocks.")
+        raise RuntimeError("Query execution failed after multiple retries due to deadlock.")
 
     def get_current_status_document_node(self, file_name):
         query = """

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -400,7 +400,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
           obj_source_node.processing_time = processed_time
           obj_source_node.processed_chunk = select_chunks_upto+select_chunks_with_retry
           if retry_condition == START_FROM_BEGINNING:
-            result = graph.query(QUERY_TO_GET_NODES_AND_RELATIONS_OF_A_DOCUMENT, params={"filename":file_name})
+            result = execute_graph_query(graph,QUERY_TO_GET_NODES_AND_RELATIONS_OF_A_DOCUMENT, params={"filename":file_name})
             obj_source_node.node_count = result[0]['nodes']
             obj_source_node.relationship_count = result[0]['rels']
           else:  
@@ -503,21 +503,10 @@ async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password,
   logging.info(f'Time taken to create relationship between chunk and entities: {elapsed_relationship:.2f} seconds')
   latency_processing_chunk["relationship_between_chunk_entity"] = f'{elapsed_relationship:.2f}'
 
-  distinct_nodes = set()
-  relations = []
-  for graph_document in graph_documents:
-    #get distinct nodes
-    for node in graph_document.nodes:
-          node_id = node.id
-          node_type= node.type
-          if (node_id, node_type) not in distinct_nodes:
-            distinct_nodes.add((node_id, node_type))
-    #get all relations
-    for relation in graph_document.relationships:
-          relations.append(relation.type)
-
-    node_count += len(distinct_nodes)
-    rel_count += len(relations)
+  graphDb_data_Access = graphDBdataAccess(graph)
+  count_response = graphDb_data_Access.update_node_relationship_count(file_name)
+  node_count = count_response[file_name].get('nodeCount',"0")
+  rel_count = count_response[file_name].get('relationshipCount',"0")
   return node_count,rel_count,latency_processing_chunk
 
 def get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_overlap, retry_condition):
@@ -539,7 +528,7 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_o
 
   else:  
     chunkId_chunkDoc_list=[]
-    chunks =  graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
+    chunks =  execute_graph_query(graph,QUERY_TO_GET_CHUNKS, params={"filename":file_name})
 
     if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks :
       raise LLMGraphBuilderException(f"Chunks are not created for {file_name}. Please re-upload file and try again.")    
@@ -550,13 +539,13 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_o
 
       if retry_condition ==  START_FROM_LAST_PROCESSED_POSITION:
         logging.info(f"Retry : start_from_last_processed_position")
-        starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, params={"filename":file_name})
+        starting_chunk = execute_graph_query(graph,QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, params={"filename":file_name})
 
         if starting_chunk and starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
           return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
 
         elif starting_chunk and starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
-          starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, params={"filename":file_name})
+          starting_chunk =  execute_graph_query(graph,QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, params={"filename":file_name})
           return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
 
         else:
@@ -741,7 +730,7 @@ def set_status_retry(graph, file_name, retry_condition):
     if retry_condition == DELETE_ENTITIES_AND_START_FROM_BEGINNING or retry_condition == START_FROM_BEGINNING:
         obj_source_node.processed_chunk=0
     if retry_condition == DELETE_ENTITIES_AND_START_FROM_BEGINNING:
-        graph.query(QUERY_TO_DELETE_EXISTING_ENTITIES, params={"filename":file_name})
+        execute_graph_query(graph,QUERY_TO_DELETE_EXISTING_ENTITIES, params={"filename":file_name})
         obj_source_node.node_count=0
         obj_source_node.relationship_count=0
     logging.info(obj_source_node)

diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py
@@ -1,6 +1,6 @@
 from langchain_neo4j import Neo4jGraph
 from langchain.docstore.document import Document
-from src.shared.common_fn import load_embedding_model
+from src.shared.common_fn import load_embedding_model,execute_graph_query
 import logging
 from typing import List
 import os
@@ -33,7 +33,7 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
                     CALL apoc.merge.node([data.node_type], {id: data.node_id}) YIELD node AS n
                     MERGE (c)-[:HAS_ENTITY]->(n)
                 """
-        graph.query(unwind_query, params={"batch_data": batch_data})
+        execute_graph_query(graph,unwind_query, params={"batch_data": batch_data})
 
 
 def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name):
@@ -59,7 +59,7 @@ def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name):
         SET c.embedding = row.embeddings
         MERGE (c)-[:PART_OF]->(d)
     """       
-    graph.query(query_to_create_embedding, params={"fileName":file_name, "data":data_for_query})
+    execute_graph_query(graph,query_to_create_embedding, params={"fileName":file_name, "data":data_for_query})
 
 def create_relation_between_chunks(graph, file_name, chunks: List[Document])->list:
     logging.info("creating FIRST_CHUNK and NEXT_CHUNK relationships between chunks")
@@ -127,7 +127,7 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
         MATCH (d:Document {fileName: data.f_name})
         MERGE (c)-[:PART_OF]->(d)
     """
-    graph.query(query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data})
+    execute_graph_query(graph,query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data})
 
     query_to_create_FIRST_relation = """ 
         UNWIND $relationships AS relationship
@@ -136,7 +136,7 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
         FOREACH(r IN CASE WHEN relationship.type = 'FIRST_CHUNK' THEN [1] ELSE [] END |
                 MERGE (d)-[:FIRST_CHUNK]->(c))
         """
-    graph.query(query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships})   
+    execute_graph_query(graph,query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships})
 
     query_to_create_NEXT_CHUNK_relation = """ 
         UNWIND $relationships AS relationship
@@ -145,17 +145,16 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
         MATCH (pc:Chunk {id: relationship.previous_chunk_id})
         FOREACH(r IN CASE WHEN relationship.type = 'NEXT_CHUNK' THEN [1] ELSE [] END |
                 MERGE (c)<-[:NEXT_CHUNK]-(pc))
-        """
-    graph.query(query_to_create_NEXT_CHUNK_relation, params={"relationships": relationships})   
-
+        """  
+    execute_graph_query(graph,query_to_create_NEXT_CHUNK_relation, params={"relationships": relationships})
     return lst_chunks_including_hash
 
 
 def create_chunk_vector_index(graph):
     start_time = time.time()
     try:
-        vector_index = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and type = 'VECTOR' AND name = 'vector' return options")
-
+        vector_index_query = "SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and type = 'VECTOR' AND name = 'vector' return options"
+        vector_index = execute_graph_query(graph,vector_index_query)
         if not vector_index:
             vector_store = Neo4jVector(embedding=EMBEDDING_FUNCTION,
                                     graph=graph,

diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py
@@ -4,7 +4,7 @@
 from langchain_neo4j import Neo4jGraph
 import os
 from src.graph_query import get_graphDB_driver
-from src.shared.common_fn import load_embedding_model
+from src.shared.common_fn import load_embedding_model,execute_graph_query
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from src.shared.constants import GRAPH_CLEANUP_PROMPT
@@ -179,8 +179,8 @@ def fetch_entities_for_embedding(graph):
                 MATCH (e)
                 WHERE NOT (e:Chunk OR e:Document OR e:`__Community__`) AND e.embedding IS NULL AND e.id IS NOT NULL
                 RETURN elementId(e) AS elementId, e.id + " " + coalesce(e.description, "") AS text
-                """
-    result = graph.query(query)           
+                """ 
+    result = execute_graph_query(graph,query)        
     return [{"elementId": record["elementId"], "text": record["text"]} for record in result]
 
 def update_embeddings(rows, graph):
@@ -194,7 +194,7 @@ def update_embeddings(rows, graph):
       MATCH (e) WHERE elementId(e) = row.elementId
       CALL db.create.setNodeVectorProperty(e, "embedding", row.embedding)
       """  
-    return graph.query(query,params={'rows':rows})          
+    return execute_graph_query(graph,query,params={'rows':rows})          
 
 def graph_schema_consolidation(graph):
     graphDb_data_Access = graphDBdataAccess(graph)
@@ -223,14 +223,14 @@ def graph_schema_consolidation(graph):
                     SET n:`{new_label}`
                     REMOVE n:`{old_label}`
                     """
-            graph.query(query)
-    
+            execute_graph_query(graph,query)
+
     for old_label, new_label in relation_mapping.items():
         query = f"""
                 MATCH (n)-[r:`{old_label}`]->(m)
                 CREATE (n)-[r2:`{new_label}`]->(m)
                 DELETE r
                 """
-        graph.query(query)
+        execute_graph_query(graph,query)
 
     return None
-Original file line number
+Diff line change
@@ Expand Up / @@ -172,4 +172,5 @@ google-cloud-cli-linux-x86_64.tar.gz @@
     .vennv
     newenv
     files
+    startupbackend.sh
+    startupfrontend.sh