Improve Document/Chunk ID management (#222)

* Make sure the created chunk ID are unique * Remove unused id_prefix * Rm unused imports * Changelog + deprecate field * Fix mypy and UR * Do not change chunk UID after embeddings * Address comments * Update lock file * Regenerate lock file after merge * Changelog + deprecate field * Recreate lock file * WIP: e2e tests * Fix CI * Ruff (why on so many files?) * Fix doc * Undo change to conftest.py * E2E tests
neo4j · Jan 2, 2025 · 39a4b73 · 39a4b73
1 parent 39fd4f7
commit 39a4b73
Show file tree

Hide file tree

Showing 35 changed files with 1,127 additions and 1,034 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,11 @@
 
 ### Changed
 - Updated LLM implementations to handle message history consistently across providers.
+- The `id_prefix` parameter in the `LexicalGraphConfig` is deprecated.
+
+### Fixed
+- IDs for the Document and Chunk nodes in the lexical graph are now randomly generated and unique across multiple runs, fixing issues in the lexical graph where relationships were created between chunks that were created by different pipeline runs.
+
 
 ## 1.3.0
 

diff --git a/docs/source/types.rst b/docs/source/types.rst
@@ -39,6 +39,12 @@ RagResultModel
 
 .. autoclass:: neo4j_graphrag.generation.types.RagResultModel
 
+DocumentInfo
+============
+
+.. autoclass:: neo4j_graphrag.experimental.components.types.DocumentInfo
+
+
 TextChunk
 =========
 

diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst
@@ -672,7 +672,7 @@ Example usage:
     from neo4j_graphrag.experimental.pipeline.components.lexical_graph_builder import LexicalGraphBuilder
     from neo4j_graphrag.experimental.pipeline.components.types import LexicalGraphConfig
 
-    lexical_graph_builder = LexicalGraphBuilder(config=LexicalGraphConfig(id_prefix="example"))
+    lexical_graph_builder = LexicalGraphBuilder(config=LexicalGraphConfig())
     graph = await lexical_graph_builder.run(
         text_chunks=TextChunks(chunks=[
             TextChunk(text="some text", index=0),
@@ -713,7 +713,6 @@ Optionally, the document and chunk node labels can be configured using a `Lexica
     # optionally, define a LexicalGraphConfig object
     # shown below with the default values
     config = LexicalGraphConfig(
-        id_prefix="",  # used to prefix the chunk and document IDs
         chunk_node_label="Chunk",
         document_node_label="Document",
         chunk_to_document_relationship_type="PART_OF_DOCUMENT",
@@ -998,7 +997,7 @@ without making assumptions about entity similarity. The Entity Resolver
 is responsible for refining the created knowledge graph by merging entity
 nodes that represent the same real-world object.
 
-In practice, this package implements a single resolver that merges nodes
+In practice, this package implements a simple resolver that merges nodes
 with the same label and identical "name" property.
 
 .. warning::
@@ -1018,15 +1017,30 @@ It can be used like this:
 
 .. warning::
 
-    By default, all nodes with the __Entity__ label will be resolved.
-    To exclude specific nodes, a filter_query can be added to the query.
-    For example, if a `:Resolved` label has been applied to already resolved entities
-    in the graph, these entities can be excluded with the following approach:
+    By default, all nodes with the `__Entity__` label will be resolved.
+    This behavior can be controled using the `filter_query` parameter described below.
 
-    .. code:: python
+Filter Query Parameter
+----------------------
 
-        from neo4j_graphrag.experimental.components.resolver import (
-            SinglePropertyExactMatchResolver,
-        )
-        resolver = SinglePropertyExactMatchResolver(driver, filter_query="WHERE not entity:Resolved")
-        res = await resolver.run()
+To exclude specific nodes from the resolution, a `filter_query` can be added to the query.
+For example, if a `:Resolved` label has been applied to already resolved entities
+in the graph, these entities can be excluded with the following approach:
+
+.. code:: python
+
+    from neo4j_graphrag.experimental.components.resolver import (
+        SinglePropertyExactMatchResolver,
+    )
+    filter_query = "WHERE NOT entity:Resolved"
+    resolver = SinglePropertyExactMatchResolver(driver, filter_query=filter_query)
+    res = await resolver.run()
+
+
+Similar approach can be used to exclude entities created from a previous pipeline
+run on the same document, assuming a label `OldDocument` has been assigned to the
+previously created document node:
+
+.. code:: python
+
+    filter_query = "WHERE NOT EXISTS((entity)-[:FROM_DOCUMENT]->(:OldDocument))"
diff --git a/examples/build_graph/simple_kg_builder_from_pdf.py b/examples/build_graph/simple_kg_builder_from_pdf.py
@@ -23,7 +23,7 @@
 DATABASE = "neo4j"
 
 
-root_dir = Path(__file__).parents[4]
+root_dir = Path(__file__).parents[1]
 file_path = root_dir / "data" / "Harry Potter and the Chamber of Secrets Summary.pdf"
 
 

diff --git a/examples/customize/build_graph/components/extractors/custom_extractor.py b/examples/customize/build_graph/components/extractors/custom_extractor.py
@@ -4,8 +4,8 @@
     EntityRelationExtractor,
     OnError,
 )
-from neo4j_graphrag.experimental.components.pdf_loader import DocumentInfo
 from neo4j_graphrag.experimental.components.types import (
+    DocumentInfo,
     LexicalGraphConfig,
     Neo4jGraph,
     TextChunks,

diff --git a/examples/customize/build_graph/components/lexical_graph_builder/lexical_graph_builder.py b/examples/customize/build_graph/components/lexical_graph_builder/lexical_graph_builder.py
@@ -13,7 +13,6 @@ async def main() -> GraphResult:
     # optionally, define a LexicalGraphConfig object
     # shown below with default values
     config = LexicalGraphConfig(
-        id_prefix="",  # used to prefix the chunk and document IDs
         chunk_node_label="Chunk",
         document_node_label="Document",
         chunk_to_document_relationship_type="PART_OF_DOCUMENT",

diff --git a/examples/customize/build_graph/components/loaders/custom_loader.py b/examples/customize/build_graph/components/loaders/custom_loader.py
@@ -3,11 +3,8 @@
 from pathlib import Path
 from typing import Dict, Optional
 
-from neo4j_graphrag.experimental.components.pdf_loader import (
-    DataLoader,
-    DocumentInfo,
-    PdfDocument,
-)
+from neo4j_graphrag.experimental.components.pdf_loader import DataLoader
+from neo4j_graphrag.experimental.components.types import DocumentInfo, PdfDocument
 
 
 class MyLoader(DataLoader):

diff --git a/examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py b/examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py
@@ -33,7 +33,6 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")
     # optional: define some custom node labels for the lexical graph:
     lexical_graph_config = LexicalGraphConfig(
-        id_prefix="example",
         chunk_node_label="TextPart",
     )
     pipe.add_component(

diff --git a/...s/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py b/...s/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py
@@ -164,7 +164,6 @@ async def define_and_run_pipeline(
 async def main(driver: neo4j.Driver) -> PipelineResult:
     # optional: define some custom node labels for the lexical graph:
     lexical_graph_config = LexicalGraphConfig(
-        id_prefix="example",
         chunk_node_label="TextPart",
         document_node_label="Text",
     )

diff --git a/...les/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py b/...les/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py
@@ -184,7 +184,6 @@ async def read_chunk_and_perform_entity_extraction(
 async def main(driver: neo4j.Driver) -> PipelineResult:
     # optional: define some custom node labels for the lexical graph:
     lexical_graph_config = LexicalGraphConfig(
-        id_prefix="example",
         document_node_label="Book",  # default: "Document"
         chunk_node_label="Chapter",  # default "Chunk"
         chunk_text_property="content",  # default: "text"