From e4342e8bea98d8717919536bf8e90db145b6a2f9 Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 13 Dec 2024 15:06:47 +0100 Subject: [PATCH 01/13] WIP: improve logging --- .../simple_kg_builder_from_text.py | 5 +++ .../components/entity_relation_extractor.py | 9 ++-- .../experimental/components/types.py | 25 ++++++++++- .../experimental/pipeline/config/runner.py | 3 ++ .../experimental/pipeline/pipeline.py | 24 ++++++----- src/neo4j_graphrag/utils.py | 42 ++++++++++++++++++- tests/e2e/test_kg_writer_component_e2e.py | 4 +- .../experimental/components/test_embedder.py | 10 +++-- .../components/test_lexical_graph_builder.py | 4 +- 9 files changed, 101 insertions(+), 25 deletions(-) diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py index 67ed6776d..0295331fd 100644 --- a/examples/build_graph/simple_kg_builder_from_text.py +++ b/examples/build_graph/simple_kg_builder_from_text.py @@ -8,6 +8,7 @@ """ import asyncio +import logging import neo4j from neo4j_graphrag.embeddings import OpenAIEmbeddings @@ -20,6 +21,10 @@ from neo4j_graphrag.llm import LLMInterface from neo4j_graphrag.llm.openai_llm import OpenAILLM +logging.basicConfig() +logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) + + # Neo4j db infos URI = "neo4j://localhost:7687" AUTH = ("neo4j", "password") diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index c1f7a6108..5432fa70c 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -38,6 +38,7 @@ from neo4j_graphrag.experimental.pipeline.exceptions import InvalidJSONError from neo4j_graphrag.generation.prompts import ERExtractionTemplate, PromptTemplate from neo4j_graphrag.llm import LLMInterface +from neo4j_graphrag.utils import prettyfier logger = logging.getLogger(__name__) @@ -221,8 +222,9 @@ async def extract_for_chunk( ) else: logger.error( - f"LLM response is not valid JSON {llm_result.content} for chunk_index={chunk.index}" + f"LLM response is not valid JSON for chunk_index={chunk.index}" ) + logger.debug(f"Invalid JSON: {llm_result.content}") result = {"nodes": [], "relationships": []} try: chunk_graph = Neo4jGraph(**result) @@ -233,8 +235,9 @@ async def extract_for_chunk( ) else: logger.error( - f"LLM response has improper format {result} for chunk_index={chunk.index}" + f"LLM response has improper format for chunk_index={chunk.index}" ) + logger.debug(f"Invalid JSON format: {result}") chunk_graph = Neo4jGraph() return chunk_graph @@ -336,5 +339,5 @@ async def run( ] chunk_graphs: list[Neo4jGraph] = list(await asyncio.gather(*tasks)) graph = self.combine_chunk_graphs(lexical_graph, chunk_graphs) - logger.debug(f"{self.__class__.__name__}: {graph}") + logger.debug(f"Extracted graph: {prettyfier(graph)}") return graph diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py index 689e6b6ce..7be8cd0fd 100644 --- a/src/neo4j_graphrag/experimental/components/types.py +++ b/src/neo4j_graphrag/experimental/components/types.py @@ -15,13 +15,17 @@ from __future__ import annotations import uuid -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, TYPE_CHECKING -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, RootModel from neo4j_graphrag.experimental.pipeline.component import DataModel +if TYPE_CHECKING: + from pydantic._internal import _repr + + class DocumentInfo(DataModel): """A document loaded by a DataLoader. @@ -75,6 +79,20 @@ class TextChunks(DataModel): chunks: list[TextChunk] +# class Embeddings(RootModel): +# """A wrapper around list[float] to represent embeddings. +# Used to improve logging of vectors by not showing the full vector. +# """ +# root: list[float] +# +# # def __rep_str__(self, sep: str = ", ") -> str: +# # return f"" +# +# def __repr_args__(self) -> _repr.ReprArgs: +# yield 'dimension', len(self.root) +# yield 'vector', self.root[:3] +# + class Neo4jNode(BaseModel): """Represents a Neo4j node. @@ -129,6 +147,9 @@ class Neo4jGraph(DataModel): nodes: list[Neo4jNode] = [] relationships: list[Neo4jRelationship] = [] + # def __str__(self) -> str: + # return f"" + class ResolutionStats(DataModel): number_of_nodes_to_resolve: int diff --git a/src/neo4j_graphrag/experimental/pipeline/config/runner.py b/src/neo4j_graphrag/experimental/pipeline/config/runner.py index a1a225858..c31f54a9f 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/runner.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/runner.py @@ -70,6 +70,7 @@ class PipelineConfigWrapper(BaseModel): ] = Field(discriminator=Discriminator(_get_discriminator_value)) def parse(self, resolved_data: dict[str, Any] | None = None) -> PipelineDefinition: + logger.debug("PIPELINE_CONFIG: start parsing config...") return self.config.parse(resolved_data) def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: @@ -101,10 +102,12 @@ def from_config( cls, config: AbstractPipelineConfig | dict[str, Any], do_cleaning: bool = False ) -> Self: wrapper = PipelineConfigWrapper.model_validate({"config": config}) + logger.debug(f"PIPELINE_RUNNER: instantiate Pipeline from config type: {wrapper.config.template_}") return cls(wrapper.parse(), config=wrapper.config, do_cleaning=do_cleaning) @classmethod def from_config_file(cls, file_path: Union[str, Path]) -> Self: + logger.info(f"PIPELINE_RUNNER: reading config file from {file_path}") if not isinstance(file_path, str): file_path = str(file_path) data = ConfigReader().read(file_path) diff --git a/src/neo4j_graphrag/experimental/pipeline/pipeline.py b/src/neo4j_graphrag/experimental/pipeline/pipeline.py index e3ded494d..a58976676 100644 --- a/src/neo4j_graphrag/experimental/pipeline/pipeline.py +++ b/src/neo4j_graphrag/experimental/pipeline/pipeline.py @@ -24,6 +24,8 @@ from timeit import default_timer from typing import Any, AsyncGenerator, Optional +from neo4j_graphrag.utils import prettyfier + try: import pygraphviz as pgv except ImportError: @@ -90,21 +92,19 @@ async def execute(self, **kwargs: Any) -> RunResult | None: if the task run successfully, None if the status update was unsuccessful. """ - logger.debug(f"Running component {self.name} with {kwargs}") - start_time = default_timer() component_result = await self.component.run(**kwargs) run_result = RunResult( result=component_result, ) - end_time = default_timer() - logger.debug(f"Component {self.name} finished in {end_time - start_time}s") return run_result async def run(self, inputs: dict[str, Any]) -> RunResult | None: """Main method to execute the task.""" - logger.debug(f"TASK START {self.name=} {inputs=}") + logger.debug(f"TASK START {self.name=} input={prettyfier(inputs)}") + start_time = default_timer() res = await self.execute(**inputs) - logger.debug(f"TASK RESULT {self.name=} {res=}") + end_time = default_timer() + logger.debug(f"TASK FINISHED {self.name} in {end_time - start_time} res={prettyfier(res)}") return res @@ -141,7 +141,7 @@ async def run_task(self, task: TaskPipelineNode, data: dict[str, Any]) -> None: try: await self.set_task_status(task.name, RunStatus.RUNNING) except PipelineStatusUpdateError: - logger.info(f"Component {task.name} already running or done") + logger.debug(f"ORCHESTRATOR: TASK ABORTED: {task.name} is already running or done, aborting") return None res = await task.run(inputs) await self.set_task_status(task.name, RunStatus.DONE) @@ -198,7 +198,8 @@ async def check_dependencies_complete(self, task: TaskPipelineNode) -> None: d_status = await self.get_status_for_component(d.start) if d_status != RunStatus.DONE: logger.debug( - f"Missing dependency {d.start} for {task.name} (status: {d_status}). " + f"ORCHESTRATOR {self.run_id}: TASK DELAYED: Missing dependency {d.start} for {task.name} " + f"(status: {d_status}). " "Will try again when dependency is complete." ) raise PipelineMissingDependencyError() @@ -227,6 +228,7 @@ async def next( await self.check_dependencies_complete(next_node) except PipelineMissingDependencyError: continue + logger.debug(f"ORCHESTRATOR {self.run_id}: enqueuing next task: {next_node.name}") yield next_node return @@ -315,7 +317,6 @@ async def run(self, data: dict[str, Any]) -> None: (node without any parent). Then the callback on_task_complete will handle the task dependencies. """ - logger.debug(f"PIPELINE START {data=}") tasks = [self.run_task(root, data) for root in self.pipeline.roots()] await asyncio.gather(*tasks) @@ -624,15 +625,16 @@ def validate_parameter_mapping_for_task(self, task: TaskPipelineNode) -> bool: return True async def run(self, data: dict[str, Any]) -> PipelineResult: - logger.debug("Starting pipeline") + logger.debug("PIPELINE START") start_time = default_timer() self.invalidate() self.validate_input_data(data) orchestrator = Orchestrator(self) + logger.debug(f"PIPELINE ORCHESTRATOR: {orchestrator.run_id}") await orchestrator.run(data) end_time = default_timer() logger.debug( - f"Pipeline {orchestrator.run_id} finished in {end_time - start_time}s" + f"PIPELINE FINISHED {orchestrator.run_id} in {end_time - start_time}s" ) return PipelineResult( run_id=orchestrator.run_id, diff --git a/src/neo4j_graphrag/utils.py b/src/neo4j_graphrag/utils.py index e86f7588a..b4331c883 100644 --- a/src/neo4j_graphrag/utils.py +++ b/src/neo4j_graphrag/utils.py @@ -14,7 +14,9 @@ # limitations under the License. from __future__ import annotations -from typing import Optional +from typing import Optional, Any + +from pydantic import BaseModel def validate_search_query_input( @@ -22,3 +24,41 @@ def validate_search_query_input( ) -> None: if not (bool(query_vector) ^ bool(query_text)): raise ValueError("You must provide exactly one of query_vector or query_text.") + + + +class Prettyfier: + """Prettyfy object for logging. + + I.e.: truncate long lists. + """ + def __init__(self, max_items_in_list: int = 5): + self.max_items_in_list = max_items_in_list + + def _prettyfy_dict(self, value: dict[Any, Any]) -> dict[Any, Any]: + return { + k: self(v) # prettyfy each value + for k, v in value.items() + } + + def _prettyfy_list(self, value: list[Any]) -> list[Any]: + items = [ + self(v) # prettify each item + for v in value[:self.max_items_in_list] + ] + remaining_items = len(value) - len(items) + if remaining_items > 0: + items.append(f"...truncated {remaining_items} items...") + return items + + def __call__(self, value: Any) -> Any: + if isinstance(value, dict): + return self._prettyfy_dict(value) + if isinstance(value, BaseModel): + return self(value.model_dump()) + if isinstance(value, list): + return self._prettyfy_list(value) + return value + + +prettyfier = Prettyfier() diff --git a/tests/e2e/test_kg_writer_component_e2e.py b/tests/e2e/test_kg_writer_component_e2e.py index 2fc0ab90a..8774e30cf 100644 --- a/tests/e2e/test_kg_writer_component_e2e.py +++ b/tests/e2e/test_kg_writer_component_e2e.py @@ -76,7 +76,7 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: if start_node.embedding_properties: # for mypy for key, val in start_node.embedding_properties.items(): assert key in node_a.keys() - assert node_a.get(key) == [1.0, 2.0, 3.0] + assert val.root == node_a.get(key) node_b = record["b"] assert end_node.label in list(node_b.labels) @@ -100,7 +100,7 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: if node_with_two_embeddings.embedding_properties: # for mypy for key, val in node_with_two_embeddings.embedding_properties.items(): assert key in node_c.keys() - assert val == node_c.get(key) + assert val.root == node_c.get(key) @pytest.mark.asyncio diff --git a/tests/unit/experimental/components/test_embedder.py b/tests/unit/experimental/components/test_embedder.py index 5c72e0d21..f7da60c8d 100644 --- a/tests/unit/experimental/components/test_embedder.py +++ b/tests/unit/experimental/components/test_embedder.py @@ -16,7 +16,11 @@ import pytest from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder -from neo4j_graphrag.experimental.components.types import TextChunk, TextChunks +from neo4j_graphrag.experimental.components.types import ( + Embeddings, + TextChunk, + TextChunks, +) @pytest.mark.asyncio @@ -33,6 +37,4 @@ async def test_text_chunk_embedder_run(embedder: MagicMock) -> None: assert isinstance(chunk, TextChunk) assert chunk.metadata is not None assert "embedding" in chunk.metadata.keys() - assert isinstance(chunk.metadata["embedding"], list) - for i in chunk.metadata["embedding"]: - assert isinstance(i, float) + assert isinstance(chunk.metadata["embedding"], Embeddings) diff --git a/tests/unit/experimental/components/test_lexical_graph_builder.py b/tests/unit/experimental/components/test_lexical_graph_builder.py index 4621c77e9..9a9887b40 100644 --- a/tests/unit/experimental/components/test_lexical_graph_builder.py +++ b/tests/unit/experimental/components/test_lexical_graph_builder.py @@ -28,7 +28,7 @@ LexicalGraphConfig, Neo4jNode, TextChunk, - TextChunks, + TextChunks, Embeddings, ) @@ -64,7 +64,7 @@ def test_lexical_graph_builder_create_chunk_node_metadata_embedding() -> None: assert isinstance(node, Neo4jNode) assert node.id is not None assert node.properties == {"index": 0, "text": "text chunk", "status": "ok"} - assert node.embedding_properties == {"embedding": [1, 2, 3]} + assert node.embedding_properties == {"embedding": Embeddings([1, 2, 3])} @pytest.mark.asyncio From 3033ef8a5f16b4c5bc47ac8f57e747472498c115 Mon Sep 17 00:00:00 2001 From: estelle Date: Thu, 2 Jan 2025 18:56:41 +0100 Subject: [PATCH 02/13] Remove unused code --- .../simple_kg_builder_from_text.py | 3 ++- .../components/entity_relation_extractor.py | 8 ++---- .../experimental/components/types.py | 25 ++----------------- .../experimental/pipeline/config/runner.py | 7 ++++-- .../experimental/pipeline/pipeline.py | 12 ++++++--- tests/e2e/test_simplekgpipeline_e2e.py | 1 - .../experimental/components/test_embedder.py | 3 +-- .../components/test_lexical_graph_builder.py | 4 +-- 8 files changed, 23 insertions(+), 40 deletions(-) diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py index 0295331fd..901744a75 100644 --- a/examples/build_graph/simple_kg_builder_from_text.py +++ b/examples/build_graph/simple_kg_builder_from_text.py @@ -22,7 +22,8 @@ from neo4j_graphrag.llm.openai_llm import OpenAILLM logging.basicConfig() -logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) +# logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) +logging.getLogger("neo4j_graphrag").setLevel(logging.INFO) # Neo4j db infos diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index 5432fa70c..6d53e4690 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -217,9 +217,7 @@ async def extract_for_chunk( result = json.loads(llm_generated_json) except (json.JSONDecodeError, InvalidJSONError) as e: if self.on_error == OnError.RAISE: - raise LLMGenerationError( - f"LLM response is not valid JSON {llm_result.content}: {e}" - ) + raise LLMGenerationError("LLM response is not valid JSON") from e else: logger.error( f"LLM response is not valid JSON for chunk_index={chunk.index}" @@ -230,9 +228,7 @@ async def extract_for_chunk( chunk_graph = Neo4jGraph(**result) except ValidationError as e: if self.on_error == OnError.RAISE: - raise LLMGenerationError( - f"LLM response has improper format {result}: {e}" - ) + raise LLMGenerationError("LLM response has improper format") from e else: logger.error( f"LLM response has improper format for chunk_index={chunk.index}" diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py index 7be8cd0fd..689e6b6ce 100644 --- a/src/neo4j_graphrag/experimental/components/types.py +++ b/src/neo4j_graphrag/experimental/components/types.py @@ -15,17 +15,13 @@ from __future__ import annotations import uuid -from typing import Any, Dict, Optional, TYPE_CHECKING +from typing import Any, Dict, Optional -from pydantic import BaseModel, Field, field_validator, RootModel +from pydantic import BaseModel, Field, field_validator from neo4j_graphrag.experimental.pipeline.component import DataModel -if TYPE_CHECKING: - from pydantic._internal import _repr - - class DocumentInfo(DataModel): """A document loaded by a DataLoader. @@ -79,20 +75,6 @@ class TextChunks(DataModel): chunks: list[TextChunk] -# class Embeddings(RootModel): -# """A wrapper around list[float] to represent embeddings. -# Used to improve logging of vectors by not showing the full vector. -# """ -# root: list[float] -# -# # def __rep_str__(self, sep: str = ", ") -> str: -# # return f"" -# -# def __repr_args__(self) -> _repr.ReprArgs: -# yield 'dimension', len(self.root) -# yield 'vector', self.root[:3] -# - class Neo4jNode(BaseModel): """Represents a Neo4j node. @@ -147,9 +129,6 @@ class Neo4jGraph(DataModel): nodes: list[Neo4jNode] = [] relationships: list[Neo4jRelationship] = [] - # def __str__(self) -> str: - # return f"" - class ResolutionStats(DataModel): number_of_nodes_to_resolve: int diff --git a/src/neo4j_graphrag/experimental/pipeline/config/runner.py b/src/neo4j_graphrag/experimental/pipeline/config/runner.py index c31f54a9f..c13217540 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/runner.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/runner.py @@ -48,6 +48,7 @@ from neo4j_graphrag.experimental.pipeline.config.types import PipelineType from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult from neo4j_graphrag.experimental.pipeline.types import PipelineDefinition +from neo4j_graphrag.utils import prettyfier logger = logging.getLogger(__name__) @@ -102,7 +103,9 @@ def from_config( cls, config: AbstractPipelineConfig | dict[str, Any], do_cleaning: bool = False ) -> Self: wrapper = PipelineConfigWrapper.model_validate({"config": config}) - logger.debug(f"PIPELINE_RUNNER: instantiate Pipeline from config type: {wrapper.config.template_}") + logger.debug( + f"PIPELINE_RUNNER: instantiate Pipeline from config type: {wrapper.config.template_}" + ) return cls(wrapper.parse(), config=wrapper.config, do_cleaning=do_cleaning) @classmethod @@ -122,7 +125,7 @@ async def run(self, user_input: dict[str, Any]) -> PipelineResult: else: run_param = deep_update(self.run_params, user_input) logger.info( - f"PIPELINE_RUNNER: starting pipeline {self.pipeline} with run_params={run_param}" + f"PIPELINE_RUNNER: starting pipeline {self.pipeline} with run_params={prettyfier(run_param)}" ) result = await self.pipeline.run(data=run_param) if self.do_cleaning: diff --git a/src/neo4j_graphrag/experimental/pipeline/pipeline.py b/src/neo4j_graphrag/experimental/pipeline/pipeline.py index a58976676..a3f9b3c16 100644 --- a/src/neo4j_graphrag/experimental/pipeline/pipeline.py +++ b/src/neo4j_graphrag/experimental/pipeline/pipeline.py @@ -104,7 +104,9 @@ async def run(self, inputs: dict[str, Any]) -> RunResult | None: start_time = default_timer() res = await self.execute(**inputs) end_time = default_timer() - logger.debug(f"TASK FINISHED {self.name} in {end_time - start_time} res={prettyfier(res)}") + logger.debug( + f"TASK FINISHED {self.name} in {end_time - start_time} res={prettyfier(res)}" + ) return res @@ -141,7 +143,9 @@ async def run_task(self, task: TaskPipelineNode, data: dict[str, Any]) -> None: try: await self.set_task_status(task.name, RunStatus.RUNNING) except PipelineStatusUpdateError: - logger.debug(f"ORCHESTRATOR: TASK ABORTED: {task.name} is already running or done, aborting") + logger.debug( + f"ORCHESTRATOR: TASK ABORTED: {task.name} is already running or done, aborting" + ) return None res = await task.run(inputs) await self.set_task_status(task.name, RunStatus.DONE) @@ -228,7 +232,9 @@ async def next( await self.check_dependencies_complete(next_node) except PipelineMissingDependencyError: continue - logger.debug(f"ORCHESTRATOR {self.run_id}: enqueuing next task: {next_node.name}") + logger.debug( + f"ORCHESTRATOR {self.run_id}: enqueuing next task: {next_node.name}" + ) yield next_node return diff --git a/tests/e2e/test_simplekgpipeline_e2e.py b/tests/e2e/test_simplekgpipeline_e2e.py index 65eb9e59c..d30ec3a66 100644 --- a/tests/e2e/test_simplekgpipeline_e2e.py +++ b/tests/e2e/test_simplekgpipeline_e2e.py @@ -20,7 +20,6 @@ import neo4j import pytest from neo4j import Driver - from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import ( FixedSizeSplitter, ) diff --git a/tests/unit/experimental/components/test_embedder.py b/tests/unit/experimental/components/test_embedder.py index f7da60c8d..581ec532e 100644 --- a/tests/unit/experimental/components/test_embedder.py +++ b/tests/unit/experimental/components/test_embedder.py @@ -17,7 +17,6 @@ import pytest from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder from neo4j_graphrag.experimental.components.types import ( - Embeddings, TextChunk, TextChunks, ) @@ -37,4 +36,4 @@ async def test_text_chunk_embedder_run(embedder: MagicMock) -> None: assert isinstance(chunk, TextChunk) assert chunk.metadata is not None assert "embedding" in chunk.metadata.keys() - assert isinstance(chunk.metadata["embedding"], Embeddings) + assert isinstance(chunk.metadata["embedding"], list) diff --git a/tests/unit/experimental/components/test_lexical_graph_builder.py b/tests/unit/experimental/components/test_lexical_graph_builder.py index 9a9887b40..4621c77e9 100644 --- a/tests/unit/experimental/components/test_lexical_graph_builder.py +++ b/tests/unit/experimental/components/test_lexical_graph_builder.py @@ -28,7 +28,7 @@ LexicalGraphConfig, Neo4jNode, TextChunk, - TextChunks, Embeddings, + TextChunks, ) @@ -64,7 +64,7 @@ def test_lexical_graph_builder_create_chunk_node_metadata_embedding() -> None: assert isinstance(node, Neo4jNode) assert node.id is not None assert node.properties == {"index": 0, "text": "text chunk", "status": "ok"} - assert node.embedding_properties == {"embedding": Embeddings([1, 2, 3])} + assert node.embedding_properties == {"embedding": [1, 2, 3]} @pytest.mark.asyncio From 80708e480fdfcc52b91f3478055ad35cdb5e6046 Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 3 Jan 2025 10:22:56 +0100 Subject: [PATCH 03/13] Cut long string, configure via env vars, restructure utils folder --- .../simple_kg_builder_from_text.py | 4 +- .../components/entity_relation_extractor.py | 2 +- .../experimental/pipeline/config/runner.py | 2 +- .../experimental/pipeline/pipeline.py | 2 +- src/neo4j_graphrag/types.py | 2 +- src/neo4j_graphrag/utils/__init__.py | 0 .../{utils.py => utils/logging.py} | 44 +++++++++++++------ src/neo4j_graphrag/utils/validation.py | 24 ++++++++++ 8 files changed, 60 insertions(+), 20 deletions(-) create mode 100644 src/neo4j_graphrag/utils/__init__.py rename src/neo4j_graphrag/{utils.py => utils/logging.py} (55%) create mode 100644 src/neo4j_graphrag/utils/validation.py diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py index 901744a75..288a21ab5 100644 --- a/examples/build_graph/simple_kg_builder_from_text.py +++ b/examples/build_graph/simple_kg_builder_from_text.py @@ -22,8 +22,8 @@ from neo4j_graphrag.llm.openai_llm import OpenAILLM logging.basicConfig() -# logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) -logging.getLogger("neo4j_graphrag").setLevel(logging.INFO) +logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) +# logging.getLogger("neo4j_graphrag").setLevel(logging.INFO) # Neo4j db infos diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index 6d53e4690..7127a36e7 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -38,7 +38,7 @@ from neo4j_graphrag.experimental.pipeline.exceptions import InvalidJSONError from neo4j_graphrag.generation.prompts import ERExtractionTemplate, PromptTemplate from neo4j_graphrag.llm import LLMInterface -from neo4j_graphrag.utils import prettyfier +from neo4j_graphrag.utils.logging import prettyfier logger = logging.getLogger(__name__) diff --git a/src/neo4j_graphrag/experimental/pipeline/config/runner.py b/src/neo4j_graphrag/experimental/pipeline/config/runner.py index c13217540..07b04dfff 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/runner.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/runner.py @@ -48,7 +48,7 @@ from neo4j_graphrag.experimental.pipeline.config.types import PipelineType from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult from neo4j_graphrag.experimental.pipeline.types import PipelineDefinition -from neo4j_graphrag.utils import prettyfier +from neo4j_graphrag.utils.logging import prettyfier logger = logging.getLogger(__name__) diff --git a/src/neo4j_graphrag/experimental/pipeline/pipeline.py b/src/neo4j_graphrag/experimental/pipeline/pipeline.py index a3f9b3c16..007d3ca4a 100644 --- a/src/neo4j_graphrag/experimental/pipeline/pipeline.py +++ b/src/neo4j_graphrag/experimental/pipeline/pipeline.py @@ -24,7 +24,7 @@ from timeit import default_timer from typing import Any, AsyncGenerator, Optional -from neo4j_graphrag.utils import prettyfier +from neo4j_graphrag.utils.logging import prettyfier try: import pygraphviz as pgv diff --git a/src/neo4j_graphrag/types.py b/src/neo4j_graphrag/types.py index 5a45141dd..3b2286acf 100644 --- a/src/neo4j_graphrag/types.py +++ b/src/neo4j_graphrag/types.py @@ -26,7 +26,7 @@ model_validator, ) -from neo4j_graphrag.utils import validate_search_query_input +from neo4j_graphrag.utils.validation import validate_search_query_input class RawSearchResult(BaseModel): diff --git a/src/neo4j_graphrag/utils/__init__.py b/src/neo4j_graphrag/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/neo4j_graphrag/utils.py b/src/neo4j_graphrag/utils/logging.py similarity index 55% rename from src/neo4j_graphrag/utils.py rename to src/neo4j_graphrag/utils/logging.py index b4331c883..eee12209c 100644 --- a/src/neo4j_graphrag/utils.py +++ b/src/neo4j_graphrag/utils/logging.py @@ -14,26 +14,32 @@ # limitations under the License. from __future__ import annotations -from typing import Optional, Any +import os +from typing import Any, Optional from pydantic import BaseModel +DEFAULT_MAX_LIST_LENGTH: int = 5 +DEFAULT_MAX_STRING_LENGTH: int = 200 -def validate_search_query_input( - query_text: Optional[str] = None, query_vector: Optional[list[float]] = None -) -> None: - if not (bool(query_vector) ^ bool(query_text)): - raise ValueError("You must provide exactly one of query_vector or query_text.") +class Prettyfier: + """Prettyfy any object for logging. + I.e.: truncate long lists and strings, even nested. -class Prettyfier: - """Prettyfy object for logging. + Max list and string length can be configured using env variables: + - LOGGING__MAX_LIST_LENGTH (int) + - LOGGING__MAX_STRING_LENGTH (int) + """ - I.e.: truncate long lists. - """ - def __init__(self, max_items_in_list: int = 5): - self.max_items_in_list = max_items_in_list + def __init__(self) -> None: + self.max_list_length = int(os.environ.get( + "LOGGING__MAX_LIST_LENGTH", DEFAULT_MAX_LIST_LENGTH + )) + self.max_string_length = int(os.environ.get( + "LOGGING__MAX_STRING_LENGTH", DEFAULT_MAX_STRING_LENGTH + )) def _prettyfy_dict(self, value: dict[Any, Any]) -> dict[Any, Any]: return { @@ -44,20 +50,30 @@ def _prettyfy_dict(self, value: dict[Any, Any]) -> dict[Any, Any]: def _prettyfy_list(self, value: list[Any]) -> list[Any]: items = [ self(v) # prettify each item - for v in value[:self.max_items_in_list] + for v in value[: self.max_list_length] ] remaining_items = len(value) - len(items) if remaining_items > 0: - items.append(f"...truncated {remaining_items} items...") + items.append(f"... ({remaining_items} items)") return items + def _prettyfy_str(self, value: str) -> str: + new_value = value[: self.max_string_length] + remaining_chars = len(value) - len(new_value) + if remaining_chars > 0: + new_value += f"... ({remaining_chars} chars)" + return new_value + def __call__(self, value: Any) -> Any: + """Takes any value and returns a prettified version for logging.""" if isinstance(value, dict): return self._prettyfy_dict(value) if isinstance(value, BaseModel): return self(value.model_dump()) if isinstance(value, list): return self._prettyfy_list(value) + if isinstance(value, str): + return self._prettyfy_str(value) return value diff --git a/src/neo4j_graphrag/utils/validation.py b/src/neo4j_graphrag/utils/validation.py new file mode 100644 index 000000000..e86f7588a --- /dev/null +++ b/src/neo4j_graphrag/utils/validation.py @@ -0,0 +1,24 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Optional + + +def validate_search_query_input( + query_text: Optional[str] = None, query_vector: Optional[list[float]] = None +) -> None: + if not (bool(query_vector) ^ bool(query_text)): + raise ValueError("You must provide exactly one of query_vector or query_text.") From 37f964ae5a92a981b3d53a83f026ba82bdfb12c9 Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 3 Jan 2025 10:23:17 +0100 Subject: [PATCH 04/13] ruff --- src/neo4j_graphrag/utils/logging.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/neo4j_graphrag/utils/logging.py b/src/neo4j_graphrag/utils/logging.py index eee12209c..5917ff10b 100644 --- a/src/neo4j_graphrag/utils/logging.py +++ b/src/neo4j_graphrag/utils/logging.py @@ -34,12 +34,12 @@ class Prettyfier: """ def __init__(self) -> None: - self.max_list_length = int(os.environ.get( - "LOGGING__MAX_LIST_LENGTH", DEFAULT_MAX_LIST_LENGTH - )) - self.max_string_length = int(os.environ.get( - "LOGGING__MAX_STRING_LENGTH", DEFAULT_MAX_STRING_LENGTH - )) + self.max_list_length = int( + os.environ.get("LOGGING__MAX_LIST_LENGTH", DEFAULT_MAX_LIST_LENGTH) + ) + self.max_string_length = int( + os.environ.get("LOGGING__MAX_STRING_LENGTH", DEFAULT_MAX_STRING_LENGTH) + ) def _prettyfy_dict(self, value: dict[Any, Any]) -> dict[Any, Any]: return { From 67174eba53e4f150a9fb9987c62fbacaa136d5fd Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 3 Jan 2025 10:38:30 +0100 Subject: [PATCH 05/13] Fix tests --- src/neo4j_graphrag/utils/logging.py | 2 +- tests/e2e/test_kg_writer_component_e2e.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/neo4j_graphrag/utils/logging.py b/src/neo4j_graphrag/utils/logging.py index 5917ff10b..7d6a10911 100644 --- a/src/neo4j_graphrag/utils/logging.py +++ b/src/neo4j_graphrag/utils/logging.py @@ -15,7 +15,7 @@ from __future__ import annotations import os -from typing import Any, Optional +from typing import Any from pydantic import BaseModel diff --git a/tests/e2e/test_kg_writer_component_e2e.py b/tests/e2e/test_kg_writer_component_e2e.py index 8774e30cf..75f8e5578 100644 --- a/tests/e2e/test_kg_writer_component_e2e.py +++ b/tests/e2e/test_kg_writer_component_e2e.py @@ -76,7 +76,7 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: if start_node.embedding_properties: # for mypy for key, val in start_node.embedding_properties.items(): assert key in node_a.keys() - assert val.root == node_a.get(key) + assert val == node_a.get(key) node_b = record["b"] assert end_node.label in list(node_b.labels) @@ -100,7 +100,7 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: if node_with_two_embeddings.embedding_properties: # for mypy for key, val in node_with_two_embeddings.embedding_properties.items(): assert key in node_c.keys() - assert val.root == node_c.get(key) + assert val == node_c.get(key) @pytest.mark.asyncio From 741a10314f3c7d961f9f8c95350080f991e246b4 Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 3 Jan 2025 11:12:39 +0100 Subject: [PATCH 06/13] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a248cb532..58ae6a61a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ ### Fixed - IDs for the Document and Chunk nodes in the lexical graph are now randomly generated and unique across multiple runs, fixing issues in the lexical graph where relationships were created between chunks that were created by different pipeline runs. - +- Improve logging for a better debugging experience: long lists and strings are now truncated. The max length can be controlled using the `LOGGING__MAX_LIST_LENGTH` and `LOGGING__MAX_STRING_LENGTH` env variables. ## 1.3.0 From 2f47d004cf50b9e8f64fb7c366ece5b471ba2b66 Mon Sep 17 00:00:00 2001 From: estelle Date: Fri, 3 Jan 2025 11:16:10 +0100 Subject: [PATCH 07/13] There is no reason not to test that --- tests/unit/experimental/components/test_embedder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/experimental/components/test_embedder.py b/tests/unit/experimental/components/test_embedder.py index 581ec532e..627a35c13 100644 --- a/tests/unit/experimental/components/test_embedder.py +++ b/tests/unit/experimental/components/test_embedder.py @@ -37,3 +37,5 @@ async def test_text_chunk_embedder_run(embedder: MagicMock) -> None: assert chunk.metadata is not None assert "embedding" in chunk.metadata.keys() assert isinstance(chunk.metadata["embedding"], list) + for i in chunk.metadata["embedding"]: + assert isinstance(i, float) From b91972050157b56b183824169a91e81a41653dac Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 6 Jan 2025 12:10:51 +0100 Subject: [PATCH 08/13] Rename --- .../experimental/components/entity_relation_extractor.py | 4 ++-- src/neo4j_graphrag/experimental/pipeline/config/runner.py | 4 ++-- src/neo4j_graphrag/experimental/pipeline/pipeline.py | 6 +++--- src/neo4j_graphrag/utils/logging.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py index 7127a36e7..376971e5f 100644 --- a/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py +++ b/src/neo4j_graphrag/experimental/components/entity_relation_extractor.py @@ -38,7 +38,7 @@ from neo4j_graphrag.experimental.pipeline.exceptions import InvalidJSONError from neo4j_graphrag.generation.prompts import ERExtractionTemplate, PromptTemplate from neo4j_graphrag.llm import LLMInterface -from neo4j_graphrag.utils.logging import prettyfier +from neo4j_graphrag.utils.logging import prettify logger = logging.getLogger(__name__) @@ -335,5 +335,5 @@ async def run( ] chunk_graphs: list[Neo4jGraph] = list(await asyncio.gather(*tasks)) graph = self.combine_chunk_graphs(lexical_graph, chunk_graphs) - logger.debug(f"Extracted graph: {prettyfier(graph)}") + logger.debug(f"Extracted graph: {prettify(graph)}") return graph diff --git a/src/neo4j_graphrag/experimental/pipeline/config/runner.py b/src/neo4j_graphrag/experimental/pipeline/config/runner.py index 07b04dfff..370695f71 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/runner.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/runner.py @@ -48,7 +48,7 @@ from neo4j_graphrag.experimental.pipeline.config.types import PipelineType from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult from neo4j_graphrag.experimental.pipeline.types import PipelineDefinition -from neo4j_graphrag.utils.logging import prettyfier +from neo4j_graphrag.utils.logging import prettify logger = logging.getLogger(__name__) @@ -125,7 +125,7 @@ async def run(self, user_input: dict[str, Any]) -> PipelineResult: else: run_param = deep_update(self.run_params, user_input) logger.info( - f"PIPELINE_RUNNER: starting pipeline {self.pipeline} with run_params={prettyfier(run_param)}" + f"PIPELINE_RUNNER: starting pipeline {self.pipeline} with run_params={prettify(run_param)}" ) result = await self.pipeline.run(data=run_param) if self.do_cleaning: diff --git a/src/neo4j_graphrag/experimental/pipeline/pipeline.py b/src/neo4j_graphrag/experimental/pipeline/pipeline.py index 007d3ca4a..b8dfcfad5 100644 --- a/src/neo4j_graphrag/experimental/pipeline/pipeline.py +++ b/src/neo4j_graphrag/experimental/pipeline/pipeline.py @@ -24,7 +24,7 @@ from timeit import default_timer from typing import Any, AsyncGenerator, Optional -from neo4j_graphrag.utils.logging import prettyfier +from neo4j_graphrag.utils.logging import prettify try: import pygraphviz as pgv @@ -100,12 +100,12 @@ async def execute(self, **kwargs: Any) -> RunResult | None: async def run(self, inputs: dict[str, Any]) -> RunResult | None: """Main method to execute the task.""" - logger.debug(f"TASK START {self.name=} input={prettyfier(inputs)}") + logger.debug(f"TASK START {self.name=} input={prettify(inputs)}") start_time = default_timer() res = await self.execute(**inputs) end_time = default_timer() logger.debug( - f"TASK FINISHED {self.name} in {end_time - start_time} res={prettyfier(res)}" + f"TASK FINISHED {self.name} in {end_time - start_time} res={prettify(res)}" ) return res diff --git a/src/neo4j_graphrag/utils/logging.py b/src/neo4j_graphrag/utils/logging.py index 7d6a10911..94ec8499d 100644 --- a/src/neo4j_graphrag/utils/logging.py +++ b/src/neo4j_graphrag/utils/logging.py @@ -23,7 +23,7 @@ DEFAULT_MAX_STRING_LENGTH: int = 200 -class Prettyfier: +class Prettifyer: """Prettyfy any object for logging. I.e.: truncate long lists and strings, even nested. @@ -77,4 +77,4 @@ def __call__(self, value: Any) -> Any: return value -prettyfier = Prettyfier() +prettify = Prettifyer() From 516b220fd1b99674caf645f60161a7a483525280 Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 6 Jan 2025 12:10:58 +0100 Subject: [PATCH 09/13] Add tests --- tests/unit/utils/__init__.py | 0 tests/unit/utils/test_prettifyer.py | 120 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 tests/unit/utils/__init__.py create mode 100644 tests/unit/utils/test_prettifyer.py diff --git a/tests/unit/utils/__init__.py b/tests/unit/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/utils/test_prettifyer.py b/tests/unit/utils/test_prettifyer.py new file mode 100644 index 000000000..badf58275 --- /dev/null +++ b/tests/unit/utils/test_prettifyer.py @@ -0,0 +1,120 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import call, patch, Mock + +from neo4j_graphrag.utils.logging import Prettifyer, prettify + + +def test_prettifyer_short_str() -> None: + p = Prettifyer() + value = "ab" + pretty_value = p._prettyfy_str(value) + assert pretty_value == "ab" + + +def test_prettifyer_long_str() -> None: + p = Prettifyer() + value = "ab" * 200 + pretty_value = p._prettyfy_str(value) + assert pretty_value == "ab" * 100 + f"... (200 chars)" + + +@patch("neo4j_graphrag.utils.logging.os.environ") +def test_prettifyer_str_custom_max_length(mock_env: Mock) -> None: + mock_env.return_value = {"LOGGING__MAX_STRING_LENGTH": "1"} + p = Prettifyer() + value = "abc" * 100 + pretty_value = p._prettyfy_str(value) + assert pretty_value == "a" + "... (299 chars)" + + +def test_prettifyer_short_list() -> None: + p = Prettifyer() + value = list("abc") + pretty_value = p._prettyfy_list(value) + assert pretty_value == ["a", "b", "c"] + + +def test_prettifyer_long_list() -> None: + p = Prettifyer() + value = list("abc") * 10 + pretty_value = p._prettyfy_list(value) + assert pretty_value == ["a", "b", "c", "a", "b", "... (25 items)"] + + +@patch("neo4j_graphrag.utils.logging.os.environ") +def test_prettifyer_list_custom_max_length(mock_env: Mock) -> None: + mock_env.return_value = {"LOGGING__MAX_LIST_LENGTH": "1"} + p = Prettifyer() + value = list("abc") * 10 + pretty_value = p._prettyfy_list(value) + assert pretty_value == ["a", "... (29 items)"] + + +def test_prettifyer_list_nested() -> None: + with patch.object( + Prettifyer, "_prettyfy_str", return_value="mocked string" + ) as mock: + p = Prettifyer() + value = ["abc" * 200] * 6 + pretty_value = p._prettyfy_list(value) + mock.assert_has_calls([call("abc" * 200)] * p.max_list_length) + assert pretty_value == ["mocked string"] * 5 + ["... (1 items)"] + + +def test_prettifyer_dict_nested() -> None: + with patch.object( + Prettifyer, "_prettyfy_str", return_value="mocked string" + ) as mock_str: + with patch.object( + Prettifyer, "_prettyfy_list", return_value=["mocked list"] + ) as mock_list: + p = Prettifyer() + value = { + "key1": "string", + "key2": ["a", "list"], + } + pretty_value = p._prettyfy_dict(value) + mock_str.assert_has_calls([call("string")]) + mock_list.assert_has_calls( + [ + call(["a", "list"]), + ] + ) + assert pretty_value == { + "key1": "mocked string", + "key2": ["mocked list"], + } + + +def test_prettify_function() -> None: + assert prettify({ + "key": { + "key0.1": "ab" * 200, + "key0.2": ["a"] * 10, + "key0.3": { + "key0.3.1": "a short strng" + } + } + }) == { + "key": { + "key0.1": "ab" * 100 + f"... (200 chars)", + "key0.2": ["a"] * 5 + ["... (5 items)"], + "key0.3": { + "key0.3.1": "a short strng" + } + } + } From ea0d0daeb1f88a850205d8db2c1b050ac81615da Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 6 Jan 2025 12:12:42 +0100 Subject: [PATCH 10/13] Update log message --- src/neo4j_graphrag/experimental/pipeline/config/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neo4j_graphrag/experimental/pipeline/config/runner.py b/src/neo4j_graphrag/experimental/pipeline/config/runner.py index 370695f71..c1bef9fee 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/runner.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/runner.py @@ -104,7 +104,7 @@ def from_config( ) -> Self: wrapper = PipelineConfigWrapper.model_validate({"config": config}) logger.debug( - f"PIPELINE_RUNNER: instantiate Pipeline from config type: {wrapper.config.template_}" + f"PIPELINE_RUNNER: instantiating Pipeline from config type: {wrapper.config.template_}" ) return cls(wrapper.parse(), config=wrapper.config, do_cleaning=do_cleaning) From decd3b8b262fd740fb6d53b0d976f4c58d109b06 Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 6 Jan 2025 12:14:35 +0100 Subject: [PATCH 11/13] Ruff --- tests/unit/utils/test_prettifyer.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/unit/utils/test_prettifyer.py b/tests/unit/utils/test_prettifyer.py index badf58275..a5b138506 100644 --- a/tests/unit/utils/test_prettifyer.py +++ b/tests/unit/utils/test_prettifyer.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import call, patch, Mock +from unittest.mock import Mock, call, patch from neo4j_graphrag.utils.logging import Prettifyer, prettify @@ -101,20 +101,18 @@ def test_prettifyer_dict_nested() -> None: def test_prettify_function() -> None: - assert prettify({ - "key": { - "key0.1": "ab" * 200, - "key0.2": ["a"] * 10, - "key0.3": { - "key0.3.1": "a short strng" + assert prettify( + { + "key": { + "key0.1": "ab" * 200, + "key0.2": ["a"] * 10, + "key0.3": {"key0.3.1": "a short strng"}, } } - }) == { + ) == { "key": { "key0.1": "ab" * 100 + f"... (200 chars)", "key0.2": ["a"] * 5 + ["... (5 items)"], - "key0.3": { - "key0.3.1": "a short strng" - } + "key0.3": {"key0.3.1": "a short strng"}, } } From 9fa0974cbe34e9fa5d8dba022e7c05deba36963a Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 6 Jan 2025 12:16:41 +0100 Subject: [PATCH 12/13] Ruff again --- tests/unit/utils/test_prettifyer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/utils/test_prettifyer.py b/tests/unit/utils/test_prettifyer.py index a5b138506..fe37a2c3c 100644 --- a/tests/unit/utils/test_prettifyer.py +++ b/tests/unit/utils/test_prettifyer.py @@ -29,7 +29,7 @@ def test_prettifyer_long_str() -> None: p = Prettifyer() value = "ab" * 200 pretty_value = p._prettyfy_str(value) - assert pretty_value == "ab" * 100 + f"... (200 chars)" + assert pretty_value == "ab" * 100 + "... (200 chars)" @patch("neo4j_graphrag.utils.logging.os.environ") @@ -106,13 +106,13 @@ def test_prettify_function() -> None: "key": { "key0.1": "ab" * 200, "key0.2": ["a"] * 10, - "key0.3": {"key0.3.1": "a short strng"}, + "key0.3": {"key0.3.1": "a short string"}, } } ) == { "key": { - "key0.1": "ab" * 100 + f"... (200 chars)", + "key0.1": "ab" * 100 + "... (200 chars)", "key0.2": ["a"] * 5 + ["... (5 items)"], - "key0.3": {"key0.3.1": "a short strng"}, + "key0.3": {"key0.3.1": "a short string"}, } } From d395b38ffb701e198ec0b14b45655c98490da67c Mon Sep 17 00:00:00 2001 From: estelle Date: Mon, 6 Jan 2025 12:18:48 +0100 Subject: [PATCH 13/13] Spelling --- src/neo4j_graphrag/utils/logging.py | 16 ++++----- tests/unit/utils/test_prettifyer.py | 56 ++++++++++++++--------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/neo4j_graphrag/utils/logging.py b/src/neo4j_graphrag/utils/logging.py index 94ec8499d..e771023bb 100644 --- a/src/neo4j_graphrag/utils/logging.py +++ b/src/neo4j_graphrag/utils/logging.py @@ -23,7 +23,7 @@ DEFAULT_MAX_STRING_LENGTH: int = 200 -class Prettifyer: +class Prettifier: """Prettyfy any object for logging. I.e.: truncate long lists and strings, even nested. @@ -41,13 +41,13 @@ def __init__(self) -> None: os.environ.get("LOGGING__MAX_STRING_LENGTH", DEFAULT_MAX_STRING_LENGTH) ) - def _prettyfy_dict(self, value: dict[Any, Any]) -> dict[Any, Any]: + def _prettify_dict(self, value: dict[Any, Any]) -> dict[Any, Any]: return { k: self(v) # prettyfy each value for k, v in value.items() } - def _prettyfy_list(self, value: list[Any]) -> list[Any]: + def _prettify_list(self, value: list[Any]) -> list[Any]: items = [ self(v) # prettify each item for v in value[: self.max_list_length] @@ -57,7 +57,7 @@ def _prettyfy_list(self, value: list[Any]) -> list[Any]: items.append(f"... ({remaining_items} items)") return items - def _prettyfy_str(self, value: str) -> str: + def _prettify_str(self, value: str) -> str: new_value = value[: self.max_string_length] remaining_chars = len(value) - len(new_value) if remaining_chars > 0: @@ -67,14 +67,14 @@ def _prettyfy_str(self, value: str) -> str: def __call__(self, value: Any) -> Any: """Takes any value and returns a prettified version for logging.""" if isinstance(value, dict): - return self._prettyfy_dict(value) + return self._prettify_dict(value) if isinstance(value, BaseModel): return self(value.model_dump()) if isinstance(value, list): - return self._prettyfy_list(value) + return self._prettify_list(value) if isinstance(value, str): - return self._prettyfy_str(value) + return self._prettify_str(value) return value -prettify = Prettifyer() +prettify = Prettifier() diff --git a/tests/unit/utils/test_prettifyer.py b/tests/unit/utils/test_prettifyer.py index fe37a2c3c..a31fb8c72 100644 --- a/tests/unit/utils/test_prettifyer.py +++ b/tests/unit/utils/test_prettifyer.py @@ -15,79 +15,79 @@ from unittest.mock import Mock, call, patch -from neo4j_graphrag.utils.logging import Prettifyer, prettify +from neo4j_graphrag.utils.logging import Prettifier, prettify -def test_prettifyer_short_str() -> None: - p = Prettifyer() +def test_prettifier_short_str() -> None: + p = Prettifier() value = "ab" - pretty_value = p._prettyfy_str(value) + pretty_value = p._prettify_str(value) assert pretty_value == "ab" -def test_prettifyer_long_str() -> None: - p = Prettifyer() +def test_prettifier_long_str() -> None: + p = Prettifier() value = "ab" * 200 - pretty_value = p._prettyfy_str(value) + pretty_value = p._prettify_str(value) assert pretty_value == "ab" * 100 + "... (200 chars)" @patch("neo4j_graphrag.utils.logging.os.environ") -def test_prettifyer_str_custom_max_length(mock_env: Mock) -> None: +def test_prettifier_str_custom_max_length(mock_env: Mock) -> None: mock_env.return_value = {"LOGGING__MAX_STRING_LENGTH": "1"} - p = Prettifyer() + p = Prettifier() value = "abc" * 100 - pretty_value = p._prettyfy_str(value) + pretty_value = p._prettify_str(value) assert pretty_value == "a" + "... (299 chars)" -def test_prettifyer_short_list() -> None: - p = Prettifyer() +def test_prettifier_short_list() -> None: + p = Prettifier() value = list("abc") - pretty_value = p._prettyfy_list(value) + pretty_value = p._prettify_list(value) assert pretty_value == ["a", "b", "c"] -def test_prettifyer_long_list() -> None: - p = Prettifyer() +def test_prettifier_long_list() -> None: + p = Prettifier() value = list("abc") * 10 - pretty_value = p._prettyfy_list(value) + pretty_value = p._prettify_list(value) assert pretty_value == ["a", "b", "c", "a", "b", "... (25 items)"] @patch("neo4j_graphrag.utils.logging.os.environ") -def test_prettifyer_list_custom_max_length(mock_env: Mock) -> None: +def test_prettifier_list_custom_max_length(mock_env: Mock) -> None: mock_env.return_value = {"LOGGING__MAX_LIST_LENGTH": "1"} - p = Prettifyer() + p = Prettifier() value = list("abc") * 10 - pretty_value = p._prettyfy_list(value) + pretty_value = p._prettify_list(value) assert pretty_value == ["a", "... (29 items)"] -def test_prettifyer_list_nested() -> None: +def test_prettifier_list_nested() -> None: with patch.object( - Prettifyer, "_prettyfy_str", return_value="mocked string" + Prettifier, "_prettify_str", return_value="mocked string" ) as mock: - p = Prettifyer() + p = Prettifier() value = ["abc" * 200] * 6 - pretty_value = p._prettyfy_list(value) + pretty_value = p._prettify_list(value) mock.assert_has_calls([call("abc" * 200)] * p.max_list_length) assert pretty_value == ["mocked string"] * 5 + ["... (1 items)"] -def test_prettifyer_dict_nested() -> None: +def test_prettifier_dict_nested() -> None: with patch.object( - Prettifyer, "_prettyfy_str", return_value="mocked string" + Prettifier, "_prettify_str", return_value="mocked string" ) as mock_str: with patch.object( - Prettifyer, "_prettyfy_list", return_value=["mocked list"] + Prettifier, "_prettify_list", return_value=["mocked list"] ) as mock_list: - p = Prettifyer() + p = Prettifier() value = { "key1": "string", "key2": ["a", "list"], } - pretty_value = p._prettyfy_dict(value) + pretty_value = p._prettify_dict(value) mock_str.assert_has_calls([call("string")]) mock_list.assert_has_calls( [