diff --git a/src/databricks/labs/dqx/profiler/generator.py b/src/databricks/labs/dqx/profiler/generator.py index 05475ae37..ac8ab37dc 100644 --- a/src/databricks/labs/dqx/profiler/generator.py +++ b/src/databricks/labs/dqx/profiler/generator.py @@ -1,14 +1,13 @@ import logging +import datetime import json from collections.abc import Callable - from pyspark.sql import SparkSession - from databricks.sdk import WorkspaceClient + from databricks.labs.dqx.base import DQEngineBase from databricks.labs.dqx.config import LLMModelConfig from databricks.labs.dqx.engine import DQEngine -from databricks.labs.dqx.profiler.common import val_maybe_to_str from databricks.labs.dqx.profiler.profiler import DQProfile from databricks.labs.dqx.telemetry import telemetry_logger from databricks.labs.dqx.errors import MissingParameterError @@ -153,49 +152,59 @@ def dq_generate_min_max(column: str, level: str = "error", **params: dict): Generates a data quality rule to check if a column's value is within a specified range. Args: - column: The name of the column to check. - level: The criticality level of the rule (default is "error"). - params: Additional parameters, including the minimum and maximum values. + column: The name of the column to check. + level: The criticality level of the rule (default is "error"). + params: Additional parameters, including the minimum and maximum values. Returns: - A dictionary representing the data quality rule, or None if no limits are provided. + A dictionary representing the data quality rule, or None if no limits are provided. """ min_limit = params.get("min") max_limit = params.get("max") - if not isinstance(min_limit, int) or not isinstance(max_limit, int): - return None # TODO handle timestamp and dates: https://github.com/databrickslabs/dqx/issues/71 + if min_limit is None and max_limit is None: + return None + + def _is_num(value): + return isinstance(value, int) - if min_limit is not None and max_limit is not None: + def _is_temporal(value): + return isinstance(value, (datetime.date, datetime.datetime)) + + def _same_family(value_a, value_b): + # numeric with numeric OR temporal with temporal + if value_a is None or value_b is None: + return True + return (_is_num(value_a) and _is_num(value_b)) or (_is_temporal(value_a) and _is_temporal(value_b)) + + # Both bounds + if min_limit is not None and max_limit is not None and _same_family(min_limit, max_limit): return { "check": { "function": "is_in_range", "arguments": { "column": column, - "min_limit": val_maybe_to_str(min_limit, include_sql_quotes=False), - "max_limit": val_maybe_to_str(max_limit, include_sql_quotes=False), + # pass through Python ints or datetime/date without stringification + "min_limit": min_limit, + "max_limit": max_limit, }, }, "name": f"{column}_isnt_in_range", "criticality": level, } - if max_limit is not None: + # Only max + if max_limit is not None and (_is_num(max_limit) or _is_temporal(max_limit)): return { - "check": { - "function": "is_not_greater_than", - "arguments": {"column": column, "limit": val_maybe_to_str(max_limit, include_sql_quotes=False)}, - }, + "check": {"function": "is_not_greater_than", "arguments": {"column": column, "limit": max_limit}}, "name": f"{column}_not_greater_than", "criticality": level, } - if min_limit is not None: + # Only min + if min_limit is not None and (_is_num(min_limit) or _is_temporal(min_limit)): return { - "check": { - "function": "is_not_less_than", - "arguments": {"column": column, "limit": val_maybe_to_str(min_limit, include_sql_quotes=False)}, - }, + "check": {"function": "is_not_less_than", "arguments": {"column": column, "limit": min_limit}}, "name": f"{column}_not_less_than", "criticality": level, } diff --git a/tests/integration/test_rules_generator.py b/tests/integration/test_rules_generator.py index 8fa49459c..ee8139ec8 100644 --- a/tests/integration/test_rules_generator.py +++ b/tests/integration/test_rules_generator.py @@ -1,5 +1,5 @@ +import logging import datetime -from decimal import Decimal from databricks.labs.dqx.profiler.generator import DQGenerator from databricks.labs.dqx.profiler.profiler import DQProfile @@ -22,19 +22,6 @@ parameters={"min": datetime.date(2020, 1, 1), "max": None}, description="Real min/max values were used", ), - DQProfile( - name="min_max", - column="product_expiry_ts", - parameters={"min": None, "max": datetime.datetime(2020, 1, 1)}, - description="Real min/max values were used", - ), - DQProfile(name="is_random", column="vendor_id", parameters={"in": ["1", "4", "2"]}), - DQProfile( - name='min_max', - column='d1', - description='Real min/max values were used', - parameters={'max': Decimal('333323.00'), 'min': Decimal('1.23')}, - ), ] @@ -71,6 +58,14 @@ def test_generate_dq_rules(ws): "name": "rate_code_id_isnt_in_range", "criticality": "error", }, + { + "check": { + "function": "is_not_less_than", + "arguments": {"column": "product_launch_date", "limit": datetime.date(2020, 1, 1)}, + }, + "name": "product_launch_date_not_less_than", + "criticality": "error", + }, ] assert expectations == expected @@ -108,13 +103,27 @@ def test_generate_dq_rules_warn(ws): "name": "rate_code_id_isnt_in_range", "criticality": "warn", }, + { + "check": { + "function": "is_not_less_than", + "arguments": {"column": "product_launch_date", "limit": datetime.date(2020, 1, 1)}, + }, + "name": "product_launch_date_not_less_than", + "criticality": "warn", + }, ] assert expectations == expected def test_generate_dq_rules_logging(ws, caplog): + # capture INFO from the generator module where the skip log is emitted + caplog.set_level(logging.INFO, logger="databricks.labs.dqx.profiler.generator") + generator = DQGenerator(ws) - generator.generate_dq_rules(test_rules) + # add an unknown rule to trigger the "skipping..." log + unknown_rule = DQProfile(name="is_random", column="vendor_id") + generator.generate_dq_rules(test_rules + [unknown_rule]) + assert "No rule 'is_random' for column 'vendor_id'. skipping..." in caplog.text diff --git a/tests/unit/test_generator_temporal.py b/tests/unit/test_generator_temporal.py new file mode 100644 index 000000000..7296ceadb --- /dev/null +++ b/tests/unit/test_generator_temporal.py @@ -0,0 +1,32 @@ +import datetime + +from databricks.labs.dqx.profiler.generator import DQGenerator + + +def test_date_both_bounds_is_in_range(): + result = DQGenerator.dq_generate_min_max( + "dcol", **{"min": datetime.date(2020, 1, 1), "max": datetime.date(2020, 12, 31)} + ) + assert result["check"]["function"] == "is_in_range" + args = result["check"]["arguments"] + assert args["column"] == "dcol" + assert args["min_limit"] == datetime.date(2020, 1, 1) + assert args["max_limit"] == datetime.date(2020, 12, 31) + + +def test_timestamp_only_min_is_not_less_than(): + timestamp = datetime.datetime(2024, 6, 1, 12, 0, 0) + result = DQGenerator.dq_generate_min_max("tscol", **{"min": timestamp, "max": None}) + assert result["check"]["function"] == "is_not_less_than" + args = result["check"]["arguments"] + assert args["column"] == "tscol" + assert args["limit"] == timestamp + + +def test_timestamp_only_max_is_not_greater_than(): + timestamp = datetime.datetime(2024, 6, 30, 23, 59, 59) + result = DQGenerator.dq_generate_min_max("tscol", **{"min": None, "max": timestamp}) + assert result["check"]["function"] == "is_not_greater_than" + args = result["check"]["arguments"] + assert args["column"] == "tscol" + assert args["limit"] == timestamp