From 3397bf90577e6d2a979aa9ceccd4d598ca19fdc2 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 12 Dec 2024 13:28:57 -0500 Subject: [PATCH 1/6] test --- .../test_canonical_expectations.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py index 96661debf686..ddb9b009a72b 100644 --- a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py +++ b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py @@ -4,6 +4,7 @@ import pandas as pd import great_expectations.expectations as gxe +from great_expectations.core.expectation_suite import ExpectationSuite from tests.integration.conftest import parameterize_batch_for_data_sources from tests.integration.test_utils.data_source_config import ( BigQueryDatasourceTestConfig, @@ -154,3 +155,27 @@ def test_expect_column_mean_to_be_between(batch_for_datasource): expectation = gxe.ExpectColumnMeanToBeBetween(column="a", min_value=2, max_value=3) result = batch_for_datasource.validate(expectation) assert result.success + + +@parameterize_batch_for_data_sources( + data_source_configs=[SparkFilesystemCsvDatasourceTestConfig()], + data=pd.DataFrame( + { + "names": ["Bob", "Alice", "Charlie"], + "emails": ["bob@gmail.com", "alice@gmail.com", "charlie@gmail.com"], + "dates": ["0", "1", "2"], + } + ), +) +def test_faulty_strtime_causes_entire_suite_to_fail(batch_for_datasource): + suite = ExpectationSuite( + name="faulty", + expectations=[ + gxe.ExpectColumnValuesToMatchStrftimeFormat(column="dates", strftime_format="%Y-%m-%d"), + gxe.ExpectColumnValuesToNotBeNull(column="names"), + gxe.ExpectColumnValuesToMatchRegex(column="emails", regex="@gmail.com"), + ], + ) + result = batch_for_datasource.validate(suite) + assert not result.success + assert all(res.success is False and res.exception_info for res in result.results) From b86b7c15030b690c47125b7dd08e3b1ef807bf80 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 12 Dec 2024 13:39:05 -0500 Subject: [PATCH 2/6] add test --- .../test_canonical_expectations.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py index ddb9b009a72b..c6bdf2eee7a7 100644 --- a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py +++ b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py @@ -161,21 +161,28 @@ def test_expect_column_mean_to_be_between(batch_for_datasource): data_source_configs=[SparkFilesystemCsvDatasourceTestConfig()], data=pd.DataFrame( { - "names": ["Bob", "Alice", "Charlie"], - "emails": ["bob@gmail.com", "alice@gmail.com", "charlie@gmail.com"], - "dates": ["0", "1", "2"], + "col1": [1, 2, 3, 4, 5], + "col2": ["A", "B", "C", "D", None], + "col3": [1.1, None, 3.3, 4.4, 5.5], } ), ) -def test_faulty_strtime_causes_entire_suite_to_fail(batch_for_datasource): +def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasource): suite = ExpectationSuite( name="faulty", expectations=[ - gxe.ExpectColumnValuesToMatchStrftimeFormat(column="dates", strftime_format="%Y-%m-%d"), - gxe.ExpectColumnValuesToNotBeNull(column="names"), - gxe.ExpectColumnValuesToMatchRegex(column="emails", regex="@gmail.com"), + gxe.ExpectColumnValuesToNotBeNull(column="col1", result_format="COMPLETE"), + gxe.ExpectColumnValuesToBeInSet( + column="col2", + value_set=["A", "B", "C"], + row_condition="col3 IS NOT NULL", + mostly=0.665, + # condition_parser='spark', + result_format="COMPLETE", + ), ], ) + result = batch_for_datasource.validate(suite) assert not result.success assert all(res.success is False and res.exception_info for res in result.results) From 0b7cba6408eb0e23e533d69b67b15dba3f1e7620 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 12 Dec 2024 13:48:51 -0500 Subject: [PATCH 3/6] fix test --- .../test_canonical_expectations.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py index c6bdf2eee7a7..f8263f860a1c 100644 --- a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py +++ b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py @@ -158,7 +158,7 @@ def test_expect_column_mean_to_be_between(batch_for_datasource): @parameterize_batch_for_data_sources( - data_source_configs=[SparkFilesystemCsvDatasourceTestConfig()], + data_source_configs=[PostgreSQLDatasourceTestConfig()], data=pd.DataFrame( { "col1": [1, 2, 3, 4, 5], @@ -171,18 +171,27 @@ def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasour suite = ExpectationSuite( name="faulty", expectations=[ - gxe.ExpectColumnValuesToNotBeNull(column="col1", result_format="COMPLETE"), + gxe.ExpectColumnValuesToNotBeNull(column="col1"), + gxe.ExpectColumnValuesToNotBeNull(column="col2"), gxe.ExpectColumnValuesToBeInSet( column="col2", value_set=["A", "B", "C"], row_condition="col3 IS NOT NULL", mostly=0.665, - # condition_parser='spark', - result_format="COMPLETE", + condition_parser=None, # Should be specified as 'great_expectations' ), ], ) result = batch_for_datasource.validate(suite) assert not result.success - assert all(res.success is False and res.exception_info for res in result.results) + # Resolution of the 'table.row_count' metric (which is used by all expectations above) fails + # because the condition_parser is inaccurate. + # BUG - The error message should not permeates across the suite due to the shared metric + # dependency. + assert all( + res.success is False + and "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser." + in str(res.exception_info) + for res in result.results + ) From 640759c1aab78ef0021b911385ec1dc04bb803ca Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 12 Dec 2024 14:15:08 -0500 Subject: [PATCH 4/6] fix test --- .../test_canonical_expectations.py | 41 ---------- .../test_known_issues.py | 74 +++++++++++++++++++ 2 files changed, 74 insertions(+), 41 deletions(-) create mode 100644 tests/integration/data_sources_and_expectations/test_known_issues.py diff --git a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py index f8263f860a1c..96661debf686 100644 --- a/tests/integration/data_sources_and_expectations/test_canonical_expectations.py +++ b/tests/integration/data_sources_and_expectations/test_canonical_expectations.py @@ -4,7 +4,6 @@ import pandas as pd import great_expectations.expectations as gxe -from great_expectations.core.expectation_suite import ExpectationSuite from tests.integration.conftest import parameterize_batch_for_data_sources from tests.integration.test_utils.data_source_config import ( BigQueryDatasourceTestConfig, @@ -155,43 +154,3 @@ def test_expect_column_mean_to_be_between(batch_for_datasource): expectation = gxe.ExpectColumnMeanToBeBetween(column="a", min_value=2, max_value=3) result = batch_for_datasource.validate(expectation) assert result.success - - -@parameterize_batch_for_data_sources( - data_source_configs=[PostgreSQLDatasourceTestConfig()], - data=pd.DataFrame( - { - "col1": [1, 2, 3, 4, 5], - "col2": ["A", "B", "C", "D", None], - "col3": [1.1, None, 3.3, 4.4, 5.5], - } - ), -) -def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasource): - suite = ExpectationSuite( - name="faulty", - expectations=[ - gxe.ExpectColumnValuesToNotBeNull(column="col1"), - gxe.ExpectColumnValuesToNotBeNull(column="col2"), - gxe.ExpectColumnValuesToBeInSet( - column="col2", - value_set=["A", "B", "C"], - row_condition="col3 IS NOT NULL", - mostly=0.665, - condition_parser=None, # Should be specified as 'great_expectations' - ), - ], - ) - - result = batch_for_datasource.validate(suite) - assert not result.success - # Resolution of the 'table.row_count' metric (which is used by all expectations above) fails - # because the condition_parser is inaccurate. - # BUG - The error message should not permeates across the suite due to the shared metric - # dependency. - assert all( - res.success is False - and "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser." - in str(res.exception_info) - for res in result.results - ) diff --git a/tests/integration/data_sources_and_expectations/test_known_issues.py b/tests/integration/data_sources_and_expectations/test_known_issues.py new file mode 100644 index 000000000000..44f74b726048 --- /dev/null +++ b/tests/integration/data_sources_and_expectations/test_known_issues.py @@ -0,0 +1,74 @@ +""" +Responsible for highlighting known bugs we're working to resolve. +""" + +import pandas as pd + +import great_expectations.expectations as gxe +from great_expectations.core.expectation_suite import ExpectationSuite +from tests.integration.conftest import parameterize_batch_for_data_sources +from tests.integration.test_utils.data_source_config.pandas_data_frame import ( + PandasDataFrameDatasourceTestConfig, +) +from tests.integration.test_utils.data_source_config.postgres import PostgreSQLDatasourceTestConfig + + +@parameterize_batch_for_data_sources( + data_source_configs=[PostgreSQLDatasourceTestConfig()], + data=pd.DataFrame( + { + "col1": [1, 2, 3, 4, 5], + "col2": ["A", "B", "C", "D", None], + "col3": [1.1, None, 3.3, 4.4, 5.5], + } + ), +) +def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasource): + suite = ExpectationSuite( + name="faulty", + expectations=[ + gxe.ExpectColumnValuesToNotBeNull(column="col1"), + gxe.ExpectColumnValuesToNotBeNull(column="col2"), + gxe.ExpectColumnValuesToBeInSet( + column="col2", + value_set=["A", "B", "C"], + row_condition="col3 IS NOT NULL", + mostly=0.665, + condition_parser=None, # Should be specified as 'great_expectations' + ), + ], + ) + + result = batch_for_datasource.validate(suite) + + # Resolution of the 'table.row_count' metric (which is used by all expectations above) fails + # because the condition_parser is inaccurate. + # BUG - The error message should not permeate across the suite due to the shared metric + # dependency. + assert result.success is False + assert all( + res.success is False + and "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser." + in str(res.exception_info) + for res in result.results + ) + + +@parameterize_batch_for_data_sources( + data_source_configs=[PandasDataFrameDatasourceTestConfig()], + data=pd.DataFrame( + { + "col1": [1, 2, 3, 4, 5], + } + ), +) +def test_catch_exceptions_is_not_respected(batch_for_datasource): + expectation = gxe.ExpectColumnValuesToMatchStrftimeFormat( + column="col1", strftime_format="%Y-%m-%d", catch_exceptions=False + ) + result = batch_for_datasource.validate(expectation) + + assert result.success is False + assert "please call the expectation before converting from string format" in str( + result.exception_info + ) From 0fb90231cad2507c5175c7158b0d2cddd938f507 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 12 Dec 2024 14:18:45 -0500 Subject: [PATCH 5/6] update tests --- .../test_known_issues.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/test_known_issues.py b/tests/integration/data_sources_and_expectations/test_known_issues.py index 44f74b726048..1148f23718f0 100644 --- a/tests/integration/data_sources_and_expectations/test_known_issues.py +++ b/tests/integration/data_sources_and_expectations/test_known_issues.py @@ -24,6 +24,15 @@ ), ) def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasource): + """ + This test demonstrates the bug where a missing condition_parser causes the entire suite to fail. + + The bug is that the error message from the third expectation is propagated to all subsequent + expectations. + + All expectations share a dependency on the 'table.row_count' metric, causing the issue to + propagate. + """ suite = ExpectationSuite( name="faulty", expectations=[ @@ -41,10 +50,6 @@ def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasour result = batch_for_datasource.validate(suite) - # Resolution of the 'table.row_count' metric (which is used by all expectations above) fails - # because the condition_parser is inaccurate. - # BUG - The error message should not permeate across the suite due to the shared metric - # dependency. assert result.success is False assert all( res.success is False @@ -63,6 +68,12 @@ def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasour ), ) def test_catch_exceptions_is_not_respected(batch_for_datasource): + """ + This test demonstrates the bug where catch_exceptions is not respected. + + We do not currently have any logic that respects user configuration + and disables the default value of True. + """ expectation = gxe.ExpectColumnValuesToMatchStrftimeFormat( column="col1", strftime_format="%Y-%m-%d", catch_exceptions=False ) From 06cc26c3efab4307097f46f9bd3207ac6b7789de Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 12 Dec 2024 14:22:23 -0500 Subject: [PATCH 6/6] update tests --- .../data_sources_and_expectations/test_known_issues.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/test_known_issues.py b/tests/integration/data_sources_and_expectations/test_known_issues.py index 1148f23718f0..1d4abe836faf 100644 --- a/tests/integration/data_sources_and_expectations/test_known_issues.py +++ b/tests/integration/data_sources_and_expectations/test_known_issues.py @@ -23,15 +23,17 @@ } ), ) -def test_missing_condition_parser_causes_entire_suite_to_fail(batch_for_datasource): +def test_shared_metric_dependencies_cause_entire_suite_to_fail(batch_for_datasource): """ This test demonstrates the bug where a missing condition_parser causes the entire suite to fail. The bug is that the error message from the third expectation is propagated to all subsequent expectations. - All expectations share a dependency on the 'table.row_count' metric, causing the issue to - propagate. + This highlights a larger issue where shared metric dependencies can cause issues to propagate + across expectations. A missing condition parser is simply one example of this issue. + + All expectations here share a dependency on the 'table.row_count' metric. """ suite = ExpectationSuite( name="faulty",