diff --git a/CHANGELOG.md b/CHANGELOG.md index a198bc0..216cb4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,13 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Term frequency adjustments are now not limited in Clickhouse server (or `chdb` when `debug_mode` is switched on) [#46](https://github.com/ADBond/splinkclickhouse/pull/46). + +### Changed + +- Dropped support for Splink <= `4.0.5` [#46](https://github.com/ADBond/splinkclickhouse/pull/46). + ## [0.3.2] - 2024-10-23 ### Added -- SQL UDF `days_since_epoch` to parse a date representing a string to the number of days since `1970-01-01` [#39](https://github.com/ADBond/splinkclickhouse/pull/39) -- Custom Clickhouse `ColumnExpression` with additional transform `parse_date_to_int` to parse string to days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39) -- Custom date comparison and comparison levels working with integer type representing days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39) +- SQL UDF `days_since_epoch` to parse a date representing a string to the number of days since `1970-01-01` [#39](https://github.com/ADBond/splinkclickhouse/pull/39). +- Custom Clickhouse `ColumnExpression` with additional transform `parse_date_to_int` to parse string to days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39). +- Custom date comparison and comparison levels working with integer type representing days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39). ## [0.3.1] - 2024-10-14 diff --git a/README.md b/README.md index 833b377..a2fcc74 100644 --- a/README.md +++ b/README.md @@ -238,12 +238,6 @@ import splink.comparison_level as cl first_name_comparison = cl.DamerauLevenshteinAtThresholds("NULLIF(first_name, '')") ``` -### Term-frequency adjustments - -Currently at most one term frequency adjustment can be used with `ClickhouseAPI`. - -This also applies to `ChDBAPI` but _only in `debug_mode`_. With `debug_mode` off there is no limit on term frequency adjustments. - ### `ClickhouseAPI` pandas registration `ClickhouseAPI` will allow registration of pandas dataframes, by inferring the types of columns. It currently only does this for string, integer, and float columns, and will always make them `Nullable`. diff --git a/pyproject.toml b/pyproject.toml index 4c24b7f..4621a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ classifiers = [ ] requires-python = ">=3.9" dependencies = [ - "splink >= 4.0.2", + "splink >= 4.0.6", "clickhouse_connect >= 0.7.0", ] [project.urls] diff --git a/scripts/getting_started_clickhouse.py b/scripts/getting_started_clickhouse.py index 371288f..0517e29 100644 --- a/scripts/getting_started_clickhouse.py +++ b/scripts/getting_started_clickhouse.py @@ -24,11 +24,10 @@ db_api = ClickhouseAPI(client) -# TODO: tf adjustments need deep work (can have _one_ but not more) settings = SettingsCreator( link_type="dedupe_only", comparisons=[ - cl.JaroWinklerAtThresholds("first_name"), + cl.NameComparison("first_name"), cl.JaroAtThresholds("surname"), cl.DateOfBirthComparison( "dob", @@ -37,7 +36,7 @@ cl.DamerauLevenshteinAtThresholds("city").configure( term_frequency_adjustments=True ), - cl.JaccardAtThresholds("email"), + cl.EmailComparison("email"), ], blocking_rules_to_generate_predictions=[ block_on("first_name", "dob"), diff --git a/tests/conftest.py b/tests/conftest.py index f5f25a5..eedaa57 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -88,8 +88,12 @@ def fake_1000_settings(version): return SettingsCreator( link_type="dedupe_only", comparisons=[ - cl.JaroWinklerAtThresholds("first_name"), - cl.JaroAtThresholds("surname"), + cl.JaroWinklerAtThresholds("first_name").configure( + term_frequency_adjustments=True + ), + cl.JaroAtThresholds("surname").configure( + term_frequency_adjustments=True + ), cl.DateOfBirthComparison( "dob", input_is_string=True, @@ -112,8 +116,10 @@ def fake_1000_settings(version): comparisons=[ cl.JaroWinklerAtThresholds( ColumnExpression("first_name").regex_extract(".*") - ), - cl.JaroAtThresholds(ColumnExpression("surname").regex_extract(".*")), + ).configure(term_frequency_adjustments=True), + cl.JaroAtThresholds( + ColumnExpression("surname").regex_extract(".*") + ).configure(term_frequency_adjustments=True), cl.DateOfBirthComparison( ColumnExpression("dob").regex_extract(".*"), input_is_string=True, diff --git a/uv.lock b/uv.lock index 9d509db..39be467 100644 --- a/uv.lock +++ b/uv.lock @@ -879,7 +879,7 @@ wheels = [ [[package]] name = "splink" -version = "4.0.5" +version = "4.0.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "altair" }, @@ -891,9 +891,9 @@ dependencies = [ { name = "pandas" }, { name = "sqlglot" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9e/51/66dd1871f1ed6edaad43dc1121dd1e59d4ef0c5d3cd993b23b5c751ab94e/splink-4.0.5.tar.gz", hash = "sha256:72dbdaa7a1211733018d01a80b87f3bfecd32216a1693b1c67fe31db9034f356", size = 3654992 } +sdist = { url = "https://files.pythonhosted.org/packages/f0/86/23c722b0742b77c0762a2bbe627227faa7ff812d740c08ac7df20c3f5961/splink-4.0.6.tar.gz", hash = "sha256:1b2c860edb5c7eae3649706c1e8e98247c9d051a3109341951ee883d424bf6ca", size = 3658160 } wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/68/bb9108f4341e41b95d203c9c8f47d7f52a7d6e96348b83dc3ba1f075e91d/splink-4.0.5-py3-none-any.whl", hash = "sha256:0afc28e12fc863030ad1add89dffa54c91a35b50f14fee64ac78bfa43f5d8866", size = 3717815 }, + { url = "https://files.pythonhosted.org/packages/9f/5a/8b28279da9947884fc46bafbeba06f41b9f9a43126e87397668aa2527bf7/splink-4.0.6-py3-none-any.whl", hash = "sha256:697377c5a401368e58ce11e3b8d4b28cfe9452625e81d19994709cad37621830", size = 3721667 }, ] [[package]] @@ -923,7 +923,7 @@ dev = [ requires-dist = [ { name = "chdb", marker = "extra == 'chdb'", specifier = ">=2.0.1" }, { name = "clickhouse-connect", specifier = ">=0.7.0" }, - { name = "splink", specifier = ">=4.0.2" }, + { name = "splink", specifier = ">=4.0.6" }, ] [package.metadata.requires-dev]