From dd1bc8d1ad363050c62d1aa7584680369d21f2b6 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:47:05 +0000 Subject: [PATCH 1/9] bump minimum splink version this includes a change that alters SQL so that we can have multiple tf adjustments in Clickhouse server, or chdb + debug mode --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4c24b7f..4621a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ classifiers = [ ] requires-python = ">=3.9" dependencies = [ - "splink >= 4.0.2", + "splink >= 4.0.6", "clickhouse_connect >= 0.7.0", ] [project.urls] From e1baa4ebc49d42770efafef68802ac425c828955 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:47:16 +0000 Subject: [PATCH 2/9] Remove README caveat --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index 833b377..a2fcc74 100644 --- a/README.md +++ b/README.md @@ -238,12 +238,6 @@ import splink.comparison_level as cl first_name_comparison = cl.DamerauLevenshteinAtThresholds("NULLIF(first_name, '')") ``` -### Term-frequency adjustments - -Currently at most one term frequency adjustment can be used with `ClickhouseAPI`. - -This also applies to `ChDBAPI` but _only in `debug_mode`_. With `debug_mode` off there is no limit on term frequency adjustments. - ### `ClickhouseAPI` pandas registration `ClickhouseAPI` will allow registration of pandas dataframes, by inferring the types of columns. It currently only does this for string, integer, and float columns, and will always make them `Nullable`. From 3d767e3a17c8b00ed0c4acd148b824fabb357390 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:49:26 +0000 Subject: [PATCH 3/9] chuck a bunch of tf adjustments into test fixtures --- tests/conftest.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f5f25a5..88d8daa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -88,8 +88,12 @@ def fake_1000_settings(version): return SettingsCreator( link_type="dedupe_only", comparisons=[ - cl.JaroWinklerAtThresholds("first_name"), - cl.JaroAtThresholds("surname"), + cl.JaroWinklerAtThresholds("first_name").configure( + term_frequency_adjustments=True + ), + cl.JaroAtThresholds("surname").configure( + term_frequency_adjustments=True + ), cl.DateOfBirthComparison( "dob", input_is_string=True, @@ -112,8 +116,12 @@ def fake_1000_settings(version): comparisons=[ cl.JaroWinklerAtThresholds( ColumnExpression("first_name").regex_extract(".*") + ).configure( + term_frequency_adjustments=True + ), + cl.JaroAtThresholds(ColumnExpression("surname").regex_extract(".*")).configure( + term_frequency_adjustments=True ), - cl.JaroAtThresholds(ColumnExpression("surname").regex_extract(".*")), cl.DateOfBirthComparison( ColumnExpression("dob").regex_extract(".*"), input_is_string=True, From b89ddaa97c41699643ee44eb003c8458a7d4ffd3 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:50:28 +0000 Subject: [PATCH 4/9] Clickhouse server getting started use same model as others now that we have no limit on tf adjustments --- scripts/getting_started_clickhouse.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/getting_started_clickhouse.py b/scripts/getting_started_clickhouse.py index 371288f..0517e29 100644 --- a/scripts/getting_started_clickhouse.py +++ b/scripts/getting_started_clickhouse.py @@ -24,11 +24,10 @@ db_api = ClickhouseAPI(client) -# TODO: tf adjustments need deep work (can have _one_ but not more) settings = SettingsCreator( link_type="dedupe_only", comparisons=[ - cl.JaroWinklerAtThresholds("first_name"), + cl.NameComparison("first_name"), cl.JaroAtThresholds("surname"), cl.DateOfBirthComparison( "dob", @@ -37,7 +36,7 @@ cl.DamerauLevenshteinAtThresholds("city").configure( term_frequency_adjustments=True ), - cl.JaccardAtThresholds("email"), + cl.EmailComparison("email"), ], blocking_rules_to_generate_predictions=[ block_on("first_name", "dob"), From 51bb1638fba0ba5141a97b7493cae172ba232bcc Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:52:24 +0000 Subject: [PATCH 5/9] changelog update --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a198bc0..a21deb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Term frequency adjustments are now not limited in Clickhouse server (or `chdb` when `debug_mode` is switched on). + +### Changed + +- Dropped support for Splink <= `4.0.5`. + ## [0.3.2] - 2024-10-23 ### Added From 1b77dc7db249a9189b4d2869a2f2e0296a5e5573 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:52:40 +0000 Subject: [PATCH 6/9] Consistent full stop format in changelog --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a21deb2..fb0c585 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,9 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- SQL UDF `days_since_epoch` to parse a date representing a string to the number of days since `1970-01-01` [#39](https://github.com/ADBond/splinkclickhouse/pull/39) -- Custom Clickhouse `ColumnExpression` with additional transform `parse_date_to_int` to parse string to days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39) -- Custom date comparison and comparison levels working with integer type representing days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39) +- SQL UDF `days_since_epoch` to parse a date representing a string to the number of days since `1970-01-01` [#39](https://github.com/ADBond/splinkclickhouse/pull/39). +- Custom Clickhouse `ColumnExpression` with additional transform `parse_date_to_int` to parse string to days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39). +- Custom date comparison and comparison levels working with integer type representing days since epoch [#39](https://github.com/ADBond/splinkclickhouse/pull/39). ## [0.3.1] - 2024-10-14 From d153c31bbeb0795d9c72cae63562a7b5dfb6427b Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 5 Dec 2024 09:52:17 +0000 Subject: [PATCH 7/9] format conftest --- tests/conftest.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 88d8daa..eedaa57 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -116,12 +116,10 @@ def fake_1000_settings(version): comparisons=[ cl.JaroWinklerAtThresholds( ColumnExpression("first_name").regex_extract(".*") - ).configure( - term_frequency_adjustments=True - ), - cl.JaroAtThresholds(ColumnExpression("surname").regex_extract(".*")).configure( - term_frequency_adjustments=True - ), + ).configure(term_frequency_adjustments=True), + cl.JaroAtThresholds( + ColumnExpression("surname").regex_extract(".*") + ).configure(term_frequency_adjustments=True), cl.DateOfBirthComparison( ColumnExpression("dob").regex_extract(".*"), input_is_string=True, From 99b209abe718fc4f93d99d0efa3f2582c58bb596 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 5 Dec 2024 09:52:22 +0000 Subject: [PATCH 8/9] update lockfile --- uv.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/uv.lock b/uv.lock index 9d509db..39be467 100644 --- a/uv.lock +++ b/uv.lock @@ -879,7 +879,7 @@ wheels = [ [[package]] name = "splink" -version = "4.0.5" +version = "4.0.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "altair" }, @@ -891,9 +891,9 @@ dependencies = [ { name = "pandas" }, { name = "sqlglot" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9e/51/66dd1871f1ed6edaad43dc1121dd1e59d4ef0c5d3cd993b23b5c751ab94e/splink-4.0.5.tar.gz", hash = "sha256:72dbdaa7a1211733018d01a80b87f3bfecd32216a1693b1c67fe31db9034f356", size = 3654992 } +sdist = { url = "https://files.pythonhosted.org/packages/f0/86/23c722b0742b77c0762a2bbe627227faa7ff812d740c08ac7df20c3f5961/splink-4.0.6.tar.gz", hash = "sha256:1b2c860edb5c7eae3649706c1e8e98247c9d051a3109341951ee883d424bf6ca", size = 3658160 } wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/68/bb9108f4341e41b95d203c9c8f47d7f52a7d6e96348b83dc3ba1f075e91d/splink-4.0.5-py3-none-any.whl", hash = "sha256:0afc28e12fc863030ad1add89dffa54c91a35b50f14fee64ac78bfa43f5d8866", size = 3717815 }, + { url = "https://files.pythonhosted.org/packages/9f/5a/8b28279da9947884fc46bafbeba06f41b9f9a43126e87397668aa2527bf7/splink-4.0.6-py3-none-any.whl", hash = "sha256:697377c5a401368e58ce11e3b8d4b28cfe9452625e81d19994709cad37621830", size = 3721667 }, ] [[package]] @@ -923,7 +923,7 @@ dev = [ requires-dist = [ { name = "chdb", marker = "extra == 'chdb'", specifier = ">=2.0.1" }, { name = "clickhouse-connect", specifier = ">=0.7.0" }, - { name = "splink", specifier = ">=4.0.2" }, + { name = "splink", specifier = ">=4.0.6" }, ] [package.metadata.requires-dev] From 89cff6cbda02e5341d190a1ecf90c3a30f27470b Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 5 Dec 2024 09:53:02 +0000 Subject: [PATCH 9/9] update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb0c585..216cb4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,11 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Term frequency adjustments are now not limited in Clickhouse server (or `chdb` when `debug_mode` is switched on). +- Term frequency adjustments are now not limited in Clickhouse server (or `chdb` when `debug_mode` is switched on) [#46](https://github.com/ADBond/splinkclickhouse/pull/46). ### Changed -- Dropped support for Splink <= `4.0.5`. +- Dropped support for Splink <= `4.0.5` [#46](https://github.com/ADBond/splinkclickhouse/pull/46). ## [0.3.2] - 2024-10-23