Skip to content

Commit

Permalink
Merge pull request #37 from ADBond/bug/compat/fix-for-splink404
Browse files Browse the repository at this point in the history
Fix for splink 4.0.4
  • Loading branch information
ADBond authored Oct 14, 2024
2 parents a2226e0 + 8a60e03 commit 9593e3d
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 109 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ tmp*

*.csv
*.json

*.log
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- `ClickhouseAPI` now has a function `.set_union_default_mode()` to allow manually setting client state necessary for clustering, if session has timed out e.g. when running interactively [#36](https://github.com/ADBond/splinkclickhouse/pull/36).
- Added support for Splink 4.0.4 [#37](https://github.com/ADBond/splinkclickhouse/pull/37).

### Fixed

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[![pypi](https://img.shields.io/github/v/release/adbond/splinkclickhouse?include_prereleases)](https://pypi.org/project/splinkclickhouse/#history)
[![Downloads](https://static.pepy.tech/badge/splinkclickhouse)](https://pepy.tech/project/splinkclickhouse)

# `splinkclickhouse`

Basic [Clickhouse](https://clickhouse.com/docs/en/intro) support for use as a backend with the data-linkage and deduplication package [Splink](https://moj-analytical-services.github.io/splink/).
Expand Down
5 changes: 5 additions & 0 deletions splinkclickhouse/chdb/database_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def _setup_for_execute_sql(self, sql: str, physical_name: str) -> str:
# TODO: very sorry for this
# avoids 'double selection' issue in creating __splink__block_counts
sql = sql.replace(", count_l, count_r,", ",")
# some excessively brittle SQL replacements to hand Clickhouse name-resolution
sql = sql.replace(
"SELECT DISTINCT r.representative",
"SELECT DISTINCT r.representative AS representative",
)

sql = f"CREATE TABLE {physical_name} ORDER BY tuple() AS {sql}"
return sql
Expand Down
5 changes: 5 additions & 0 deletions splinkclickhouse/clickhouse/database_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ def _setup_for_execute_sql(self, sql: str, physical_name: str) -> str:
# TODO: very sorry for this
# avoids 'double selection' issue in creating __splink__block_counts
sql = sql.replace(", count_l, count_r,", ",")
# some excessively brittle SQL replacements to hand Clickhouse name-resolution
sql = sql.replace(
"SELECT DISTINCT r.representative",
"SELECT DISTINCT r.representative AS representative",
)

sql = f"CREATE TABLE {physical_name} ORDER BY tuple() AS {sql}"
return sql
Expand Down
33 changes: 33 additions & 0 deletions tests/test_debug_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,40 @@
from splink import Linker, block_on


@mark.parametrize("debug_mode", [False, True])
def test_training(api_info, fake_1000, fake_1000_settings_factory, debug_mode):
db_api = api_info["db_api_factory"]()
df = fake_1000
fake_1000_settings = fake_1000_settings_factory(api_info["version"])
linker = Linker(df, fake_1000_settings, db_api)
db_api.debug_mode = debug_mode

# training
linker.training.estimate_u_using_random_sampling(max_pairs=6e5)
linker.training.estimate_probability_two_random_records_match(
[block_on("dob"), block_on("first_name", "surname")], recall=0.8
)
linker.training.estimate_parameters_using_expectation_maximisation(
block_on("dob"),
)
linker.training.estimate_parameters_using_expectation_maximisation(
block_on("first_name", "surname"),
)


@mark.parametrize("debug_mode", [False, True])
def test_predict(api_info, fake_1000, fake_1000_settings_factory, debug_mode):
db_api = api_info["db_api_factory"]()
df = fake_1000
fake_1000_settings = fake_1000_settings_factory(api_info["version"])
linker = Linker(df, fake_1000_settings, db_api)
db_api.debug_mode = debug_mode

linker.inference.predict()


# all-in-one workflow
@mark.skip("Until upstream clustering+debug fix comes through")
@mark.parametrize("debug_mode", [False, True])
def test_full_basic_run(api_info, fake_1000, fake_1000_settings_factory, debug_mode):
db_api = api_info["db_api_factory"]()
Expand Down
Loading

0 comments on commit 9593e3d

Please sign in to comment.