diff --git a/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb b/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb
index 156e5cb99c..8e98844baa 100644
--- a/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb
+++ b/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb
@@ -1,1197 +1,1198 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Evaluation when you have fully labelled data\n",
- "\n",
- "In this example, our data contains a fully-populated ground-truth column called `cluster` that enables us to perform accuracy analysis of the final model\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation when you have fully labelled data\n",
+ "\n",
+ "In this example, our data contains a fully-populated ground-truth column called `cluster` that enables us to perform accuracy analysis of the final model\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:16.264709Z",
+ "iopub.status.busy": "2024-06-07T09:09:16.264397Z",
+ "iopub.status.idle": "2024-06-07T09:09:16.269613Z",
+ "shell.execute_reply": "2024-06-07T09:09:16.268968Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:16.273849Z",
+ "iopub.status.busy": "2024-06-07T09:09:16.273306Z",
+ "iopub.status.idle": "2024-06-07T09:09:17.467426Z",
+ "shell.execute_reply": "2024-06-07T09:09:17.466787Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " city | \n",
+ " email | \n",
+ " cluster | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " Robert | \n",
+ " Alan | \n",
+ " 1971-06-24 | \n",
+ " NaN | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " Robert | \n",
+ " Allen | \n",
+ " 1971-05-24 | \n",
+ " NaN | \n",
+ " roberta25@smith.net | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " unique_id first_name surname dob city email cluster\n",
+ "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0\n",
+ "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0"
]
- },
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.fake_1000\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:17.501913Z",
+ "iopub.status.busy": "2024-06-07T09:09:17.501641Z",
+ "iopub.status.idle": "2024-06-07T09:09:17.581434Z",
+ "shell.execute_reply": "2024-06-07T09:09:17.580667Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import SettingsCreator, Linker, block_on, DuckDBAPI\n",
+ "\n",
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"first_name\"),\n",
+ " block_on(\"surname\"),\n",
+ " block_on(\"dob\"),\n",
+ " block_on(\"email\"),\n",
+ " ],\n",
+ " comparisons=[\n",
+ " cl.ForenameSurnameComparison(\"first_name\", \"surname\"),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"dob\",\n",
+ " input_is_string=True,\n",
+ " ),\n",
+ " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
+ " cl.EmailComparison(\"email\"),\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:17.585114Z",
+ "iopub.status.busy": "2024-06-07T09:09:17.584837Z",
+ "iopub.status.idle": "2024-06-07T09:09:17.847471Z",
+ "shell.execute_reply": "2024-06-07T09:09:17.846845Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:16.264709Z",
- "iopub.status.busy": "2024-06-07T09:09:16.264397Z",
- "iopub.status.idle": "2024-06-07T09:09:16.269613Z",
- "shell.execute_reply": "2024-06-07T09:09:16.268968Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.00333.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 300.13 are expected to match. With 499,500 total possible comparisons, we expect a total of around 1,664.29 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)\n",
+ "deterministic_rules = [\n",
+ " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
+ " \"l.email = r.email\",\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(\n",
+ " deterministic_rules, recall=0.7\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:17.850459Z",
+ "iopub.status.busy": "2024-06-07T09:09:17.850216Z",
+ "iopub.status.idle": "2024-06-07T09:09:18.931010Z",
+ "shell.execute_reply": "2024-06-07T09:09:18.930397Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:16.273849Z",
- "iopub.status.busy": "2024-06-07T09:09:16.273306Z",
- "iopub.status.idle": "2024-06-07T09:09:17.467426Z",
- "shell.execute_reply": "2024-06-07T09:09:17.466787Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " city | \n",
- " email | \n",
- " cluster | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " Robert | \n",
- " Alan | \n",
- " 1971-06-24 | \n",
- " NaN | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " Robert | \n",
- " Allen | \n",
- " 1971-05-24 | \n",
- " NaN | \n",
- " roberta25@smith.net | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id first_name surname dob city email cluster\n",
- "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0\n",
- "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink import splink_datasets\n",
- "\n",
- "df = splink_datasets.fake_1000\n",
- "df.head(2)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
+ "----- Estimating u probabilities using random sampling -----\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name_surname (no m values are trained).\n",
+ " - dob (no m values are trained).\n",
+ " - city (no m values are trained).\n",
+ " - email (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:18.934824Z",
+ "iopub.status.busy": "2024-06-07T09:09:18.934551Z",
+ "iopub.status.idle": "2024-06-07T09:09:20.495494Z",
+ "shell.execute_reply": "2024-06-07T09:09:20.494833Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:17.501913Z",
- "iopub.status.busy": "2024-06-07T09:09:17.501641Z",
- "iopub.status.idle": "2024-06-07T09:09:17.581434Z",
- "shell.execute_reply": "2024-06-07T09:09:17.580667Z"
- }
- },
- "outputs": [],
- "source": [
- "from splink import SettingsCreator, Linker, block_on, DuckDBAPI\n",
- "\n",
- "import splink.comparison_library as cl\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"first_name\"),\n",
- " block_on(\"surname\"),\n",
- " block_on(\"dob\"),\n",
- " block_on(\"email\"),\n",
- " ],\n",
- " comparisons=[\n",
- " cl.ForenameSurnameComparison(\"first_name\", \"surname\"),\n",
- " cl.DateOfBirthComparison(\n",
- " \"dob\",\n",
- " input_is_string=True,\n",
- " ),\n",
- " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
- " cl.EmailComparison(\"email\"),\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"dob\" = r.\"dob\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name_surname\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - dob\n",
+ "\n",
+ "WARNING:\n",
+ "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.751 in the m_probability of first_name_surname, level `(Exact match on first_name) AND (Exact match on surname)`\n",
+ "Iteration 2: Largest change in params was 0.196 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.0536 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.0189 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 0.00731 in probability_two_random_records_match\n",
+ "Iteration 6: Largest change in params was 0.0029 in probability_two_random_records_match\n",
+ "Iteration 7: Largest change in params was 0.00116 in probability_two_random_records_match\n",
+ "Iteration 8: Largest change in params was 0.000469 in probability_two_random_records_match\n",
+ "Iteration 9: Largest change in params was 0.000189 in probability_two_random_records_match\n",
+ "Iteration 10: Largest change in params was 7.62e-05 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 10 iterations\n",
+ "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - dob (no m values are trained).\n",
+ " - email (some m values are not trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"email\" = r.\"email\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name_surname\n",
+ " - dob\n",
+ " - city\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - email\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.438 in the m_probability of dob, level `Exact match on dob`\n",
+ "Iteration 2: Largest change in params was 0.122 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.0286 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.01 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 0.00448 in probability_two_random_records_match\n",
+ "Iteration 6: Largest change in params was 0.00237 in probability_two_random_records_match\n",
+ "Iteration 7: Largest change in params was 0.0014 in probability_two_random_records_match\n",
+ "Iteration 8: Largest change in params was 0.000893 in probability_two_random_records_match\n",
+ "Iteration 9: Largest change in params was 0.000597 in probability_two_random_records_match\n",
+ "Iteration 10: Largest change in params was 0.000413 in probability_two_random_records_match\n",
+ "Iteration 11: Largest change in params was 0.000292 in probability_two_random_records_match\n",
+ "Iteration 12: Largest change in params was 0.000211 in probability_two_random_records_match\n",
+ "Iteration 13: Largest change in params was 0.000154 in probability_two_random_records_match\n",
+ "Iteration 14: Largest change in params was 0.000113 in probability_two_random_records_match\n",
+ "Iteration 15: Largest change in params was 8.4e-05 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 15 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - email (some m values are not trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - dob\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - first_name_surname\n",
+ "\n",
+ "WARNING:\n",
+ "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.473 in probability_two_random_records_match\n",
+ "Iteration 2: Largest change in params was 0.0452 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.00766 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.00135 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 0.00025 in probability_two_random_records_match\n",
+ "Iteration 6: Largest change in params was 0.000468 in the m_probability of email, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was 0.00776 in the m_probability of email, level `All other comparisons`\n",
+ "Iteration 8: Largest change in params was 0.00992 in the m_probability of email, level `All other comparisons`\n",
+ "Iteration 9: Largest change in params was 0.00277 in probability_two_random_records_match\n",
+ "Iteration 10: Largest change in params was 0.000972 in probability_two_random_records_match\n",
+ "Iteration 11: Largest change in params was 0.000337 in probability_two_random_records_match\n",
+ "Iteration 12: Largest change in params was 0.000118 in probability_two_random_records_match\n",
+ "Iteration 13: Largest change in params was 4.14e-05 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 13 iterations\n",
+ "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - email (some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"dob\"), estimate_without_term_frequencies=True\n",
+ ")\n",
+ "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"email\"), estimate_without_term_frequencies=True\n",
+ ")\n",
+ "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"first_name\", \"surname\"), estimate_without_term_frequencies=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:20.498372Z",
+ "iopub.status.busy": "2024-06-07T09:09:20.498155Z",
+ "iopub.status.idle": "2024-06-07T09:09:20.768827Z",
+ "shell.execute_reply": "2024-06-07T09:09:20.768326Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:17.585114Z",
- "iopub.status.busy": "2024-06-07T09:09:17.584837Z",
- "iopub.status.idle": "2024-06-07T09:09:17.847471Z",
- "shell.execute_reply": "2024-06-07T09:09:17.846845Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.00333.\n",
- "This means that amongst all possible pairwise record comparisons, one in 300.13 are expected to match. With 499,500 total possible comparisons, we expect a total of around 1,664.29 matching pairs\n"
- ]
- }
- ],
- "source": [
- "db_api = DuckDBAPI()\n",
- "linker = Linker(df, settings, db_api=db_api)\n",
- "deterministic_rules = [\n",
- " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
- " \"l.email = r.email\",\n",
- "]\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(\n",
- " deterministic_rules, recall=0.7\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:17.850459Z",
- "iopub.status.busy": "2024-06-07T09:09:17.850216Z",
- "iopub.status.idle": "2024-06-07T09:09:18.931010Z",
- "shell.execute_reply": "2024-06-07T09:09:18.930397Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
- "----- Estimating u probabilities using random sampling -----\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name_surname (no m values are trained).\n",
- " - dob (no m values are trained).\n",
- " - city (no m values are trained).\n",
- " - email (no m values are trained).\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " truth_threshold | \n",
+ " match_probability | \n",
+ " total_clerical_labels | \n",
+ " p | \n",
+ " n | \n",
+ " tp | \n",
+ " tn | \n",
+ " fp | \n",
+ " fn | \n",
+ " P_rate | \n",
+ " ... | \n",
+ " precision | \n",
+ " recall | \n",
+ " specificity | \n",
+ " npv | \n",
+ " accuracy | \n",
+ " f1 | \n",
+ " f2 | \n",
+ " f0_5 | \n",
+ " p4 | \n",
+ " phi | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " -17.8 | \n",
+ " 0.000004 | \n",
+ " 499500.0 | \n",
+ " 2031.0 | \n",
+ " 497469.0 | \n",
+ " 1650.0 | \n",
+ " 495130.0 | \n",
+ " 2339.0 | \n",
+ " 381.0 | \n",
+ " 0.004066 | \n",
+ " ... | \n",
+ " 0.413638 | \n",
+ " 0.812408 | \n",
+ " 0.995298 | \n",
+ " 0.999231 | \n",
+ " 0.994555 | \n",
+ " 0.548173 | \n",
+ " 0.681086 | \n",
+ " 0.458665 | \n",
+ " 0.707466 | \n",
+ " 0.577474 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " -17.7 | \n",
+ " 0.000005 | \n",
+ " 499500.0 | \n",
+ " 2031.0 | \n",
+ " 497469.0 | \n",
+ " 1650.0 | \n",
+ " 495225.0 | \n",
+ " 2244.0 | \n",
+ " 381.0 | \n",
+ " 0.004066 | \n",
+ " ... | \n",
+ " 0.423729 | \n",
+ " 0.812408 | \n",
+ " 0.995489 | \n",
+ " 0.999231 | \n",
+ " 0.994745 | \n",
+ " 0.556962 | \n",
+ " 0.686470 | \n",
+ " 0.468564 | \n",
+ " 0.714769 | \n",
+ " 0.584558 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " -17.1 | \n",
+ " 0.000007 | \n",
+ " 499500.0 | \n",
+ " 2031.0 | \n",
+ " 497469.0 | \n",
+ " 1650.0 | \n",
+ " 495311.0 | \n",
+ " 2158.0 | \n",
+ " 381.0 | \n",
+ " 0.004066 | \n",
+ " ... | \n",
+ " 0.433298 | \n",
+ " 0.812408 | \n",
+ " 0.995662 | \n",
+ " 0.999231 | \n",
+ " 0.994917 | \n",
+ " 0.565165 | \n",
+ " 0.691418 | \n",
+ " 0.477901 | \n",
+ " 0.721512 | \n",
+ " 0.591197 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " -17.0 | \n",
+ " 0.000008 | \n",
+ " 499500.0 | \n",
+ " 2031.0 | \n",
+ " 497469.0 | \n",
+ " 1650.0 | \n",
+ " 495354.0 | \n",
+ " 2115.0 | \n",
+ " 381.0 | \n",
+ " 0.004066 | \n",
+ " ... | \n",
+ " 0.438247 | \n",
+ " 0.812408 | \n",
+ " 0.995748 | \n",
+ " 0.999231 | \n",
+ " 0.995003 | \n",
+ " 0.569358 | \n",
+ " 0.693919 | \n",
+ " 0.482710 | \n",
+ " 0.724931 | \n",
+ " 0.594601 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " -16.9 | \n",
+ " 0.000008 | \n",
+ " 499500.0 | \n",
+ " 2031.0 | \n",
+ " 497469.0 | \n",
+ " 1650.0 | \n",
+ " 495386.0 | \n",
+ " 2083.0 | \n",
+ " 381.0 | \n",
+ " 0.004066 | \n",
+ " ... | \n",
+ " 0.442004 | \n",
+ " 0.812408 | \n",
+ " 0.995813 | \n",
+ " 0.999231 | \n",
+ " 0.995067 | \n",
+ " 0.572519 | \n",
+ " 0.695792 | \n",
+ " 0.486353 | \n",
+ " 0.727497 | \n",
+ " 0.597173 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 25 columns
\n",
+ "
"
],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)"
+ "text/plain": [
+ " truth_threshold match_probability total_clerical_labels p \\\n",
+ "0 -17.8 0.000004 499500.0 2031.0 \n",
+ "1 -17.7 0.000005 499500.0 2031.0 \n",
+ "2 -17.1 0.000007 499500.0 2031.0 \n",
+ "3 -17.0 0.000008 499500.0 2031.0 \n",
+ "4 -16.9 0.000008 499500.0 2031.0 \n",
+ "\n",
+ " n tp tn fp fn P_rate ... precision \\\n",
+ "0 497469.0 1650.0 495130.0 2339.0 381.0 0.004066 ... 0.413638 \n",
+ "1 497469.0 1650.0 495225.0 2244.0 381.0 0.004066 ... 0.423729 \n",
+ "2 497469.0 1650.0 495311.0 2158.0 381.0 0.004066 ... 0.433298 \n",
+ "3 497469.0 1650.0 495354.0 2115.0 381.0 0.004066 ... 0.438247 \n",
+ "4 497469.0 1650.0 495386.0 2083.0 381.0 0.004066 ... 0.442004 \n",
+ "\n",
+ " recall specificity npv accuracy f1 f2 f0_5 \\\n",
+ "0 0.812408 0.995298 0.999231 0.994555 0.548173 0.681086 0.458665 \n",
+ "1 0.812408 0.995489 0.999231 0.994745 0.556962 0.686470 0.468564 \n",
+ "2 0.812408 0.995662 0.999231 0.994917 0.565165 0.691418 0.477901 \n",
+ "3 0.812408 0.995748 0.999231 0.995003 0.569358 0.693919 0.482710 \n",
+ "4 0.812408 0.995813 0.999231 0.995067 0.572519 0.695792 0.486353 \n",
+ "\n",
+ " p4 phi \n",
+ "0 0.707466 0.577474 \n",
+ "1 0.714769 0.584558 \n",
+ "2 0.721512 0.591197 \n",
+ "3 0.724931 0.594601 \n",
+ "4 0.727497 0.597173 \n",
+ "\n",
+ "[5 rows x 25 columns]"
]
- },
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_column(\n",
+ " \"cluster\", output_type=\"table\"\n",
+ ").as_pandas_dataframe(limit=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:20.771736Z",
+ "iopub.status.busy": "2024-06-07T09:09:20.771453Z",
+ "iopub.status.idle": "2024-06-07T09:09:21.322647Z",
+ "shell.execute_reply": "2024-06-07T09:09:21.322088Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:18.934824Z",
- "iopub.status.busy": "2024-06-07T09:09:18.934551Z",
- "iopub.status.idle": "2024-06-07T09:09:20.495494Z",
- "shell.execute_reply": "2024-06-07T09:09:20.494833Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"dob\" = r.\"dob\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name_surname\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - dob\n",
- "\n",
- "WARNING:\n",
- "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was -0.751 in the m_probability of first_name_surname, level `(Exact match on first_name) AND (Exact match on surname)`\n",
- "Iteration 2: Largest change in params was 0.196 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was 0.0536 in probability_two_random_records_match\n",
- "Iteration 4: Largest change in params was 0.0189 in probability_two_random_records_match\n",
- "Iteration 5: Largest change in params was 0.00731 in probability_two_random_records_match\n",
- "Iteration 6: Largest change in params was 0.0029 in probability_two_random_records_match\n",
- "Iteration 7: Largest change in params was 0.00116 in probability_two_random_records_match\n",
- "Iteration 8: Largest change in params was 0.000469 in probability_two_random_records_match\n",
- "Iteration 9: Largest change in params was 0.000189 in probability_two_random_records_match\n",
- "Iteration 10: Largest change in params was 7.62e-05 in probability_two_random_records_match\n",
- "\n",
- "EM converged after 10 iterations\n",
- "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - dob (no m values are trained).\n",
- " - email (some m values are not trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"email\" = r.\"email\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name_surname\n",
- " - dob\n",
- " - city\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - email\n",
- "\n",
- "Iteration 1: Largest change in params was -0.438 in the m_probability of dob, level `Exact match on dob`\n",
- "Iteration 2: Largest change in params was 0.122 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was 0.0286 in probability_two_random_records_match\n",
- "Iteration 4: Largest change in params was 0.01 in probability_two_random_records_match\n",
- "Iteration 5: Largest change in params was 0.00448 in probability_two_random_records_match\n",
- "Iteration 6: Largest change in params was 0.00237 in probability_two_random_records_match\n",
- "Iteration 7: Largest change in params was 0.0014 in probability_two_random_records_match\n",
- "Iteration 8: Largest change in params was 0.000893 in probability_two_random_records_match\n",
- "Iteration 9: Largest change in params was 0.000597 in probability_two_random_records_match\n",
- "Iteration 10: Largest change in params was 0.000413 in probability_two_random_records_match\n",
- "Iteration 11: Largest change in params was 0.000292 in probability_two_random_records_match\n",
- "Iteration 12: Largest change in params was 0.000211 in probability_two_random_records_match\n",
- "Iteration 13: Largest change in params was 0.000154 in probability_two_random_records_match\n",
- "Iteration 14: Largest change in params was 0.000113 in probability_two_random_records_match\n",
- "Iteration 15: Largest change in params was 8.4e-05 in probability_two_random_records_match\n",
- "\n",
- "EM converged after 15 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - email (some m values are not trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - dob\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - first_name_surname\n",
- "\n",
- "WARNING:\n",
- "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was 0.473 in probability_two_random_records_match\n",
- "Iteration 2: Largest change in params was 0.0452 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was 0.00766 in probability_two_random_records_match\n",
- "Iteration 4: Largest change in params was 0.00135 in probability_two_random_records_match\n",
- "Iteration 5: Largest change in params was 0.00025 in probability_two_random_records_match\n",
- "Iteration 6: Largest change in params was 0.000468 in the m_probability of email, level `All other comparisons`\n",
- "Iteration 7: Largest change in params was 0.00776 in the m_probability of email, level `All other comparisons`\n",
- "Iteration 8: Largest change in params was 0.00992 in the m_probability of email, level `All other comparisons`\n",
- "Iteration 9: Largest change in params was 0.00277 in probability_two_random_records_match\n",
- "Iteration 10: Largest change in params was 0.000972 in probability_two_random_records_match\n",
- "Iteration 11: Largest change in params was 0.000337 in probability_two_random_records_match\n",
- "Iteration 12: Largest change in params was 0.000118 in probability_two_random_records_match\n",
- "Iteration 13: Largest change in params was 4.14e-05 in probability_two_random_records_match\n",
- "\n",
- "EM converged after 13 iterations\n",
- "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - email (some m values are not trained).\n"
- ]
- }
- ],
- "source": [
- "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"dob\"), estimate_without_term_frequencies=True\n",
- ")\n",
- "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"email\"), estimate_without_term_frequencies=True\n",
- ")\n",
- "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"first_name\", \"surname\"), estimate_without_term_frequencies=True\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:20.498372Z",
- "iopub.status.busy": "2024-06-07T09:09:20.498155Z",
- "iopub.status.idle": "2024-06-07T09:09:20.768827Z",
- "shell.execute_reply": "2024-06-07T09:09:20.768326Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " truth_threshold | \n",
- " match_probability | \n",
- " total_clerical_labels | \n",
- " p | \n",
- " n | \n",
- " tp | \n",
- " tn | \n",
- " fp | \n",
- " fn | \n",
- " P_rate | \n",
- " ... | \n",
- " precision | \n",
- " recall | \n",
- " specificity | \n",
- " npv | \n",
- " accuracy | \n",
- " f1 | \n",
- " f2 | \n",
- " f0_5 | \n",
- " p4 | \n",
- " phi | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " -17.8 | \n",
- " 0.000004 | \n",
- " 499500.0 | \n",
- " 2031.0 | \n",
- " 497469.0 | \n",
- " 1650.0 | \n",
- " 495130.0 | \n",
- " 2339.0 | \n",
- " 381.0 | \n",
- " 0.004066 | \n",
- " ... | \n",
- " 0.413638 | \n",
- " 0.812408 | \n",
- " 0.995298 | \n",
- " 0.999231 | \n",
- " 0.994555 | \n",
- " 0.548173 | \n",
- " 0.681086 | \n",
- " 0.458665 | \n",
- " 0.707466 | \n",
- " 0.577474 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " -17.7 | \n",
- " 0.000005 | \n",
- " 499500.0 | \n",
- " 2031.0 | \n",
- " 497469.0 | \n",
- " 1650.0 | \n",
- " 495225.0 | \n",
- " 2244.0 | \n",
- " 381.0 | \n",
- " 0.004066 | \n",
- " ... | \n",
- " 0.423729 | \n",
- " 0.812408 | \n",
- " 0.995489 | \n",
- " 0.999231 | \n",
- " 0.994745 | \n",
- " 0.556962 | \n",
- " 0.686470 | \n",
- " 0.468564 | \n",
- " 0.714769 | \n",
- " 0.584558 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " -17.1 | \n",
- " 0.000007 | \n",
- " 499500.0 | \n",
- " 2031.0 | \n",
- " 497469.0 | \n",
- " 1650.0 | \n",
- " 495311.0 | \n",
- " 2158.0 | \n",
- " 381.0 | \n",
- " 0.004066 | \n",
- " ... | \n",
- " 0.433298 | \n",
- " 0.812408 | \n",
- " 0.995662 | \n",
- " 0.999231 | \n",
- " 0.994917 | \n",
- " 0.565165 | \n",
- " 0.691418 | \n",
- " 0.477901 | \n",
- " 0.721512 | \n",
- " 0.591197 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " -17.0 | \n",
- " 0.000008 | \n",
- " 499500.0 | \n",
- " 2031.0 | \n",
- " 497469.0 | \n",
- " 1650.0 | \n",
- " 495354.0 | \n",
- " 2115.0 | \n",
- " 381.0 | \n",
- " 0.004066 | \n",
- " ... | \n",
- " 0.438247 | \n",
- " 0.812408 | \n",
- " 0.995748 | \n",
- " 0.999231 | \n",
- " 0.995003 | \n",
- " 0.569358 | \n",
- " 0.693919 | \n",
- " 0.482710 | \n",
- " 0.724931 | \n",
- " 0.594601 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " -16.9 | \n",
- " 0.000008 | \n",
- " 499500.0 | \n",
- " 2031.0 | \n",
- " 497469.0 | \n",
- " 1650.0 | \n",
- " 495386.0 | \n",
- " 2083.0 | \n",
- " 381.0 | \n",
- " 0.004066 | \n",
- " ... | \n",
- " 0.442004 | \n",
- " 0.812408 | \n",
- " 0.995813 | \n",
- " 0.999231 | \n",
- " 0.995067 | \n",
- " 0.572519 | \n",
- " 0.695792 | \n",
- " 0.486353 | \n",
- " 0.727497 | \n",
- " 0.597173 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 25 columns
\n",
- "
"
- ],
- "text/plain": [
- " truth_threshold match_probability total_clerical_labels p \\\n",
- "0 -17.8 0.000004 499500.0 2031.0 \n",
- "1 -17.7 0.000005 499500.0 2031.0 \n",
- "2 -17.1 0.000007 499500.0 2031.0 \n",
- "3 -17.0 0.000008 499500.0 2031.0 \n",
- "4 -16.9 0.000008 499500.0 2031.0 \n",
- "\n",
- " n tp tn fp fn P_rate ... precision \\\n",
- "0 497469.0 1650.0 495130.0 2339.0 381.0 0.004066 ... 0.413638 \n",
- "1 497469.0 1650.0 495225.0 2244.0 381.0 0.004066 ... 0.423729 \n",
- "2 497469.0 1650.0 495311.0 2158.0 381.0 0.004066 ... 0.433298 \n",
- "3 497469.0 1650.0 495354.0 2115.0 381.0 0.004066 ... 0.438247 \n",
- "4 497469.0 1650.0 495386.0 2083.0 381.0 0.004066 ... 0.442004 \n",
- "\n",
- " recall specificity npv accuracy f1 f2 f0_5 \\\n",
- "0 0.812408 0.995298 0.999231 0.994555 0.548173 0.681086 0.458665 \n",
- "1 0.812408 0.995489 0.999231 0.994745 0.556962 0.686470 0.468564 \n",
- "2 0.812408 0.995662 0.999231 0.994917 0.565165 0.691418 0.477901 \n",
- "3 0.812408 0.995748 0.999231 0.995003 0.569358 0.693919 0.482710 \n",
- "4 0.812408 0.995813 0.999231 0.995067 0.572519 0.695792 0.486353 \n",
- "\n",
- " p4 phi \n",
- "0 0.707466 0.577474 \n",
- "1 0.714769 0.584558 \n",
- "2 0.721512 0.591197 \n",
- "3 0.724931 0.594601 \n",
- "4 0.727497 0.597173 \n",
- "\n",
- "[5 rows x 25 columns]"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_column(\n",
- " \"cluster\", output_type=\"table\"\n",
- ").as_pandas_dataframe(limit=5)"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_column(\"cluster\", output_type=\"roc\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:21.327370Z",
+ "iopub.status.busy": "2024-06-07T09:09:21.327111Z",
+ "iopub.status.idle": "2024-06-07T09:09:22.635682Z",
+ "shell.execute_reply": "2024-06-07T09:09:22.635098Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:20.771736Z",
- "iopub.status.busy": "2024-06-07T09:09:20.771453Z",
- "iopub.status.idle": "2024-06-07T09:09:21.322647Z",
- "shell.execute_reply": "2024-06-07T09:09:21.322088Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_column(\"cluster\", output_type=\"roc\")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:21.327370Z",
- "iopub.status.busy": "2024-06-07T09:09:21.327111Z",
- "iopub.status.idle": "2024-06-07T09:09:22.635682Z",
- "shell.execute_reply": "2024-06-07T09:09:22.635098Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_column(\n",
- " \"cluster\",\n",
- " output_type=\"threshold_selection\",\n",
- " threshold_match_probability=0.5,\n",
- " add_metrics=[\"f1\"],\n",
- ")"
+ "text/plain": [
+ "alt.HConcatChart(...)"
]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_column(\n",
+ " \"cluster\",\n",
+ " output_type=\"threshold_selection\",\n",
+ " threshold_match_probability=0.5,\n",
+ " add_metrics=[\"f1\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:22.638822Z",
+ "iopub.status.busy": "2024-06-07T09:09:22.638569Z",
+ "iopub.status.idle": "2024-06-07T09:09:22.853941Z",
+ "shell.execute_reply": "2024-06-07T09:09:22.853250Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:22.638822Z",
- "iopub.status.busy": "2024-06-07T09:09:22.638569Z",
- "iopub.status.idle": "2024-06-07T09:09:22.853941Z",
- "shell.execute_reply": "2024-06-07T09:09:22.853250Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " clerical_match_score | \n",
- " found_by_blocking_rules | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " surname_l | \n",
- " surname_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " ... | \n",
- " email_l | \n",
- " email_r | \n",
- " gamma_email | \n",
- " tf_email_l | \n",
- " tf_email_r | \n",
- " bf_email | \n",
- " bf_tf_adj_email | \n",
- " cluster_l | \n",
- " cluster_r | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " False | \n",
- " -15.568945 | \n",
- " 0.000021 | \n",
- " 452 | \n",
- " 454 | \n",
- " Daves | \n",
- " Reuben | \n",
- " None | \n",
- " Davies | \n",
- " ... | \n",
- " rd@lewis.com | \n",
- " idlewrs.cocm | \n",
- " 0 | \n",
- " 0.003802 | \n",
- " 0.001267 | \n",
- " 0.01099 | \n",
- " 1.0 | \n",
- " 115 | \n",
- " 115 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " False | \n",
- " -14.884057 | \n",
- " 0.000033 | \n",
- " 715 | \n",
- " 717 | \n",
- " Joes | \n",
- " Jones | \n",
- " None | \n",
- " Mia | \n",
- " ... | \n",
- " None | \n",
- " mia.j63@martinez.biz | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.005070 | \n",
- " 1.00000 | \n",
- " 1.0 | \n",
- " 182 | \n",
- " 182 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " False | \n",
- " -14.884057 | \n",
- " 0.000033 | \n",
- " 626 | \n",
- " 628 | \n",
- " Davidson | \n",
- " None | \n",
- " geeorGe | \n",
- " Geeorge | \n",
- " ... | \n",
- " None | \n",
- " gdavidson@johnson-brown.com | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.005070 | \n",
- " 1.00000 | \n",
- " 1.0 | \n",
- " 158 | \n",
- " 158 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " False | \n",
- " -13.761589 | \n",
- " 0.000072 | \n",
- " 983 | \n",
- " 984 | \n",
- " Milller | \n",
- " Miller | \n",
- " Jessica | \n",
- " aessicJ | \n",
- " ... | \n",
- " None | \n",
- " jessica.miller@johnson.com | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.007605 | \n",
- " 1.00000 | \n",
- " 1.0 | \n",
- " 246 | \n",
- " 246 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " True | \n",
- " -11.637585 | \n",
- " 0.000314 | \n",
- " 594 | \n",
- " 595 | \n",
- " Kik | \n",
- " Kiirk | \n",
- " Grace | \n",
- " Grace | \n",
- " ... | \n",
- " gk@frey-robinson.org | \n",
- " rgk@frey-robinon.org | \n",
- " 0 | \n",
- " 0.001267 | \n",
- " 0.001267 | \n",
- " 0.01099 | \n",
- " 1.0 | \n",
- " 146 | \n",
- " 146 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 38 columns
\n",
- "
"
- ],
- "text/plain": [
- " clerical_match_score found_by_blocking_rules match_weight \\\n",
- "0 1.0 False -15.568945 \n",
- "1 1.0 False -14.884057 \n",
- "2 1.0 False -14.884057 \n",
- "3 1.0 False -13.761589 \n",
- "4 1.0 True -11.637585 \n",
- "\n",
- " match_probability unique_id_l unique_id_r surname_l surname_r \\\n",
- "0 0.000021 452 454 Daves Reuben \n",
- "1 0.000033 715 717 Joes Jones \n",
- "2 0.000033 626 628 Davidson None \n",
- "3 0.000072 983 984 Milller Miller \n",
- "4 0.000314 594 595 Kik Kiirk \n",
- "\n",
- " first_name_l first_name_r ... email_l \\\n",
- "0 None Davies ... rd@lewis.com \n",
- "1 None Mia ... None \n",
- "2 geeorGe Geeorge ... None \n",
- "3 Jessica aessicJ ... None \n",
- "4 Grace Grace ... gk@frey-robinson.org \n",
- "\n",
- " email_r gamma_email tf_email_l tf_email_r bf_email \\\n",
- "0 idlewrs.cocm 0 0.003802 0.001267 0.01099 \n",
- "1 mia.j63@martinez.biz -1 NaN 0.005070 1.00000 \n",
- "2 gdavidson@johnson-brown.com -1 NaN 0.005070 1.00000 \n",
- "3 jessica.miller@johnson.com -1 NaN 0.007605 1.00000 \n",
- "4 rgk@frey-robinon.org 0 0.001267 0.001267 0.01099 \n",
- "\n",
- " bf_tf_adj_email cluster_l cluster_r match_key \n",
- "0 1.0 115 115 4 \n",
- "1 1.0 182 182 4 \n",
- "2 1.0 158 158 4 \n",
- "3 1.0 246 246 4 \n",
- "4 1.0 146 146 0 \n",
- "\n",
- "[5 rows x 38 columns]"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " clerical_match_score | \n",
+ " found_by_blocking_rules | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " surname_l | \n",
+ " surname_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " ... | \n",
+ " email_l | \n",
+ " email_r | \n",
+ " gamma_email | \n",
+ " tf_email_l | \n",
+ " tf_email_r | \n",
+ " bf_email | \n",
+ " bf_tf_adj_email | \n",
+ " cluster_l | \n",
+ " cluster_r | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -15.568945 | \n",
+ " 0.000021 | \n",
+ " 452 | \n",
+ " 454 | \n",
+ " Daves | \n",
+ " Reuben | \n",
+ " None | \n",
+ " Davies | \n",
+ " ... | \n",
+ " rd@lewis.com | \n",
+ " idlewrs.cocm | \n",
+ " 0 | \n",
+ " 0.003802 | \n",
+ " 0.001267 | \n",
+ " 0.01099 | \n",
+ " 1.0 | \n",
+ " 115 | \n",
+ " 115 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -14.884057 | \n",
+ " 0.000033 | \n",
+ " 715 | \n",
+ " 717 | \n",
+ " Joes | \n",
+ " Jones | \n",
+ " None | \n",
+ " Mia | \n",
+ " ... | \n",
+ " None | \n",
+ " mia.j63@martinez.biz | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.005070 | \n",
+ " 1.00000 | \n",
+ " 1.0 | \n",
+ " 182 | \n",
+ " 182 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -14.884057 | \n",
+ " 0.000033 | \n",
+ " 626 | \n",
+ " 628 | \n",
+ " Davidson | \n",
+ " None | \n",
+ " geeorGe | \n",
+ " Geeorge | \n",
+ " ... | \n",
+ " None | \n",
+ " gdavidson@johnson-brown.com | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.005070 | \n",
+ " 1.00000 | \n",
+ " 1.0 | \n",
+ " 158 | \n",
+ " 158 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -13.761589 | \n",
+ " 0.000072 | \n",
+ " 983 | \n",
+ " 984 | \n",
+ " Milller | \n",
+ " Miller | \n",
+ " Jessica | \n",
+ " aessicJ | \n",
+ " ... | \n",
+ " None | \n",
+ " jessica.miller@johnson.com | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.007605 | \n",
+ " 1.00000 | \n",
+ " 1.0 | \n",
+ " 246 | \n",
+ " 246 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1.0 | \n",
+ " True | \n",
+ " -11.637585 | \n",
+ " 0.000314 | \n",
+ " 594 | \n",
+ " 595 | \n",
+ " Kik | \n",
+ " Kiirk | \n",
+ " Grace | \n",
+ " Grace | \n",
+ " ... | \n",
+ " gk@frey-robinson.org | \n",
+ " rgk@frey-robinon.org | \n",
+ " 0 | \n",
+ " 0.001267 | \n",
+ " 0.001267 | \n",
+ " 0.01099 | \n",
+ " 1.0 | \n",
+ " 146 | \n",
+ " 146 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 38 columns
\n",
+ "
"
],
- "source": [
- "# Plot some false positives\n",
- "linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\", include_false_negatives=True, include_false_positives=True\n",
- ").as_pandas_dataframe(limit=5)"
+ "text/plain": [
+ " clerical_match_score found_by_blocking_rules match_weight \\\n",
+ "0 1.0 False -15.568945 \n",
+ "1 1.0 False -14.884057 \n",
+ "2 1.0 False -14.884057 \n",
+ "3 1.0 False -13.761589 \n",
+ "4 1.0 True -11.637585 \n",
+ "\n",
+ " match_probability unique_id_l unique_id_r surname_l surname_r \\\n",
+ "0 0.000021 452 454 Daves Reuben \n",
+ "1 0.000033 715 717 Joes Jones \n",
+ "2 0.000033 626 628 Davidson None \n",
+ "3 0.000072 983 984 Milller Miller \n",
+ "4 0.000314 594 595 Kik Kiirk \n",
+ "\n",
+ " first_name_l first_name_r ... email_l \\\n",
+ "0 None Davies ... rd@lewis.com \n",
+ "1 None Mia ... None \n",
+ "2 geeorGe Geeorge ... None \n",
+ "3 Jessica aessicJ ... None \n",
+ "4 Grace Grace ... gk@frey-robinson.org \n",
+ "\n",
+ " email_r gamma_email tf_email_l tf_email_r bf_email \\\n",
+ "0 idlewrs.cocm 0 0.003802 0.001267 0.01099 \n",
+ "1 mia.j63@martinez.biz -1 NaN 0.005070 1.00000 \n",
+ "2 gdavidson@johnson-brown.com -1 NaN 0.005070 1.00000 \n",
+ "3 jessica.miller@johnson.com -1 NaN 0.007605 1.00000 \n",
+ "4 rgk@frey-robinon.org 0 0.001267 0.001267 0.01099 \n",
+ "\n",
+ " bf_tf_adj_email cluster_l cluster_r match_key \n",
+ "0 1.0 115 115 4 \n",
+ "1 1.0 182 182 4 \n",
+ "2 1.0 158 158 4 \n",
+ "3 1.0 246 246 4 \n",
+ "4 1.0 146 146 0 \n",
+ "\n",
+ "[5 rows x 38 columns]"
]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Plot some false positives\n",
+ "linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\", include_false_negatives=True, include_false_positives=True\n",
+ ").as_pandas_dataframe(limit=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:09:22.857193Z",
+ "iopub.status.busy": "2024-06-07T09:09:22.856931Z",
+ "iopub.status.idle": "2024-06-07T09:09:23.602967Z",
+ "shell.execute_reply": "2024-06-07T09:09:23.602410Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:09:22.857193Z",
- "iopub.status.busy": "2024-06-07T09:09:22.856931Z",
- "iopub.status.idle": "2024-06-07T09:09:23.602967Z",
- "shell.execute_reply": "2024-06-07T09:09:23.602410Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "records = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\", include_false_negatives=True, include_false_positives=True\n",
- ").as_record_dict(limit=5)\n",
- "\n",
- "linker.visualisations.waterfall_chart(records)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "records = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\", include_false_negatives=True, include_false_positives=True\n",
+ ").as_record_dict(limit=5)\n",
+ "\n",
+ "linker.visualisations.waterfall_chart(records)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb
index 5c0fb59e15..4678898c53 100644
--- a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb
+++ b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb
@@ -1,1732 +1,1733 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Linking a dataset of real historical persons\n",
- "\n",
- "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:39.966015Z",
- "iopub.status.busy": "2024-07-23T15:42:39.965625Z",
- "iopub.status.idle": "2024-07-23T15:42:39.986378Z",
- "shell.execute_reply": "2024-07-23T15:42:39.985434Z"
- },
- "tags": [
- "hide_input",
- "hide_output"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linking a dataset of real historical persons\n",
+ "\n",
+ "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:39.966015Z",
+ "iopub.status.busy": "2024-07-23T15:42:39.965625Z",
+ "iopub.status.idle": "2024-07-23T15:42:39.986378Z",
+ "shell.execute_reply": "2024-07-23T15:42:39.985434Z"
},
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:39.991114Z",
- "iopub.status.busy": "2024-07-23T15:42:39.990758Z",
- "iopub.status.idle": "2024-07-23T15:42:42.012126Z",
- "shell.execute_reply": "2024-07-23T15:42:42.011207Z"
- },
- "tags": [
- "hide_output"
- ]
- },
- "outputs": [],
- "source": [
- "from splink import splink_datasets\n",
- "\n",
- "df = splink_datasets.historical_50k"
- ]
+ "tags": [
+ "hide_input",
+ "hide_output"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:39.991114Z",
+ "iopub.status.busy": "2024-07-23T15:42:39.990758Z",
+ "iopub.status.idle": "2024-07-23T15:42:42.012126Z",
+ "shell.execute_reply": "2024-07-23T15:42:42.011207Z"
},
+ "tags": [
+ "hide_output"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "from splink import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.historical_50k"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:42.017509Z",
+ "iopub.status.busy": "2024-07-23T15:42:42.016425Z",
+ "iopub.status.idle": "2024-07-23T15:42:42.048882Z",
+ "shell.execute_reply": "2024-07-23T15:42:42.047960Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:42.017509Z",
- "iopub.status.busy": "2024-07-23T15:42:42.016425Z",
- "iopub.status.idle": "2024-07-23T15:42:42.048882Z",
- "shell.execute_reply": "2024-07-23T15:42:42.047960Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id | \n",
- " cluster | \n",
- " full_name | \n",
- " first_and_surname | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " birth_place | \n",
- " postcode_fake | \n",
- " gender | \n",
- " occupation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Q2296770-1 | \n",
- " Q2296770 | \n",
- " thomas clifford, 1st baron clifford of chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Q2296770-2 | \n",
- " Q2296770 | \n",
- " thomas of chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Q2296770-3 | \n",
- " Q2296770 | \n",
- " tom 1st baron clifford of chudleigh | \n",
- " tom chudleigh | \n",
- " tom | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Q2296770-4 | \n",
- " Q2296770 | \n",
- " thomas 1st chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8hu | \n",
- " None | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Q2296770-5 | \n",
- " Q2296770 | \n",
- " thomas clifford, 1st baron chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " None | \n",
- " politician | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id cluster full_name \\\n",
- "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n",
- "1 Q2296770-2 Q2296770 thomas of chudleigh \n",
- "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n",
- "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n",
- "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n",
- "\n",
- " first_and_surname first_name surname dob birth_place \\\n",
- "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "2 tom chudleigh tom chudleigh 1630-08-01 devon \n",
- "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "\n",
- " postcode_fake gender occupation \n",
- "0 tq13 8df male politician \n",
- "1 tq13 8df male politician \n",
- "2 tq13 8df male politician \n",
- "3 tq13 8hu None politician \n",
- "4 tq13 8df None politician "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id | \n",
+ " cluster | \n",
+ " full_name | \n",
+ " first_and_surname | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " birth_place | \n",
+ " postcode_fake | \n",
+ " gender | \n",
+ " occupation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Q2296770-1 | \n",
+ " Q2296770 | \n",
+ " thomas clifford, 1st baron clifford of chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Q2296770-2 | \n",
+ " Q2296770 | \n",
+ " thomas of chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Q2296770-3 | \n",
+ " Q2296770 | \n",
+ " tom 1st baron clifford of chudleigh | \n",
+ " tom chudleigh | \n",
+ " tom | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Q2296770-4 | \n",
+ " Q2296770 | \n",
+ " thomas 1st chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8hu | \n",
+ " None | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Q2296770-5 | \n",
+ " Q2296770 | \n",
+ " thomas clifford, 1st baron chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " None | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "df.head()"
+ "text/plain": [
+ " unique_id cluster full_name \\\n",
+ "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n",
+ "1 Q2296770-2 Q2296770 thomas of chudleigh \n",
+ "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n",
+ "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n",
+ "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n",
+ "\n",
+ " first_and_surname first_name surname dob birth_place \\\n",
+ "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "2 tom chudleigh tom chudleigh 1630-08-01 devon \n",
+ "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "\n",
+ " postcode_fake gender occupation \n",
+ "0 tq13 8df male politician \n",
+ "1 tq13 8df male politician \n",
+ "2 tq13 8df male politician \n",
+ "3 tq13 8hu None politician \n",
+ "4 tq13 8df None politician "
]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:42.098361Z",
+ "iopub.status.busy": "2024-07-23T15:42:42.097959Z",
+ "iopub.status.idle": "2024-07-23T15:42:43.350760Z",
+ "shell.execute_reply": "2024-07-23T15:42:43.350171Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:42.098361Z",
- "iopub.status.busy": "2024-07-23T15:42:42.097959Z",
- "iopub.status.idle": "2024-07-23T15:42:43.350760Z",
- "shell.execute_reply": "2024-07-23T15:42:43.350171Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import DuckDBAPI\n",
- "from splink.exploratory import profile_columns\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "profile_columns(df, db_api, column_expressions=[\"first_name\", \"substr(surname,1,2)\"])"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI\n",
+ "from splink.exploratory import profile_columns\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "profile_columns(df_sdf, column_expressions=[\"first_name\", \"substr(surname,1,2)\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:43.354847Z",
+ "iopub.status.busy": "2024-07-23T15:42:43.354580Z",
+ "iopub.status.idle": "2024-07-23T15:42:44.443241Z",
+ "shell.execute_reply": "2024-07-23T15:42:44.442459Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:43.354847Z",
- "iopub.status.busy": "2024-07-23T15:42:43.354580Z",
- "iopub.status.idle": "2024-07-23T15:42:44.443241Z",
- "shell.execute_reply": "2024-07-23T15:42:44.442459Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import DuckDBAPI, block_on\n",
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "blocking_rules = [\n",
- " block_on(\"substr(first_name,1,3)\", \"substr(surname,1,4)\"),\n",
- " block_on(\"surname\", \"dob\"),\n",
- " block_on(\"first_name\", \"dob\"),\n",
- " block_on(\"postcode_fake\", \"first_name\"),\n",
- " block_on(\"postcode_fake\", \"surname\"),\n",
- " block_on(\"dob\", \"birth_place\"),\n",
- " block_on(\"substr(postcode_fake,1,3)\", \"dob\"),\n",
- " block_on(\"substr(postcode_fake,1,3)\", \"first_name\"),\n",
- " block_on(\"substr(postcode_fake,1,3)\", \"surname\"),\n",
- " block_on(\"substr(first_name,1,2)\", \"substr(surname,1,2)\", \"substr(dob,1,4)\"),\n",
- "]\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=df,\n",
- " blocking_rules=blocking_rules,\n",
- " db_api=db_api,\n",
- " link_type=\"dedupe_only\",\n",
- ")"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI, block_on\n",
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "blocking_rules = [\n",
+ " block_on(\"substr(first_name,1,3)\", \"substr(surname,1,4)\"),\n",
+ " block_on(\"surname\", \"dob\"),\n",
+ " block_on(\"first_name\", \"dob\"),\n",
+ " block_on(\"postcode_fake\", \"first_name\"),\n",
+ " block_on(\"postcode_fake\", \"surname\"),\n",
+ " block_on(\"dob\", \"birth_place\"),\n",
+ " block_on(\"substr(postcode_fake,1,3)\", \"dob\"),\n",
+ " block_on(\"substr(postcode_fake,1,3)\", \"first_name\"),\n",
+ " block_on(\"substr(postcode_fake,1,3)\", \"surname\"),\n",
+ " block_on(\"substr(first_name,1,2)\", \"substr(surname,1,2)\", \"substr(dob,1,4)\"),\n",
+ "]\n",
+ "\n",
+ "\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " df_sdf,\n",
+ " blocking_rules=blocking_rules,\n",
+ " link_type=\"dedupe_only\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:44.447462Z",
+ "iopub.status.busy": "2024-07-23T15:42:44.446856Z",
+ "iopub.status.idle": "2024-07-23T15:42:44.676523Z",
+ "shell.execute_reply": "2024-07-23T15:42:44.675666Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "from splink import Linker, SettingsCreator\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " blocking_rules_to_generate_predictions=blocking_rules,\n",
+ " comparisons=[\n",
+ " cl.ForenameSurnameComparison(\n",
+ " \"first_name\",\n",
+ " \"surname\",\n",
+ " forename_surname_concat_col_name=\"first_name_surname_concat\",\n",
+ " ),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"dob\", input_is_string=True\n",
+ " ),\n",
+ " cl.PostcodeComparison(\"postcode_fake\"),\n",
+ " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n",
+ " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "# Needed to apply term frequencies to first+surname comparison\n",
+ "df[\"first_name_surname_concat\"] = df[\"first_name\"] + \" \" + df[\"surname\"]\n",
+ "\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:44.680176Z",
+ "iopub.status.busy": "2024-07-23T15:42:44.679929Z",
+ "iopub.status.idle": "2024-07-23T15:42:45.051292Z",
+ "shell.execute_reply": "2024-07-23T15:42:45.050482Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:44.447462Z",
- "iopub.status.busy": "2024-07-23T15:42:44.446856Z",
- "iopub.status.idle": "2024-07-23T15:42:44.676523Z",
- "shell.execute_reply": "2024-07-23T15:42:44.675666Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_library as cl\n",
- "\n",
- "from splink import Linker, SettingsCreator\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " blocking_rules_to_generate_predictions=blocking_rules,\n",
- " comparisons=[\n",
- " cl.ForenameSurnameComparison(\n",
- " \"first_name\",\n",
- " \"surname\",\n",
- " forename_surname_concat_col_name=\"first_name_surname_concat\",\n",
- " ),\n",
- " cl.DateOfBirthComparison(\n",
- " \"dob\", input_is_string=True\n",
- " ),\n",
- " cl.PostcodeComparison(\"postcode_fake\"),\n",
- " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n",
- " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")\n",
- "# Needed to apply term frequencies to first+surname comparison\n",
- "df[\"first_name_surname_concat\"] = df[\"first_name\"] + \" \" + df[\"surname\"]\n",
- "linker = Linker(df, settings, db_api=db_api)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.000136.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 7,362.31 are expected to match. With 1,279,041,753 total possible comparisons, we expect a total of around 173,728.33 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_probability_two_random_records_match(\n",
+ " [\n",
+ " block_on(\"first_name\", \"surname\", \"dob\"),\n",
+ " block_on(\"substr(first_name,1,2)\", \"surname\", \"substr(postcode_fake,1,2)\"),\n",
+ " block_on(\"dob\", \"postcode_fake\"),\n",
+ " ],\n",
+ " recall=0.6,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:45.055454Z",
+ "iopub.status.busy": "2024-07-23T15:42:45.055045Z",
+ "iopub.status.idle": "2024-07-23T15:42:51.549320Z",
+ "shell.execute_reply": "2024-07-23T15:42:51.548201Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:44.680176Z",
- "iopub.status.busy": "2024-07-23T15:42:44.679929Z",
- "iopub.status.idle": "2024-07-23T15:42:45.051292Z",
- "shell.execute_reply": "2024-07-23T15:42:45.050482Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.000136.\n",
- "This means that amongst all possible pairwise record comparisons, one in 7,362.31 are expected to match. With 1,279,041,753 total possible comparisons, we expect a total of around 173,728.33 matching pairs\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_probability_two_random_records_match(\n",
- " [\n",
- " block_on(\"first_name\", \"surname\", \"dob\"),\n",
- " block_on(\"substr(first_name,1,2)\", \"surname\", \"substr(postcode_fake,1,2)\"),\n",
- " block_on(\"dob\", \"postcode_fake\"),\n",
- " ],\n",
- " recall=0.6,\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "----- Estimating u probabilities using random sampling -----\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name_surname (no m values are trained).\n",
+ " - dob (no m values are trained).\n",
+ " - postcode_fake (no m values are trained).\n",
+ " - birth_place (no m values are trained).\n",
+ " - occupation (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=5e6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:51.552865Z",
+ "iopub.status.busy": "2024-07-23T15:42:51.552648Z",
+ "iopub.status.idle": "2024-07-23T15:42:53.882280Z",
+ "shell.execute_reply": "2024-07-23T15:42:53.881546Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:45.055454Z",
- "iopub.status.busy": "2024-07-23T15:42:45.055045Z",
- "iopub.status.idle": "2024-07-23T15:42:51.549320Z",
- "shell.execute_reply": "2024-07-23T15:42:51.548201Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "----- Estimating u probabilities using random sampling -----\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name_surname (no m values are trained).\n",
- " - dob (no m values are trained).\n",
- " - postcode_fake (no m values are trained).\n",
- " - birth_place (no m values are trained).\n",
- " - occupation (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=5e6)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - dob\n",
+ " - postcode_fake\n",
+ " - birth_place\n",
+ " - occupation\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - first_name_surname\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.248 in probability_two_random_records_match\n",
+ "Iteration 2: Largest change in params was -0.0935 in the m_probability of postcode_fake, level `Exact match on full postcode`\n",
+ "Iteration 3: Largest change in params was -0.0239 in the m_probability of birth_place, level `Exact match on birth_place`\n",
+ "Iteration 4: Largest change in params was 0.00984 in the m_probability of birth_place, level `All other comparisons`\n",
+ "Iteration 5: Largest change in params was -0.00477 in the m_probability of birth_place, level `Exact match on birth_place`\n",
+ "Iteration 6: Largest change in params was 0.00274 in the m_probability of birth_place, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was 0.00189 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 8: Largest change in params was 0.00129 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 9: Largest change in params was 0.000863 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 10: Largest change in params was 0.000576 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 11: Largest change in params was 0.000383 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 12: Largest change in params was 0.000254 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 13: Largest change in params was 0.000169 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 14: Largest change in params was 0.000112 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "Iteration 15: Largest change in params was 7.43e-05 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
+ "\n",
+ "EM converged after 15 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name_surname (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"first_name\", \"surname\")\n",
+ "training_session_names = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " training_blocking_rule, estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:53.887098Z",
+ "iopub.status.busy": "2024-07-23T15:42:53.886728Z",
+ "iopub.status.idle": "2024-07-23T15:42:55.732120Z",
+ "shell.execute_reply": "2024-07-23T15:42:55.731277Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:51.552865Z",
- "iopub.status.busy": "2024-07-23T15:42:51.552648Z",
- "iopub.status.idle": "2024-07-23T15:42:53.882280Z",
- "shell.execute_reply": "2024-07-23T15:42:53.881546Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - dob\n",
- " - postcode_fake\n",
- " - birth_place\n",
- " - occupation\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - first_name_surname\n",
- "\n",
- "Iteration 1: Largest change in params was 0.248 in probability_two_random_records_match\n",
- "Iteration 2: Largest change in params was -0.0935 in the m_probability of postcode_fake, level `Exact match on full postcode`\n",
- "Iteration 3: Largest change in params was -0.0239 in the m_probability of birth_place, level `Exact match on birth_place`\n",
- "Iteration 4: Largest change in params was 0.00984 in the m_probability of birth_place, level `All other comparisons`\n",
- "Iteration 5: Largest change in params was -0.00477 in the m_probability of birth_place, level `Exact match on birth_place`\n",
- "Iteration 6: Largest change in params was 0.00274 in the m_probability of birth_place, level `All other comparisons`\n",
- "Iteration 7: Largest change in params was 0.00189 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 8: Largest change in params was 0.00129 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 9: Largest change in params was 0.000863 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 10: Largest change in params was 0.000576 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 11: Largest change in params was 0.000383 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 12: Largest change in params was 0.000254 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 13: Largest change in params was 0.000169 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 14: Largest change in params was 0.000112 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "Iteration 15: Largest change in params was 7.43e-05 in the m_probability of dob, level `Abs date difference <= 10 year`\n",
- "\n",
- "EM converged after 15 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name_surname (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "training_blocking_rule = block_on(\"first_name\", \"surname\")\n",
- "training_session_names = (\n",
- " linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " training_blocking_rule, estimate_without_term_frequencies=True\n",
- " )\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"dob\" = r.\"dob\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name_surname\n",
+ " - postcode_fake\n",
+ " - birth_place\n",
+ " - occupation\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - dob\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.472 in the m_probability of first_name_surname, level `Exact match on first_name_surname_concat`\n",
+ "Iteration 2: Largest change in params was 0.0536 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 0.0179 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "Iteration 4: Largest change in params was 0.00547 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "Iteration 5: Largest change in params was 0.00169 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "Iteration 6: Largest change in params was 0.00053 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was 0.000168 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "Iteration 8: Largest change in params was 5.38e-05 in the m_probability of first_name_surname, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 8 iterations\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"dob\")\n",
+ "training_session_dob = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " training_blocking_rule, estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The final match weights can be viewed in the match weights chart:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:55.736125Z",
+ "iopub.status.busy": "2024-07-23T15:42:55.735824Z",
+ "iopub.status.idle": "2024-07-23T15:42:56.076294Z",
+ "shell.execute_reply": "2024-07-23T15:42:56.075727Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:53.887098Z",
- "iopub.status.busy": "2024-07-23T15:42:53.886728Z",
- "iopub.status.idle": "2024-07-23T15:42:55.732120Z",
- "shell.execute_reply": "2024-07-23T15:42:55.731277Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"dob\" = r.\"dob\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name_surname\n",
- " - postcode_fake\n",
- " - birth_place\n",
- " - occupation\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - dob\n",
- "\n",
- "Iteration 1: Largest change in params was -0.472 in the m_probability of first_name_surname, level `Exact match on first_name_surname_concat`\n",
- "Iteration 2: Largest change in params was 0.0536 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 0.0179 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "Iteration 4: Largest change in params was 0.00547 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "Iteration 5: Largest change in params was 0.00169 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "Iteration 6: Largest change in params was 0.00053 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "Iteration 7: Largest change in params was 0.000168 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "Iteration 8: Largest change in params was 5.38e-05 in the m_probability of first_name_surname, level `All other comparisons`\n",
- "\n",
- "EM converged after 8 iterations\n",
- "\n",
- "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "training_blocking_rule = block_on(\"dob\")\n",
- "training_session_dob = (\n",
- " linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " training_blocking_rule, estimate_without_term_frequencies=True\n",
- " )\n",
- ")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The final match weights can be viewed in the match weights chart:\n"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:56.079832Z",
+ "iopub.status.busy": "2024-07-23T15:42:56.079361Z",
+ "iopub.status.idle": "2024-07-23T15:42:59.455445Z",
+ "shell.execute_reply": "2024-07-23T15:42:59.454721Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:55.736125Z",
- "iopub.status.busy": "2024-07-23T15:42:55.735824Z",
- "iopub.status.idle": "2024-07-23T15:42:56.076294Z",
- "shell.execute_reply": "2024-07-23T15:42:56.075727Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.visualisations.match_weights_chart()"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.unlinkables_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:42:59.461904Z",
+ "iopub.status.busy": "2024-07-23T15:42:59.461570Z",
+ "iopub.status.idle": "2024-07-23T15:43:01.857390Z",
+ "shell.execute_reply": "2024-07-23T15:43:01.856478Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 0.66 seconds\n",
+ "Predict time: 1.32 seconds\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:56.079832Z",
- "iopub.status.busy": "2024-07-23T15:42:56.079361Z",
- "iopub.status.idle": "2024-07-23T15:42:59.455445Z",
- "shell.execute_reply": "2024-07-23T15:42:59.454721Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " surname_l | \n",
+ " surname_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " first_name_surname_concat_l | \n",
+ " first_name_surname_concat_r | \n",
+ " ... | \n",
+ " bf_birth_place | \n",
+ " bf_tf_adj_birth_place | \n",
+ " occupation_l | \n",
+ " occupation_r | \n",
+ " gamma_occupation | \n",
+ " tf_occupation_l | \n",
+ " tf_occupation_r | \n",
+ " bf_occupation | \n",
+ " bf_tf_adj_occupation | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 11.155625 | \n",
+ " 0.999562 | \n",
+ " Q19654778-17 | \n",
+ " Q19654778-4 | \n",
+ " chattock | \n",
+ " chattock | \n",
+ " richard | \n",
+ " ritchie | \n",
+ " richard chattock | \n",
+ " ritchie chattock | \n",
+ " ... | \n",
+ " 0.164723 | \n",
+ " 1.000000 | \n",
+ " photographer | \n",
+ " photographer | \n",
+ " 1 | \n",
+ " 0.018862 | \n",
+ " 0.018862 | \n",
+ " 23.537422 | \n",
+ " 2.020099 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 21.080818 | \n",
+ " 1.000000 | \n",
+ " Q2331144-2 | \n",
+ " Q2331144-9 | \n",
+ " caine | \n",
+ " caine | \n",
+ " sir | \n",
+ " hall | \n",
+ " sir caine | \n",
+ " hall caine | \n",
+ " ... | \n",
+ " 165.631265 | \n",
+ " 20.031894 | \n",
+ " novelist | \n",
+ " writer | \n",
+ " 0 | \n",
+ " 0.007078 | \n",
+ " 0.053264 | \n",
+ " 0.107239 | \n",
+ " 1.000000 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 20.499240 | \n",
+ " 0.999999 | \n",
+ " Q3377781-1 | \n",
+ " Q3377781-4 | \n",
+ " meux | \n",
+ " meux | \n",
+ " hedworth | \n",
+ " admiral | \n",
+ " hedworth meux | \n",
+ " admiral meux | \n",
+ " ... | \n",
+ " 165.631265 | \n",
+ " 0.094897 | \n",
+ " politician | \n",
+ " politician | \n",
+ " 1 | \n",
+ " 0.088932 | \n",
+ " 0.088932 | \n",
+ " 23.537422 | \n",
+ " 0.428451 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 20.499240 | \n",
+ " 0.999999 | \n",
+ " Q3377781-2 | \n",
+ " Q3377781-4 | \n",
+ " meux | \n",
+ " meux | \n",
+ " hedworth | \n",
+ " admiral | \n",
+ " hedworth meux | \n",
+ " admiral meux | \n",
+ " ... | \n",
+ " 165.631265 | \n",
+ " 0.094897 | \n",
+ " politician | \n",
+ " politician | \n",
+ " 1 | \n",
+ " 0.088932 | \n",
+ " 0.088932 | \n",
+ " 23.537422 | \n",
+ " 0.428451 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 20.499240 | \n",
+ " 0.999999 | \n",
+ " Q3377781-3 | \n",
+ " Q3377781-4 | \n",
+ " meux | \n",
+ " meux | \n",
+ " hedworth | \n",
+ " admiral | \n",
+ " hedworth meux | \n",
+ " admiral meux | \n",
+ " ... | \n",
+ " 165.631265 | \n",
+ " 0.094897 | \n",
+ " politician | \n",
+ " politician | \n",
+ " 1 | \n",
+ " 0.088932 | \n",
+ " 0.088932 | \n",
+ " 23.537422 | \n",
+ " 0.428451 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 42 columns
\n",
+ "
"
],
- "source": [
- "linker.evaluation.unlinkables_chart()"
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r surname_l \\\n",
+ "0 11.155625 0.999562 Q19654778-17 Q19654778-4 chattock \n",
+ "1 21.080818 1.000000 Q2331144-2 Q2331144-9 caine \n",
+ "2 20.499240 0.999999 Q3377781-1 Q3377781-4 meux \n",
+ "3 20.499240 0.999999 Q3377781-2 Q3377781-4 meux \n",
+ "4 20.499240 0.999999 Q3377781-3 Q3377781-4 meux \n",
+ "\n",
+ " surname_r first_name_l first_name_r first_name_surname_concat_l \\\n",
+ "0 chattock richard ritchie richard chattock \n",
+ "1 caine sir hall sir caine \n",
+ "2 meux hedworth admiral hedworth meux \n",
+ "3 meux hedworth admiral hedworth meux \n",
+ "4 meux hedworth admiral hedworth meux \n",
+ "\n",
+ " first_name_surname_concat_r ... bf_birth_place bf_tf_adj_birth_place \\\n",
+ "0 ritchie chattock ... 0.164723 1.000000 \n",
+ "1 hall caine ... 165.631265 20.031894 \n",
+ "2 admiral meux ... 165.631265 0.094897 \n",
+ "3 admiral meux ... 165.631265 0.094897 \n",
+ "4 admiral meux ... 165.631265 0.094897 \n",
+ "\n",
+ " occupation_l occupation_r gamma_occupation tf_occupation_l \\\n",
+ "0 photographer photographer 1 0.018862 \n",
+ "1 novelist writer 0 0.007078 \n",
+ "2 politician politician 1 0.088932 \n",
+ "3 politician politician 1 0.088932 \n",
+ "4 politician politician 1 0.088932 \n",
+ "\n",
+ " tf_occupation_r bf_occupation bf_tf_adj_occupation match_key \n",
+ "0 0.018862 23.537422 2.020099 4 \n",
+ "1 0.053264 0.107239 1.000000 4 \n",
+ "2 0.088932 23.537422 0.428451 4 \n",
+ "3 0.088932 23.537422 0.428451 4 \n",
+ "4 0.088932 23.537422 0.428451 4 \n",
+ "\n",
+ "[5 rows x 42 columns]"
]
- },
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_predict = linker.inference.predict()\n",
+ "df_e = df_predict.as_pandas_dataframe(limit=5)\n",
+ "df_e"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also view rows in this dataset as a waterfall chart as follows:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:43:01.862898Z",
+ "iopub.status.busy": "2024-07-23T15:43:01.862461Z",
+ "iopub.status.idle": "2024-07-23T15:43:03.200386Z",
+ "shell.execute_reply": "2024-07-23T15:43:03.198944Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:42:59.461904Z",
- "iopub.status.busy": "2024-07-23T15:42:59.461570Z",
- "iopub.status.idle": "2024-07-23T15:43:01.857390Z",
- "shell.execute_reply": "2024-07-23T15:43:01.856478Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Blocking time: 0.66 seconds\n",
- "Predict time: 1.32 seconds\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " surname_l | \n",
- " surname_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " first_name_surname_concat_l | \n",
- " first_name_surname_concat_r | \n",
- " ... | \n",
- " bf_birth_place | \n",
- " bf_tf_adj_birth_place | \n",
- " occupation_l | \n",
- " occupation_r | \n",
- " gamma_occupation | \n",
- " tf_occupation_l | \n",
- " tf_occupation_r | \n",
- " bf_occupation | \n",
- " bf_tf_adj_occupation | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 11.155625 | \n",
- " 0.999562 | \n",
- " Q19654778-17 | \n",
- " Q19654778-4 | \n",
- " chattock | \n",
- " chattock | \n",
- " richard | \n",
- " ritchie | \n",
- " richard chattock | \n",
- " ritchie chattock | \n",
- " ... | \n",
- " 0.164723 | \n",
- " 1.000000 | \n",
- " photographer | \n",
- " photographer | \n",
- " 1 | \n",
- " 0.018862 | \n",
- " 0.018862 | \n",
- " 23.537422 | \n",
- " 2.020099 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 21.080818 | \n",
- " 1.000000 | \n",
- " Q2331144-2 | \n",
- " Q2331144-9 | \n",
- " caine | \n",
- " caine | \n",
- " sir | \n",
- " hall | \n",
- " sir caine | \n",
- " hall caine | \n",
- " ... | \n",
- " 165.631265 | \n",
- " 20.031894 | \n",
- " novelist | \n",
- " writer | \n",
- " 0 | \n",
- " 0.007078 | \n",
- " 0.053264 | \n",
- " 0.107239 | \n",
- " 1.000000 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 20.499240 | \n",
- " 0.999999 | \n",
- " Q3377781-1 | \n",
- " Q3377781-4 | \n",
- " meux | \n",
- " meux | \n",
- " hedworth | \n",
- " admiral | \n",
- " hedworth meux | \n",
- " admiral meux | \n",
- " ... | \n",
- " 165.631265 | \n",
- " 0.094897 | \n",
- " politician | \n",
- " politician | \n",
- " 1 | \n",
- " 0.088932 | \n",
- " 0.088932 | \n",
- " 23.537422 | \n",
- " 0.428451 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 20.499240 | \n",
- " 0.999999 | \n",
- " Q3377781-2 | \n",
- " Q3377781-4 | \n",
- " meux | \n",
- " meux | \n",
- " hedworth | \n",
- " admiral | \n",
- " hedworth meux | \n",
- " admiral meux | \n",
- " ... | \n",
- " 165.631265 | \n",
- " 0.094897 | \n",
- " politician | \n",
- " politician | \n",
- " 1 | \n",
- " 0.088932 | \n",
- " 0.088932 | \n",
- " 23.537422 | \n",
- " 0.428451 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 20.499240 | \n",
- " 0.999999 | \n",
- " Q3377781-3 | \n",
- " Q3377781-4 | \n",
- " meux | \n",
- " meux | \n",
- " hedworth | \n",
- " admiral | \n",
- " hedworth meux | \n",
- " admiral meux | \n",
- " ... | \n",
- " 165.631265 | \n",
- " 0.094897 | \n",
- " politician | \n",
- " politician | \n",
- " 1 | \n",
- " 0.088932 | \n",
- " 0.088932 | \n",
- " 23.537422 | \n",
- " 0.428451 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 42 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r surname_l \\\n",
- "0 11.155625 0.999562 Q19654778-17 Q19654778-4 chattock \n",
- "1 21.080818 1.000000 Q2331144-2 Q2331144-9 caine \n",
- "2 20.499240 0.999999 Q3377781-1 Q3377781-4 meux \n",
- "3 20.499240 0.999999 Q3377781-2 Q3377781-4 meux \n",
- "4 20.499240 0.999999 Q3377781-3 Q3377781-4 meux \n",
- "\n",
- " surname_r first_name_l first_name_r first_name_surname_concat_l \\\n",
- "0 chattock richard ritchie richard chattock \n",
- "1 caine sir hall sir caine \n",
- "2 meux hedworth admiral hedworth meux \n",
- "3 meux hedworth admiral hedworth meux \n",
- "4 meux hedworth admiral hedworth meux \n",
- "\n",
- " first_name_surname_concat_r ... bf_birth_place bf_tf_adj_birth_place \\\n",
- "0 ritchie chattock ... 0.164723 1.000000 \n",
- "1 hall caine ... 165.631265 20.031894 \n",
- "2 admiral meux ... 165.631265 0.094897 \n",
- "3 admiral meux ... 165.631265 0.094897 \n",
- "4 admiral meux ... 165.631265 0.094897 \n",
- "\n",
- " occupation_l occupation_r gamma_occupation tf_occupation_l \\\n",
- "0 photographer photographer 1 0.018862 \n",
- "1 novelist writer 0 0.007078 \n",
- "2 politician politician 1 0.088932 \n",
- "3 politician politician 1 0.088932 \n",
- "4 politician politician 1 0.088932 \n",
- "\n",
- " tf_occupation_r bf_occupation bf_tf_adj_occupation match_key \n",
- "0 0.018862 23.537422 2.020099 4 \n",
- "1 0.053264 0.107239 1.000000 4 \n",
- "2 0.088932 23.537422 0.428451 4 \n",
- "3 0.088932 23.537422 0.428451 4 \n",
- "4 0.088932 23.537422 0.428451 4 \n",
- "\n",
- "[5 rows x 42 columns]"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "df_predict = linker.inference.predict()\n",
- "df_e = df_predict.as_pandas_dataframe(limit=5)\n",
- "df_e"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "records_to_plot = df_e.to_dict(orient=\"records\")\n",
+ "linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:43:03.208830Z",
+ "iopub.status.busy": "2024-07-23T15:43:03.207925Z",
+ "iopub.status.idle": "2024-07-23T15:43:03.888871Z",
+ "shell.execute_reply": "2024-07-23T15:43:03.888071Z"
+ }
+ },
+ "outputs": [
{
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can also view rows in this dataset as a waterfall chart as follows:\n"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 1, num representatives needing updating: 810\n",
+ "Completed iteration 2, num representatives needing updating: 183\n",
+ "Completed iteration 3, num representatives needing updating: 59\n",
+ "Completed iteration 4, num representatives needing updating: 6\n",
+ "Completed iteration 5, num representatives needing updating: 1\n",
+ "Completed iteration 6, num representatives needing updating: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " df_predict, threshold_match_probability=0.95\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:43:03.893162Z",
+ "iopub.status.busy": "2024-07-23T15:43:03.892847Z",
+ "iopub.status.idle": "2024-07-23T15:43:04.163632Z",
+ "shell.execute_reply": "2024-07-23T15:43:04.162854Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:43:01.862898Z",
- "iopub.status.busy": "2024-07-23T15:43:01.862461Z",
- "iopub.status.idle": "2024-07-23T15:43:03.200386Z",
- "shell.execute_reply": "2024-07-23T15:43:03.198944Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
],
- "source": [
- "records_to_plot = df_e.to_dict(orient=\"records\")\n",
- "linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)"
+ "text/plain": [
+ ""
]
- },
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from IPython.display import IFrame\n",
+ "\n",
+ "linker.visualisations.cluster_studio_dashboard(\n",
+ " df_predict,\n",
+ " clusters,\n",
+ " \"dashboards/50k_cluster.html\",\n",
+ " sampling_method=\"by_cluster_size\",\n",
+ " overwrite=True,\n",
+ ")\n",
+ "\n",
+ "\n",
+ "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:43:04.167619Z",
+ "iopub.status.busy": "2024-07-23T15:43:04.167286Z",
+ "iopub.status.idle": "2024-07-23T15:43:20.892263Z",
+ "shell.execute_reply": "2024-07-23T15:43:20.891045Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:43:03.208830Z",
- "iopub.status.busy": "2024-07-23T15:43:03.207925Z",
- "iopub.status.idle": "2024-07-23T15:43:03.888871Z",
- "shell.execute_reply": "2024-07-23T15:43:03.888071Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 1, num representatives needing updating: 810\n",
- "Completed iteration 2, num representatives needing updating: 183\n",
- "Completed iteration 3, num representatives needing updating: 59\n",
- "Completed iteration 4, num representatives needing updating: 6\n",
- "Completed iteration 5, num representatives needing updating: 1\n",
- "Completed iteration 6, num representatives needing updating: 0\n"
- ]
- }
- ],
- "source": [
- "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " df_predict, threshold_match_probability=0.95\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 1.37 seconds\n",
+ "Predict time: 1.38 seconds\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:43:03.893162Z",
- "iopub.status.busy": "2024-07-23T15:43:03.892847Z",
- "iopub.status.idle": "2024-07-23T15:43:04.163632Z",
- "shell.execute_reply": "2024-07-23T15:43:04.162854Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from IPython.display import IFrame\n",
- "\n",
- "linker.visualisations.cluster_studio_dashboard(\n",
- " df_predict,\n",
- " clusters,\n",
- " \"dashboards/50k_cluster.html\",\n",
- " sampling_method=\"by_cluster_size\",\n",
- " overwrite=True,\n",
- ")\n",
- "\n",
- "\n",
- "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_column(\n",
+ " \"cluster\", output_type=\"accuracy\", match_weight_round_to_nearest=0.02\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:43:20.938453Z",
+ "iopub.status.busy": "2024-07-23T15:43:20.938120Z",
+ "iopub.status.idle": "2024-07-23T15:43:46.638355Z",
+ "shell.execute_reply": "2024-07-23T15:43:46.637640Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:43:04.167619Z",
- "iopub.status.busy": "2024-07-23T15:43:04.167286Z",
- "iopub.status.idle": "2024-07-23T15:43:20.892263Z",
- "shell.execute_reply": "2024-07-23T15:43:20.891045Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Blocking time: 1.37 seconds\n",
- "Predict time: 1.38 seconds\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_column(\n",
- " \"cluster\", output_type=\"accuracy\", match_weight_round_to_nearest=0.02\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 1.80 seconds\n",
+ "Predict time: 0.59 seconds\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:43:20.938453Z",
- "iopub.status.busy": "2024-07-23T15:43:20.938120Z",
- "iopub.status.idle": "2024-07-23T15:43:46.638355Z",
- "shell.execute_reply": "2024-07-23T15:43:46.637640Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Blocking time: 1.80 seconds\n",
- "Predict time: 0.59 seconds\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "records = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\",\n",
- " threshold_match_probability=0.999,\n",
- " include_false_negatives=False,\n",
- " include_false_positives=True,\n",
- ").as_record_dict()\n",
- "linker.visualisations.waterfall_chart(records)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "records = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\",\n",
+ " threshold_match_probability=0.999,\n",
+ " include_false_negatives=False,\n",
+ " include_false_positives=True,\n",
+ ").as_record_dict()\n",
+ "linker.visualisations.waterfall_chart(records)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-23T15:43:46.673037Z",
+ "iopub.status.busy": "2024-07-23T15:43:46.672255Z",
+ "iopub.status.idle": "2024-07-23T15:43:51.079488Z",
+ "shell.execute_reply": "2024-07-23T15:43:51.078834Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 1.08 seconds\n",
+ "Predict time: 0.48 seconds\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-23T15:43:46.673037Z",
- "iopub.status.busy": "2024-07-23T15:43:46.672255Z",
- "iopub.status.idle": "2024-07-23T15:43:51.079488Z",
- "shell.execute_reply": "2024-07-23T15:43:51.078834Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Blocking time: 1.08 seconds\n",
- "Predict time: 0.48 seconds\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "# Some of the false negatives will be because they weren't detected by the blocking rules\n",
- "records = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\",\n",
- " threshold_match_probability=0.5,\n",
- " include_false_negatives=True,\n",
- " include_false_positives=False,\n",
- ").as_record_dict(limit=50)\n",
- "\n",
- "linker.visualisations.waterfall_chart(records)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "4cf4ede97b3d45be967484f88714ab49": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_ecb0553265084101970f63e3164eb846",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_767db5e0436a4ceda3bcac823cbdc818",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "767db5e0436a4ceda3bcac823cbdc818": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "ecb0553265084101970f63e3164eb846": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
- }
- }
+ ],
+ "source": [
+ "# Some of the false negatives will be because they weren't detected by the blocking rules\n",
+ "records = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\",\n",
+ " threshold_match_probability=0.5,\n",
+ " include_false_negatives=True,\n",
+ " include_false_positives=False,\n",
+ ").as_record_dict(limit=50)\n",
+ "\n",
+ "linker.visualisations.waterfall_chart(records)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
},
- "nbformat": 4,
- "nbformat_minor": 4
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "4cf4ede97b3d45be967484f88714ab49": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_ecb0553265084101970f63e3164eb846",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_767db5e0436a4ceda3bcac823cbdc818",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "767db5e0436a4ceda3bcac823cbdc818": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "ecb0553265084101970f63e3164eb846": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb
index 62543ceb2d..5d0ce6fabc 100644
--- a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb
+++ b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb
@@ -1,826 +1,826 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Linking a dataset of real historical persons with Deterrministic Rules\n",
- "\n",
- "While Splink is primarily a tool for probabilistic records linkage, it includes functionality to perform deterministic (i.e. rules based) linkage.\n",
- "\n",
- "Significant work has gone into optimising the performance of rules based matching, so Splink is likely to be significantly faster than writing the basic SQL by hand.\n",
- "\n",
- "In this example, we deduplicate a 50k row dataset based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced. The probabilistic dedupe of the same dataset can be found at `Deduplicate 50k rows historical persons`.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:10:59.567669Z",
- "iopub.status.busy": "2024-06-07T09:10:59.567311Z",
- "iopub.status.idle": "2024-06-07T09:10:59.591784Z",
- "shell.execute_reply": "2024-06-07T09:10:59.590923Z"
- }
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
- },
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linking a dataset of real historical persons with Deterrministic Rules\n",
+ "\n",
+ "While Splink is primarily a tool for probabilistic records linkage, it includes functionality to perform deterministic (i.e. rules based) linkage.\n",
+ "\n",
+ "Significant work has gone into optimising the performance of rules based matching, so Splink is likely to be significantly faster than writing the basic SQL by hand.\n",
+ "\n",
+ "In this example, we deduplicate a 50k row dataset based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced. The probabilistic dedupe of the same dataset can be found at `Deduplicate 50k rows historical persons`.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:10:59.567669Z",
+ "iopub.status.busy": "2024-06-07T09:10:59.567311Z",
+ "iopub.status.idle": "2024-06-07T09:10:59.591784Z",
+ "shell.execute_reply": "2024-06-07T09:10:59.590923Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:10:59.595969Z",
+ "iopub.status.busy": "2024-06-07T09:10:59.595667Z",
+ "iopub.status.idle": "2024-06-07T09:11:01.007136Z",
+ "shell.execute_reply": "2024-06-07T09:11:01.006553Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:10:59.595969Z",
- "iopub.status.busy": "2024-06-07T09:10:59.595667Z",
- "iopub.status.idle": "2024-06-07T09:11:01.007136Z",
- "shell.execute_reply": "2024-06-07T09:11:01.006553Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id | \n",
- " cluster | \n",
- " full_name | \n",
- " first_and_surname | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " birth_place | \n",
- " postcode_fake | \n",
- " gender | \n",
- " occupation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Q2296770-1 | \n",
- " Q2296770 | \n",
- " thomas clifford, 1st baron clifford of chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Q2296770-2 | \n",
- " Q2296770 | \n",
- " thomas of chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Q2296770-3 | \n",
- " Q2296770 | \n",
- " tom 1st baron clifford of chudleigh | \n",
- " tom chudleigh | \n",
- " tom | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Q2296770-4 | \n",
- " Q2296770 | \n",
- " thomas 1st chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8hu | \n",
- " None | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Q2296770-5 | \n",
- " Q2296770 | \n",
- " thomas clifford, 1st baron chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " None | \n",
- " politician | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id cluster full_name \\\n",
- "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n",
- "1 Q2296770-2 Q2296770 thomas of chudleigh \n",
- "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n",
- "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n",
- "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n",
- "\n",
- " first_and_surname first_name surname dob birth_place \\\n",
- "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "2 tom chudleigh tom chudleigh 1630-08-01 devon \n",
- "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "\n",
- " postcode_fake gender occupation \n",
- "0 tq13 8df male politician \n",
- "1 tq13 8df male politician \n",
- "2 tq13 8df male politician \n",
- "3 tq13 8hu None politician \n",
- "4 tq13 8df None politician "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id | \n",
+ " cluster | \n",
+ " full_name | \n",
+ " first_and_surname | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " birth_place | \n",
+ " postcode_fake | \n",
+ " gender | \n",
+ " occupation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Q2296770-1 | \n",
+ " Q2296770 | \n",
+ " thomas clifford, 1st baron clifford of chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Q2296770-2 | \n",
+ " Q2296770 | \n",
+ " thomas of chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Q2296770-3 | \n",
+ " Q2296770 | \n",
+ " tom 1st baron clifford of chudleigh | \n",
+ " tom chudleigh | \n",
+ " tom | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Q2296770-4 | \n",
+ " Q2296770 | \n",
+ " thomas 1st chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8hu | \n",
+ " None | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Q2296770-5 | \n",
+ " Q2296770 | \n",
+ " thomas clifford, 1st baron chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " None | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "import pandas as pd\n",
- "\n",
- "from splink import splink_datasets\n",
- "\n",
- "pd.options.display.max_rows = 1000\n",
- "df = splink_datasets.historical_50k\n",
- "df.head()"
+ "text/plain": [
+ " unique_id cluster full_name \\\n",
+ "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n",
+ "1 Q2296770-2 Q2296770 thomas of chudleigh \n",
+ "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n",
+ "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n",
+ "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n",
+ "\n",
+ " first_and_surname first_name surname dob birth_place \\\n",
+ "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "2 tom chudleigh tom chudleigh 1630-08-01 devon \n",
+ "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "\n",
+ " postcode_fake gender occupation \n",
+ "0 tq13 8df male politician \n",
+ "1 tq13 8df male politician \n",
+ "2 tq13 8df male politician \n",
+ "3 tq13 8hu None politician \n",
+ "4 tq13 8df None politician "
]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "When defining the settings object, specity your deterministic rules in the `blocking_rules_to_generate_predictions` key.\n",
- "\n",
- "For a deterministic linkage, the linkage methodology is based solely on these rules, so there is no need to define `comparisons` nor any other parameters required for model training in a probabilistic model.\n"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Prior to running the linkage, it's usually a good idea to check how many record comparisons will be generated by your deterministic rules:\n"
- ]
- },
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "from splink import splink_datasets\n",
+ "\n",
+ "pd.options.display.max_rows = 1000\n",
+ "df = splink_datasets.historical_50k\n",
+ "df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "When defining the settings object, specity your deterministic rules in the `blocking_rules_to_generate_predictions` key.\n",
+ "\n",
+ "For a deterministic linkage, the linkage methodology is based solely on these rules, so there is no need to define `comparisons` nor any other parameters required for model training in a probabilistic model.\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Prior to running the linkage, it's usually a good idea to check how many record comparisons will be generated by your deterministic rules:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:01.050336Z",
+ "iopub.status.busy": "2024-06-07T09:11:01.049679Z",
+ "iopub.status.idle": "2024-06-07T09:11:01.602823Z",
+ "shell.execute_reply": "2024-06-07T09:11:01.601902Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:01.050336Z",
- "iopub.status.busy": "2024-06-07T09:11:01.049679Z",
- "iopub.status.idle": "2024-06-07T09:11:01.602823Z",
- "shell.execute_reply": "2024-06-07T09:11:01.601902Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import DuckDBAPI, block_on\n",
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=df,\n",
- " blocking_rules=[\n",
- " block_on(\"first_name\", \"surname\", \"dob\"),\n",
- " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n",
- " block_on(\"first_name\", \"dob\", \"occupation\"),\n",
- " ],\n",
- " db_api=db_api,\n",
- " link_type=\"dedupe_only\",\n",
- ")"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:01.606853Z",
- "iopub.status.busy": "2024-06-07T09:11:01.606539Z",
- "iopub.status.idle": "2024-06-07T09:11:01.691839Z",
- "shell.execute_reply": "2024-06-07T09:11:01.690988Z"
- }
- },
- "outputs": [],
- "source": [
- "from splink import Linker, SettingsCreator\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"first_name\", \"surname\", \"dob\"),\n",
- " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n",
- " block_on(\"first_name\", \"dob\", \"occupation\"),\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")\n",
- "\n",
- "linker = Linker(df, settings, db_api=db_api)\n"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The results of the linkage can be viewed with the `deterministic_link` function.\n"
- ]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI, block_on\n",
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " df_sdf,\n",
+ " blocking_rules=[\n",
+ " block_on(\"first_name\", \"surname\", \"dob\"),\n",
+ " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n",
+ " block_on(\"first_name\", \"dob\", \"occupation\"),\n",
+ " ],\n",
+ " link_type=\"dedupe_only\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:01.606853Z",
+ "iopub.status.busy": "2024-06-07T09:11:01.606539Z",
+ "iopub.status.idle": "2024-06-07T09:11:01.691839Z",
+ "shell.execute_reply": "2024-06-07T09:11:01.690988Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import Linker, SettingsCreator\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"first_name\", \"surname\", \"dob\"),\n",
+ " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n",
+ " block_on(\"first_name\", \"dob\", \"occupation\"),\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The results of the linkage can be viewed with the `deterministic_link` function.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:01.695906Z",
+ "iopub.status.busy": "2024-06-07T09:11:01.695600Z",
+ "iopub.status.idle": "2024-06-07T09:11:01.995020Z",
+ "shell.execute_reply": "2024-06-07T09:11:01.994289Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:01.695906Z",
- "iopub.status.busy": "2024-06-07T09:11:01.695600Z",
- "iopub.status.idle": "2024-06-07T09:11:01.995020Z",
- "shell.execute_reply": "2024-06-07T09:11:01.994289Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " occupation_l | \n",
- " occupation_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " dob_l | \n",
- " dob_r | \n",
- " surname_l | \n",
- " surname_r | \n",
- " postcode_fake_l | \n",
- " postcode_fake_r | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Q55455287-12 | \n",
- " Q55455287-2 | \n",
- " None | \n",
- " writer | \n",
- " jaido | \n",
- " jaido | \n",
- " 1836-01-01 | \n",
- " 1836-01-01 | \n",
- " morata | \n",
- " morata | \n",
- " ta4 2ug | \n",
- " ta4 2uu | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Q55455287-12 | \n",
- " Q55455287-3 | \n",
- " None | \n",
- " writer | \n",
- " jaido | \n",
- " jaido | \n",
- " 1836-01-01 | \n",
- " 1836-01-01 | \n",
- " morata | \n",
- " morata | \n",
- " ta4 2ug | \n",
- " ta4 2uu | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Q55455287-12 | \n",
- " Q55455287-4 | \n",
- " None | \n",
- " writer | \n",
- " jaido | \n",
- " jaido | \n",
- " 1836-01-01 | \n",
- " 1836-01-01 | \n",
- " morata | \n",
- " morata | \n",
- " ta4 2ug | \n",
- " ta4 2sz | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Q55455287-12 | \n",
- " Q55455287-5 | \n",
- " None | \n",
- " None | \n",
- " jaido | \n",
- " jaido | \n",
- " 1836-01-01 | \n",
- " 1836-01-01 | \n",
- " morata | \n",
- " morata | \n",
- " ta4 2ug | \n",
- " ta4 2ug | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Q55455287-12 | \n",
- " Q55455287-6 | \n",
- " None | \n",
- " writer | \n",
- " jaido | \n",
- " jaido | \n",
- " 1836-01-01 | \n",
- " 1836-01-01 | \n",
- " morata | \n",
- " morata | \n",
- " ta4 2ug | \n",
- " None | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id_l unique_id_r occupation_l occupation_r first_name_l \\\n",
- "0 Q55455287-12 Q55455287-2 None writer jaido \n",
- "1 Q55455287-12 Q55455287-3 None writer jaido \n",
- "2 Q55455287-12 Q55455287-4 None writer jaido \n",
- "3 Q55455287-12 Q55455287-5 None None jaido \n",
- "4 Q55455287-12 Q55455287-6 None writer jaido \n",
- "\n",
- " first_name_r dob_l dob_r surname_l surname_r postcode_fake_l \\\n",
- "0 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
- "1 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
- "2 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
- "3 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
- "4 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
- "\n",
- " postcode_fake_r match_key \n",
- "0 ta4 2uu 0 \n",
- "1 ta4 2uu 0 \n",
- "2 ta4 2sz 0 \n",
- "3 ta4 2ug 0 \n",
- "4 None 0 "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " occupation_l | \n",
+ " occupation_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " dob_l | \n",
+ " dob_r | \n",
+ " surname_l | \n",
+ " surname_r | \n",
+ " postcode_fake_l | \n",
+ " postcode_fake_r | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Q55455287-12 | \n",
+ " Q55455287-2 | \n",
+ " None | \n",
+ " writer | \n",
+ " jaido | \n",
+ " jaido | \n",
+ " 1836-01-01 | \n",
+ " 1836-01-01 | \n",
+ " morata | \n",
+ " morata | \n",
+ " ta4 2ug | \n",
+ " ta4 2uu | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Q55455287-12 | \n",
+ " Q55455287-3 | \n",
+ " None | \n",
+ " writer | \n",
+ " jaido | \n",
+ " jaido | \n",
+ " 1836-01-01 | \n",
+ " 1836-01-01 | \n",
+ " morata | \n",
+ " morata | \n",
+ " ta4 2ug | \n",
+ " ta4 2uu | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Q55455287-12 | \n",
+ " Q55455287-4 | \n",
+ " None | \n",
+ " writer | \n",
+ " jaido | \n",
+ " jaido | \n",
+ " 1836-01-01 | \n",
+ " 1836-01-01 | \n",
+ " morata | \n",
+ " morata | \n",
+ " ta4 2ug | \n",
+ " ta4 2sz | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Q55455287-12 | \n",
+ " Q55455287-5 | \n",
+ " None | \n",
+ " None | \n",
+ " jaido | \n",
+ " jaido | \n",
+ " 1836-01-01 | \n",
+ " 1836-01-01 | \n",
+ " morata | \n",
+ " morata | \n",
+ " ta4 2ug | \n",
+ " ta4 2ug | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Q55455287-12 | \n",
+ " Q55455287-6 | \n",
+ " None | \n",
+ " writer | \n",
+ " jaido | \n",
+ " jaido | \n",
+ " 1836-01-01 | \n",
+ " 1836-01-01 | \n",
+ " morata | \n",
+ " morata | \n",
+ " ta4 2ug | \n",
+ " None | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "df_predict = linker.inference.deterministic_link()\n",
- "df_predict.as_pandas_dataframe().head()"
+ "text/plain": [
+ " unique_id_l unique_id_r occupation_l occupation_r first_name_l \\\n",
+ "0 Q55455287-12 Q55455287-2 None writer jaido \n",
+ "1 Q55455287-12 Q55455287-3 None writer jaido \n",
+ "2 Q55455287-12 Q55455287-4 None writer jaido \n",
+ "3 Q55455287-12 Q55455287-5 None None jaido \n",
+ "4 Q55455287-12 Q55455287-6 None writer jaido \n",
+ "\n",
+ " first_name_r dob_l dob_r surname_l surname_r postcode_fake_l \\\n",
+ "0 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
+ "1 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
+ "2 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
+ "3 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
+ "4 jaido 1836-01-01 1836-01-01 morata morata ta4 2ug \n",
+ "\n",
+ " postcode_fake_r match_key \n",
+ "0 ta4 2uu 0 \n",
+ "1 ta4 2uu 0 \n",
+ "2 ta4 2sz 0 \n",
+ "3 ta4 2ug 0 \n",
+ "4 None 0 "
]
- },
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_predict = linker.inference.deterministic_link()\n",
+ "df_predict.as_pandas_dataframe().head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Which can be used to generate clusters.\n",
+ "\n",
+ "Note, for deterministic linkage, each comparison has been assigned a match probability of 1, so to generate clusters, set `threshold_match_probability=1` in the `cluster_pairwise_predictions_at_threshold` function.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:01.998965Z",
+ "iopub.status.busy": "2024-06-07T09:11:01.998665Z",
+ "iopub.status.idle": "2024-06-07T09:11:02.348788Z",
+ "shell.execute_reply": "2024-06-07T09:11:02.348039Z"
+ }
+ },
+ "outputs": [
{
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Which can be used to generate clusters.\n",
- "\n",
- "Note, for deterministic linkage, each comparison has been assigned a match probability of 1, so to generate clusters, set `threshold_match_probability=1` in the `cluster_pairwise_predictions_at_threshold` function.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 1, root rows count 94\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:01.998965Z",
- "iopub.status.busy": "2024-06-07T09:11:01.998665Z",
- "iopub.status.idle": "2024-06-07T09:11:02.348788Z",
- "shell.execute_reply": "2024-06-07T09:11:02.348039Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 1, root rows count 94\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 2, root rows count 10\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 3, root rows count 0\n"
- ]
- }
- ],
- "source": [
- "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " df_predict\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 2, root rows count 10\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:02.352872Z",
- "iopub.status.busy": "2024-06-07T09:11:02.352366Z",
- "iopub.status.idle": "2024-06-07T09:11:02.367858Z",
- "shell.execute_reply": "2024-06-07T09:11:02.367179Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cluster_id | \n",
- " unique_id | \n",
- " cluster | \n",
- " full_name | \n",
- " first_and_surname | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " birth_place | \n",
- " postcode_fake | \n",
- " gender | \n",
- " occupation | \n",
- " __splink_salt | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Q16025107-1 | \n",
- " Q5497940-9 | \n",
- " Q5497940 | \n",
- " frederick hall | \n",
- " frederick hall | \n",
- " frederick | \n",
- " hall | \n",
- " 1855-01-01 | \n",
- " bristol, city of | \n",
- " bs11 9pn | \n",
- " None | \n",
- " None | \n",
- " 0.002739 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Q1149445-1 | \n",
- " Q1149445-9 | \n",
- " Q1149445 | \n",
- " earl egerton | \n",
- " earl egerton | \n",
- " earl | \n",
- " egerton | \n",
- " 1800-01-01 | \n",
- " westminster | \n",
- " w1d 2hf | \n",
- " None | \n",
- " None | \n",
- " 0.991459 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Q20664532-1 | \n",
- " Q21466387-2 | \n",
- " Q21466387 | \n",
- " harry brooker | \n",
- " harry brooker | \n",
- " harry | \n",
- " brooker | \n",
- " 1848-01-01 | \n",
- " plymouth | \n",
- " pl4 9hx | \n",
- " male | \n",
- " painter | \n",
- " 0.506127 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Q1124636-1 | \n",
- " Q1124636-12 | \n",
- " Q1124636 | \n",
- " tom stapleton | \n",
- " tom stapleton | \n",
- " tom | \n",
- " stapleton | \n",
- " 1535-01-01 | \n",
- " None | \n",
- " bn6 9na | \n",
- " male | \n",
- " theologian | \n",
- " 0.612694 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Q18508292-1 | \n",
- " Q21466711-4 | \n",
- " Q21466711 | \n",
- " harry s0ence | \n",
- " harry s0ence | \n",
- " harry | \n",
- " s0ence | \n",
- " 1860-01-01 | \n",
- " london | \n",
- " se1 7pb | \n",
- " male | \n",
- " painter | \n",
- " 0.488917 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " cluster_id unique_id cluster full_name first_and_surname \\\n",
- "0 Q16025107-1 Q5497940-9 Q5497940 frederick hall frederick hall \n",
- "1 Q1149445-1 Q1149445-9 Q1149445 earl egerton earl egerton \n",
- "2 Q20664532-1 Q21466387-2 Q21466387 harry brooker harry brooker \n",
- "3 Q1124636-1 Q1124636-12 Q1124636 tom stapleton tom stapleton \n",
- "4 Q18508292-1 Q21466711-4 Q21466711 harry s0ence harry s0ence \n",
- "\n",
- " first_name surname dob birth_place postcode_fake gender \\\n",
- "0 frederick hall 1855-01-01 bristol, city of bs11 9pn None \n",
- "1 earl egerton 1800-01-01 westminster w1d 2hf None \n",
- "2 harry brooker 1848-01-01 plymouth pl4 9hx male \n",
- "3 tom stapleton 1535-01-01 None bn6 9na male \n",
- "4 harry s0ence 1860-01-01 london se1 7pb male \n",
- "\n",
- " occupation __splink_salt \n",
- "0 None 0.002739 \n",
- "1 None 0.991459 \n",
- "2 painter 0.506127 \n",
- "3 theologian 0.612694 \n",
- "4 painter 0.488917 "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "clusters.as_pandas_dataframe(limit=5)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 3, root rows count 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " df_predict\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:02.352872Z",
+ "iopub.status.busy": "2024-06-07T09:11:02.352366Z",
+ "iopub.status.idle": "2024-06-07T09:11:02.367858Z",
+ "shell.execute_reply": "2024-06-07T09:11:02.367179Z"
+ }
+ },
+ "outputs": [
{
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "These results can then be passed into the `Cluster Studio Dashboard`.\n"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cluster_id | \n",
+ " unique_id | \n",
+ " cluster | \n",
+ " full_name | \n",
+ " first_and_surname | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " birth_place | \n",
+ " postcode_fake | \n",
+ " gender | \n",
+ " occupation | \n",
+ " __splink_salt | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Q16025107-1 | \n",
+ " Q5497940-9 | \n",
+ " Q5497940 | \n",
+ " frederick hall | \n",
+ " frederick hall | \n",
+ " frederick | \n",
+ " hall | \n",
+ " 1855-01-01 | \n",
+ " bristol, city of | \n",
+ " bs11 9pn | \n",
+ " None | \n",
+ " None | \n",
+ " 0.002739 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Q1149445-1 | \n",
+ " Q1149445-9 | \n",
+ " Q1149445 | \n",
+ " earl egerton | \n",
+ " earl egerton | \n",
+ " earl | \n",
+ " egerton | \n",
+ " 1800-01-01 | \n",
+ " westminster | \n",
+ " w1d 2hf | \n",
+ " None | \n",
+ " None | \n",
+ " 0.991459 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Q20664532-1 | \n",
+ " Q21466387-2 | \n",
+ " Q21466387 | \n",
+ " harry brooker | \n",
+ " harry brooker | \n",
+ " harry | \n",
+ " brooker | \n",
+ " 1848-01-01 | \n",
+ " plymouth | \n",
+ " pl4 9hx | \n",
+ " male | \n",
+ " painter | \n",
+ " 0.506127 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Q1124636-1 | \n",
+ " Q1124636-12 | \n",
+ " Q1124636 | \n",
+ " tom stapleton | \n",
+ " tom stapleton | \n",
+ " tom | \n",
+ " stapleton | \n",
+ " 1535-01-01 | \n",
+ " None | \n",
+ " bn6 9na | \n",
+ " male | \n",
+ " theologian | \n",
+ " 0.612694 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Q18508292-1 | \n",
+ " Q21466711-4 | \n",
+ " Q21466711 | \n",
+ " harry s0ence | \n",
+ " harry s0ence | \n",
+ " harry | \n",
+ " s0ence | \n",
+ " 1860-01-01 | \n",
+ " london | \n",
+ " se1 7pb | \n",
+ " male | \n",
+ " painter | \n",
+ " 0.488917 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cluster_id unique_id cluster full_name first_and_surname \\\n",
+ "0 Q16025107-1 Q5497940-9 Q5497940 frederick hall frederick hall \n",
+ "1 Q1149445-1 Q1149445-9 Q1149445 earl egerton earl egerton \n",
+ "2 Q20664532-1 Q21466387-2 Q21466387 harry brooker harry brooker \n",
+ "3 Q1124636-1 Q1124636-12 Q1124636 tom stapleton tom stapleton \n",
+ "4 Q18508292-1 Q21466711-4 Q21466711 harry s0ence harry s0ence \n",
+ "\n",
+ " first_name surname dob birth_place postcode_fake gender \\\n",
+ "0 frederick hall 1855-01-01 bristol, city of bs11 9pn None \n",
+ "1 earl egerton 1800-01-01 westminster w1d 2hf None \n",
+ "2 harry brooker 1848-01-01 plymouth pl4 9hx male \n",
+ "3 tom stapleton 1535-01-01 None bn6 9na male \n",
+ "4 harry s0ence 1860-01-01 london se1 7pb male \n",
+ "\n",
+ " occupation __splink_salt \n",
+ "0 None 0.002739 \n",
+ "1 None 0.991459 \n",
+ "2 painter 0.506127 \n",
+ "3 theologian 0.612694 \n",
+ "4 painter 0.488917 "
]
- },
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clusters.as_pandas_dataframe(limit=5)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "These results can then be passed into the `Cluster Studio Dashboard`.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:02.371850Z",
+ "iopub.status.busy": "2024-06-07T09:11:02.371545Z",
+ "iopub.status.idle": "2024-06-07T09:11:02.462645Z",
+ "shell.execute_reply": "2024-06-07T09:11:02.461886Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:02.371850Z",
- "iopub.status.busy": "2024-06-07T09:11:02.371545Z",
- "iopub.status.idle": "2024-06-07T09:11:02.462645Z",
- "shell.execute_reply": "2024-06-07T09:11:02.461886Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
],
- "source": [
- "linker.visualisations.cluster_studio_dashboard(\n",
- " df_predict,\n",
- " clusters,\n",
- " \"dashboards/50k_deterministic_cluster.html\",\n",
- " sampling_method=\"by_cluster_size\",\n",
- " overwrite=True,\n",
- ")\n",
- "\n",
- "from IPython.display import IFrame\n",
- "\n",
- "IFrame(src=\"./dashboards/50k_deterministic_cluster.html\", width=\"100%\", height=1200)"
+ "text/plain": [
+ ""
]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "linker.visualisations.cluster_studio_dashboard(\n",
+ " df_predict,\n",
+ " clusters,\n",
+ " \"dashboards/50k_deterministic_cluster.html\",\n",
+ " sampling_method=\"by_cluster_size\",\n",
+ " overwrite=True,\n",
+ ")\n",
+ "\n",
+ "from IPython.display import IFrame\n",
+ "\n",
+ "IFrame(src=\"./dashboards/50k_deterministic_cluster.html\", width=\"100%\", height=1200)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/docs/demos/examples/duckdb/febrl3.ipynb b/docs/demos/examples/duckdb/febrl3.ipynb
index f10413ccc7..a0d7418b3c 100644
--- a/docs/demos/examples/duckdb/febrl3.ipynb
+++ b/docs/demos/examples/duckdb/febrl3.ipynb
@@ -1,1630 +1,1633 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Deduplicating the febrl3 dataset\n",
- "\n",
- "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Deduplicating the febrl3 dataset\n",
+ "\n",
+ "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:24.420657Z",
+ "iopub.status.busy": "2024-06-07T09:11:24.420336Z",
+ "iopub.status.idle": "2024-06-07T09:11:24.443364Z",
+ "shell.execute_reply": "2024-06-07T09:11:24.442120Z"
},
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:24.420657Z",
- "iopub.status.busy": "2024-06-07T09:11:24.420336Z",
- "iopub.status.idle": "2024-06-07T09:11:24.443364Z",
- "shell.execute_reply": "2024-06-07T09:11:24.442120Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:24.447798Z",
+ "iopub.status.busy": "2024-06-07T09:11:24.447495Z",
+ "iopub.status.idle": "2024-06-07T09:11:26.149918Z",
+ "shell.execute_reply": "2024-06-07T09:11:26.149230Z"
},
+ "tags": [
+ "hide_output"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "from splink.datasets import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.febrl3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:24.447798Z",
- "iopub.status.busy": "2024-06-07T09:11:24.447495Z",
- "iopub.status.idle": "2024-06-07T09:11:26.149918Z",
- "shell.execute_reply": "2024-06-07T09:11:26.149230Z"
- },
- "tags": [
- "hide_output"
- ]
- },
- "outputs": [],
- "source": [
- "from splink.datasets import splink_datasets\n",
- "\n",
- "df = splink_datasets.febrl3"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " rec_id | \n",
+ " given_name | \n",
+ " surname | \n",
+ " street_number | \n",
+ " address_1 | \n",
+ " address_2 | \n",
+ " suburb | \n",
+ " postcode | \n",
+ " state | \n",
+ " date_of_birth | \n",
+ " soc_sec_id | \n",
+ " cluster | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " rec-1496-org | \n",
+ " mitchell | \n",
+ " green | \n",
+ " 7 | \n",
+ " wallaby place | \n",
+ " delmar | \n",
+ " cleveland | \n",
+ " 2119 | \n",
+ " sa | \n",
+ " 19560409 | \n",
+ " 1804974 | \n",
+ " rec-1496 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " rec-552-dup-3 | \n",
+ " harley | \n",
+ " mccarthy | \n",
+ " 177 | \n",
+ " pridhamstreet | \n",
+ " milton | \n",
+ " marsden | \n",
+ " 3165 | \n",
+ " nsw | \n",
+ " 19080419 | \n",
+ " 6089216 | \n",
+ " rec-552 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " rec_id given_name surname street_number address_1 \\\n",
+ "0 rec-1496-org mitchell green 7 wallaby place \n",
+ "1 rec-552-dup-3 harley mccarthy 177 pridhamstreet \n",
+ "\n",
+ " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n",
+ "0 delmar cleveland 2119 sa 19560409 1804974 rec-1496 \n",
+ "1 milton marsden 3165 nsw 19080419 6089216 rec-552 "
]
- },
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.rename(columns=lambda x: x.strip())\n",
+ "\n",
+ "df[\"cluster\"] = df[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n",
+ "\n",
+ "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n",
+ "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()\n",
+ "\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:26.153666Z",
+ "iopub.status.busy": "2024-06-07T09:11:26.153378Z",
+ "iopub.status.idle": "2024-06-07T09:11:26.160666Z",
+ "shell.execute_reply": "2024-06-07T09:11:26.159911Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n",
+ "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:26.164000Z",
+ "iopub.status.busy": "2024-06-07T09:11:26.163726Z",
+ "iopub.status.idle": "2024-06-07T09:11:26.170794Z",
+ "shell.execute_reply": "2024-06-07T09:11:26.170146Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n",
+ "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:26.174301Z",
+ "iopub.status.busy": "2024-06-07T09:11:26.174024Z",
+ "iopub.status.idle": "2024-06-07T09:11:26.331196Z",
+ "shell.execute_reply": "2024-06-07T09:11:26.330465Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import DuckDBAPI, Linker, SettingsCreator\n",
+ "\n",
+ "# TODO: Allow missingness to be analysed without a linker\n",
+ "settings = SettingsCreator(\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ " link_type=\"dedupe_only\",\n",
+ ")\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:26.334644Z",
+ "iopub.status.busy": "2024-06-07T09:11:26.334398Z",
+ "iopub.status.idle": "2024-06-07T09:11:26.630134Z",
+ "shell.execute_reply": "2024-06-07T09:11:26.629629Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rec_id | \n",
- " given_name | \n",
- " surname | \n",
- " street_number | \n",
- " address_1 | \n",
- " address_2 | \n",
- " suburb | \n",
- " postcode | \n",
- " state | \n",
- " date_of_birth | \n",
- " soc_sec_id | \n",
- " cluster | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " rec-1496-org | \n",
- " mitchell | \n",
- " green | \n",
- " 7 | \n",
- " wallaby place | \n",
- " delmar | \n",
- " cleveland | \n",
- " 2119 | \n",
- " sa | \n",
- " 19560409 | \n",
- " 1804974 | \n",
- " rec-1496 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " rec-552-dup-3 | \n",
- " harley | \n",
- " mccarthy | \n",
- " 177 | \n",
- " pridhamstreet | \n",
- " milton | \n",
- " marsden | \n",
- " 3165 | \n",
- " nsw | \n",
- " 19080419 | \n",
- " 6089216 | \n",
- " rec-552 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " rec_id given_name surname street_number address_1 \\\n",
- "0 rec-1496-org mitchell green 7 wallaby place \n",
- "1 rec-552-dup-3 harley mccarthy 177 pridhamstreet \n",
- "\n",
- " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n",
- "0 delmar cleveland 2119 sa 19560409 1804974 rec-1496 \n",
- "1 milton marsden 3165 nsw 19080419 6089216 rec-552 "
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "df = df.rename(columns=lambda x: x.strip())\n",
- "\n",
- "df[\"cluster\"] = df[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n",
- "\n",
- "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n",
- "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()\n",
- "\n",
- "df.head(2)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.exploratory import completeness_chart\n",
+ "\n",
+ "completeness_chart(df_sdf)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:26.633200Z",
+ "iopub.status.busy": "2024-06-07T09:11:26.632979Z",
+ "iopub.status.idle": "2024-06-07T09:11:27.047469Z",
+ "shell.execute_reply": "2024-06-07T09:11:27.046951Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:26.153666Z",
- "iopub.status.busy": "2024-06-07T09:11:26.153378Z",
- "iopub.status.idle": "2024-06-07T09:11:26.160666Z",
- "shell.execute_reply": "2024-06-07T09:11:26.159911Z"
- }
- },
- "outputs": [],
- "source": [
- "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n",
- "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.exploratory import profile_columns\n",
+ "\n",
+ "profile_columns(df_sdf, column_expressions=[\"given_name\", \"surname\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:27.050491Z",
+ "iopub.status.busy": "2024-06-07T09:11:27.050266Z",
+ "iopub.status.idle": "2024-06-07T09:11:27.428593Z",
+ "shell.execute_reply": "2024-06-07T09:11:27.428055Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:26.164000Z",
- "iopub.status.busy": "2024-06-07T09:11:26.163726Z",
- "iopub.status.idle": "2024-06-07T09:11:26.170794Z",
- "shell.execute_reply": "2024-06-07T09:11:26.170146Z"
- }
- },
- "outputs": [],
- "source": [
- "df[\"date_of_birth\"] = df[\"date_of_birth\"].astype(str).str.strip()\n",
- "df[\"soc_sec_id\"] = df[\"soc_sec_id\"].astype(str).str.strip()"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI, block_on\n",
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "blocking_rules = [\n",
+ " block_on(\"soc_sec_id\"),\n",
+ " block_on(\"given_name\"),\n",
+ " block_on(\"surname\"),\n",
+ " block_on(\"date_of_birth\"),\n",
+ " block_on(\"postcode\"),\n",
+ "]\n",
+ "\n",
+ "\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " df_sdf,\n",
+ " blocking_rules=blocking_rules,\n",
+ " link_type=\"dedupe_only\",\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:27.431702Z",
+ "iopub.status.busy": "2024-06-07T09:11:27.431466Z",
+ "iopub.status.idle": "2024-06-07T09:11:27.591229Z",
+ "shell.execute_reply": "2024-06-07T09:11:27.590491Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "from splink import Linker\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ " link_type=\"dedupe_only\",\n",
+ " blocking_rules_to_generate_predictions=blocking_rules,\n",
+ " comparisons=[\n",
+ " cl.NameComparison(\"given_name\"),\n",
+ " cl.NameComparison(\"surname\"),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"date_of_birth\",\n",
+ " input_is_string=True,\n",
+ " datetime_format=\"%Y%m%d\",\n",
+ " ),\n",
+ " cl.DamerauLevenshteinAtThresholds(\"soc_sec_id\", [2]),\n",
+ " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n",
+ " cl.ExactMatch(\"postcode\").configure(term_frequency_adjustments=True),\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:27.594493Z",
+ "iopub.status.busy": "2024-06-07T09:11:27.594264Z",
+ "iopub.status.idle": "2024-06-07T09:11:27.787352Z",
+ "shell.execute_reply": "2024-06-07T09:11:27.786769Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:26.174301Z",
- "iopub.status.busy": "2024-06-07T09:11:26.174024Z",
- "iopub.status.idle": "2024-06-07T09:11:26.331196Z",
- "shell.execute_reply": "2024-06-07T09:11:26.330465Z"
- }
- },
- "outputs": [],
- "source": [
- "from splink import DuckDBAPI, Linker, SettingsCreator\n",
- "\n",
- "# TODO: Allow missingness to be analysed without a linker\n",
- "settings = SettingsCreator(\n",
- " unique_id_column_name=\"rec_id\",\n",
- " link_type=\"dedupe_only\",\n",
- ")\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.000528.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 1,893.56 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 6,600.00 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "from splink import block_on\n",
+ "\n",
+ "deterministic_rules = [\n",
+ " block_on(\"soc_sec_id\"),\n",
+ " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n",
+ " \"l.given_name = r.surname and l.surname = r.given_name and l.date_of_birth = r.date_of_birth\",\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(\n",
+ " deterministic_rules, recall=0.9\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:27.790368Z",
+ "iopub.status.busy": "2024-06-07T09:11:27.790145Z",
+ "iopub.status.idle": "2024-06-07T09:11:35.433199Z",
+ "shell.execute_reply": "2024-06-07T09:11:35.431006Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing:\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
+ "----- Estimating u probabilities using random sampling -----\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:26.334644Z",
- "iopub.status.busy": "2024-06-07T09:11:26.334398Z",
- "iopub.status.idle": "2024-06-07T09:11:26.630134Z",
- "shell.execute_reply": "2024-06-07T09:11:26.629629Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "80ec855655d34fb49588ee24a928ae25",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink.exploratory import completeness_chart\n",
- "\n",
- "completeness_chart(df, db_api=DuckDBAPI())"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:26.633200Z",
- "iopub.status.busy": "2024-06-07T09:11:26.632979Z",
- "iopub.status.idle": "2024-06-07T09:11:27.047469Z",
- "shell.execute_reply": "2024-06-07T09:11:27.046951Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink.exploratory import profile_columns\n",
- "\n",
- "profile_columns(df, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - given_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - date_of_birth (some u values are not trained, no m values are trained).\n",
+ " - soc_sec_id (no m values are trained).\n",
+ " - street_number (no m values are trained).\n",
+ " - postcode (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:35.446472Z",
+ "iopub.status.busy": "2024-06-07T09:11:35.440198Z",
+ "iopub.status.idle": "2024-06-07T09:11:36.895235Z",
+ "shell.execute_reply": "2024-06-07T09:11:36.894603Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:27.050491Z",
- "iopub.status.busy": "2024-06-07T09:11:27.050266Z",
- "iopub.status.idle": "2024-06-07T09:11:27.428593Z",
- "shell.execute_reply": "2024-06-07T09:11:27.428055Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink import DuckDBAPI, block_on\n",
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "blocking_rules = [\n",
- " block_on(\"soc_sec_id\"),\n",
- " block_on(\"given_name\"),\n",
- " block_on(\"surname\"),\n",
- " block_on(\"date_of_birth\"),\n",
- " block_on(\"postcode\"),\n",
- "]\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=df,\n",
- " blocking_rules=blocking_rules,\n",
- " db_api=db_api,\n",
- " link_type=\"dedupe_only\",\n",
- " unique_id_column_name=\"rec_id\",\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"date_of_birth\" = r.\"date_of_birth\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - given_name\n",
+ " - surname\n",
+ " - soc_sec_id\n",
+ " - street_number\n",
+ " - postcode\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - date_of_birth\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.376 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 2: Largest change in params was 0.0156 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 0.000699 in the m_probability of postcode, level `All other comparisons`\n",
+ "Iteration 4: Largest change in params was -3.77e-05 in the m_probability of postcode, level `Exact match on postcode`\n",
+ "\n",
+ "EM converged after 4 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - date_of_birth (some u values are not trained, no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "em_blocking_rule_1 = block_on(\"date_of_birth\")\n",
+ "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " em_blocking_rule_1\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:36.898638Z",
+ "iopub.status.busy": "2024-06-07T09:11:36.898156Z",
+ "iopub.status.idle": "2024-06-07T09:11:37.517318Z",
+ "shell.execute_reply": "2024-06-07T09:11:37.516459Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:27.431702Z",
- "iopub.status.busy": "2024-06-07T09:11:27.431466Z",
- "iopub.status.idle": "2024-06-07T09:11:27.591229Z",
- "shell.execute_reply": "2024-06-07T09:11:27.590491Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_library as cl\n",
- "\n",
- "from splink import Linker\n",
- "\n",
- "settings = SettingsCreator(\n",
- " unique_id_column_name=\"rec_id\",\n",
- " link_type=\"dedupe_only\",\n",
- " blocking_rules_to_generate_predictions=blocking_rules,\n",
- " comparisons=[\n",
- " cl.NameComparison(\"given_name\"),\n",
- " cl.NameComparison(\"surname\"),\n",
- " cl.DateOfBirthComparison(\n",
- " \"date_of_birth\",\n",
- " input_is_string=True,\n",
- " datetime_format=\"%Y%m%d\",\n",
- " ),\n",
- " cl.DamerauLevenshteinAtThresholds(\"soc_sec_id\", [2]),\n",
- " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n",
- " cl.ExactMatch(\"postcode\").configure(term_frequency_adjustments=True),\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"postcode\" = r.\"postcode\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - given_name\n",
+ " - surname\n",
+ " - date_of_birth\n",
+ " - soc_sec_id\n",
+ " - street_number\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - postcode\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.0681 in probability_two_random_records_match\n",
+ "Iteration 2: Largest change in params was -0.00185 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n",
+ "Iteration 3: Largest change in params was -5.7e-05 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n",
+ "\n",
+ "EM converged after 3 iterations\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - date_of_birth (some u values are not trained, some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "em_blocking_rule_2 = block_on(\"postcode\")\n",
+ "session_postcode = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " em_blocking_rule_2\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:37.523135Z",
+ "iopub.status.busy": "2024-06-07T09:11:37.522810Z",
+ "iopub.status.idle": "2024-06-07T09:11:37.957335Z",
+ "shell.execute_reply": "2024-06-07T09:11:37.956712Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:27.594493Z",
- "iopub.status.busy": "2024-06-07T09:11:27.594264Z",
- "iopub.status.idle": "2024-06-07T09:11:27.787352Z",
- "shell.execute_reply": "2024-06-07T09:11:27.786769Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.000528.\n",
- "This means that amongst all possible pairwise record comparisons, one in 1,893.56 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 6,600.00 matching pairs\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import block_on\n",
- "\n",
- "deterministic_rules = [\n",
- " block_on(\"soc_sec_id\"),\n",
- " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n",
- " \"l.given_name = r.surname and l.surname = r.given_name and l.date_of_birth = r.date_of_birth\",\n",
- "]\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(\n",
- " deterministic_rules, recall=0.9\n",
- ")"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:37.960629Z",
+ "iopub.status.busy": "2024-06-07T09:11:37.960358Z",
+ "iopub.status.idle": "2024-06-07T09:11:44.496784Z",
+ "shell.execute_reply": "2024-06-07T09:11:44.496254Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:27.790368Z",
- "iopub.status.busy": "2024-06-07T09:11:27.790145Z",
- "iopub.status.idle": "2024-06-07T09:11:35.433199Z",
- "shell.execute_reply": "2024-06-07T09:11:35.431006Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7317c56423e44b84abdfb32562eda774",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
- "----- Estimating u probabilities using random sampling -----\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "80ec855655d34fb49588ee24a928ae25",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - given_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - date_of_birth (some u values are not trained, no m values are trained).\n",
- " - soc_sec_id (no m values are trained).\n",
- " - street_number (no m values are trained).\n",
- " - postcode (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:35.446472Z",
- "iopub.status.busy": "2024-06-07T09:11:35.440198Z",
- "iopub.status.idle": "2024-06-07T09:11:36.895235Z",
- "shell.execute_reply": "2024-06-07T09:11:36.894603Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"date_of_birth\" = r.\"date_of_birth\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - given_name\n",
- " - surname\n",
- " - soc_sec_id\n",
- " - street_number\n",
- " - postcode\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - date_of_birth\n",
- "\n",
- "Iteration 1: Largest change in params was -0.376 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 2: Largest change in params was 0.0156 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 0.000699 in the m_probability of postcode, level `All other comparisons`\n",
- "Iteration 4: Largest change in params was -3.77e-05 in the m_probability of postcode, level `Exact match on postcode`\n",
- "\n",
- "EM converged after 4 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - date_of_birth (some u values are not trained, no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "em_blocking_rule_1 = block_on(\"date_of_birth\")\n",
- "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " em_blocking_rule_1\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = linker.inference.predict(threshold_match_probability=0.2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:44.499943Z",
+ "iopub.status.busy": "2024-06-07T09:11:44.499693Z",
+ "iopub.status.idle": "2024-06-07T09:11:47.310831Z",
+ "shell.execute_reply": "2024-06-07T09:11:47.310208Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:36.898638Z",
- "iopub.status.busy": "2024-06-07T09:11:36.898156Z",
- "iopub.status.idle": "2024-06-07T09:11:37.517318Z",
- "shell.execute_reply": "2024-06-07T09:11:37.516459Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1c4e16cfc8fc4df7bbdd87024c2d86cf",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"postcode\" = r.\"postcode\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - given_name\n",
- " - surname\n",
- " - date_of_birth\n",
- " - soc_sec_id\n",
- " - street_number\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - postcode\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was 0.0681 in probability_two_random_records_match\n",
- "Iteration 2: Largest change in params was -0.00185 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n",
- "Iteration 3: Largest change in params was -5.7e-05 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n",
- "\n",
- "EM converged after 3 iterations\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - date_of_birth (some u values are not trained, some m values are not trained).\n"
- ]
- }
- ],
- "source": [
- "em_blocking_rule_2 = block_on(\"postcode\")\n",
- "session_postcode = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " em_blocking_rule_2\n",
- ")"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:37.523135Z",
- "iopub.status.busy": "2024-06-07T09:11:37.522810Z",
- "iopub.status.idle": "2024-06-07T09:11:37.957335Z",
- "shell.execute_reply": "2024-06-07T09:11:37.956712Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.match_weights_chart()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:37.960629Z",
- "iopub.status.busy": "2024-06-07T09:11:37.960358Z",
- "iopub.status.idle": "2024-06-07T09:11:44.496784Z",
- "shell.execute_reply": "2024-06-07T09:11:44.496254Z"
- }
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "7317c56423e44b84abdfb32562eda774",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "results = linker.inference.predict(threshold_match_probability=0.2)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_column(\n",
+ " \"cluster\", match_weight_round_to_nearest=0.1, output_type=\"accuracy\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:47.319625Z",
+ "iopub.status.busy": "2024-06-07T09:11:47.319347Z",
+ "iopub.status.idle": "2024-06-07T09:11:47.588558Z",
+ "shell.execute_reply": "2024-06-07T09:11:47.587940Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:44.499943Z",
- "iopub.status.busy": "2024-06-07T09:11:44.499693Z",
- "iopub.status.idle": "2024-06-07T09:11:47.310831Z",
- "shell.execute_reply": "2024-06-07T09:11:47.310208Z"
- }
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "1c4e16cfc8fc4df7bbdd87024c2d86cf",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_column(\n",
- " \"cluster\", match_weight_round_to_nearest=0.1, output_type=\"accuracy\"\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:47.319625Z",
- "iopub.status.busy": "2024-06-07T09:11:47.319347Z",
- "iopub.status.idle": "2024-06-07T09:11:47.588558Z",
- "shell.execute_reply": "2024-06-07T09:11:47.587940Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " clerical_match_score | \n",
- " found_by_blocking_rules | \n",
- " match_weight | \n",
- " match_probability | \n",
- " rec_id_l | \n",
- " rec_id_r | \n",
- " given_name_l | \n",
- " given_name_r | \n",
- " gamma_given_name | \n",
- " tf_given_name_l | \n",
- " ... | \n",
- " postcode_l | \n",
- " postcode_r | \n",
- " gamma_postcode | \n",
- " tf_postcode_l | \n",
- " tf_postcode_r | \n",
- " bf_postcode | \n",
- " bf_tf_adj_postcode | \n",
- " cluster_l | \n",
- " cluster_r | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " False | \n",
- " -27.805731 | \n",
- " 4.262268e-09 | \n",
- " rec-993-dup-1 | \n",
- " rec-993-dup-3 | \n",
- " westbrook | \n",
- " jake | \n",
- " 0 | \n",
- " 0.0004 | \n",
- " ... | \n",
- " 2704 | \n",
- " 2074 | \n",
- " 0 | \n",
- " 0.0002 | \n",
- " 0.0014 | \n",
- " 0.230173 | \n",
- " 1.0 | \n",
- " rec-993 | \n",
- " rec-993 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " False | \n",
- " -27.805731 | \n",
- " 4.262268e-09 | \n",
- " rec-829-dup-0 | \n",
- " rec-829-dup-2 | \n",
- " wilde | \n",
- " kyra | \n",
- " 0 | \n",
- " 0.0002 | \n",
- " ... | \n",
- " 3859 | \n",
- " 3595 | \n",
- " 0 | \n",
- " 0.0004 | \n",
- " 0.0006 | \n",
- " 0.230173 | \n",
- " 1.0 | \n",
- " rec-829 | \n",
- " rec-829 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " False | \n",
- " -19.717877 | \n",
- " 1.159651e-06 | \n",
- " rec-829-dup-0 | \n",
- " rec-829-dup-1 | \n",
- " wilde | \n",
- " kyra | \n",
- " 0 | \n",
- " 0.0002 | \n",
- " ... | \n",
- " 3859 | \n",
- " 3889 | \n",
- " 0 | \n",
- " 0.0004 | \n",
- " 0.0002 | \n",
- " 0.230173 | \n",
- " 1.0 | \n",
- " rec-829 | \n",
- " rec-829 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " True | \n",
- " -15.453190 | \n",
- " 2.229034e-05 | \n",
- " rec-721-dup-0 | \n",
- " rec-721-dup-1 | \n",
- " mikhaili | \n",
- " elly | \n",
- " 0 | \n",
- " 0.0008 | \n",
- " ... | \n",
- " 4806 | \n",
- " 4860 | \n",
- " 0 | \n",
- " 0.0008 | \n",
- " 0.0014 | \n",
- " 0.230173 | \n",
- " 1.0 | \n",
- " rec-721 | \n",
- " rec-721 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " True | \n",
- " -12.931781 | \n",
- " 1.279648e-04 | \n",
- " rec-401-dup-1 | \n",
- " rec-401-dup-3 | \n",
- " whitbe | \n",
- " alexa-ose | \n",
- " 0 | \n",
- " 0.0002 | \n",
- " ... | \n",
- " 3040 | \n",
- " 3041 | \n",
- " 0 | \n",
- " 0.0020 | \n",
- " 0.0004 | \n",
- " 0.230173 | \n",
- " 1.0 | \n",
- " rec-401 | \n",
- " rec-401 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 45 columns
\n",
- "
"
- ],
- "text/plain": [
- " clerical_match_score found_by_blocking_rules match_weight \\\n",
- "0 1.0 False -27.805731 \n",
- "1 1.0 False -27.805731 \n",
- "2 1.0 False -19.717877 \n",
- "3 1.0 True -15.453190 \n",
- "4 1.0 True -12.931781 \n",
- "\n",
- " match_probability rec_id_l rec_id_r given_name_l given_name_r \\\n",
- "0 4.262268e-09 rec-993-dup-1 rec-993-dup-3 westbrook jake \n",
- "1 4.262268e-09 rec-829-dup-0 rec-829-dup-2 wilde kyra \n",
- "2 1.159651e-06 rec-829-dup-0 rec-829-dup-1 wilde kyra \n",
- "3 2.229034e-05 rec-721-dup-0 rec-721-dup-1 mikhaili elly \n",
- "4 1.279648e-04 rec-401-dup-1 rec-401-dup-3 whitbe alexa-ose \n",
- "\n",
- " gamma_given_name tf_given_name_l ... postcode_l postcode_r \\\n",
- "0 0 0.0004 ... 2704 2074 \n",
- "1 0 0.0002 ... 3859 3595 \n",
- "2 0 0.0002 ... 3859 3889 \n",
- "3 0 0.0008 ... 4806 4860 \n",
- "4 0 0.0002 ... 3040 3041 \n",
- "\n",
- " gamma_postcode tf_postcode_l tf_postcode_r bf_postcode \\\n",
- "0 0 0.0002 0.0014 0.230173 \n",
- "1 0 0.0004 0.0006 0.230173 \n",
- "2 0 0.0004 0.0002 0.230173 \n",
- "3 0 0.0008 0.0014 0.230173 \n",
- "4 0 0.0020 0.0004 0.230173 \n",
- "\n",
- " bf_tf_adj_postcode cluster_l cluster_r match_key \n",
- "0 1.0 rec-993 rec-993 5 \n",
- "1 1.0 rec-829 rec-829 5 \n",
- "2 1.0 rec-829 rec-829 5 \n",
- "3 1.0 rec-721 rec-721 2 \n",
- "4 1.0 rec-401 rec-401 0 \n",
- "\n",
- "[5 rows x 45 columns]"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " clerical_match_score | \n",
+ " found_by_blocking_rules | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " rec_id_l | \n",
+ " rec_id_r | \n",
+ " given_name_l | \n",
+ " given_name_r | \n",
+ " gamma_given_name | \n",
+ " tf_given_name_l | \n",
+ " ... | \n",
+ " postcode_l | \n",
+ " postcode_r | \n",
+ " gamma_postcode | \n",
+ " tf_postcode_l | \n",
+ " tf_postcode_r | \n",
+ " bf_postcode | \n",
+ " bf_tf_adj_postcode | \n",
+ " cluster_l | \n",
+ " cluster_r | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -27.805731 | \n",
+ " 4.262268e-09 | \n",
+ " rec-993-dup-1 | \n",
+ " rec-993-dup-3 | \n",
+ " westbrook | \n",
+ " jake | \n",
+ " 0 | \n",
+ " 0.0004 | \n",
+ " ... | \n",
+ " 2704 | \n",
+ " 2074 | \n",
+ " 0 | \n",
+ " 0.0002 | \n",
+ " 0.0014 | \n",
+ " 0.230173 | \n",
+ " 1.0 | \n",
+ " rec-993 | \n",
+ " rec-993 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -27.805731 | \n",
+ " 4.262268e-09 | \n",
+ " rec-829-dup-0 | \n",
+ " rec-829-dup-2 | \n",
+ " wilde | \n",
+ " kyra | \n",
+ " 0 | \n",
+ " 0.0002 | \n",
+ " ... | \n",
+ " 3859 | \n",
+ " 3595 | \n",
+ " 0 | \n",
+ " 0.0004 | \n",
+ " 0.0006 | \n",
+ " 0.230173 | \n",
+ " 1.0 | \n",
+ " rec-829 | \n",
+ " rec-829 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " -19.717877 | \n",
+ " 1.159651e-06 | \n",
+ " rec-829-dup-0 | \n",
+ " rec-829-dup-1 | \n",
+ " wilde | \n",
+ " kyra | \n",
+ " 0 | \n",
+ " 0.0002 | \n",
+ " ... | \n",
+ " 3859 | \n",
+ " 3889 | \n",
+ " 0 | \n",
+ " 0.0004 | \n",
+ " 0.0002 | \n",
+ " 0.230173 | \n",
+ " 1.0 | \n",
+ " rec-829 | \n",
+ " rec-829 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " True | \n",
+ " -15.453190 | \n",
+ " 2.229034e-05 | \n",
+ " rec-721-dup-0 | \n",
+ " rec-721-dup-1 | \n",
+ " mikhaili | \n",
+ " elly | \n",
+ " 0 | \n",
+ " 0.0008 | \n",
+ " ... | \n",
+ " 4806 | \n",
+ " 4860 | \n",
+ " 0 | \n",
+ " 0.0008 | \n",
+ " 0.0014 | \n",
+ " 0.230173 | \n",
+ " 1.0 | \n",
+ " rec-721 | \n",
+ " rec-721 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1.0 | \n",
+ " True | \n",
+ " -12.931781 | \n",
+ " 1.279648e-04 | \n",
+ " rec-401-dup-1 | \n",
+ " rec-401-dup-3 | \n",
+ " whitbe | \n",
+ " alexa-ose | \n",
+ " 0 | \n",
+ " 0.0002 | \n",
+ " ... | \n",
+ " 3040 | \n",
+ " 3041 | \n",
+ " 0 | \n",
+ " 0.0020 | \n",
+ " 0.0004 | \n",
+ " 0.230173 | \n",
+ " 1.0 | \n",
+ " rec-401 | \n",
+ " rec-401 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 45 columns
\n",
+ "
"
],
- "source": [
- "pred_errors_df = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\"\n",
- ").as_pandas_dataframe()\n",
- "len(pred_errors_df)\n",
- "pred_errors_df.head()"
+ "text/plain": [
+ " clerical_match_score found_by_blocking_rules match_weight \\\n",
+ "0 1.0 False -27.805731 \n",
+ "1 1.0 False -27.805731 \n",
+ "2 1.0 False -19.717877 \n",
+ "3 1.0 True -15.453190 \n",
+ "4 1.0 True -12.931781 \n",
+ "\n",
+ " match_probability rec_id_l rec_id_r given_name_l given_name_r \\\n",
+ "0 4.262268e-09 rec-993-dup-1 rec-993-dup-3 westbrook jake \n",
+ "1 4.262268e-09 rec-829-dup-0 rec-829-dup-2 wilde kyra \n",
+ "2 1.159651e-06 rec-829-dup-0 rec-829-dup-1 wilde kyra \n",
+ "3 2.229034e-05 rec-721-dup-0 rec-721-dup-1 mikhaili elly \n",
+ "4 1.279648e-04 rec-401-dup-1 rec-401-dup-3 whitbe alexa-ose \n",
+ "\n",
+ " gamma_given_name tf_given_name_l ... postcode_l postcode_r \\\n",
+ "0 0 0.0004 ... 2704 2074 \n",
+ "1 0 0.0002 ... 3859 3595 \n",
+ "2 0 0.0002 ... 3859 3889 \n",
+ "3 0 0.0008 ... 4806 4860 \n",
+ "4 0 0.0002 ... 3040 3041 \n",
+ "\n",
+ " gamma_postcode tf_postcode_l tf_postcode_r bf_postcode \\\n",
+ "0 0 0.0002 0.0014 0.230173 \n",
+ "1 0 0.0004 0.0006 0.230173 \n",
+ "2 0 0.0004 0.0002 0.230173 \n",
+ "3 0 0.0008 0.0014 0.230173 \n",
+ "4 0 0.0020 0.0004 0.230173 \n",
+ "\n",
+ " bf_tf_adj_postcode cluster_l cluster_r match_key \n",
+ "0 1.0 rec-993 rec-993 5 \n",
+ "1 1.0 rec-829 rec-829 5 \n",
+ "2 1.0 rec-829 rec-829 5 \n",
+ "3 1.0 rec-721 rec-721 2 \n",
+ "4 1.0 rec-401 rec-401 0 \n",
+ "\n",
+ "[5 rows x 45 columns]"
]
- },
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pred_errors_df = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\"\n",
+ ").as_pandas_dataframe()\n",
+ "len(pred_errors_df)\n",
+ "pred_errors_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following chart seems to suggest that, where the model is making errors, it's because the data is corrupted beyond recognition and no reasonable linkage model could find these matches"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:11:47.591674Z",
+ "iopub.status.busy": "2024-06-07T09:11:47.591437Z",
+ "iopub.status.idle": "2024-06-07T09:11:48.630581Z",
+ "shell.execute_reply": "2024-06-07T09:11:48.629955Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following chart seems to suggest that, where the model is making errors, it's because the data is corrupted beyond recognition and no reasonable linkage model could find these matches"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:11:47.591674Z",
- "iopub.status.busy": "2024-06-07T09:11:47.591437Z",
- "iopub.status.idle": "2024-06-07T09:11:48.630581Z",
- "shell.execute_reply": "2024-06-07T09:11:48.629955Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "records = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\"\n",
- ").as_record_dict(limit=10)\n",
- "linker.visualisations.waterfall_chart(records)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "94aaeff2f888492ea321d4e4492526ff": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_bdf3a462cd3d48bda4269ac1cc8ed9ef",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_e05a7090510949ac956ea05719a3b8c2",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "b179423ef9d24cb1ac973b4b55daa86c": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "bdf3a462cd3d48bda4269ac1cc8ed9ef": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "db3fd6bdb9884f5a88fd4cf5d39330d4": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "e05a7090510949ac956ea05719a3b8c2": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "e181cb7618b74e4bbf9f2e144b68b87e": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_b179423ef9d24cb1ac973b4b55daa86c",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_db3fd6bdb9884f5a88fd4cf5d39330d4",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
- }
- }
+ ],
+ "source": [
+ "records = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\"\n",
+ ").as_record_dict(limit=10)\n",
+ "linker.visualisations.waterfall_chart(records)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "94aaeff2f888492ea321d4e4492526ff": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_bdf3a462cd3d48bda4269ac1cc8ed9ef",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_e05a7090510949ac956ea05719a3b8c2",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "b179423ef9d24cb1ac973b4b55daa86c": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "bdf3a462cd3d48bda4269ac1cc8ed9ef": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "db3fd6bdb9884f5a88fd4cf5d39330d4": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "e05a7090510949ac956ea05719a3b8c2": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "e181cb7618b74e4bbf9f2e144b68b87e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_b179423ef9d24cb1ac973b4b55daa86c",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_db3fd6bdb9884f5a88fd4cf5d39330d4",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/duckdb/febrl4.ipynb b/docs/demos/examples/duckdb/febrl4.ipynb
index f3e5a04cd6..3e67a92807 100644
--- a/docs/demos/examples/duckdb/febrl4.ipynb
+++ b/docs/demos/examples/duckdb/febrl4.ipynb
@@ -1,3493 +1,3503 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "b624dcc9-a6be-4996-8f78-1568568c2e6a",
- "metadata": {},
- "source": [
- "## Linking the febrl4 datasets\n",
- "\n",
- "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data.\n",
- "\n",
- "It consists of two datasets, A and B, of 5000 records each, with each record in dataset A having a corresponding record in dataset B. The aim will be to capture as many of those 5000 true links as possible, with minimal false linkages.\n",
- "\n",
- "It is worth noting that we should not necessarily expect to capture _all_ links. There are some links that although we know they _do_ correspond to the same person, the data is so mismatched between them that we would not reasonably expect a model to link them, and indeed should a model do so may indicate that we have overengineered things using our knowledge of true links, which will not be a helpful reference in situations where we attempt to link unlabelled data, as will usually be the case.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "32963faf",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "9c2be649",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:39.973571Z",
- "iopub.status.busy": "2024-06-07T09:16:39.973235Z",
- "iopub.status.idle": "2024-06-07T09:16:39.993885Z",
- "shell.execute_reply": "2024-06-07T09:16:39.992799Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3547f018-c884-4b9e-a042-3df09a576582",
- "metadata": {},
- "source": [
- "### Exploring data and defining model\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "05a3c2d4-6da8-48d5-89c8-db24702783c7",
- "metadata": {},
- "source": [
- "Firstly let's read in the data and have a little look at it\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b624dcc9-a6be-4996-8f78-1568568c2e6a",
+ "metadata": {},
+ "source": [
+ "## Linking the febrl4 datasets\n",
+ "\n",
+ "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data.\n",
+ "\n",
+ "It consists of two datasets, A and B, of 5000 records each, with each record in dataset A having a corresponding record in dataset B. The aim will be to capture as many of those 5000 true links as possible, with minimal false linkages.\n",
+ "\n",
+ "It is worth noting that we should not necessarily expect to capture _all_ links. There are some links that although we know they _do_ correspond to the same person, the data is so mismatched between them that we would not reasonably expect a model to link them, and indeed should a model do so may indicate that we have overengineered things using our knowledge of true links, which will not be a helpful reference in situations where we attempt to link unlabelled data, as will usually be the case.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32963faf",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "9c2be649",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:39.973571Z",
+ "iopub.status.busy": "2024-06-07T09:16:39.973235Z",
+ "iopub.status.idle": "2024-06-07T09:16:39.993885Z",
+ "shell.execute_reply": "2024-06-07T09:16:39.992799Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3547f018-c884-4b9e-a042-3df09a576582",
+ "metadata": {},
+ "source": [
+ "### Exploring data and defining model\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "05a3c2d4-6da8-48d5-89c8-db24702783c7",
+ "metadata": {},
+ "source": [
+ "Firstly let's read in the data and have a little look at it\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "832113c9-13b2-43b7-86d0-6051a9db79e8",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:39.999281Z",
+ "iopub.status.busy": "2024-06-07T09:16:39.998928Z",
+ "iopub.status.idle": "2024-06-07T09:16:41.957056Z",
+ "shell.execute_reply": "2024-06-07T09:16:41.956423Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 16,
- "id": "832113c9-13b2-43b7-86d0-6051a9db79e8",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:39.999281Z",
- "iopub.status.busy": "2024-06-07T09:16:39.998928Z",
- "iopub.status.idle": "2024-06-07T09:16:41.957056Z",
- "shell.execute_reply": "2024-06-07T09:16:41.956423Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rec_id | \n",
- " given_name | \n",
- " surname | \n",
- " street_number | \n",
- " address_1 | \n",
- " address_2 | \n",
- " suburb | \n",
- " postcode | \n",
- " state | \n",
- " date_of_birth | \n",
- " soc_sec_id | \n",
- " cluster | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " rec-1070-org | \n",
- " michaela | \n",
- " neumann | \n",
- " 8 | \n",
- " stanley street | \n",
- " miami | \n",
- " winston hills | \n",
- " 4223 | \n",
- " nsw | \n",
- " 19151111 | \n",
- " 5304218 | \n",
- " rec-1070 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " rec-1016-org | \n",
- " courtney | \n",
- " painter | \n",
- " 12 | \n",
- " pinkerton circuit | \n",
- " bega flats | \n",
- " richlands | \n",
- " 4560 | \n",
- " vic | \n",
- " 19161214 | \n",
- " 4066625 | \n",
- " rec-1016 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " rec_id given_name surname street_number address_1 \\\n",
- "0 rec-1070-org michaela neumann 8 stanley street \n",
- "1 rec-1016-org courtney painter 12 pinkerton circuit \n",
- "\n",
- " address_2 suburb postcode state date_of_birth soc_sec_id \\\n",
- "0 miami winston hills 4223 nsw 19151111 5304218 \n",
- "1 bega flats richlands 4560 vic 19161214 4066625 \n",
- "\n",
- " cluster \n",
- "0 rec-1070 \n",
- "1 rec-1016 "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rec_id | \n",
- " given_name | \n",
- " surname | \n",
- " street_number | \n",
- " address_1 | \n",
- " address_2 | \n",
- " suburb | \n",
- " postcode | \n",
- " state | \n",
- " date_of_birth | \n",
- " soc_sec_id | \n",
- " cluster | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " rec-561-dup-0 | \n",
- " elton | \n",
- " | \n",
- " 3 | \n",
- " light setreet | \n",
- " pinehill | \n",
- " windermere | \n",
- " 3212 | \n",
- " vic | \n",
- " 19651013 | \n",
- " 1551941 | \n",
- " rec-561 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " rec-2642-dup-0 | \n",
- " mitchell | \n",
- " maxon | \n",
- " 47 | \n",
- " edkins street | \n",
- " lochaoair | \n",
- " north ryde | \n",
- " 3355 | \n",
- " nsw | \n",
- " 19390212 | \n",
- " 8859999 | \n",
- " rec-2642 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " rec_id given_name surname street_number address_1 \\\n",
- "0 rec-561-dup-0 elton 3 light setreet \n",
- "1 rec-2642-dup-0 mitchell maxon 47 edkins street \n",
- "\n",
- " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n",
- "0 pinehill windermere 3212 vic 19651013 1551941 rec-561 \n",
- "1 lochaoair north ryde 3355 nsw 19390212 8859999 rec-2642 "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " rec_id | \n",
+ " given_name | \n",
+ " surname | \n",
+ " street_number | \n",
+ " address_1 | \n",
+ " address_2 | \n",
+ " suburb | \n",
+ " postcode | \n",
+ " state | \n",
+ " date_of_birth | \n",
+ " soc_sec_id | \n",
+ " cluster | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " rec-1070-org | \n",
+ " michaela | \n",
+ " neumann | \n",
+ " 8 | \n",
+ " stanley street | \n",
+ " miami | \n",
+ " winston hills | \n",
+ " 4223 | \n",
+ " nsw | \n",
+ " 19151111 | \n",
+ " 5304218 | \n",
+ " rec-1070 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " rec-1016-org | \n",
+ " courtney | \n",
+ " painter | \n",
+ " 12 | \n",
+ " pinkerton circuit | \n",
+ " bega flats | \n",
+ " richlands | \n",
+ " 4560 | \n",
+ " vic | \n",
+ " 19161214 | \n",
+ " 4066625 | \n",
+ " rec-1016 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink import splink_datasets\n",
- "\n",
- "df_a = splink_datasets.febrl4a\n",
- "df_b = splink_datasets.febrl4b\n",
- "\n",
- "\n",
- "def prepare_data(data):\n",
- " data = data.rename(columns=lambda x: x.strip())\n",
- " data[\"cluster\"] = data[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n",
- " data[\"date_of_birth\"] = data[\"date_of_birth\"].astype(str).str.strip()\n",
- " data[\"soc_sec_id\"] = data[\"soc_sec_id\"].astype(str).str.strip()\n",
- " data[\"postcode\"] = data[\"postcode\"].astype(str).str.strip()\n",
- " return data\n",
- "\n",
- "\n",
- "dfs = [prepare_data(dataset) for dataset in [df_a, df_b]]\n",
- "\n",
- "display(dfs[0].head(2))\n",
- "display(dfs[1].head(2))"
+ "text/plain": [
+ " rec_id given_name surname street_number address_1 \\\n",
+ "0 rec-1070-org michaela neumann 8 stanley street \n",
+ "1 rec-1016-org courtney painter 12 pinkerton circuit \n",
+ "\n",
+ " address_2 suburb postcode state date_of_birth soc_sec_id \\\n",
+ "0 miami winston hills 4223 nsw 19151111 5304218 \n",
+ "1 bega flats richlands 4560 vic 19161214 4066625 \n",
+ "\n",
+ " cluster \n",
+ "0 rec-1070 \n",
+ "1 rec-1016 "
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "markdown",
- "id": "8aebb0dd-28c1-44b8-9e12-e872b97f7583",
- "metadata": {},
- "source": [
- "Next, to better understand which variables will prove useful in linking, we have a look at how populated each column is, as well as the distribution of unique values within each\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "3233c3e1-3e6b-4abc-8bed-c26e8b463c2a",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:41.960684Z",
- "iopub.status.busy": "2024-06-07T09:16:41.960330Z",
- "iopub.status.idle": "2024-06-07T09:16:42.175342Z",
- "shell.execute_reply": "2024-06-07T09:16:42.174611Z"
- }
- },
- "outputs": [],
- "source": [
- "from splink import DuckDBAPI, Linker, SettingsCreator\n",
- "\n",
- "basic_settings = SettingsCreator(\n",
- " unique_id_column_name=\"rec_id\",\n",
- " link_type=\"link_only\",\n",
- " # NB as we are linking one-one, we know the probability that a random pair will be a match\n",
- " # hence we could set:\n",
- " # \"probability_two_random_records_match\": 1/5000,\n",
- " # however we will not specify this here, as we will use this as a check that\n",
- " # our estimation procedure returns something sensible\n",
- ")\n",
- "\n",
- "linker = Linker(dfs, basic_settings, db_api=DuckDBAPI())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c540f670",
- "metadata": {},
- "source": [
- "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "319ffdbc-7853-40a9-b331-e635d96b6fdc",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:42.178669Z",
- "iopub.status.busy": "2024-06-07T09:16:42.178397Z",
- "iopub.status.idle": "2024-06-07T09:16:42.558301Z",
- "shell.execute_reply": "2024-06-07T09:16:42.557736Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " rec_id | \n",
+ " given_name | \n",
+ " surname | \n",
+ " street_number | \n",
+ " address_1 | \n",
+ " address_2 | \n",
+ " suburb | \n",
+ " postcode | \n",
+ " state | \n",
+ " date_of_birth | \n",
+ " soc_sec_id | \n",
+ " cluster | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " rec-561-dup-0 | \n",
+ " elton | \n",
+ " | \n",
+ " 3 | \n",
+ " light setreet | \n",
+ " pinehill | \n",
+ " windermere | \n",
+ " 3212 | \n",
+ " vic | \n",
+ " 19651013 | \n",
+ " 1551941 | \n",
+ " rec-561 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " rec-2642-dup-0 | \n",
+ " mitchell | \n",
+ " maxon | \n",
+ " 47 | \n",
+ " edkins street | \n",
+ " lochaoair | \n",
+ " north ryde | \n",
+ " 3355 | \n",
+ " nsw | \n",
+ " 19390212 | \n",
+ " 8859999 | \n",
+ " rec-2642 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink.exploratory import completeness_chart\n",
- "\n",
- "completeness_chart(dfs, db_api=DuckDBAPI())"
+ "text/plain": [
+ " rec_id given_name surname street_number address_1 \\\n",
+ "0 rec-561-dup-0 elton 3 light setreet \n",
+ "1 rec-2642-dup-0 mitchell maxon 47 edkins street \n",
+ "\n",
+ " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n",
+ "0 pinehill windermere 3212 vic 19651013 1551941 rec-561 \n",
+ "1 lochaoair north ryde 3355 nsw 19390212 8859999 rec-2642 "
]
- },
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from splink import splink_datasets\n",
+ "\n",
+ "df_a = splink_datasets.febrl4a\n",
+ "df_b = splink_datasets.febrl4b\n",
+ "\n",
+ "\n",
+ "def prepare_data(data):\n",
+ " data = data.rename(columns=lambda x: x.strip())\n",
+ " data[\"cluster\"] = data[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n",
+ " data[\"date_of_birth\"] = data[\"date_of_birth\"].astype(str).str.strip()\n",
+ " data[\"soc_sec_id\"] = data[\"soc_sec_id\"].astype(str).str.strip()\n",
+ " data[\"postcode\"] = data[\"postcode\"].astype(str).str.strip()\n",
+ " return data\n",
+ "\n",
+ "\n",
+ "dfs = [prepare_data(dataset) for dataset in [df_a, df_b]]\n",
+ "\n",
+ "display(dfs[0].head(2))\n",
+ "display(dfs[1].head(2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8aebb0dd-28c1-44b8-9e12-e872b97f7583",
+ "metadata": {},
+ "source": [
+ "Next, to better understand which variables will prove useful in linking, we have a look at how populated each column is, as well as the distribution of unique values within each\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3233c3e1-3e6b-4abc-8bed-c26e8b463c2a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:41.960684Z",
+ "iopub.status.busy": "2024-06-07T09:16:41.960330Z",
+ "iopub.status.idle": "2024-06-07T09:16:42.175342Z",
+ "shell.execute_reply": "2024-06-07T09:16:42.174611Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import DuckDBAPI, Linker, SettingsCreator\n",
+ "\n",
+ "basic_settings = SettingsCreator(\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ " link_type=\"link_only\",\n",
+ " # NB as we are linking one-one, we know the probability that a random pair will be a match\n",
+ " # hence we could set:\n",
+ " # \"probability_two_random_records_match\": 1/5000,\n",
+ " # however we will not specify this here, as we will use this as a check that\n",
+ " # our estimation procedure returns something sensible\n",
+ ")\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "linker = Linker(dfs_sdf, basic_settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c540f670",
+ "metadata": {},
+ "source": [
+ "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "319ffdbc-7853-40a9-b331-e635d96b6fdc",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:42.178669Z",
+ "iopub.status.busy": "2024-06-07T09:16:42.178397Z",
+ "iopub.status.idle": "2024-06-07T09:16:42.558301Z",
+ "shell.execute_reply": "2024-06-07T09:16:42.557736Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 19,
- "id": "dff8dfca-57c8-42bf-878c-da9dd23d2682",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:42.561536Z",
- "iopub.status.busy": "2024-06-07T09:16:42.561314Z",
- "iopub.status.idle": "2024-06-07T09:16:43.066015Z",
- "shell.execute_reply": "2024-06-07T09:16:43.065469Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink.exploratory import profile_columns\n",
- "\n",
- "profile_columns(dfs, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
- {
- "cell_type": "markdown",
- "id": "935fc769-8678-494b-96d9-f499c34ae061",
- "metadata": {},
- "source": [
- "Next let's come up with some candidate blocking rules, which define which record comparisons are generated, and have a look at how many comparisons each will generate.\n",
- "\n",
- "For blocking rules that we use in prediction, our aim is to have the union of all rules cover all true matches, whilst avoiding generating so many comparisons that it becomes computationally intractable - i.e. each true match should have at least _one_ of the following conditions holding.\n"
- ]
- },
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.exploratory import completeness_chart\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "completeness_chart(dfs_sdf)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dff8dfca-57c8-42bf-878c-da9dd23d2682",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:42.561536Z",
+ "iopub.status.busy": "2024-06-07T09:16:42.561314Z",
+ "iopub.status.idle": "2024-06-07T09:16:43.066015Z",
+ "shell.execute_reply": "2024-06-07T09:16:43.065469Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 20,
- "id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:43.069224Z",
- "iopub.status.busy": "2024-06-07T09:16:43.068982Z",
- "iopub.status.idle": "2024-06-07T09:16:43.684745Z",
- "shell.execute_reply": "2024-06-07T09:16:43.684041Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import DuckDBAPI, block_on\n",
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "blocking_rules = [\n",
- " block_on(\"given_name\", \"surname\"),\n",
- " # A blocking rule can also be an aribtrary SQL expression\n",
- " \"l.given_name = r.surname and l.surname = r.given_name\",\n",
- " block_on(\"date_of_birth\"),\n",
- " block_on(\"soc_sec_id\"),\n",
- " block_on(\"state\", \"address_1\"),\n",
- " block_on(\"street_number\", \"address_1\"),\n",
- " block_on(\"postcode\"),\n",
- "]\n",
- "\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=dfs,\n",
- " blocking_rules=blocking_rules,\n",
- " db_api=db_api,\n",
- " link_type=\"link_only\",\n",
- " unique_id_column_name=\"rec_id\",\n",
- " source_dataset_column_name=\"source_dataset\",\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c91c8946-94e3-4ee0-b43f-2d9675339ac9",
- "metadata": {},
- "source": [
- "The broadest rule, having a matching postcode, unsurpisingly gives the largest number of comparisons.\n",
- "For this small dataset we still have a very manageable number, but if it was larger we might have needed to include a further `AND` condition with it to break the number of comparisons further.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8fe64895-9292-4c86-983e-2ec3f140d12c",
- "metadata": {},
- "source": [
- "Now we get the full settings by including the blocking rules, as well as deciding the actual comparisons we will be including in our model.\n",
- "\n",
- "We will define two models, each with a separate linker with different settings, so that we can compare performance. One will be a very basic model, whilst the other will include a lot more detail.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "f6360b69-2d52-4f1a-9199-2edf2339ec63",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:43.687914Z",
- "iopub.status.busy": "2024-06-07T09:16:43.687640Z",
- "iopub.status.idle": "2024-06-07T09:16:44.021204Z",
- "shell.execute_reply": "2024-06-07T09:16:44.020435Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_level_library as cll\n",
- "import splink.comparison_library as cl\n",
- "\n",
- "\n",
- "# the simple model only considers a few columns, and only two comparison levels for each\n",
- "simple_model_settings = SettingsCreator(\n",
- " unique_id_column_name=\"rec_id\",\n",
- " link_type=\"link_only\",\n",
- " blocking_rules_to_generate_predictions=blocking_rules,\n",
- " comparisons=[\n",
- " cl.ExactMatch(\"given_name\").configure(term_frequency_adjustments=True),\n",
- " cl.ExactMatch(\"surname\").configure(term_frequency_adjustments=True),\n",
- " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")\n",
- "\n",
- "# the detailed model considers more columns, using the information we saw in the exploratory phase\n",
- "# we also include further comparison levels to account for typos and other differences\n",
- "detailed_model_settings = SettingsCreator(\n",
- " unique_id_column_name=\"rec_id\",\n",
- " link_type=\"link_only\",\n",
- " blocking_rules_to_generate_predictions=blocking_rules,\n",
- " comparisons=[\n",
- " cl.NameComparison(\"given_name\").configure(term_frequency_adjustments=True),\n",
- " cl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n",
- " cl.DateOfBirthComparison(\n",
- " \"date_of_birth\",\n",
- " input_is_string=True,\n",
- " datetime_format=\"%Y%m%d\",\n",
- " invalid_dates_as_null=True,\n",
- " ),\n",
- " cl.DamerauLevenshteinAtThresholds(\"soc_sec_id\", [1, 2]),\n",
- " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n",
- " cl.DamerauLevenshteinAtThresholds(\"postcode\", [1, 2]).configure(\n",
- " term_frequency_adjustments=True\n",
- " ),\n",
- " # we don't consider further location columns as they will be strongly correlated with postcode\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")\n",
- "\n",
- "\n",
- "linker_simple = Linker(dfs, simple_model_settings, db_api=DuckDBAPI())\n",
- "linker_detailed = Linker(dfs, detailed_model_settings, db_api=DuckDBAPI())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4b151420-f53b-4dab-9d80-238892cffd53",
- "metadata": {},
- "source": [
- "### Estimating model parameters\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "27f4d86a-3ec0-4d31-a8c7-eae2952e76a4",
- "metadata": {},
- "source": [
- "We need to furnish our models with parameter estimates so that we can generate results. We will focus on the detailed model, generating the values for the simple model at the end\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3684d83f-44ce-46af-b3bd-0725f001b8d4",
- "metadata": {},
- "source": [
- "We can instead estimate the probability two random records match, and compare with the known value of 1/5000 = 0.0002, to see how well our estimation procedure works.\n",
- "\n",
- "To do this we come up with some deterministic rules - the aim here is that we generate very few false positives (i.e. we expect that the majority of records with at least one of these conditions holding are true matches), whilst also capturing the majority of matches - our guess here is that these two rules should capture 80% of all matches.\n"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.exploratory import profile_columns\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "profile_columns(dfs_sdf, column_expressions=[\"given_name\", \"surname\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "935fc769-8678-494b-96d9-f499c34ae061",
+ "metadata": {},
+ "source": [
+ "Next let's come up with some candidate blocking rules, which define which record comparisons are generated, and have a look at how many comparisons each will generate.\n",
+ "\n",
+ "For blocking rules that we use in prediction, our aim is to have the union of all rules cover all true matches, whilst avoiding generating so many comparisons that it becomes computationally intractable - i.e. each true match should have at least _one_ of the following conditions holding.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:43.069224Z",
+ "iopub.status.busy": "2024-06-07T09:16:43.068982Z",
+ "iopub.status.idle": "2024-06-07T09:16:43.684745Z",
+ "shell.execute_reply": "2024-06-07T09:16:43.684041Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 22,
- "id": "7ad48419-4eda-4fe5-b00f-2ec9f798e0e8",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:44.024887Z",
- "iopub.status.busy": "2024-06-07T09:16:44.024650Z",
- "iopub.status.idle": "2024-06-07T09:16:44.225016Z",
- "shell.execute_reply": "2024-06-07T09:16:44.224395Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.000239.\n",
- "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "deterministic_rules = [\n",
- " block_on(\"soc_sec_id\"),\n",
- " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n",
- "]\n",
- "\n",
- "linker_detailed.training.estimate_probability_two_random_records_match(\n",
- " deterministic_rules, recall=0.8\n",
- ")"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI, block_on\n",
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "blocking_rules = [\n",
+ " block_on(\"given_name\", \"surname\"),\n",
+ " # A blocking rule can also be an aribtrary SQL expression\n",
+ " \"l.given_name = r.surname and l.surname = r.given_name\",\n",
+ " block_on(\"date_of_birth\"),\n",
+ " block_on(\"soc_sec_id\"),\n",
+ " block_on(\"state\", \"address_1\"),\n",
+ " block_on(\"street_number\", \"address_1\"),\n",
+ " block_on(\"postcode\"),\n",
+ "]\n",
+ "\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " dfs_sdf,\n",
+ " blocking_rules=blocking_rules,\n",
+ " link_type=\"link_only\",\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ " source_dataset_column_name=\"source_dataset\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c91c8946-94e3-4ee0-b43f-2d9675339ac9",
+ "metadata": {},
+ "source": [
+ "The broadest rule, having a matching postcode, unsurpisingly gives the largest number of comparisons.\n",
+ "For this small dataset we still have a very manageable number, but if it was larger we might have needed to include a further `AND` condition with it to break the number of comparisons further.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fe64895-9292-4c86-983e-2ec3f140d12c",
+ "metadata": {},
+ "source": [
+ "Now we get the full settings by including the blocking rules, as well as deciding the actual comparisons we will be including in our model.\n",
+ "\n",
+ "We will define two models, each with a separate linker with different settings, so that we can compare performance. One will be a very basic model, whilst the other will include a lot more detail.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f6360b69-2d52-4f1a-9199-2edf2339ec63",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:43.687914Z",
+ "iopub.status.busy": "2024-06-07T09:16:43.687640Z",
+ "iopub.status.idle": "2024-06-07T09:16:44.021204Z",
+ "shell.execute_reply": "2024-06-07T09:16:44.020435Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_level_library as cll\n",
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "\n",
+ "# the simple model only considers a few columns, and only two comparison levels for each\n",
+ "simple_model_settings = SettingsCreator(\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ " link_type=\"link_only\",\n",
+ " blocking_rules_to_generate_predictions=blocking_rules,\n",
+ " comparisons=[\n",
+ " cl.ExactMatch(\"given_name\").configure(term_frequency_adjustments=True),\n",
+ " cl.ExactMatch(\"surname\").configure(term_frequency_adjustments=True),\n",
+ " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "# the detailed model considers more columns, using the information we saw in the exploratory phase\n",
+ "# we also include further comparison levels to account for typos and other differences\n",
+ "detailed_model_settings = SettingsCreator(\n",
+ " unique_id_column_name=\"rec_id\",\n",
+ " link_type=\"link_only\",\n",
+ " blocking_rules_to_generate_predictions=blocking_rules,\n",
+ " comparisons=[\n",
+ " cl.NameComparison(\"given_name\").configure(term_frequency_adjustments=True),\n",
+ " cl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"date_of_birth\",\n",
+ " input_is_string=True,\n",
+ " datetime_format=\"%Y%m%d\",\n",
+ " invalid_dates_as_null=True,\n",
+ " ),\n",
+ " cl.DamerauLevenshteinAtThresholds(\"soc_sec_id\", [1, 2]),\n",
+ " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n",
+ " cl.DamerauLevenshteinAtThresholds(\"postcode\", [1, 2]).configure(\n",
+ " term_frequency_adjustments=True\n",
+ " ),\n",
+ " # we don't consider further location columns as they will be strongly correlated with postcode\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "linker_simple = Linker(dfs_sdf, simple_model_settings)\n",
+ "linker_detailed = Linker(dfs_sdf, detailed_model_settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b151420-f53b-4dab-9d80-238892cffd53",
+ "metadata": {},
+ "source": [
+ "### Estimating model parameters\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27f4d86a-3ec0-4d31-a8c7-eae2952e76a4",
+ "metadata": {},
+ "source": [
+ "We need to furnish our models with parameter estimates so that we can generate results. We will focus on the detailed model, generating the values for the simple model at the end\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3684d83f-44ce-46af-b3bd-0725f001b8d4",
+ "metadata": {},
+ "source": [
+ "We can instead estimate the probability two random records match, and compare with the known value of 1/5000 = 0.0002, to see how well our estimation procedure works.\n",
+ "\n",
+ "To do this we come up with some deterministic rules - the aim here is that we generate very few false positives (i.e. we expect that the majority of records with at least one of these conditions holding are true matches), whilst also capturing the majority of matches - our guess here is that these two rules should capture 80% of all matches.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7ad48419-4eda-4fe5-b00f-2ec9f798e0e8",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:44.024887Z",
+ "iopub.status.busy": "2024-06-07T09:16:44.024650Z",
+ "iopub.status.idle": "2024-06-07T09:16:44.225016Z",
+ "shell.execute_reply": "2024-06-07T09:16:44.224395Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "0e035592-b1bb-4e27-a5b9-e890810088fb",
- "metadata": {},
- "source": [
- "Even playing around with changing these deterministic rules, or the nominal recall leaves us with an answer which is pretty close to our known value\n"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.000239.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "deterministic_rules = [\n",
+ " block_on(\"soc_sec_id\"),\n",
+ " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n",
+ "]\n",
+ "\n",
+ "linker_detailed.training.estimate_probability_two_random_records_match(\n",
+ " deterministic_rules, recall=0.8\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e035592-b1bb-4e27-a5b9-e890810088fb",
+ "metadata": {},
+ "source": [
+ "Even playing around with changing these deterministic rules, or the nominal recall leaves us with an answer which is pretty close to our known value\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bdaaa245-4bd9-476c-9ead-c5f28597aa7e",
+ "metadata": {},
+ "source": [
+ "Next we estimate `u` and `m` values for each comparison, so that we can move to generating predictions\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "e40ee288-0c42-4cda-aaf1-3ffb2ea02383",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:44.228813Z",
+ "iopub.status.busy": "2024-06-07T09:16:44.228526Z",
+ "iopub.status.idle": "2024-06-07T09:16:50.708588Z",
+ "shell.execute_reply": "2024-06-07T09:16:50.707955Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "bdaaa245-4bd9-476c-9ead-c5f28597aa7e",
- "metadata": {},
- "source": [
- "Next we estimate `u` and `m` values for each comparison, so that we can move to generating predictions\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
+ "----- Estimating u probabilities using random sampling -----\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 23,
- "id": "e40ee288-0c42-4cda-aaf1-3ffb2ea02383",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:44.228813Z",
- "iopub.status.busy": "2024-06-07T09:16:44.228526Z",
- "iopub.status.idle": "2024-06-07T09:16:50.708588Z",
- "shell.execute_reply": "2024-06-07T09:16:50.707955Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b260f117b61c4603b0563dd130acded6",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
- "----- Estimating u probabilities using random sampling -----\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "b260f117b61c4603b0563dd130acded6",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - given_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - date_of_birth (some u values are not trained, no m values are trained).\n",
- " - soc_sec_id (no m values are trained).\n",
- " - street_number (no m values are trained).\n",
- " - postcode (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "# We generally recommend setting max pairs higher (e.g. 1e7 or more)\n",
- "# But this will run faster for the purpose of this demo\n",
- "linker_detailed.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "markdown",
- "id": "614f6e19-14bb-4d40-9b95-36593b6de9ba",
- "metadata": {},
- "source": [
- "When training the `m` values using expectation maximisation, we need somre more blocking rules to reduce the total number of comparisons. For each rule, we want to ensure that we have neither proportionally too many matches, or too few.\n",
- "\n",
- "We must run this multiple times using different rules so that we can obtain estimates for all comparisons - if we block on e.g. `date_of_birth`, then we cannot compute the `m` values for the `date_of_birth` comparison, as we have only looked at records where these match.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "9ee0f49b-084c-45aa-8c6b-ec5da11c2cc4",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:50.712950Z",
- "iopub.status.busy": "2024-06-07T09:16:50.712681Z",
- "iopub.status.idle": "2024-06-07T09:16:52.276811Z",
- "shell.execute_reply": "2024-06-07T09:16:52.276216Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"date_of_birth\" = r.\"date_of_birth\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - given_name\n",
- " - surname\n",
- " - soc_sec_id\n",
- " - street_number\n",
- " - postcode\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - date_of_birth\n",
- "\n",
- "Iteration 1: Largest change in params was -0.331 in probability_two_random_records_match\n",
- "Iteration 2: Largest change in params was 0.00365 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 9.22e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n",
- "\n",
- "EM converged after 3 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - date_of_birth (some u values are not trained, no m values are trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"postcode\" = r.\"postcode\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - given_name\n",
- " - surname\n",
- " - date_of_birth\n",
- " - soc_sec_id\n",
- " - street_number\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - postcode\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n",
- "Iteration 2: Largest change in params was 0.000457 in the m_probability of date_of_birth, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 7.66e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n",
- "\n",
- "EM converged after 3 iterations\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - date_of_birth (some u values are not trained, some m values are not trained).\n"
- ]
- }
- ],
- "source": [
- "session_dob = (\n",
- " linker_detailed.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"date_of_birth\"), estimate_without_term_frequencies=True\n",
- " )\n",
- ")\n",
- "session_pc = (\n",
- " linker_detailed.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"postcode\"), estimate_without_term_frequencies=True\n",
- " )\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - given_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - date_of_birth (some u values are not trained, no m values are trained).\n",
+ " - soc_sec_id (no m values are trained).\n",
+ " - street_number (no m values are trained).\n",
+ " - postcode (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "# We generally recommend setting max pairs higher (e.g. 1e7 or more)\n",
+ "# But this will run faster for the purpose of this demo\n",
+ "linker_detailed.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "614f6e19-14bb-4d40-9b95-36593b6de9ba",
+ "metadata": {},
+ "source": [
+ "When training the `m` values using expectation maximisation, we need somre more blocking rules to reduce the total number of comparisons. For each rule, we want to ensure that we have neither proportionally too many matches, or too few.\n",
+ "\n",
+ "We must run this multiple times using different rules so that we can obtain estimates for all comparisons - if we block on e.g. `date_of_birth`, then we cannot compute the `m` values for the `date_of_birth` comparison, as we have only looked at records where these match.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "9ee0f49b-084c-45aa-8c6b-ec5da11c2cc4",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:50.712950Z",
+ "iopub.status.busy": "2024-06-07T09:16:50.712681Z",
+ "iopub.status.idle": "2024-06-07T09:16:52.276811Z",
+ "shell.execute_reply": "2024-06-07T09:16:52.276216Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "ba8ed5fa-7003-46a9-bc40-4ae7cfb40953",
- "metadata": {},
- "source": [
- "If we wish we can have a look at how our parameter estimates changes over these training sessions\n"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"date_of_birth\" = r.\"date_of_birth\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - given_name\n",
+ " - surname\n",
+ " - soc_sec_id\n",
+ " - street_number\n",
+ " - postcode\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - date_of_birth\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.331 in probability_two_random_records_match\n",
+ "Iteration 2: Largest change in params was 0.00365 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 9.22e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 3 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - date_of_birth (some u values are not trained, no m values are trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"postcode\" = r.\"postcode\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - given_name\n",
+ " - surname\n",
+ " - date_of_birth\n",
+ " - soc_sec_id\n",
+ " - street_number\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - postcode\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n",
+ "Iteration 2: Largest change in params was 0.000457 in the m_probability of date_of_birth, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 7.66e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 3 iterations\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - date_of_birth (some u values are not trained, some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "session_dob = (\n",
+ " linker_detailed.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"date_of_birth\"), estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")\n",
+ "session_pc = (\n",
+ " linker_detailed.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"postcode\"), estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba8ed5fa-7003-46a9-bc40-4ae7cfb40953",
+ "metadata": {},
+ "source": [
+ "If we wish we can have a look at how our parameter estimates changes over these training sessions\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "31ef6844-6be8-4f01-9ff7-5dfebcf12ae1",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:52.281563Z",
+ "iopub.status.busy": "2024-06-07T09:16:52.281303Z",
+ "iopub.status.idle": "2024-06-07T09:16:52.513958Z",
+ "shell.execute_reply": "2024-06-07T09:16:52.513314Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 25,
- "id": "31ef6844-6be8-4f01-9ff7-5dfebcf12ae1",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:52.281563Z",
- "iopub.status.busy": "2024-06-07T09:16:52.281303Z",
- "iopub.status.idle": "2024-06-07T09:16:52.513958Z",
- "shell.execute_reply": "2024-06-07T09:16:52.513314Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "session_dob.m_u_values_interactive_history_chart()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cffd7f8f-6cea-4ef7-87c7-c6a9c1775cf2",
- "metadata": {},
- "source": [
- "For variables that aren't used in the `m`-training blocking rules, we have two estimates --- one from each of the training sessions (see for example `street_number`). We can have a look at how the values compare between them, to ensure that we don't have drastically different values, which may be indicative of an issue.\n"
+ "text/plain": [
+ "alt.HConcatChart(...)"
]
- },
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "session_dob.m_u_values_interactive_history_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cffd7f8f-6cea-4ef7-87c7-c6a9c1775cf2",
+ "metadata": {},
+ "source": [
+ "For variables that aren't used in the `m`-training blocking rules, we have two estimates --- one from each of the training sessions (see for example `street_number`). We can have a look at how the values compare between them, to ensure that we don't have drastically different values, which may be indicative of an issue.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "8d260a60-a4fa-4c0d-9853-8b8256a24257",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:52.517168Z",
+ "iopub.status.busy": "2024-06-07T09:16:52.516948Z",
+ "iopub.status.idle": "2024-06-07T09:16:52.637604Z",
+ "shell.execute_reply": "2024-06-07T09:16:52.636662Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 26,
- "id": "8d260a60-a4fa-4c0d-9853-8b8256a24257",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:52.517168Z",
- "iopub.status.busy": "2024-06-07T09:16:52.516948Z",
- "iopub.status.idle": "2024-06-07T09:16:52.637604Z",
- "shell.execute_reply": "2024-06-07T09:16:52.636662Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_detailed.visualisations.parameter_estimate_comparisons_chart()"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_detailed.visualisations.parameter_estimate_comparisons_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "25e3e343-603a-4aed-a5ac-5de42af5f8ad",
+ "metadata": {},
+ "source": [
+ "We repeat our parameter estimations for the simple model in much the same fashion\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "71f2f166-05cd-4038-a289-a053a1f0b5c5",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:52.640970Z",
+ "iopub.status.busy": "2024-06-07T09:16:52.640725Z",
+ "iopub.status.idle": "2024-06-07T09:16:54.701590Z",
+ "shell.execute_reply": "2024-06-07T09:16:54.701058Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "25e3e343-603a-4aed-a5ac-5de42af5f8ad",
- "metadata": {},
- "source": [
- "We repeat our parameter estimations for the simple model in much the same fashion\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.000239.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n",
+ "----- Estimating u probabilities using random sampling -----\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - given_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - street_number (no m values are trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"given_name\" = r.\"given_name\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - surname\n",
+ " - street_number\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - given_name\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.0812 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 2: Largest change in params was -0.0261 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 3: Largest change in params was -0.0247 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 4: Largest change in params was 0.0227 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 5: Largest change in params was -0.0198 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 6: Largest change in params was 0.0164 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was -0.0131 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 8: Largest change in params was 0.0101 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 9: Largest change in params was -0.00769 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 10: Largest change in params was 0.00576 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 11: Largest change in params was -0.00428 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 12: Largest change in params was 0.00316 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 13: Largest change in params was -0.00234 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 14: Largest change in params was -0.00172 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 15: Largest change in params was 0.00127 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 16: Largest change in params was -0.000939 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 17: Largest change in params was -0.000694 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 18: Largest change in params was -0.000514 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 19: Largest change in params was -0.000381 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 20: Largest change in params was -0.000282 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 21: Largest change in params was 0.00021 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 22: Largest change in params was -0.000156 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 23: Largest change in params was 0.000116 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 24: Largest change in params was 8.59e-05 in the m_probability of surname, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 24 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - given_name (no m values are trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"street_number\" = r.\"street_number\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - given_name\n",
+ " - surname\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - street_number\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.0446 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 2: Largest change in params was -0.0285 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was -0.026 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 4: Largest change in params was 0.0252 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 5: Largest change in params was -0.0231 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 6: Largest change in params was -0.02 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 7: Largest change in params was -0.0164 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 8: Largest change in params was -0.013 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 9: Largest change in params was 0.01 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 10: Largest change in params was -0.00757 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 11: Largest change in params was 0.00564 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 12: Largest change in params was -0.00419 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 13: Largest change in params was 0.0031 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 14: Largest change in params was -0.00231 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 15: Largest change in params was -0.00173 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 16: Largest change in params was 0.0013 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 17: Largest change in params was 0.000988 in the m_probability of given_name, level `All other comparisons`\n",
+ "Iteration 18: Largest change in params was -0.000756 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 19: Largest change in params was -0.000584 in the m_probability of given_name, level `Exact match on given_name`\n",
+ "Iteration 20: Largest change in params was -0.000465 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 21: Largest change in params was -0.000388 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 22: Largest change in params was -0.000322 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 23: Largest change in params was 0.000266 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 24: Largest change in params was -0.000219 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 25: Largest change in params was -0.00018 in the m_probability of surname, level `Exact match on surname`\n",
+ "\n",
+ "EM converged after 25 iterations\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 27,
- "id": "71f2f166-05cd-4038-a289-a053a1f0b5c5",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:52.640970Z",
- "iopub.status.busy": "2024-06-07T09:16:52.640725Z",
- "iopub.status.idle": "2024-06-07T09:16:54.701590Z",
- "shell.execute_reply": "2024-06-07T09:16:54.701058Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.000239.\n",
- "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n",
- "----- Estimating u probabilities using random sampling -----\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - given_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - street_number (no m values are trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"given_name\" = r.\"given_name\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - surname\n",
- " - street_number\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - given_name\n",
- "\n",
- "Iteration 1: Largest change in params was 0.0812 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 2: Largest change in params was -0.0261 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 3: Largest change in params was -0.0247 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 4: Largest change in params was 0.0227 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 5: Largest change in params was -0.0198 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 6: Largest change in params was 0.0164 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 7: Largest change in params was -0.0131 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 8: Largest change in params was 0.0101 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 9: Largest change in params was -0.00769 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 10: Largest change in params was 0.00576 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 11: Largest change in params was -0.00428 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 12: Largest change in params was 0.00316 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 13: Largest change in params was -0.00234 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 14: Largest change in params was -0.00172 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 15: Largest change in params was 0.00127 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 16: Largest change in params was -0.000939 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 17: Largest change in params was -0.000694 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 18: Largest change in params was -0.000514 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 19: Largest change in params was -0.000381 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 20: Largest change in params was -0.000282 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 21: Largest change in params was 0.00021 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 22: Largest change in params was -0.000156 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 23: Largest change in params was 0.000116 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 24: Largest change in params was 8.59e-05 in the m_probability of surname, level `All other comparisons`\n",
- "\n",
- "EM converged after 24 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - given_name (no m values are trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"street_number\" = r.\"street_number\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - given_name\n",
- " - surname\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - street_number\n",
- "\n",
- "Iteration 1: Largest change in params was -0.0446 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 2: Largest change in params was -0.0285 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was -0.026 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 4: Largest change in params was 0.0252 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 5: Largest change in params was -0.0231 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 6: Largest change in params was -0.02 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 7: Largest change in params was -0.0164 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 8: Largest change in params was -0.013 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 9: Largest change in params was 0.01 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 10: Largest change in params was -0.00757 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 11: Largest change in params was 0.00564 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 12: Largest change in params was -0.00419 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 13: Largest change in params was 0.0031 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 14: Largest change in params was -0.00231 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 15: Largest change in params was -0.00173 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 16: Largest change in params was 0.0013 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 17: Largest change in params was 0.000988 in the m_probability of given_name, level `All other comparisons`\n",
- "Iteration 18: Largest change in params was -0.000756 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 19: Largest change in params was -0.000584 in the m_probability of given_name, level `Exact match on given_name`\n",
- "Iteration 20: Largest change in params was -0.000465 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 21: Largest change in params was -0.000388 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 22: Largest change in params was -0.000322 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 23: Largest change in params was 0.000266 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 24: Largest change in params was -0.000219 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 25: Largest change in params was -0.00018 in the m_probability of surname, level `Exact match on surname`\n",
- "\n",
- "EM converged after 25 iterations\n",
- "\n",
- "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_simple.training.estimate_probability_two_random_records_match(\n",
- " deterministic_rules, recall=0.8\n",
- ")\n",
- "linker_simple.training.estimate_u_using_random_sampling(max_pairs=1e7)\n",
- "session_ssid = (\n",
- " linker_simple.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"given_name\"), estimate_without_term_frequencies=True\n",
- " )\n",
- ")\n",
- "session_pc = linker_simple.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"street_number\"), estimate_without_term_frequencies=True\n",
- ")\n",
- "linker_simple.visualisations.parameter_estimate_comparisons_chart()"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "3a87cb78-0e97-40a3-b757-6c99bb19d7b1",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:54.704569Z",
- "iopub.status.busy": "2024-06-07T09:16:54.704327Z",
- "iopub.status.idle": "2024-06-07T09:16:54.707573Z",
- "shell.execute_reply": "2024-06-07T09:16:54.707000Z"
- }
- },
- "outputs": [],
- "source": [
- "# import json\n",
- "# we can have a look at the full settings if we wish, including the values of our estimated parameters:\n",
- "# print(json.dumps(linker_detailed._settings_obj.as_dict(), indent=2))\n",
- "# we can also get a handy summary of of the model in an easily readable format if we wish:\n",
- "# print(linker_detailed._settings_obj.human_readable_description)\n",
- "# (we suppress output here for brevity)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "76f453df-848b-4f06-bbb7-d88ee710ae64",
- "metadata": {},
- "source": [
- "We can now visualise some of the details of our models. We can look at the match weights, which tell us the relative importance for/against a match for each of our comparsion levels.\n",
- "\n",
- "Comparing the two models will show the added benefit we get in the more detailed model --- what in the simple model is classed as 'all other comparisons' is instead broken down further, and we can see that the detail of how this is broken down in fact gives us quite a bit of useful information about the likelihood of a match.\n"
- ]
- },
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_simple.training.estimate_probability_two_random_records_match(\n",
+ " deterministic_rules, recall=0.8\n",
+ ")\n",
+ "linker_simple.training.estimate_u_using_random_sampling(max_pairs=1e7)\n",
+ "session_ssid = (\n",
+ " linker_simple.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"given_name\"), estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")\n",
+ "session_pc = linker_simple.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"street_number\"), estimate_without_term_frequencies=True\n",
+ ")\n",
+ "linker_simple.visualisations.parameter_estimate_comparisons_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "3a87cb78-0e97-40a3-b757-6c99bb19d7b1",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:54.704569Z",
+ "iopub.status.busy": "2024-06-07T09:16:54.704327Z",
+ "iopub.status.idle": "2024-06-07T09:16:54.707573Z",
+ "shell.execute_reply": "2024-06-07T09:16:54.707000Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# import json\n",
+ "# we can have a look at the full settings if we wish, including the values of our estimated parameters:\n",
+ "# print(json.dumps(linker_detailed._settings_obj.as_dict(), indent=2))\n",
+ "# we can also get a handy summary of of the model in an easily readable format if we wish:\n",
+ "# print(linker_detailed._settings_obj.human_readable_description)\n",
+ "# (we suppress output here for brevity)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "76f453df-848b-4f06-bbb7-d88ee710ae64",
+ "metadata": {},
+ "source": [
+ "We can now visualise some of the details of our models. We can look at the match weights, which tell us the relative importance for/against a match for each of our comparsion levels.\n",
+ "\n",
+ "Comparing the two models will show the added benefit we get in the more detailed model --- what in the simple model is classed as 'all other comparisons' is instead broken down further, and we can see that the detail of how this is broken down in fact gives us quite a bit of useful information about the likelihood of a match.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "b17b131c-c83e-4c32-bfad-c12021d2c3b7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:54.710434Z",
+ "iopub.status.busy": "2024-06-07T09:16:54.710226Z",
+ "iopub.status.idle": "2024-06-07T09:16:54.974408Z",
+ "shell.execute_reply": "2024-06-07T09:16:54.973855Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 29,
- "id": "b17b131c-c83e-4c32-bfad-c12021d2c3b7",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:54.710434Z",
- "iopub.status.busy": "2024-06-07T09:16:54.710226Z",
- "iopub.status.idle": "2024-06-07T09:16:54.974408Z",
- "shell.execute_reply": "2024-06-07T09:16:54.973855Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_simple.visualisations.match_weights_chart()"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_simple.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "c095ff2b-405b-427c-849f-1468f6ca98e0",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:54.977562Z",
+ "iopub.status.busy": "2024-06-07T09:16:54.977352Z",
+ "iopub.status.idle": "2024-06-07T09:16:55.252915Z",
+ "shell.execute_reply": "2024-06-07T09:16:55.251950Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 30,
- "id": "c095ff2b-405b-427c-849f-1468f6ca98e0",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:54.977562Z",
- "iopub.status.busy": "2024-06-07T09:16:54.977352Z",
- "iopub.status.idle": "2024-06-07T09:16:55.252915Z",
- "shell.execute_reply": "2024-06-07T09:16:55.251950Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_detailed.visualisations.match_weights_chart()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e08287f4-711a-4960-b6e6-3f3d19ca8667",
- "metadata": {},
- "source": [
- "As well as the match weights, which give us an idea of the overall effect of each comparison level, we can also look at the individual `u` and `m` parameter estimates, which tells us about the prevalence of coincidences and mistakes (for further details/explanation about this see [this article](https://www.robinlinacre.com/maths_of_fellegi_sunter/)). We might want to revise aspects of our model based on the information we ascertain here.\n",
- "\n",
- "Note however that some of these values are very small, which is why the match weight chart is often more useful for getting a decent picture of things.\n"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_detailed.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e08287f4-711a-4960-b6e6-3f3d19ca8667",
+ "metadata": {},
+ "source": [
+ "As well as the match weights, which give us an idea of the overall effect of each comparison level, we can also look at the individual `u` and `m` parameter estimates, which tells us about the prevalence of coincidences and mistakes (for further details/explanation about this see [this article](https://www.robinlinacre.com/maths_of_fellegi_sunter/)). We might want to revise aspects of our model based on the information we ascertain here.\n",
+ "\n",
+ "Note however that some of these values are very small, which is why the match weight chart is often more useful for getting a decent picture of things.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "26e5dbe5-a621-44ab-bdb4-0bcd53b220b6",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:55.256437Z",
+ "iopub.status.busy": "2024-06-07T09:16:55.256148Z",
+ "iopub.status.idle": "2024-06-07T09:16:55.408274Z",
+ "shell.execute_reply": "2024-06-07T09:16:55.407631Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 31,
- "id": "26e5dbe5-a621-44ab-bdb4-0bcd53b220b6",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:55.256437Z",
- "iopub.status.busy": "2024-06-07T09:16:55.256148Z",
- "iopub.status.idle": "2024-06-07T09:16:55.408274Z",
- "shell.execute_reply": "2024-06-07T09:16:55.407631Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "# linker_simple.m_u_parameters_chart()\n",
- "linker_detailed.visualisations.m_u_parameters_chart()"
+ "text/plain": [
+ "alt.HConcatChart(...)"
]
- },
- {
- "cell_type": "markdown",
- "id": "67321657-d2a0-4f7c-b68a-a906a210547e",
- "metadata": {},
- "source": [
- "It is also useful to have a look at unlinkable records - these are records which do not contain enough information to be linked at some match probability threshold. We can figure this out be seeing whether records are able to be matched with themselves.\n",
- "\n",
- "This is of course relative to the information we have put into the model - we see that in our simple model, at a 99% match threshold nearly 10% of records are unlinkable, as we have not included enough information in the model for distinct records to be adequately distinguished; this is not an issue in our more detailed model.\n"
- ]
- },
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# linker_simple.m_u_parameters_chart()\n",
+ "linker_detailed.visualisations.m_u_parameters_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67321657-d2a0-4f7c-b68a-a906a210547e",
+ "metadata": {},
+ "source": [
+ "It is also useful to have a look at unlinkable records - these are records which do not contain enough information to be linked at some match probability threshold. We can figure this out be seeing whether records are able to be matched with themselves.\n",
+ "\n",
+ "This is of course relative to the information we have put into the model - we see that in our simple model, at a 99% match threshold nearly 10% of records are unlinkable, as we have not included enough information in the model for distinct records to be adequately distinguished; this is not an issue in our more detailed model.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "149962d6-a2ad-412f-aa05-8697beb12ed0",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:55.411718Z",
+ "iopub.status.busy": "2024-06-07T09:16:55.411484Z",
+ "iopub.status.idle": "2024-06-07T09:16:57.179378Z",
+ "shell.execute_reply": "2024-06-07T09:16:57.178861Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 32,
- "id": "149962d6-a2ad-412f-aa05-8697beb12ed0",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:55.411718Z",
- "iopub.status.busy": "2024-06-07T09:16:55.411484Z",
- "iopub.status.idle": "2024-06-07T09:16:57.179378Z",
- "shell.execute_reply": "2024-06-07T09:16:57.178861Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_simple.evaluation.unlinkables_chart()"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_simple.evaluation.unlinkables_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "cac493dd-ea43-4550-8fd4-f758ae90ed75",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:57.182832Z",
+ "iopub.status.busy": "2024-06-07T09:16:57.182595Z",
+ "iopub.status.idle": "2024-06-07T09:16:57.517285Z",
+ "shell.execute_reply": "2024-06-07T09:16:57.516677Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 33,
- "id": "cac493dd-ea43-4550-8fd4-f758ae90ed75",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:57.182832Z",
- "iopub.status.busy": "2024-06-07T09:16:57.182595Z",
- "iopub.status.idle": "2024-06-07T09:16:57.517285Z",
- "shell.execute_reply": "2024-06-07T09:16:57.516677Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_detailed.evaluation.unlinkables_chart()"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
- {
- "cell_type": "markdown",
- "id": "66244ba3-7397-466a-889e-c85f90db1e82",
- "metadata": {},
- "source": [
- "Our simple model doesn't do _terribly_, but suffers if we want to have a high match probability --- to be 99% (match weight ~7) certain of matches we have ~10% of records that we will be unable to link.\n",
- "\n",
- "Our detailed model, however, has enough nuance that we can at least self-link records.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "061e1355-557a-457d-92b6-2589b32371da",
- "metadata": {},
- "source": [
- "### Predictions\n",
- "\n",
- "Now that we have had a look into the details of the models, we will focus on only our more detailed model, which should be able to capture more of the genuine links in our data\n"
- ]
- },
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_detailed.evaluation.unlinkables_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "66244ba3-7397-466a-889e-c85f90db1e82",
+ "metadata": {},
+ "source": [
+ "Our simple model doesn't do _terribly_, but suffers if we want to have a high match probability --- to be 99% (match weight ~7) certain of matches we have ~10% of records that we will be unable to link.\n",
+ "\n",
+ "Our detailed model, however, has enough nuance that we can at least self-link records.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "061e1355-557a-457d-92b6-2589b32371da",
+ "metadata": {},
+ "source": [
+ "### Predictions\n",
+ "\n",
+ "Now that we have had a look into the details of the models, we will focus on only our more detailed model, which should be able to capture more of the genuine links in our data\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "03348477-c3c1-42e7-a8af-8f678acc9d58",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:16:57.520557Z",
+ "iopub.status.busy": "2024-06-07T09:16:57.520288Z",
+ "iopub.status.idle": "2024-06-07T09:17:01.939499Z",
+ "shell.execute_reply": "2024-06-07T09:17:01.938793Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 34,
- "id": "03348477-c3c1-42e7-a8af-8f678acc9d58",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:16:57.520557Z",
- "iopub.status.busy": "2024-06-07T09:16:57.520288Z",
- "iopub.status.idle": "2024-06-07T09:17:01.939499Z",
- "shell.execute_reply": "2024-06-07T09:17:01.938793Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "dbdb93185654405cb9f38df6298ebb2c",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "dbdb93185654405cb9f38df6298ebb2c",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " source_dataset_l | \n",
- " source_dataset_r | \n",
- " rec_id_l | \n",
- " rec_id_r | \n",
- " given_name_l | \n",
- " given_name_r | \n",
- " gamma_given_name | \n",
- " tf_given_name_l | \n",
- " ... | \n",
- " gamma_postcode | \n",
- " tf_postcode_l | \n",
- " tf_postcode_r | \n",
- " bf_postcode | \n",
- " bf_tf_adj_postcode | \n",
- " address_1_l | \n",
- " address_1_r | \n",
- " state_l | \n",
- " state_r | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " -1.830001 | \n",
- " 0.219521 | \n",
- " __splink__input_table_0 | \n",
- " __splink__input_table_1 | \n",
- " rec-760-org | \n",
- " rec-3951-dup-0 | \n",
- " lachlan | \n",
- " lachlan | \n",
- " 4 | \n",
- " 0.0113 | \n",
- " ... | \n",
- " 3 | \n",
- " 0.0007 | \n",
- " 0.0007 | \n",
- " 759.407155 | \n",
- " 1.583362 | \n",
- " bushby close | \n",
- " templestoew avenue | \n",
- " nsw | \n",
- " vic | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " -1.801736 | \n",
- " 0.222896 | \n",
- " __splink__input_table_0 | \n",
- " __splink__input_table_1 | \n",
- " rec-4980-org | \n",
- " rec-4980-dup-0 | \n",
- " isabella | \n",
- " ctercteko | \n",
- " 0 | \n",
- " 0.0069 | \n",
- " ... | \n",
- " 3 | \n",
- " 0.0004 | \n",
- " 0.0004 | \n",
- " 759.407155 | \n",
- " 2.770884 | \n",
- " sturt avenue | \n",
- " sturta venue | \n",
- " vic | \n",
- " vic | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " -1.271794 | \n",
- " 0.292859 | \n",
- " __splink__input_table_0 | \n",
- " __splink__input_table_1 | \n",
- " rec-585-org | \n",
- " rec-585-dup-0 | \n",
- " danny | \n",
- " stephenson | \n",
- " 0 | \n",
- " 0.0001 | \n",
- " ... | \n",
- " 2 | \n",
- " 0.0016 | \n",
- " 0.0012 | \n",
- " 11.264825 | \n",
- " 1.000000 | \n",
- " o'shanassy street | \n",
- " o'shanassy street | \n",
- " tas | \n",
- " tas | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " -1.213441 | \n",
- " 0.301305 | \n",
- " __splink__input_table_0 | \n",
- " __splink__input_table_1 | \n",
- " rec-1250-org | \n",
- " rec-1250-dup-0 | \n",
- " luke | \n",
- " gazzola | \n",
- " 0 | \n",
- " 0.0055 | \n",
- " ... | \n",
- " 2 | \n",
- " 0.0015 | \n",
- " 0.0002 | \n",
- " 11.264825 | \n",
- " 1.000000 | \n",
- " newman morris circuit | \n",
- " newman morr is circuit | \n",
- " nsw | \n",
- " nsw | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " -0.380336 | \n",
- " 0.434472 | \n",
- " __splink__input_table_0 | \n",
- " __splink__input_table_1 | \n",
- " rec-4763-org | \n",
- " rec-4763-dup-0 | \n",
- " max | \n",
- " alisha | \n",
- " 0 | \n",
- " 0.0021 | \n",
- " ... | \n",
- " 1 | \n",
- " 0.0004 | \n",
- " 0.0016 | \n",
- " 0.043565 | \n",
- " 1.000000 | \n",
- " duffy street | \n",
- " duffy s treet | \n",
- " nsw | \n",
- " nsw | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 47 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability source_dataset_l \\\n",
- "0 -1.830001 0.219521 __splink__input_table_0 \n",
- "1 -1.801736 0.222896 __splink__input_table_0 \n",
- "2 -1.271794 0.292859 __splink__input_table_0 \n",
- "3 -1.213441 0.301305 __splink__input_table_0 \n",
- "4 -0.380336 0.434472 __splink__input_table_0 \n",
- "\n",
- " source_dataset_r rec_id_l rec_id_r given_name_l \\\n",
- "0 __splink__input_table_1 rec-760-org rec-3951-dup-0 lachlan \n",
- "1 __splink__input_table_1 rec-4980-org rec-4980-dup-0 isabella \n",
- "2 __splink__input_table_1 rec-585-org rec-585-dup-0 danny \n",
- "3 __splink__input_table_1 rec-1250-org rec-1250-dup-0 luke \n",
- "4 __splink__input_table_1 rec-4763-org rec-4763-dup-0 max \n",
- "\n",
- " given_name_r gamma_given_name tf_given_name_l ... gamma_postcode \\\n",
- "0 lachlan 4 0.0113 ... 3 \n",
- "1 ctercteko 0 0.0069 ... 3 \n",
- "2 stephenson 0 0.0001 ... 2 \n",
- "3 gazzola 0 0.0055 ... 2 \n",
- "4 alisha 0 0.0021 ... 1 \n",
- "\n",
- " tf_postcode_l tf_postcode_r bf_postcode bf_tf_adj_postcode \\\n",
- "0 0.0007 0.0007 759.407155 1.583362 \n",
- "1 0.0004 0.0004 759.407155 2.770884 \n",
- "2 0.0016 0.0012 11.264825 1.000000 \n",
- "3 0.0015 0.0002 11.264825 1.000000 \n",
- "4 0.0004 0.0016 0.043565 1.000000 \n",
- "\n",
- " address_1_l address_1_r state_l state_r \\\n",
- "0 bushby close templestoew avenue nsw vic \n",
- "1 sturt avenue sturta venue vic vic \n",
- "2 o'shanassy street o'shanassy street tas tas \n",
- "3 newman morris circuit newman morr is circuit nsw nsw \n",
- "4 duffy street duffy s treet nsw nsw \n",
- "\n",
- " match_key \n",
- "0 0 \n",
- "1 2 \n",
- "2 1 \n",
- "3 1 \n",
- "4 2 \n",
- "\n",
- "[5 rows x 47 columns]"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "predictions = linker_detailed.inference.predict(threshold_match_probability=0.2)\n",
- "df_predictions = predictions.as_pandas_dataframe()\n",
- "df_predictions.head(5)"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "markdown",
- "id": "fd32d127-5012-42e8-9f69-89237992a793",
- "metadata": {},
- "source": [
- "We can see how our model performs at different probability thresholds, with a couple of options depending on the space we wish to view things\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 41,
- "id": "ce8d409c-7ef5-4485-9ec0-8b539fdecb1f",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:01.942896Z",
- "iopub.status.busy": "2024-06-07T09:17:01.942661Z",
- "iopub.status.idle": "2024-06-07T09:17:04.159161Z",
- "shell.execute_reply": "2024-06-07T09:17:04.158614Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " rec_id_l | \n",
+ " rec_id_r | \n",
+ " given_name_l | \n",
+ " given_name_r | \n",
+ " gamma_given_name | \n",
+ " tf_given_name_l | \n",
+ " ... | \n",
+ " gamma_postcode | \n",
+ " tf_postcode_l | \n",
+ " tf_postcode_r | \n",
+ " bf_postcode | \n",
+ " bf_tf_adj_postcode | \n",
+ " address_1_l | \n",
+ " address_1_r | \n",
+ " state_l | \n",
+ " state_r | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " -1.830001 | \n",
+ " 0.219521 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " rec-760-org | \n",
+ " rec-3951-dup-0 | \n",
+ " lachlan | \n",
+ " lachlan | \n",
+ " 4 | \n",
+ " 0.0113 | \n",
+ " ... | \n",
+ " 3 | \n",
+ " 0.0007 | \n",
+ " 0.0007 | \n",
+ " 759.407155 | \n",
+ " 1.583362 | \n",
+ " bushby close | \n",
+ " templestoew avenue | \n",
+ " nsw | \n",
+ " vic | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " -1.801736 | \n",
+ " 0.222896 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " rec-4980-org | \n",
+ " rec-4980-dup-0 | \n",
+ " isabella | \n",
+ " ctercteko | \n",
+ " 0 | \n",
+ " 0.0069 | \n",
+ " ... | \n",
+ " 3 | \n",
+ " 0.0004 | \n",
+ " 0.0004 | \n",
+ " 759.407155 | \n",
+ " 2.770884 | \n",
+ " sturt avenue | \n",
+ " sturta venue | \n",
+ " vic | \n",
+ " vic | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " -1.271794 | \n",
+ " 0.292859 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " rec-585-org | \n",
+ " rec-585-dup-0 | \n",
+ " danny | \n",
+ " stephenson | \n",
+ " 0 | \n",
+ " 0.0001 | \n",
+ " ... | \n",
+ " 2 | \n",
+ " 0.0016 | \n",
+ " 0.0012 | \n",
+ " 11.264825 | \n",
+ " 1.000000 | \n",
+ " o'shanassy street | \n",
+ " o'shanassy street | \n",
+ " tas | \n",
+ " tas | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " -1.213441 | \n",
+ " 0.301305 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " rec-1250-org | \n",
+ " rec-1250-dup-0 | \n",
+ " luke | \n",
+ " gazzola | \n",
+ " 0 | \n",
+ " 0.0055 | \n",
+ " ... | \n",
+ " 2 | \n",
+ " 0.0015 | \n",
+ " 0.0002 | \n",
+ " 11.264825 | \n",
+ " 1.000000 | \n",
+ " newman morris circuit | \n",
+ " newman morr is circuit | \n",
+ " nsw | \n",
+ " nsw | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " -0.380336 | \n",
+ " 0.434472 | \n",
+ " __splink__input_table_0 | \n",
+ " __splink__input_table_1 | \n",
+ " rec-4763-org | \n",
+ " rec-4763-dup-0 | \n",
+ " max | \n",
+ " alisha | \n",
+ " 0 | \n",
+ " 0.0021 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0.0004 | \n",
+ " 0.0016 | \n",
+ " 0.043565 | \n",
+ " 1.000000 | \n",
+ " duffy street | \n",
+ " duffy s treet | \n",
+ " nsw | \n",
+ " nsw | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 47 columns
\n",
+ "
"
],
- "source": [
- "linker_detailed.evaluation.accuracy_analysis_from_labels_column(\n",
- " \"cluster\", output_type=\"accuracy\"\n",
- ")"
+ "text/plain": [
+ " match_weight match_probability source_dataset_l \\\n",
+ "0 -1.830001 0.219521 __splink__input_table_0 \n",
+ "1 -1.801736 0.222896 __splink__input_table_0 \n",
+ "2 -1.271794 0.292859 __splink__input_table_0 \n",
+ "3 -1.213441 0.301305 __splink__input_table_0 \n",
+ "4 -0.380336 0.434472 __splink__input_table_0 \n",
+ "\n",
+ " source_dataset_r rec_id_l rec_id_r given_name_l \\\n",
+ "0 __splink__input_table_1 rec-760-org rec-3951-dup-0 lachlan \n",
+ "1 __splink__input_table_1 rec-4980-org rec-4980-dup-0 isabella \n",
+ "2 __splink__input_table_1 rec-585-org rec-585-dup-0 danny \n",
+ "3 __splink__input_table_1 rec-1250-org rec-1250-dup-0 luke \n",
+ "4 __splink__input_table_1 rec-4763-org rec-4763-dup-0 max \n",
+ "\n",
+ " given_name_r gamma_given_name tf_given_name_l ... gamma_postcode \\\n",
+ "0 lachlan 4 0.0113 ... 3 \n",
+ "1 ctercteko 0 0.0069 ... 3 \n",
+ "2 stephenson 0 0.0001 ... 2 \n",
+ "3 gazzola 0 0.0055 ... 2 \n",
+ "4 alisha 0 0.0021 ... 1 \n",
+ "\n",
+ " tf_postcode_l tf_postcode_r bf_postcode bf_tf_adj_postcode \\\n",
+ "0 0.0007 0.0007 759.407155 1.583362 \n",
+ "1 0.0004 0.0004 759.407155 2.770884 \n",
+ "2 0.0016 0.0012 11.264825 1.000000 \n",
+ "3 0.0015 0.0002 11.264825 1.000000 \n",
+ "4 0.0004 0.0016 0.043565 1.000000 \n",
+ "\n",
+ " address_1_l address_1_r state_l state_r \\\n",
+ "0 bushby close templestoew avenue nsw vic \n",
+ "1 sturt avenue sturta venue vic vic \n",
+ "2 o'shanassy street o'shanassy street tas tas \n",
+ "3 newman morris circuit newman morr is circuit nsw nsw \n",
+ "4 duffy street duffy s treet nsw nsw \n",
+ "\n",
+ " match_key \n",
+ "0 0 \n",
+ "1 2 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 2 \n",
+ "\n",
+ "[5 rows x 47 columns]"
]
- },
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "predictions = linker_detailed.inference.predict(threshold_match_probability=0.2)\n",
+ "df_predictions = predictions.as_pandas_dataframe()\n",
+ "df_predictions.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd32d127-5012-42e8-9f69-89237992a793",
+ "metadata": {},
+ "source": [
+ "We can see how our model performs at different probability thresholds, with a couple of options depending on the space we wish to view things\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "ce8d409c-7ef5-4485-9ec0-8b539fdecb1f",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:01.942896Z",
+ "iopub.status.busy": "2024-06-07T09:17:01.942661Z",
+ "iopub.status.idle": "2024-06-07T09:17:04.159161Z",
+ "shell.execute_reply": "2024-06-07T09:17:04.158614Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "568b990e-982a-4adc-9629-06ba30f872b0",
- "metadata": {},
- "source": [
- "and we can easily see how many individuals we identify and link by looking at clusters generated at some threshold match probability of interest - in this example 99%\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 36,
- "id": "ade53248-212f-4776-8d7d-4632b1749425",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:04.165374Z",
- "iopub.status.busy": "2024-06-07T09:17:04.165099Z",
- "iopub.status.idle": "2024-06-07T09:17:04.301694Z",
- "shell.execute_reply": "2024-06-07T09:17:04.301045Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 1, root rows count 0\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "2 4959\n",
- "1 82\n",
- "Name: count, dtype: int64"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "clusters = linker_detailed.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " predictions, threshold_match_probability=0.99\n",
- ")\n",
- "df_clusters = clusters.as_pandas_dataframe().sort_values(\"cluster_id\")\n",
- "df_clusters.groupby(\"cluster_id\").size().value_counts()"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_detailed.evaluation.accuracy_analysis_from_labels_column(\n",
+ " \"cluster\", output_type=\"accuracy\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "568b990e-982a-4adc-9629-06ba30f872b0",
+ "metadata": {},
+ "source": [
+ "and we can easily see how many individuals we identify and link by looking at clusters generated at some threshold match probability of interest - in this example 99%\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "ade53248-212f-4776-8d7d-4632b1749425",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:04.165374Z",
+ "iopub.status.busy": "2024-06-07T09:17:04.165099Z",
+ "iopub.status.idle": "2024-06-07T09:17:04.301694Z",
+ "shell.execute_reply": "2024-06-07T09:17:04.301045Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "6f7dbde5-c588-4930-bace-21642f250395",
- "metadata": {},
- "source": [
- "In this case, we happen to know what the true links are, so we can manually inspect the ones that are doing worst to see what our model is not capturing - i.e. where we have false negatives.\n",
- "\n",
- "Similarly, we can look at the non-links which are performing the best, to see whether we have an issue with false positives.\n",
- "\n",
- "Ordinarily we would not have this luxury, and so would need to dig a bit deeper for clues as to how to improve our model, such as manually inspecting records across threshold probabilities,\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 1, root rows count 0\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 37,
- "id": "ef77a8b1-1119-4cb0-b299-343a4022d65e",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:04.305169Z",
- "iopub.status.busy": "2024-06-07T09:17:04.304886Z",
- "iopub.status.idle": "2024-06-07T09:17:04.322035Z",
- "shell.execute_reply": "2024-06-07T09:17:04.321351Z"
- }
- },
- "outputs": [],
- "source": [
- "df_predictions[\"cluster_l\"] = df_predictions[\"rec_id_l\"].apply(\n",
- " lambda x: \"-\".join(x.split(\"-\")[:2])\n",
- ")\n",
- "df_predictions[\"cluster_r\"] = df_predictions[\"rec_id_r\"].apply(\n",
- " lambda x: \"-\".join(x.split(\"-\")[:2])\n",
- ")\n",
- "df_true_links = df_predictions[\n",
- " df_predictions[\"cluster_l\"] == df_predictions[\"cluster_r\"]\n",
- "].sort_values(\"match_probability\")"
+ "data": {
+ "text/plain": [
+ "2 4959\n",
+ "1 82\n",
+ "Name: count, dtype: int64"
]
- },
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clusters = linker_detailed.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " predictions, threshold_match_probability=0.99\n",
+ ")\n",
+ "df_clusters = clusters.as_pandas_dataframe().sort_values(\"cluster_id\")\n",
+ "df_clusters.groupby(\"cluster_id\").size().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f7dbde5-c588-4930-bace-21642f250395",
+ "metadata": {},
+ "source": [
+ "In this case, we happen to know what the true links are, so we can manually inspect the ones that are doing worst to see what our model is not capturing - i.e. where we have false negatives.\n",
+ "\n",
+ "Similarly, we can look at the non-links which are performing the best, to see whether we have an issue with false positives.\n",
+ "\n",
+ "Ordinarily we would not have this luxury, and so would need to dig a bit deeper for clues as to how to improve our model, such as manually inspecting records across threshold probabilities,\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "ef77a8b1-1119-4cb0-b299-343a4022d65e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:04.305169Z",
+ "iopub.status.busy": "2024-06-07T09:17:04.304886Z",
+ "iopub.status.idle": "2024-06-07T09:17:04.322035Z",
+ "shell.execute_reply": "2024-06-07T09:17:04.321351Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df_predictions[\"cluster_l\"] = df_predictions[\"rec_id_l\"].apply(\n",
+ " lambda x: \"-\".join(x.split(\"-\")[:2])\n",
+ ")\n",
+ "df_predictions[\"cluster_r\"] = df_predictions[\"rec_id_r\"].apply(\n",
+ " lambda x: \"-\".join(x.split(\"-\")[:2])\n",
+ ")\n",
+ "df_true_links = df_predictions[\n",
+ " df_predictions[\"cluster_l\"] == df_predictions[\"cluster_r\"]\n",
+ "].sort_values(\"match_probability\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "bc531ca3-fe0d-480d-b059-a7125474fb22",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:04.325739Z",
+ "iopub.status.busy": "2024-06-07T09:17:04.325483Z",
+ "iopub.status.idle": "2024-06-07T09:17:04.966790Z",
+ "shell.execute_reply": "2024-06-07T09:17:04.966182Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 38,
- "id": "bc531ca3-fe0d-480d-b059-a7125474fb22",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:04.325739Z",
- "iopub.status.busy": "2024-06-07T09:17:04.325483Z",
- "iopub.status.idle": "2024-06-07T09:17:04.966790Z",
- "shell.execute_reply": "2024-06-07T09:17:04.966182Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "records_to_view = 3\n",
- "linker_detailed.visualisations.waterfall_chart(\n",
- " df_true_links.head(records_to_view).to_dict(orient=\"records\")\n",
- ")"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "records_to_view = 3\n",
+ "linker_detailed.visualisations.waterfall_chart(\n",
+ " df_true_links.head(records_to_view).to_dict(orient=\"records\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "aacd9042-5672-4bc4-aa98-940d1f5fd28a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:04.969789Z",
+ "iopub.status.busy": "2024-06-07T09:17:04.969553Z",
+ "iopub.status.idle": "2024-06-07T09:17:05.445307Z",
+ "shell.execute_reply": "2024-06-07T09:17:05.444530Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 39,
- "id": "aacd9042-5672-4bc4-aa98-940d1f5fd28a",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:04.969789Z",
- "iopub.status.busy": "2024-06-07T09:17:04.969553Z",
- "iopub.status.idle": "2024-06-07T09:17:05.445307Z",
- "shell.execute_reply": "2024-06-07T09:17:05.444530Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "df_non_links = df_predictions[\n",
- " df_predictions[\"cluster_l\"] != df_predictions[\"cluster_r\"]\n",
- "].sort_values(\"match_probability\", ascending=False)\n",
- "linker_detailed.visualisations.waterfall_chart(\n",
- " df_non_links.head(records_to_view).to_dict(orient=\"records\")\n",
- ")"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_non_links = df_predictions[\n",
+ " df_predictions[\"cluster_l\"] != df_predictions[\"cluster_r\"]\n",
+ "].sort_values(\"match_probability\", ascending=False)\n",
+ "linker_detailed.visualisations.waterfall_chart(\n",
+ " df_non_links.head(records_to_view).to_dict(orient=\"records\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "99abfc68-61a0-4290-be22-7243680b5ee1",
+ "metadata": {},
+ "source": [
+ "## Further refinements\n",
+ "\n",
+ "Looking at the non-links we have done well in having no false positives at any substantial match probability --- however looking at some of the true links we can see that there are a few that we are not capturing with sufficient match probability.\n",
+ "\n",
+ "We can see that there are a few features that we are not capturing/weighting appropriately\n",
+ "\n",
+ "- single-character transpostions, particularly in postcode (which is being lumped in with more 'severe typos'/probable non-matches)\n",
+ "- given/sur-names being swapped with typos\n",
+ "- given/sur-names being cross-matches on one only, with no match on the other cross\n",
+ "\n",
+ "We will quickly see if we can incorporate these features into a new model. As we are now going into more detail with the inter-relationship between given name and surname, it is probably no longer sensible to model them as independent comparisons, and so we will need to switch to a combined comparison on full name.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "2a7229da-9f79-4151-a6b1-018d17205f5f",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:05.448836Z",
+ "iopub.status.busy": "2024-06-07T09:17:05.448543Z",
+ "iopub.status.idle": "2024-06-07T09:17:05.460100Z",
+ "shell.execute_reply": "2024-06-07T09:17:05.459191Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# we need to append a full name column to our source data frames\n",
+ "# so that we can use it for term frequency adjustments\n",
+ "dfs[0][\"full_name\"] = dfs[0][\"given_name\"] + \"_\" + dfs[0][\"surname\"]\n",
+ "dfs[1][\"full_name\"] = dfs[1][\"given_name\"] + \"_\" + dfs[1][\"surname\"]\n",
+ "\n",
+ "\n",
+ "extended_model_settings = {\n",
+ " \"unique_id_column_name\": \"rec_id\",\n",
+ " \"link_type\": \"link_only\",\n",
+ " \"blocking_rules_to_generate_predictions\": blocking_rules,\n",
+ " \"comparisons\": [\n",
+ " {\n",
+ " \"output_column_name\": \"Full name\",\n",
+ " \"comparison_levels\": [\n",
+ " {\n",
+ " \"sql_condition\": \"(given_name_l IS NULL OR given_name_r IS NULL) and (surname_l IS NULL OR surname_r IS NULL)\",\n",
+ " \"label_for_charts\": \"Null\",\n",
+ " \"is_null_level\": True,\n",
+ " },\n",
+ " # full name match\n",
+ " cll.ExactMatchLevel(\"full_name\", term_frequency_adjustments=True),\n",
+ " # typos - keep levels across full name rather than scoring separately\n",
+ " cll.JaroWinklerLevel(\"full_name\", 0.9),\n",
+ " cll.JaroWinklerLevel(\"full_name\", 0.7),\n",
+ " # name switched\n",
+ " cll.ColumnsReversedLevel(\"given_name\", \"surname\"),\n",
+ " # name switched + typo\n",
+ " {\n",
+ " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.8\",\n",
+ " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.8\",\n",
+ " },\n",
+ " {\n",
+ " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.4\",\n",
+ " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.4\",\n",
+ " },\n",
+ " # single name match\n",
+ " cll.ExactMatchLevel(\"given_name\", term_frequency_adjustments=True),\n",
+ " cll.ExactMatchLevel(\"surname\", term_frequency_adjustments=True),\n",
+ " # single name cross-match\n",
+ " {\n",
+ " \"sql_condition\": \"given_name_l = surname_r OR surname_l = given_name_r\",\n",
+ " \"label_for_charts\": \"single name cross-matches\",\n",
+ " }, # single name typos\n",
+ " cll.JaroWinklerLevel(\"given_name\", 0.9),\n",
+ " cll.JaroWinklerLevel(\"surname\", 0.9),\n",
+ " # the rest\n",
+ " cll.ElseLevel(),\n",
+ " ],\n",
+ " },\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"date_of_birth\",\n",
+ " input_is_string=True,\n",
+ " datetime_format=\"%Y%m%d\",\n",
+ " invalid_dates_as_null=True,\n",
+ " ),\n",
+ " {\n",
+ " \"output_column_name\": \"Social security ID\",\n",
+ " \"comparison_levels\": [\n",
+ " cll.NullLevel(\"soc_sec_id\"),\n",
+ " cll.ExactMatchLevel(\"soc_sec_id\", term_frequency_adjustments=True),\n",
+ " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 1),\n",
+ " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 2),\n",
+ " cll.ElseLevel(),\n",
+ " ],\n",
+ " },\n",
+ " {\n",
+ " \"output_column_name\": \"Street number\",\n",
+ " \"comparison_levels\": [\n",
+ " cll.NullLevel(\"street_number\"),\n",
+ " cll.ExactMatchLevel(\"street_number\", term_frequency_adjustments=True),\n",
+ " cll.DamerauLevenshteinLevel(\"street_number\", 1),\n",
+ " cll.ElseLevel(),\n",
+ " ],\n",
+ " },\n",
+ " {\n",
+ " \"output_column_name\": \"Postcode\",\n",
+ " \"comparison_levels\": [\n",
+ " cll.NullLevel(\"postcode\"),\n",
+ " cll.ExactMatchLevel(\"postcode\", term_frequency_adjustments=True),\n",
+ " cll.DamerauLevenshteinLevel(\"postcode\", 1),\n",
+ " cll.DamerauLevenshteinLevel(\"postcode\", 2),\n",
+ " cll.ElseLevel(),\n",
+ " ],\n",
+ " },\n",
+ " # we don't consider further location columns as they will be strongly correlated with postcode\n",
+ " ],\n",
+ " \"retain_intermediate_calculation_columns\": True,\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1581eeeb-246b-46de-be88-ba4dc821fce7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:17:05.463764Z",
+ "iopub.status.busy": "2024-06-07T09:17:05.463499Z",
+ "iopub.status.idle": "2024-06-07T09:18:25.606071Z",
+ "shell.execute_reply": "2024-06-07T09:18:25.605371Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "99abfc68-61a0-4290-be22-7243680b5ee1",
- "metadata": {},
- "source": [
- "## Further refinements\n",
- "\n",
- "Looking at the non-links we have done well in having no false positives at any substantial match probability --- however looking at some of the true links we can see that there are a few that we are not capturing with sufficient match probability.\n",
- "\n",
- "We can see that there are a few features that we are not capturing/weighting appropriately\n",
- "\n",
- "- single-character transpostions, particularly in postcode (which is being lumped in with more 'severe typos'/probable non-matches)\n",
- "- given/sur-names being swapped with typos\n",
- "- given/sur-names being cross-matches on one only, with no match on the other cross\n",
- "\n",
- "We will quickly see if we can incorporate these features into a new model. As we are now going into more detail with the inter-relationship between given name and surname, it is probably no longer sensible to model them as independent comparisons, and so we will need to switch to a combined comparison on full name.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.000239.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n",
+ "----- Estimating u probabilities using random sampling -----\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 42,
- "id": "2a7229da-9f79-4151-a6b1-018d17205f5f",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:05.448836Z",
- "iopub.status.busy": "2024-06-07T09:17:05.448543Z",
- "iopub.status.idle": "2024-06-07T09:17:05.460100Z",
- "shell.execute_reply": "2024-06-07T09:17:05.459191Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "80aa82bbc2884ddcb71df130ed5b4edc",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [],
- "source": [
- "# we need to append a full name column to our source data frames\n",
- "# so that we can use it for term frequency adjustments\n",
- "dfs[0][\"full_name\"] = dfs[0][\"given_name\"] + \"_\" + dfs[0][\"surname\"]\n",
- "dfs[1][\"full_name\"] = dfs[1][\"given_name\"] + \"_\" + dfs[1][\"surname\"]\n",
- "\n",
- "\n",
- "extended_model_settings = {\n",
- " \"unique_id_column_name\": \"rec_id\",\n",
- " \"link_type\": \"link_only\",\n",
- " \"blocking_rules_to_generate_predictions\": blocking_rules,\n",
- " \"comparisons\": [\n",
- " {\n",
- " \"output_column_name\": \"Full name\",\n",
- " \"comparison_levels\": [\n",
- " {\n",
- " \"sql_condition\": \"(given_name_l IS NULL OR given_name_r IS NULL) and (surname_l IS NULL OR surname_r IS NULL)\",\n",
- " \"label_for_charts\": \"Null\",\n",
- " \"is_null_level\": True,\n",
- " },\n",
- " # full name match\n",
- " cll.ExactMatchLevel(\"full_name\", term_frequency_adjustments=True),\n",
- " # typos - keep levels across full name rather than scoring separately\n",
- " cll.JaroWinklerLevel(\"full_name\", 0.9),\n",
- " cll.JaroWinklerLevel(\"full_name\", 0.7),\n",
- " # name switched\n",
- " cll.ColumnsReversedLevel(\"given_name\", \"surname\"),\n",
- " # name switched + typo\n",
- " {\n",
- " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.8\",\n",
- " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.8\",\n",
- " },\n",
- " {\n",
- " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.4\",\n",
- " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.4\",\n",
- " },\n",
- " # single name match\n",
- " cll.ExactMatchLevel(\"given_name\", term_frequency_adjustments=True),\n",
- " cll.ExactMatchLevel(\"surname\", term_frequency_adjustments=True),\n",
- " # single name cross-match\n",
- " {\n",
- " \"sql_condition\": \"given_name_l = surname_r OR surname_l = given_name_r\",\n",
- " \"label_for_charts\": \"single name cross-matches\",\n",
- " }, # single name typos\n",
- " cll.JaroWinklerLevel(\"given_name\", 0.9),\n",
- " cll.JaroWinklerLevel(\"surname\", 0.9),\n",
- " # the rest\n",
- " cll.ElseLevel(),\n",
- " ],\n",
- " },\n",
- " cl.DateOfBirthComparison(\n",
- " \"date_of_birth\",\n",
- " input_is_string=True,\n",
- " datetime_format=\"%Y%m%d\",\n",
- " invalid_dates_as_null=True,\n",
- " ),\n",
- " {\n",
- " \"output_column_name\": \"Social security ID\",\n",
- " \"comparison_levels\": [\n",
- " cll.NullLevel(\"soc_sec_id\"),\n",
- " cll.ExactMatchLevel(\"soc_sec_id\", term_frequency_adjustments=True),\n",
- " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 1),\n",
- " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 2),\n",
- " cll.ElseLevel(),\n",
- " ],\n",
- " },\n",
- " {\n",
- " \"output_column_name\": \"Street number\",\n",
- " \"comparison_levels\": [\n",
- " cll.NullLevel(\"street_number\"),\n",
- " cll.ExactMatchLevel(\"street_number\", term_frequency_adjustments=True),\n",
- " cll.DamerauLevenshteinLevel(\"street_number\", 1),\n",
- " cll.ElseLevel(),\n",
- " ],\n",
- " },\n",
- " {\n",
- " \"output_column_name\": \"Postcode\",\n",
- " \"comparison_levels\": [\n",
- " cll.NullLevel(\"postcode\"),\n",
- " cll.ExactMatchLevel(\"postcode\", term_frequency_adjustments=True),\n",
- " cll.DamerauLevenshteinLevel(\"postcode\", 1),\n",
- " cll.DamerauLevenshteinLevel(\"postcode\", 2),\n",
- " cll.ElseLevel(),\n",
- " ],\n",
- " },\n",
- " # we don't consider further location columns as they will be strongly correlated with postcode\n",
- " ],\n",
- " \"retain_intermediate_calculation_columns\": True,\n",
- "}"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 43,
- "id": "1581eeeb-246b-46de-be88-ba4dc821fce7",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:17:05.463764Z",
- "iopub.status.busy": "2024-06-07T09:17:05.463499Z",
- "iopub.status.idle": "2024-06-07T09:18:25.606071Z",
- "shell.execute_reply": "2024-06-07T09:18:25.605371Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.000239.\n",
- "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n",
- "----- Estimating u probabilities using random sampling -----\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "80aa82bbc2884ddcb71df130ed5b4edc",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - Full name (no m values are trained).\n",
- " - date_of_birth (some u values are not trained, no m values are trained).\n",
- " - Social security ID (no m values are trained).\n",
- " - Street number (no m values are trained).\n",
- " - Postcode (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "# train\n",
- "linker_advanced = Linker(dfs, extended_model_settings, db_api=DuckDBAPI())\n",
- "linker_advanced.training.estimate_probability_two_random_records_match(\n",
- " deterministic_rules, recall=0.8\n",
- ")\n",
- "# We recommend increasing target rows to 1e8 improve accuracy for u\n",
- "# values in full name comparison, as we have subdivided the data more finely\n",
- "\n",
- "# Here, 1e7 for speed\n",
- "linker_advanced.training.estimate_u_using_random_sampling(max_pairs=1e7)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - Full name (no m values are trained).\n",
+ " - date_of_birth (some u values are not trained, no m values are trained).\n",
+ " - Social security ID (no m values are trained).\n",
+ " - Street number (no m values are trained).\n",
+ " - Postcode (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "# train\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "linker_advanced = Linker(dfs_sdf, extended_model_settings)\n",
+ "linker_advanced.training.estimate_probability_two_random_records_match(\n",
+ " deterministic_rules, recall=0.8\n",
+ ")\n",
+ "# We recommend increasing target rows to 1e8 improve accuracy for u\n",
+ "# values in full name comparison, as we have subdivided the data more finely\n",
+ "\n",
+ "# Here, 1e7 for speed\n",
+ "linker_advanced.training.estimate_u_using_random_sampling(max_pairs=1e7)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "265f0651",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:25.610698Z",
+ "iopub.status.busy": "2024-06-07T09:18:25.610416Z",
+ "iopub.status.idle": "2024-06-07T09:18:26.522700Z",
+ "shell.execute_reply": "2024-06-07T09:18:26.522017Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 44,
- "id": "265f0651",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:25.610698Z",
- "iopub.status.busy": "2024-06-07T09:18:25.610416Z",
- "iopub.status.idle": "2024-06-07T09:18:26.522700Z",
- "shell.execute_reply": "2024-06-07T09:18:26.522017Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.date_of_birth = r.date_of_birth\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - Full name\n",
- " - Social security ID\n",
- " - Street number\n",
- " - Postcode\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - date_of_birth\n",
- "\n",
- "WARNING:\n",
- "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was -0.465 in the m_probability of Full name, level `Exact match on full_name`\n",
- "Iteration 2: Largest change in params was 0.00252 in the m_probability of Social security ID, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 4.98e-05 in the m_probability of Social security ID, level `All other comparisons`\n",
- "\n",
- "EM converged after 3 iterations\n",
- "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - Full name (some m values are not trained).\n",
- " - date_of_birth (some u values are not trained, no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "session_dob = (\n",
- " linker_advanced.training.estimate_parameters_using_expectation_maximisation(\n",
- " \"l.date_of_birth = r.date_of_birth\", estimate_without_term_frequencies=True\n",
- " )\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.date_of_birth = r.date_of_birth\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - Full name\n",
+ " - Social security ID\n",
+ " - Street number\n",
+ " - Postcode\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - date_of_birth\n",
+ "\n",
+ "WARNING:\n",
+ "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.465 in the m_probability of Full name, level `Exact match on full_name`\n",
+ "Iteration 2: Largest change in params was 0.00252 in the m_probability of Social security ID, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 4.98e-05 in the m_probability of Social security ID, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 3 iterations\n",
+ "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - Full name (some m values are not trained).\n",
+ " - date_of_birth (some u values are not trained, no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "session_dob = (\n",
+ " linker_advanced.training.estimate_parameters_using_expectation_maximisation(\n",
+ " \"l.date_of_birth = r.date_of_birth\", estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "ebcb15c8",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:26.526171Z",
+ "iopub.status.busy": "2024-06-07T09:18:26.525914Z",
+ "iopub.status.idle": "2024-06-07T09:18:27.518982Z",
+ "shell.execute_reply": "2024-06-07T09:18:27.518364Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 45,
- "id": "ebcb15c8",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:26.526171Z",
- "iopub.status.busy": "2024-06-07T09:18:26.525914Z",
- "iopub.status.idle": "2024-06-07T09:18:27.518982Z",
- "shell.execute_reply": "2024-06-07T09:18:27.518364Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.postcode = r.postcode\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - Full name\n",
- " - date_of_birth\n",
- " - Social security ID\n",
- " - Street number\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - Postcode\n",
- "\n",
- "WARNING:\n",
- "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n",
- "Iteration 2: Largest change in params was 0.000656 in the m_probability of date_of_birth, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 1.75e-05 in the m_probability of Social security ID, level `All other comparisons`\n",
- "\n",
- "EM converged after 3 iterations\n",
- "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - Full name (some m values are not trained).\n",
- " - date_of_birth (some u values are not trained, some m values are not trained).\n"
- ]
- }
- ],
- "source": [
- "session_pc = (\n",
- " linker_advanced.training.estimate_parameters_using_expectation_maximisation(\n",
- " \"l.postcode = r.postcode\", estimate_without_term_frequencies=True\n",
- " )\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.postcode = r.postcode\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - Full name\n",
+ " - date_of_birth\n",
+ " - Social security ID\n",
+ " - Street number\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - Postcode\n",
+ "\n",
+ "WARNING:\n",
+ "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n",
+ "Iteration 2: Largest change in params was 0.000656 in the m_probability of date_of_birth, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 1.75e-05 in the m_probability of Social security ID, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 3 iterations\n",
+ "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - Full name (some m values are not trained).\n",
+ " - date_of_birth (some u values are not trained, some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "session_pc = (\n",
+ " linker_advanced.training.estimate_parameters_using_expectation_maximisation(\n",
+ " \"l.postcode = r.postcode\", estimate_without_term_frequencies=True\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "d9d21e85-b89b-435a-8b75-142166ac3f31",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:27.523341Z",
+ "iopub.status.busy": "2024-06-07T09:18:27.523109Z",
+ "iopub.status.idle": "2024-06-07T09:18:27.711081Z",
+ "shell.execute_reply": "2024-06-07T09:18:27.710381Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 46,
- "id": "d9d21e85-b89b-435a-8b75-142166ac3f31",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:27.523341Z",
- "iopub.status.busy": "2024-06-07T09:18:27.523109Z",
- "iopub.status.idle": "2024-06-07T09:18:27.711081Z",
- "shell.execute_reply": "2024-06-07T09:18:27.710381Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_advanced.visualisations.parameter_estimate_comparisons_chart()"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_advanced.visualisations.parameter_estimate_comparisons_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "4a857c18-b0d5-48dc-b7f1-1f6389db5089",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:27.746299Z",
+ "iopub.status.busy": "2024-06-07T09:18:27.744495Z",
+ "iopub.status.idle": "2024-06-07T09:18:28.388134Z",
+ "shell.execute_reply": "2024-06-07T09:18:28.387392Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 47,
- "id": "4a857c18-b0d5-48dc-b7f1-1f6389db5089",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:27.746299Z",
- "iopub.status.busy": "2024-06-07T09:18:27.744495Z",
- "iopub.status.idle": "2024-06-07T09:18:28.388134Z",
- "shell.execute_reply": "2024-06-07T09:18:28.387392Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker_advanced.visualisations.match_weights_chart()"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker_advanced.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "e1ee24d9-1def-4b8d-bb85-1c63b595e75e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:28.392069Z",
+ "iopub.status.busy": "2024-06-07T09:18:28.391745Z",
+ "iopub.status.idle": "2024-06-07T09:18:30.289569Z",
+ "shell.execute_reply": "2024-06-07T09:18:30.288893Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 48,
- "id": "e1ee24d9-1def-4b8d-bb85-1c63b595e75e",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:28.392069Z",
- "iopub.status.busy": "2024-06-07T09:18:28.391745Z",
- "iopub.status.idle": "2024-06-07T09:18:30.289569Z",
- "shell.execute_reply": "2024-06-07T09:18:30.288893Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'Full name':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " m values not fully trained\n",
- "Comparison: 'date_of_birth':\n",
- " u values not fully trained\n",
- "Completed iteration 1, root rows count 0\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "2 4960\n",
- "1 80\n",
- "Name: count, dtype: int64"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "predictions_adv = linker_advanced.inference.predict()\n",
- "df_predictions_adv = predictions_adv.as_pandas_dataframe()\n",
- "clusters_adv = linker_advanced.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " predictions_adv, threshold_match_probability=0.99\n",
- ")\n",
- "df_clusters_adv = clusters_adv.as_pandas_dataframe().sort_values(\"cluster_id\")\n",
- "df_clusters_adv.groupby(\"cluster_id\").size().value_counts()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'Full name':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " m values not fully trained\n",
+ "Comparison: 'date_of_birth':\n",
+ " u values not fully trained\n",
+ "Completed iteration 1, root rows count 0\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "8db464e7-c57b-48e1-9e7b-c01ce9ccbad9",
- "metadata": {},
- "source": [
- "This is a pretty modest improvement on our previous model - however it is worth re-iterating that we should not necessarily expect to recover _all_ matches, as in several cases it may be unreasonable for a model to have reasonable confidence that two records refer to the same entity.\n",
- "\n",
- "If we wished to improve matters we could iterate on this process - investigating where our model is not performing as we would hope, and seeing how we can adjust these areas to address these shortcomings.\n"
+ "data": {
+ "text/plain": [
+ "2 4960\n",
+ "1 80\n",
+ "Name: count, dtype: int64"
]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "1cd1512b68bf43868e26a4c0fa908d4e": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "3540e7572a2e497c8837e9038728b244": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "4eab2071171e419a8f9ddbd6a12f12e4": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_d9ed287ce1f146c09c8a0e89a7bd9855",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_9aae753cd7a54a2d94be6496b1812b3c",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "73555dbfc04c485fb9c6d09bc677f843": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "886192be2bdf4a88a1d8808f1db44fb2": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_73555dbfc04c485fb9c6d09bc677f843",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_3540e7572a2e497c8837e9038728b244",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "8906edf4488846fb908d17be3dc5440f": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "9aae753cd7a54a2d94be6496b1812b3c": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "cda32ebef57a4bcb9b8d2d531ac2b32a": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_8906edf4488846fb908d17be3dc5440f",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_1cd1512b68bf43868e26a4c0fa908d4e",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "d9ed287ce1f146c09c8a0e89a7bd9855": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
- }
- }
+ ],
+ "source": [
+ "predictions_adv = linker_advanced.inference.predict()\n",
+ "df_predictions_adv = predictions_adv.as_pandas_dataframe()\n",
+ "clusters_adv = linker_advanced.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " predictions_adv, threshold_match_probability=0.99\n",
+ ")\n",
+ "df_clusters_adv = clusters_adv.as_pandas_dataframe().sort_values(\"cluster_id\")\n",
+ "df_clusters_adv.groupby(\"cluster_id\").size().value_counts()"
+ ]
},
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
+ {
+ "cell_type": "markdown",
+ "id": "8db464e7-c57b-48e1-9e7b-c01ce9ccbad9",
+ "metadata": {},
+ "source": [
+ "This is a pretty modest improvement on our previous model - however it is worth re-iterating that we should not necessarily expect to recover _all_ matches, as in several cases it may be unreasonable for a model to have reasonable confidence that two records refer to the same entity.\n",
+ "\n",
+ "If we wished to improve matters we could iterate on this process - investigating where our model is not performing as we would hope, and seeing how we can adjust these areas to address these shortcomings.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "1cd1512b68bf43868e26a4c0fa908d4e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "3540e7572a2e497c8837e9038728b244": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "4eab2071171e419a8f9ddbd6a12f12e4": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_d9ed287ce1f146c09c8a0e89a7bd9855",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_9aae753cd7a54a2d94be6496b1812b3c",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "73555dbfc04c485fb9c6d09bc677f843": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "886192be2bdf4a88a1d8808f1db44fb2": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_73555dbfc04c485fb9c6d09bc677f843",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_3540e7572a2e497c8837e9038728b244",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "8906edf4488846fb908d17be3dc5440f": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "9aae753cd7a54a2d94be6496b1812b3c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "cda32ebef57a4bcb9b8d2d531ac2b32a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_8906edf4488846fb908d17be3dc5440f",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_1cd1512b68bf43868e26a4c0fa908d4e",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "d9ed287ce1f146c09c8a0e89a7bd9855": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/demos/examples/duckdb/link_only.ipynb b/docs/demos/examples/duckdb/link_only.ipynb
index 89ac8fe9b4..f8fb66bb94 100644
--- a/docs/demos/examples/duckdb/link_only.ipynb
+++ b/docs/demos/examples/duckdb/link_only.ipynb
@@ -1,747 +1,747 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Linking without deduplication\n",
- "\n",
- "A simple record linkage model using the `link_only` [link type](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#link_type).\n",
- "\n",
- "With `link_only`, only between-dataset record comparisons are generated. No within-dataset record comparisons are created, meaning that the model does not attempt to find within-dataset duplicates.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:42.926356Z",
- "iopub.status.busy": "2024-06-07T09:18:42.925982Z",
- "iopub.status.idle": "2024-06-07T09:18:42.943456Z",
- "shell.execute_reply": "2024-06-07T09:18:42.942569Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:42.947959Z",
- "iopub.status.busy": "2024-06-07T09:18:42.947640Z",
- "iopub.status.idle": "2024-06-07T09:18:44.652788Z",
- "shell.execute_reply": "2024-06-07T09:18:44.652024Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " city | \n",
- " email | \n",
- " cluster | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 922 | \n",
- " 922 | \n",
- " Evie | \n",
- " Jones | \n",
- " 2002-07-22 | \n",
- " NaN | \n",
- " eviejones@brewer-sparks.org | \n",
- " 230 | \n",
- "
\n",
- " \n",
- " | 224 | \n",
- " 224 | \n",
- " Logn | \n",
- " Feeruson | \n",
- " 2013-10-15 | \n",
- " London | \n",
- " l.fergson46@shah.com | \n",
- " 58 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id first_name surname dob city \\\n",
- "922 922 Evie Jones 2002-07-22 NaN \n",
- "224 224 Logn Feeruson 2013-10-15 London \n",
- "\n",
- " email cluster \n",
- "922 eviejones@brewer-sparks.org 230 \n",
- "224 l.fergson46@shah.com 58 "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink import splink_datasets\n",
- "\n",
- "df = splink_datasets.fake_1000\n",
- "\n",
- "# Split a simple dataset into two, separate datasets which can be linked together.\n",
- "df_l = df.sample(frac=0.5)\n",
- "df_r = df.drop(df_l.index)\n",
- "\n",
- "df_l.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:44.695716Z",
- "iopub.status.busy": "2024-06-07T09:18:44.695390Z",
- "iopub.status.idle": "2024-06-07T09:18:44.942598Z",
- "shell.execute_reply": "2024-06-07T09:18:44.942052Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_library as cl\n",
- "\n",
- "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"link_only\",\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"first_name\"),\n",
- " block_on(\"surname\"),\n",
- " ],\n",
- " comparisons=[\n",
- " cl.NameComparison(\n",
- " \"first_name\",\n",
- " ),\n",
- " cl.NameComparison(\"surname\"),\n",
- " cl.DateOfBirthComparison(\n",
- " \"dob\",\n",
- " input_is_string=True,\n",
- " invalid_dates_as_null=True,\n",
- " ),\n",
- " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
- " cl.EmailComparison(\"email\"),\n",
- " ],\n",
- ")\n",
- "\n",
- "linker = Linker(\n",
- " [df_l, df_r],\n",
- " settings,\n",
- " db_api=DuckDBAPI(),\n",
- " input_table_aliases=[\"df_left\", \"df_right\"],\n",
- ")"
- ]
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linking without deduplication\n",
+ "\n",
+ "A simple record linkage model using the `link_only` [link type](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#link_type).\n",
+ "\n",
+ "With `link_only`, only between-dataset record comparisons are generated. No within-dataset record comparisons are created, meaning that the model does not attempt to find within-dataset duplicates.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:42.926356Z",
+ "iopub.status.busy": "2024-06-07T09:18:42.925982Z",
+ "iopub.status.idle": "2024-06-07T09:18:42.943456Z",
+ "shell.execute_reply": "2024-06-07T09:18:42.942569Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:42.947959Z",
+ "iopub.status.busy": "2024-06-07T09:18:42.947640Z",
+ "iopub.status.idle": "2024-06-07T09:18:44.652788Z",
+ "shell.execute_reply": "2024-06-07T09:18:44.652024Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:44.946395Z",
- "iopub.status.busy": "2024-06-07T09:18:44.946113Z",
- "iopub.status.idle": "2024-06-07T09:18:45.188705Z",
- "shell.execute_reply": "2024-06-07T09:18:45.188192Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " city | \n",
+ " email | \n",
+ " cluster | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 922 | \n",
+ " 922 | \n",
+ " Evie | \n",
+ " Jones | \n",
+ " 2002-07-22 | \n",
+ " NaN | \n",
+ " eviejones@brewer-sparks.org | \n",
+ " 230 | \n",
+ "
\n",
+ " \n",
+ " | 224 | \n",
+ " 224 | \n",
+ " Logn | \n",
+ " Feeruson | \n",
+ " 2013-10-15 | \n",
+ " London | \n",
+ " l.fergson46@shah.com | \n",
+ " 58 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink.exploratory import completeness_chart\n",
- "\n",
- "completeness_chart(\n",
- " [df_l, df_r],\n",
- " cols=[\"first_name\", \"surname\", \"dob\", \"city\", \"email\"],\n",
- " db_api=DuckDBAPI(),\n",
- " table_names_for_chart=[\"df_left\", \"df_right\"],\n",
- ")"
+ "text/plain": [
+ " unique_id first_name surname dob city \\\n",
+ "922 922 Evie Jones 2002-07-22 NaN \n",
+ "224 224 Logn Feeruson 2013-10-15 London \n",
+ "\n",
+ " email cluster \n",
+ "922 eviejones@brewer-sparks.org 230 \n",
+ "224 l.fergson46@shah.com 58 "
]
- },
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.fake_1000\n",
+ "\n",
+ "# Split a simple dataset into two, separate datasets which can be linked together.\n",
+ "df_l = df.sample(frac=0.5)\n",
+ "df_r = df.drop(df_l.index)\n",
+ "\n",
+ "df_l.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:44.695716Z",
+ "iopub.status.busy": "2024-06-07T09:18:44.695390Z",
+ "iopub.status.idle": "2024-06-07T09:18:44.942598Z",
+ "shell.execute_reply": "2024-06-07T09:18:44.942052Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"link_only\",\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"first_name\"),\n",
+ " block_on(\"surname\"),\n",
+ " ],\n",
+ " comparisons=[\n",
+ " cl.NameComparison(\n",
+ " \"first_name\",\n",
+ " ),\n",
+ " cl.NameComparison(\"surname\"),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"dob\",\n",
+ " input_is_string=True,\n",
+ " invalid_dates_as_null=True,\n",
+ " ),\n",
+ " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
+ " cl.EmailComparison(\"email\"),\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_l_sdf = db_api.register(df_l, source_dataset_name=\"df_left\")\n",
+ "df_r_sdf = db_api.register(df_r, source_dataset_name=\"df_right\")\n",
+ "linker = Linker([df_l_sdf, df_r_sdf], settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:44.946395Z",
+ "iopub.status.busy": "2024-06-07T09:18:44.946113Z",
+ "iopub.status.idle": "2024-06-07T09:18:45.188705Z",
+ "shell.execute_reply": "2024-06-07T09:18:45.188192Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:45.192584Z",
- "iopub.status.busy": "2024-06-07T09:18:45.192253Z",
- "iopub.status.idle": "2024-06-07T09:18:45.341533Z",
- "shell.execute_reply": "2024-06-07T09:18:45.340965Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.00338.\n",
- "This means that amongst all possible pairwise record comparisons, one in 295.61 are expected to match. With 250,000 total possible comparisons, we expect a total of around 845.71 matching pairs\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "\n",
- "deterministic_rules = [\n",
- " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
- " block_on(\"email\"),\n",
- "]\n",
- "\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.exploratory import completeness_chart\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_l_sdf = db_api.register(df_l)\n",
+ "df_r_sdf = db_api.register(df_r)\n",
+ "completeness_chart(\n",
+ " [df_l_sdf, df_r_sdf],\n",
+ " cols=[\"first_name\", \"surname\", \"dob\", \"city\", \"email\"],\n",
+ " table_names_for_chart=[\"df_left\", \"df_right\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:45.192584Z",
+ "iopub.status.busy": "2024-06-07T09:18:45.192253Z",
+ "iopub.status.idle": "2024-06-07T09:18:45.341533Z",
+ "shell.execute_reply": "2024-06-07T09:18:45.340965Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:45.344512Z",
- "iopub.status.busy": "2024-06-07T09:18:45.344289Z",
- "iopub.status.idle": "2024-06-07T09:18:46.142225Z",
- "shell.execute_reply": "2024-06-07T09:18:46.141712Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
- "----- Estimating u probabilities using random sampling -----\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - dob (no m values are trained).\n",
- " - city (no m values are trained).\n",
- " - email (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.00338.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 295.61 are expected to match. With 250,000 total possible comparisons, we expect a total of around 845.71 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "deterministic_rules = [\n",
+ " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
+ " block_on(\"email\"),\n",
+ "]\n",
+ "\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:45.344512Z",
+ "iopub.status.busy": "2024-06-07T09:18:45.344289Z",
+ "iopub.status.idle": "2024-06-07T09:18:46.142225Z",
+ "shell.execute_reply": "2024-06-07T09:18:46.141712Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:46.145662Z",
- "iopub.status.busy": "2024-06-07T09:18:46.145393Z",
- "iopub.status.idle": "2024-06-07T09:18:47.814138Z",
- "shell.execute_reply": "2024-06-07T09:18:47.813573Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"dob\" = r.\"dob\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name\n",
- " - surname\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - dob\n",
- "\n",
- "WARNING:\n",
- "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was -0.418 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 2: Largest change in params was 0.104 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was 0.0711 in the m_probability of first_name, level `All other comparisons`\n",
- "Iteration 4: Largest change in params was 0.0237 in probability_two_random_records_match\n",
- "Iteration 5: Largest change in params was 0.0093 in probability_two_random_records_match\n",
- "Iteration 6: Largest change in params was 0.00407 in probability_two_random_records_match\n",
- "Iteration 7: Largest change in params was 0.0019 in probability_two_random_records_match\n",
- "Iteration 8: Largest change in params was 0.000916 in probability_two_random_records_match\n",
- "Iteration 9: Largest change in params was 0.000449 in probability_two_random_records_match\n",
- "Iteration 10: Largest change in params was 0.000222 in probability_two_random_records_match\n",
- "Iteration 11: Largest change in params was 0.00011 in probability_two_random_records_match\n",
- "Iteration 12: Largest change in params was 5.46e-05 in probability_two_random_records_match\n",
- "\n",
- "EM converged after 12 iterations\n",
- "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - dob (no m values are trained).\n",
- " - email (some m values are not trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"email\" = r.\"email\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name\n",
- " - surname\n",
- " - dob\n",
- " - city\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - email\n",
- "\n",
- "Iteration 1: Largest change in params was -0.483 in the m_probability of dob, level `Exact match on dob`\n",
- "Iteration 2: Largest change in params was 0.0905 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was 0.02 in probability_two_random_records_match\n",
- "Iteration 4: Largest change in params was 0.00718 in probability_two_random_records_match\n",
- "Iteration 5: Largest change in params was 0.0031 in probability_two_random_records_match\n",
- "Iteration 6: Largest change in params was 0.00148 in probability_two_random_records_match\n",
- "Iteration 7: Largest change in params was 0.000737 in probability_two_random_records_match\n",
- "Iteration 8: Largest change in params was 0.000377 in probability_two_random_records_match\n",
- "Iteration 9: Largest change in params was 0.000196 in probability_two_random_records_match\n",
- "Iteration 10: Largest change in params was 0.000102 in probability_two_random_records_match\n",
- "Iteration 11: Largest change in params was 5.37e-05 in probability_two_random_records_match\n",
- "\n",
- "EM converged after 11 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - email (some m values are not trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"first_name\" = r.\"first_name\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - surname\n",
- " - dob\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - first_name\n",
- "\n",
- "Iteration 1: Largest change in params was -0.169 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 2: Largest change in params was -0.0127 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was -0.00388 in the m_probability of surname, level `All other comparisons`\n",
- "Iteration 4: Largest change in params was -0.00164 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
- "Iteration 5: Largest change in params was -0.00089 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
- "Iteration 6: Largest change in params was -0.000454 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
- "Iteration 7: Largest change in params was -0.000225 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
- "Iteration 8: Largest change in params was -0.00011 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
- "Iteration 9: Largest change in params was -5.31e-05 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
- "\n",
- "EM converged after 9 iterations\n",
- "\n",
- "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
- ]
- }
- ],
- "source": [
- "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
- "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"email\")\n",
- ")\n",
- "session_first_name = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " block_on(\"first_name\")\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
+ "----- Estimating u probabilities using random sampling -----\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - dob (no m values are trained).\n",
+ " - city (no m values are trained).\n",
+ " - email (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:46.145662Z",
+ "iopub.status.busy": "2024-06-07T09:18:46.145393Z",
+ "iopub.status.idle": "2024-06-07T09:18:47.814138Z",
+ "shell.execute_reply": "2024-06-07T09:18:47.813573Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:47.817058Z",
- "iopub.status.busy": "2024-06-07T09:18:47.816828Z",
- "iopub.status.idle": "2024-06-07T09:18:48.064527Z",
- "shell.execute_reply": "2024-06-07T09:18:48.063844Z"
- }
- },
- "outputs": [],
- "source": [
- "results = linker.inference.predict(threshold_match_probability=0.9)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"dob\" = r.\"dob\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name\n",
+ " - surname\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - dob\n",
+ "\n",
+ "WARNING:\n",
+ "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.418 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 2: Largest change in params was 0.104 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.0711 in the m_probability of first_name, level `All other comparisons`\n",
+ "Iteration 4: Largest change in params was 0.0237 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 0.0093 in probability_two_random_records_match\n",
+ "Iteration 6: Largest change in params was 0.00407 in probability_two_random_records_match\n",
+ "Iteration 7: Largest change in params was 0.0019 in probability_two_random_records_match\n",
+ "Iteration 8: Largest change in params was 0.000916 in probability_two_random_records_match\n",
+ "Iteration 9: Largest change in params was 0.000449 in probability_two_random_records_match\n",
+ "Iteration 10: Largest change in params was 0.000222 in probability_two_random_records_match\n",
+ "Iteration 11: Largest change in params was 0.00011 in probability_two_random_records_match\n",
+ "Iteration 12: Largest change in params was 5.46e-05 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 12 iterations\n",
+ "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - dob (no m values are trained).\n",
+ " - email (some m values are not trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"email\" = r.\"email\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name\n",
+ " - surname\n",
+ " - dob\n",
+ " - city\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - email\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.483 in the m_probability of dob, level `Exact match on dob`\n",
+ "Iteration 2: Largest change in params was 0.0905 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.02 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.00718 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 0.0031 in probability_two_random_records_match\n",
+ "Iteration 6: Largest change in params was 0.00148 in probability_two_random_records_match\n",
+ "Iteration 7: Largest change in params was 0.000737 in probability_two_random_records_match\n",
+ "Iteration 8: Largest change in params was 0.000377 in probability_two_random_records_match\n",
+ "Iteration 9: Largest change in params was 0.000196 in probability_two_random_records_match\n",
+ "Iteration 10: Largest change in params was 0.000102 in probability_two_random_records_match\n",
+ "Iteration 11: Largest change in params was 5.37e-05 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 11 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - email (some m values are not trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"first_name\" = r.\"first_name\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - surname\n",
+ " - dob\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - first_name\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.169 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 2: Largest change in params was -0.0127 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was -0.00388 in the m_probability of surname, level `All other comparisons`\n",
+ "Iteration 4: Largest change in params was -0.00164 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
+ "Iteration 5: Largest change in params was -0.00089 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
+ "Iteration 6: Largest change in params was -0.000454 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
+ "Iteration 7: Largest change in params was -0.000225 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
+ "Iteration 8: Largest change in params was -0.00011 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
+ "Iteration 9: Largest change in params was -5.31e-05 in the m_probability of email, level `Jaro-Winkler >0.88 on username`\n",
+ "\n",
+ "EM converged after 9 iterations\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
+ }
+ ],
+ "source": [
+ "session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
+ "session_email = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"email\")\n",
+ ")\n",
+ "session_first_name = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " block_on(\"first_name\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:47.817058Z",
+ "iopub.status.busy": "2024-06-07T09:18:47.816828Z",
+ "iopub.status.idle": "2024-06-07T09:18:48.064527Z",
+ "shell.execute_reply": "2024-06-07T09:18:48.063844Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "results = linker.inference.predict(threshold_match_probability=0.9)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:18:48.067845Z",
+ "iopub.status.busy": "2024-06-07T09:18:48.067582Z",
+ "iopub.status.idle": "2024-06-07T09:18:48.084784Z",
+ "shell.execute_reply": "2024-06-07T09:18:48.084179Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:18:48.067845Z",
- "iopub.status.busy": "2024-06-07T09:18:48.067582Z",
- "iopub.status.idle": "2024-06-07T09:18:48.084784Z",
- "shell.execute_reply": "2024-06-07T09:18:48.084179Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " source_dataset_l | \n",
- " source_dataset_r | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " gamma_first_name | \n",
- " surname_l | \n",
- " ... | \n",
- " dob_l | \n",
- " dob_r | \n",
- " gamma_dob | \n",
- " city_l | \n",
- " city_r | \n",
- " gamma_city | \n",
- " email_l | \n",
- " email_r | \n",
- " gamma_email | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 3.180767 | \n",
- " 0.900674 | \n",
- " df_left | \n",
- " df_right | \n",
- " 242 | \n",
- " 240 | \n",
- " Freya | \n",
- " Freya | \n",
- " 4 | \n",
- " Shah | \n",
- " ... | \n",
- " 1970-12-17 | \n",
- " 1970-12-16 | \n",
- " 4 | \n",
- " Lonnod | \n",
- " noLdon | \n",
- " 0 | \n",
- " None | \n",
- " None | \n",
- " -1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 3.180767 | \n",
- " 0.900674 | \n",
- " df_left | \n",
- " df_right | \n",
- " 241 | \n",
- " 240 | \n",
- " Freya | \n",
- " Freya | \n",
- " 4 | \n",
- " None | \n",
- " ... | \n",
- " 1970-12-17 | \n",
- " 1970-12-16 | \n",
- " 4 | \n",
- " London | \n",
- " noLdon | \n",
- " 0 | \n",
- " f.s@flynn.com | \n",
- " None | \n",
- " -1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3.212523 | \n",
- " 0.902626 | \n",
- " df_left | \n",
- " df_right | \n",
- " 679 | \n",
- " 682 | \n",
- " Elizabeth | \n",
- " Elizabeth | \n",
- " 4 | \n",
- " Shaw | \n",
- " ... | \n",
- " 2006-04-21 | \n",
- " 2016-04-18 | \n",
- " 1 | \n",
- " Cardiff | \n",
- " Cardifrf | \n",
- " 0 | \n",
- " e.shaw@smith-hall.biz | \n",
- " e.shaw@smith-hall.lbiz | \n",
- " 3 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 3.224126 | \n",
- " 0.903331 | \n",
- " df_left | \n",
- " df_right | \n",
- " 576 | \n",
- " 580 | \n",
- " Jessica | \n",
- " Jessica | \n",
- " 4 | \n",
- " None | \n",
- " ... | \n",
- " 1974-11-17 | \n",
- " 1974-12-17 | \n",
- " 4 | \n",
- " None | \n",
- " Walsall | \n",
- " -1 | \n",
- " jesscac.owen@elliott.org | \n",
- " None | \n",
- " -1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 3.224126 | \n",
- " 0.903331 | \n",
- " df_left | \n",
- " df_right | \n",
- " 577 | \n",
- " 580 | \n",
- " Jessica | \n",
- " Jessica | \n",
- " 4 | \n",
- " None | \n",
- " ... | \n",
- " 1974-11-17 | \n",
- " 1974-12-17 | \n",
- " 4 | \n",
- " None | \n",
- " Walsall | \n",
- " -1 | \n",
- " jessica.owen@elliott.org | \n",
- " None | \n",
- " -1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 22 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability source_dataset_l source_dataset_r \\\n",
- "0 3.180767 0.900674 df_left df_right \n",
- "1 3.180767 0.900674 df_left df_right \n",
- "2 3.212523 0.902626 df_left df_right \n",
- "3 3.224126 0.903331 df_left df_right \n",
- "4 3.224126 0.903331 df_left df_right \n",
- "\n",
- " unique_id_l unique_id_r first_name_l first_name_r gamma_first_name \\\n",
- "0 242 240 Freya Freya 4 \n",
- "1 241 240 Freya Freya 4 \n",
- "2 679 682 Elizabeth Elizabeth 4 \n",
- "3 576 580 Jessica Jessica 4 \n",
- "4 577 580 Jessica Jessica 4 \n",
- "\n",
- " surname_l ... dob_l dob_r gamma_dob city_l city_r \\\n",
- "0 Shah ... 1970-12-17 1970-12-16 4 Lonnod noLdon \n",
- "1 None ... 1970-12-17 1970-12-16 4 London noLdon \n",
- "2 Shaw ... 2006-04-21 2016-04-18 1 Cardiff Cardifrf \n",
- "3 None ... 1974-11-17 1974-12-17 4 None Walsall \n",
- "4 None ... 1974-11-17 1974-12-17 4 None Walsall \n",
- "\n",
- " gamma_city email_l email_r gamma_email \\\n",
- "0 0 None None -1 \n",
- "1 0 f.s@flynn.com None -1 \n",
- "2 0 e.shaw@smith-hall.biz e.shaw@smith-hall.lbiz 3 \n",
- "3 -1 jesscac.owen@elliott.org None -1 \n",
- "4 -1 jessica.owen@elliott.org None -1 \n",
- "\n",
- " match_key \n",
- "0 0 \n",
- "1 0 \n",
- "2 0 \n",
- "3 0 \n",
- "4 0 \n",
- "\n",
- "[5 rows x 22 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " source_dataset_l | \n",
+ " source_dataset_r | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " gamma_first_name | \n",
+ " surname_l | \n",
+ " ... | \n",
+ " dob_l | \n",
+ " dob_r | \n",
+ " gamma_dob | \n",
+ " city_l | \n",
+ " city_r | \n",
+ " gamma_city | \n",
+ " email_l | \n",
+ " email_r | \n",
+ " gamma_email | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 3.180767 | \n",
+ " 0.900674 | \n",
+ " df_left | \n",
+ " df_right | \n",
+ " 242 | \n",
+ " 240 | \n",
+ " Freya | \n",
+ " Freya | \n",
+ " 4 | \n",
+ " Shah | \n",
+ " ... | \n",
+ " 1970-12-17 | \n",
+ " 1970-12-16 | \n",
+ " 4 | \n",
+ " Lonnod | \n",
+ " noLdon | \n",
+ " 0 | \n",
+ " None | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3.180767 | \n",
+ " 0.900674 | \n",
+ " df_left | \n",
+ " df_right | \n",
+ " 241 | \n",
+ " 240 | \n",
+ " Freya | \n",
+ " Freya | \n",
+ " 4 | \n",
+ " None | \n",
+ " ... | \n",
+ " 1970-12-17 | \n",
+ " 1970-12-16 | \n",
+ " 4 | \n",
+ " London | \n",
+ " noLdon | \n",
+ " 0 | \n",
+ " f.s@flynn.com | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3.212523 | \n",
+ " 0.902626 | \n",
+ " df_left | \n",
+ " df_right | \n",
+ " 679 | \n",
+ " 682 | \n",
+ " Elizabeth | \n",
+ " Elizabeth | \n",
+ " 4 | \n",
+ " Shaw | \n",
+ " ... | \n",
+ " 2006-04-21 | \n",
+ " 2016-04-18 | \n",
+ " 1 | \n",
+ " Cardiff | \n",
+ " Cardifrf | \n",
+ " 0 | \n",
+ " e.shaw@smith-hall.biz | \n",
+ " e.shaw@smith-hall.lbiz | \n",
+ " 3 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3.224126 | \n",
+ " 0.903331 | \n",
+ " df_left | \n",
+ " df_right | \n",
+ " 576 | \n",
+ " 580 | \n",
+ " Jessica | \n",
+ " Jessica | \n",
+ " 4 | \n",
+ " None | \n",
+ " ... | \n",
+ " 1974-11-17 | \n",
+ " 1974-12-17 | \n",
+ " 4 | \n",
+ " None | \n",
+ " Walsall | \n",
+ " -1 | \n",
+ " jesscac.owen@elliott.org | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 3.224126 | \n",
+ " 0.903331 | \n",
+ " df_left | \n",
+ " df_right | \n",
+ " 577 | \n",
+ " 580 | \n",
+ " Jessica | \n",
+ " Jessica | \n",
+ " 4 | \n",
+ " None | \n",
+ " ... | \n",
+ " 1974-11-17 | \n",
+ " 1974-12-17 | \n",
+ " 4 | \n",
+ " None | \n",
+ " Walsall | \n",
+ " -1 | \n",
+ " jessica.owen@elliott.org | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 22 columns
\n",
+ "
"
],
- "source": [
- "results.as_pandas_dataframe(limit=5)"
+ "text/plain": [
+ " match_weight match_probability source_dataset_l source_dataset_r \\\n",
+ "0 3.180767 0.900674 df_left df_right \n",
+ "1 3.180767 0.900674 df_left df_right \n",
+ "2 3.212523 0.902626 df_left df_right \n",
+ "3 3.224126 0.903331 df_left df_right \n",
+ "4 3.224126 0.903331 df_left df_right \n",
+ "\n",
+ " unique_id_l unique_id_r first_name_l first_name_r gamma_first_name \\\n",
+ "0 242 240 Freya Freya 4 \n",
+ "1 241 240 Freya Freya 4 \n",
+ "2 679 682 Elizabeth Elizabeth 4 \n",
+ "3 576 580 Jessica Jessica 4 \n",
+ "4 577 580 Jessica Jessica 4 \n",
+ "\n",
+ " surname_l ... dob_l dob_r gamma_dob city_l city_r \\\n",
+ "0 Shah ... 1970-12-17 1970-12-16 4 Lonnod noLdon \n",
+ "1 None ... 1970-12-17 1970-12-16 4 London noLdon \n",
+ "2 Shaw ... 2006-04-21 2016-04-18 1 Cardiff Cardifrf \n",
+ "3 None ... 1974-11-17 1974-12-17 4 None Walsall \n",
+ "4 None ... 1974-11-17 1974-12-17 4 None Walsall \n",
+ "\n",
+ " gamma_city email_l email_r gamma_email \\\n",
+ "0 0 None None -1 \n",
+ "1 0 f.s@flynn.com None -1 \n",
+ "2 0 e.shaw@smith-hall.biz e.shaw@smith-hall.lbiz 3 \n",
+ "3 -1 jesscac.owen@elliott.org None -1 \n",
+ "4 -1 jessica.owen@elliott.org None -1 \n",
+ "\n",
+ " match_key \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 22 columns]"
]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "results.as_pandas_dataframe(limit=5)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/duckdb/pairwise_labels.ipynb b/docs/demos/examples/duckdb/pairwise_labels.ipynb
index 1fb088d3eb..7c0d151eec 100644
--- a/docs/demos/examples/duckdb/pairwise_labels.ipynb
+++ b/docs/demos/examples/duckdb/pairwise_labels.ipynb
@@ -1,780 +1,782 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- ""
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Estimating m from a sample of pairwise labels\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "In this example, we estimate the m probabilities of the model from a table containing pairwise record comparisons which we know are 'true' matches. For example, these may be the result of work by a clerical team who have manually labelled a sample of matches.\n",
- "\n",
- "The table must be in the following format:\n",
- "\n",
- "| source_dataset_l | unique_id_l | source_dataset_r | unique_id_r |\n",
- "| ---------------- | ----------- | ---------------- | ----------- |\n",
- "| df_1 | 1 | df_2 | 2 |\n",
- "| df_1 | 1 | df_2 | 3 |\n",
- "\n",
- "It is assumed that every record in the table represents a certain match.\n",
- "\n",
- "Note that the column names above are the defaults. They should correspond to the values you've set for [`unique_id_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#unique_id_column_name) and [`source_dataset_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#source_dataset_column_name), if you've chosen custom values.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:22.461384Z",
- "iopub.status.busy": "2024-06-07T09:20:22.461075Z",
- "iopub.status.idle": "2024-06-07T09:20:22.466162Z",
- "shell.execute_reply": "2024-06-07T09:20:22.465529Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:22.470034Z",
- "iopub.status.busy": "2024-06-07T09:20:22.469740Z",
- "iopub.status.idle": "2024-06-07T09:20:24.546756Z",
- "shell.execute_reply": "2024-06-07T09:20:24.546033Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id_l | \n",
- " source_dataset_l | \n",
- " unique_id_r | \n",
- " source_dataset_r | \n",
- " clerical_match_score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 1 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 2 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 3 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 49 | \n",
- " 1 | \n",
- " fake_1000 | \n",
- " 2 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 50 | \n",
- " 1 | \n",
- " fake_1000 | \n",
- " 3 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 3171 | \n",
- " 994 | \n",
- " fake_1000 | \n",
- " 996 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3172 | \n",
- " 995 | \n",
- " fake_1000 | \n",
- " 996 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3173 | \n",
- " 997 | \n",
- " fake_1000 | \n",
- " 998 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3174 | \n",
- " 997 | \n",
- " fake_1000 | \n",
- " 999 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3175 | \n",
- " 998 | \n",
- " fake_1000 | \n",
- " 999 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
2031 rows × 5 columns
\n",
- "
"
- ],
- "text/plain": [
- " unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n",
- "0 0 fake_1000 1 fake_1000 \n",
- "1 0 fake_1000 2 fake_1000 \n",
- "2 0 fake_1000 3 fake_1000 \n",
- "49 1 fake_1000 2 fake_1000 \n",
- "50 1 fake_1000 3 fake_1000 \n",
- "... ... ... ... ... \n",
- "3171 994 fake_1000 996 fake_1000 \n",
- "3172 995 fake_1000 996 fake_1000 \n",
- "3173 997 fake_1000 998 fake_1000 \n",
- "3174 997 fake_1000 999 fake_1000 \n",
- "3175 998 fake_1000 999 fake_1000 \n",
- "\n",
- " clerical_match_score \n",
- "0 1.0 \n",
- "1 1.0 \n",
- "2 1.0 \n",
- "49 1.0 \n",
- "50 1.0 \n",
- "... ... \n",
- "3171 1.0 \n",
- "3172 1.0 \n",
- "3173 1.0 \n",
- "3174 1.0 \n",
- "3175 1.0 \n",
- "\n",
- "[2031 rows x 5 columns]"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink.datasets import splink_dataset_labels\n",
- "\n",
- "pairwise_labels = splink_dataset_labels.fake_1000_labels\n",
- "\n",
- "# Choose labels indicating a match\n",
- "pairwise_labels = pairwise_labels[pairwise_labels[\"clerical_match_score\"] == 1]\n",
- "pairwise_labels"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We now proceed to estimate the Fellegi Sunter model:\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ ""
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Estimating m from a sample of pairwise labels\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "In this example, we estimate the m probabilities of the model from a table containing pairwise record comparisons which we know are 'true' matches. For example, these may be the result of work by a clerical team who have manually labelled a sample of matches.\n",
+ "\n",
+ "The table must be in the following format:\n",
+ "\n",
+ "| source_dataset_l | unique_id_l | source_dataset_r | unique_id_r |\n",
+ "| ---------------- | ----------- | ---------------- | ----------- |\n",
+ "| df_1 | 1 | df_2 | 2 |\n",
+ "| df_1 | 1 | df_2 | 3 |\n",
+ "\n",
+ "It is assumed that every record in the table represents a certain match.\n",
+ "\n",
+ "Note that the column names above are the defaults. They should correspond to the values you've set for [`unique_id_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#unique_id_column_name) and [`source_dataset_column_name`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#source_dataset_column_name), if you've chosen custom values.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:22.461384Z",
+ "iopub.status.busy": "2024-06-07T09:20:22.461075Z",
+ "iopub.status.idle": "2024-06-07T09:20:22.466162Z",
+ "shell.execute_reply": "2024-06-07T09:20:22.465529Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:22.470034Z",
+ "iopub.status.busy": "2024-06-07T09:20:22.469740Z",
+ "iopub.status.idle": "2024-06-07T09:20:24.546756Z",
+ "shell.execute_reply": "2024-06-07T09:20:24.546033Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:24.588843Z",
- "iopub.status.busy": "2024-06-07T09:20:24.588530Z",
- "iopub.status.idle": "2024-06-07T09:20:24.602952Z",
- "shell.execute_reply": "2024-06-07T09:20:24.602047Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " city | \n",
- " email | \n",
- " cluster | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " Robert | \n",
- " Alan | \n",
- " 1971-06-24 | \n",
- " NaN | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " Robert | \n",
- " Allen | \n",
- " 1971-05-24 | \n",
- " NaN | \n",
- " roberta25@smith.net | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id first_name surname dob city email cluster\n",
- "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0\n",
- "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id_l | \n",
+ " source_dataset_l | \n",
+ " unique_id_r | \n",
+ " source_dataset_r | \n",
+ " clerical_match_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 1 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 2 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 3 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 49 | \n",
+ " 1 | \n",
+ " fake_1000 | \n",
+ " 2 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " 1 | \n",
+ " fake_1000 | \n",
+ " 3 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 3171 | \n",
+ " 994 | \n",
+ " fake_1000 | \n",
+ " 996 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3172 | \n",
+ " 995 | \n",
+ " fake_1000 | \n",
+ " 996 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3173 | \n",
+ " 997 | \n",
+ " fake_1000 | \n",
+ " 998 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3174 | \n",
+ " 997 | \n",
+ " fake_1000 | \n",
+ " 999 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3175 | \n",
+ " 998 | \n",
+ " fake_1000 | \n",
+ " 999 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2031 rows × 5 columns
\n",
+ "
"
],
- "source": [
- "from splink import splink_datasets\n",
- "\n",
- "df = splink_datasets.fake_1000\n",
- "df.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:24.607247Z",
- "iopub.status.busy": "2024-06-07T09:20:24.606935Z",
- "iopub.status.idle": "2024-06-07T09:20:24.711369Z",
- "shell.execute_reply": "2024-06-07T09:20:24.710531Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_library as cl\n",
- "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"first_name\"),\n",
- " block_on(\"surname\"),\n",
- " ],\n",
- " comparisons=[\n",
- " cl.NameComparison(\"first_name\"),\n",
- " cl.NameComparison(\"surname\"),\n",
- " cl.DateOfBirthComparison(\n",
- " \"dob\",\n",
- " input_is_string=True,\n",
- " ),\n",
- " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
- " cl.EmailComparison(\"email\"),\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")"
+ "text/plain": [
+ " unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n",
+ "0 0 fake_1000 1 fake_1000 \n",
+ "1 0 fake_1000 2 fake_1000 \n",
+ "2 0 fake_1000 3 fake_1000 \n",
+ "49 1 fake_1000 2 fake_1000 \n",
+ "50 1 fake_1000 3 fake_1000 \n",
+ "... ... ... ... ... \n",
+ "3171 994 fake_1000 996 fake_1000 \n",
+ "3172 995 fake_1000 996 fake_1000 \n",
+ "3173 997 fake_1000 998 fake_1000 \n",
+ "3174 997 fake_1000 999 fake_1000 \n",
+ "3175 998 fake_1000 999 fake_1000 \n",
+ "\n",
+ " clerical_match_score \n",
+ "0 1.0 \n",
+ "1 1.0 \n",
+ "2 1.0 \n",
+ "49 1.0 \n",
+ "50 1.0 \n",
+ "... ... \n",
+ "3171 1.0 \n",
+ "3172 1.0 \n",
+ "3173 1.0 \n",
+ "3174 1.0 \n",
+ "3175 1.0 \n",
+ "\n",
+ "[2031 rows x 5 columns]"
]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:24.715481Z",
- "iopub.status.busy": "2024-06-07T09:20:24.715162Z",
- "iopub.status.idle": "2024-06-07T09:20:25.100461Z",
- "shell.execute_reply": "2024-06-07T09:20:25.099741Z"
- }
- },
- "outputs": [],
- "source": [
- "linker = Linker(df, settings, db_api=DuckDBAPI(), set_up_basic_logging=False)\n",
- "deterministic_rules = [\n",
- " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
- " \"l.email = r.email\",\n",
- "]\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)"
- ]
- },
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.datasets import splink_dataset_labels\n",
+ "\n",
+ "pairwise_labels = splink_dataset_labels.fake_1000_labels\n",
+ "\n",
+ "# Choose labels indicating a match\n",
+ "pairwise_labels = pairwise_labels[pairwise_labels[\"clerical_match_score\"] == 1]\n",
+ "pairwise_labels"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We now proceed to estimate the Fellegi Sunter model:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:24.588843Z",
+ "iopub.status.busy": "2024-06-07T09:20:24.588530Z",
+ "iopub.status.idle": "2024-06-07T09:20:24.602952Z",
+ "shell.execute_reply": "2024-06-07T09:20:24.602047Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:25.104541Z",
- "iopub.status.busy": "2024-06-07T09:20:25.104116Z",
- "iopub.status.idle": "2024-06-07T09:20:26.866642Z",
- "shell.execute_reply": "2024-06-07T09:20:26.866007Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " city | \n",
+ " email | \n",
+ " cluster | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " Robert | \n",
+ " Alan | \n",
+ " 1971-06-24 | \n",
+ " NaN | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " Robert | \n",
+ " Allen | \n",
+ " 1971-05-24 | \n",
+ " NaN | \n",
+ " roberta25@smith.net | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ "text/plain": [
+ " unique_id first_name surname dob city email cluster\n",
+ "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0\n",
+ "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0"
]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.fake_1000\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:24.607247Z",
+ "iopub.status.busy": "2024-06-07T09:20:24.606935Z",
+ "iopub.status.idle": "2024-06-07T09:20:24.711369Z",
+ "shell.execute_reply": "2024-06-07T09:20:24.710531Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"first_name\"),\n",
+ " block_on(\"surname\"),\n",
+ " ],\n",
+ " comparisons=[\n",
+ " cl.NameComparison(\"first_name\"),\n",
+ " cl.NameComparison(\"surname\"),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"dob\",\n",
+ " input_is_string=True,\n",
+ " ),\n",
+ " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
+ " cl.EmailComparison(\"email\"),\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:24.715481Z",
+ "iopub.status.busy": "2024-06-07T09:20:24.715162Z",
+ "iopub.status.idle": "2024-06-07T09:20:25.100461Z",
+ "shell.execute_reply": "2024-06-07T09:20:25.099741Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
+ "deterministic_rules = [\n",
+ " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
+ " \"l.email = r.email\",\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:25.104541Z",
+ "iopub.status.busy": "2024-06-07T09:20:25.104116Z",
+ "iopub.status.idle": "2024-06-07T09:20:26.866642Z",
+ "shell.execute_reply": "2024-06-07T09:20:26.866007Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:26.871363Z",
- "iopub.status.busy": "2024-06-07T09:20:26.871016Z",
- "iopub.status.idle": "2024-06-07T09:20:27.051023Z",
- "shell.execute_reply": "2024-06-07T09:20:27.050407Z"
- }
- },
- "outputs": [],
- "source": [
- "# Register the pairwise labels table with the database, and then use it to estimate the m values\n",
- "labels_df = linker.table_management.register_labels_table(pairwise_labels, overwrite=True)\n",
- "linker.training.estimate_m_from_pairwise_labels(labels_df)\n",
- "\n",
- "\n",
- "# If the labels table already existing in the dataset you could run\n",
- "# linker.training.estimate_m_from_pairwise_labels(\"labels_tablename_here\")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:26.871363Z",
+ "iopub.status.busy": "2024-06-07T09:20:26.871016Z",
+ "iopub.status.idle": "2024-06-07T09:20:27.051023Z",
+ "shell.execute_reply": "2024-06-07T09:20:27.050407Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Register the pairwise labels table with the database, and then use it to estimate the m values\n",
+ "labels_df = linker.table_management.register_labels_table(pairwise_labels, overwrite=True)\n",
+ "linker.training.estimate_m_from_pairwise_labels(labels_df)\n",
+ "\n",
+ "\n",
+ "# If the labels table already existing in the dataset you could run\n",
+ "# linker.training.estimate_m_from_pairwise_labels(\"labels_tablename_here\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:27.054211Z",
+ "iopub.status.busy": "2024-06-07T09:20:27.053972Z",
+ "iopub.status.idle": "2024-06-07T09:20:27.489093Z",
+ "shell.execute_reply": "2024-06-07T09:20:27.488564Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:27.054211Z",
- "iopub.status.busy": "2024-06-07T09:20:27.053972Z",
- "iopub.status.idle": "2024-06-07T09:20:27.489093Z",
- "shell.execute_reply": "2024-06-07T09:20:27.488564Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "training_blocking_rule = block_on(\"first_name\")\n",
- "linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)"
+ "data": {
+ "text/plain": [
+ ""
]
- },
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"first_name\")\n",
+ "linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:27.492742Z",
+ "iopub.status.busy": "2024-06-07T09:20:27.492510Z",
+ "iopub.status.idle": "2024-06-07T09:20:27.624619Z",
+ "shell.execute_reply": "2024-06-07T09:20:27.624114Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:27.492742Z",
- "iopub.status.busy": "2024-06-07T09:20:27.492510Z",
- "iopub.status.idle": "2024-06-07T09:20:27.624619Z",
- "shell.execute_reply": "2024-06-07T09:20:27.624114Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.visualisations.parameter_estimate_comparisons_chart()"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.parameter_estimate_comparisons_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:27.628602Z",
+ "iopub.status.busy": "2024-06-07T09:20:27.628256Z",
+ "iopub.status.idle": "2024-06-07T09:20:27.933374Z",
+ "shell.execute_reply": "2024-06-07T09:20:27.932702Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:27.628602Z",
- "iopub.status.busy": "2024-06-07T09:20:27.628256Z",
- "iopub.status.idle": "2024-06-07T09:20:27.933374Z",
- "shell.execute_reply": "2024-06-07T09:20:27.932702Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.visualisations.match_weights_chart()"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "54186cece08b4f6fa03f33cc282f36a6": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_8863bef4905c44fc9705add5d5165a71",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_ce7ee37dfbeb4d26ae9171f7f3b857e7",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "8863bef4905c44fc9705add5d5165a71": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "ce7ee37dfbeb4d26ae9171f7f3b857e7": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
- }
- }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "54186cece08b4f6fa03f33cc282f36a6": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_8863bef4905c44fc9705add5d5165a71",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_ce7ee37dfbeb4d26ae9171f7f3b857e7",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "8863bef4905c44fc9705add5d5165a71": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "ce7ee37dfbeb4d26ae9171f7f3b857e7": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb b/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb
index 31a337c9fd..36a7336714 100644
--- a/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb
+++ b/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb
@@ -1,716 +1,718 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Historical people: Quick and dirty\n",
- "\n",
- "This example shows how to get some initial record linkage results as quickly as possible.\n",
- "\n",
- "There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:37.624889Z",
- "iopub.status.busy": "2024-06-07T09:20:37.624517Z",
- "iopub.status.idle": "2024-06-07T09:20:37.644289Z",
- "shell.execute_reply": "2024-06-07T09:20:37.643404Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Historical people: Quick and dirty\n",
+ "\n",
+ "This example shows how to get some initial record linkage results as quickly as possible.\n",
+ "\n",
+ "There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:37.624889Z",
+ "iopub.status.busy": "2024-06-07T09:20:37.624517Z",
+ "iopub.status.idle": "2024-06-07T09:20:37.644289Z",
+ "shell.execute_reply": "2024-06-07T09:20:37.643404Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:37.648712Z",
+ "iopub.status.busy": "2024-06-07T09:20:37.648404Z",
+ "iopub.status.idle": "2024-06-07T09:20:39.278642Z",
+ "shell.execute_reply": "2024-06-07T09:20:39.277984Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:37.648712Z",
- "iopub.status.busy": "2024-06-07T09:20:37.648404Z",
- "iopub.status.idle": "2024-06-07T09:20:39.278642Z",
- "shell.execute_reply": "2024-06-07T09:20:39.277984Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id | \n",
- " cluster | \n",
- " full_name | \n",
- " first_and_surname | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " birth_place | \n",
- " postcode_fake | \n",
- " gender | \n",
- " occupation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Q2296770-1 | \n",
- " Q2296770 | \n",
- " thomas clifford, 1st baron clifford of chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Q2296770-2 | \n",
- " Q2296770 | \n",
- " thomas of chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Q2296770-3 | \n",
- " Q2296770 | \n",
- " tom 1st baron clifford of chudleigh | \n",
- " tom chudleigh | \n",
- " tom | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " male | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Q2296770-4 | \n",
- " Q2296770 | \n",
- " thomas 1st chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8hu | \n",
- " None | \n",
- " politician | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Q2296770-5 | \n",
- " Q2296770 | \n",
- " thomas clifford, 1st baron chudleigh | \n",
- " thomas chudleigh | \n",
- " thomas | \n",
- " chudleigh | \n",
- " 1630-08-01 | \n",
- " devon | \n",
- " tq13 8df | \n",
- " None | \n",
- " politician | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id cluster full_name \\\n",
- "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n",
- "1 Q2296770-2 Q2296770 thomas of chudleigh \n",
- "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n",
- "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n",
- "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n",
- "\n",
- " first_and_surname first_name surname dob birth_place \\\n",
- "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "2 tom chudleigh tom chudleigh 1630-08-01 devon \n",
- "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
- "\n",
- " postcode_fake gender occupation \n",
- "0 tq13 8df male politician \n",
- "1 tq13 8df male politician \n",
- "2 tq13 8df male politician \n",
- "3 tq13 8hu None politician \n",
- "4 tq13 8df None politician "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id | \n",
+ " cluster | \n",
+ " full_name | \n",
+ " first_and_surname | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " birth_place | \n",
+ " postcode_fake | \n",
+ " gender | \n",
+ " occupation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Q2296770-1 | \n",
+ " Q2296770 | \n",
+ " thomas clifford, 1st baron clifford of chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Q2296770-2 | \n",
+ " Q2296770 | \n",
+ " thomas of chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Q2296770-3 | \n",
+ " Q2296770 | \n",
+ " tom 1st baron clifford of chudleigh | \n",
+ " tom chudleigh | \n",
+ " tom | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " male | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Q2296770-4 | \n",
+ " Q2296770 | \n",
+ " thomas 1st chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8hu | \n",
+ " None | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Q2296770-5 | \n",
+ " Q2296770 | \n",
+ " thomas clifford, 1st baron chudleigh | \n",
+ " thomas chudleigh | \n",
+ " thomas | \n",
+ " chudleigh | \n",
+ " 1630-08-01 | \n",
+ " devon | \n",
+ " tq13 8df | \n",
+ " None | \n",
+ " politician | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink.datasets import splink_datasets\n",
- "\n",
- "df = splink_datasets.historical_50k\n",
- "df.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:39.330739Z",
- "iopub.status.busy": "2024-06-07T09:20:39.330384Z",
- "iopub.status.idle": "2024-06-07T09:20:39.345331Z",
- "shell.execute_reply": "2024-06-07T09:20:39.344598Z"
- }
- },
- "outputs": [],
- "source": [
- "from splink import block_on, SettingsCreator\n",
- "import splink.comparison_library as cl\n",
- "\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"full_name\"),\n",
- " block_on(\"substr(full_name,1,6)\", \"dob\", \"birth_place\"),\n",
- " block_on(\"dob\", \"birth_place\"),\n",
- " block_on(\"postcode_fake\"),\n",
- " ],\n",
- " comparisons=[\n",
- " cl.ForenameSurnameComparison(\n",
- " \"first_name\",\n",
- " \"surname\",\n",
- " forename_surname_concat_col_name=\"first_and_surname\",\n",
- " ),\n",
- " cl.DateOfBirthComparison(\n",
- " \"dob\",\n",
- " input_is_string=True,\n",
- " ),\n",
- " cl.LevenshteinAtThresholds(\"postcode_fake\", 2),\n",
- " cl.JaroWinklerAtThresholds(\"birth_place\", 0.9).configure(\n",
- " term_frequency_adjustments=True\n",
- " ),\n",
- " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n",
- " ],\n",
- ")"
+ "text/plain": [
+ " unique_id cluster full_name \\\n",
+ "0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh \n",
+ "1 Q2296770-2 Q2296770 thomas of chudleigh \n",
+ "2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh \n",
+ "3 Q2296770-4 Q2296770 thomas 1st chudleigh \n",
+ "4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh \n",
+ "\n",
+ " first_and_surname first_name surname dob birth_place \\\n",
+ "0 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "1 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "2 tom chudleigh tom chudleigh 1630-08-01 devon \n",
+ "3 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "4 thomas chudleigh thomas chudleigh 1630-08-01 devon \n",
+ "\n",
+ " postcode_fake gender occupation \n",
+ "0 tq13 8df male politician \n",
+ "1 tq13 8df male politician \n",
+ "2 tq13 8df male politician \n",
+ "3 tq13 8hu None politician \n",
+ "4 tq13 8df None politician "
]
- },
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.datasets import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.historical_50k\n",
+ "df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:39.330739Z",
+ "iopub.status.busy": "2024-06-07T09:20:39.330384Z",
+ "iopub.status.idle": "2024-06-07T09:20:39.345331Z",
+ "shell.execute_reply": "2024-06-07T09:20:39.344598Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import block_on, SettingsCreator\n",
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"full_name\"),\n",
+ " block_on(\"substr(full_name,1,6)\", \"dob\", \"birth_place\"),\n",
+ " block_on(\"dob\", \"birth_place\"),\n",
+ " block_on(\"postcode_fake\"),\n",
+ " ],\n",
+ " comparisons=[\n",
+ " cl.ForenameSurnameComparison(\n",
+ " \"first_name\",\n",
+ " \"surname\",\n",
+ " forename_surname_concat_col_name=\"first_and_surname\",\n",
+ " ),\n",
+ " cl.DateOfBirthComparison(\n",
+ " \"dob\",\n",
+ " input_is_string=True,\n",
+ " ),\n",
+ " cl.LevenshteinAtThresholds(\"postcode_fake\", 2),\n",
+ " cl.JaroWinklerAtThresholds(\"birth_place\", 0.9).configure(\n",
+ " term_frequency_adjustments=True\n",
+ " ),\n",
+ " cl.ExactMatch(\"occupation\").configure(term_frequency_adjustments=True),\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:39.349123Z",
+ "iopub.status.busy": "2024-06-07T09:20:39.348832Z",
+ "iopub.status.idle": "2024-06-07T09:20:39.807802Z",
+ "shell.execute_reply": "2024-06-07T09:20:39.807089Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import Linker, DuckDBAPI\n",
+ "\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
+ "deterministic_rules = [\n",
+ " \"l.full_name = r.full_name\",\n",
+ " \"l.postcode_fake = r.postcode_fake and l.dob = r.dob\",\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(\n",
+ " deterministic_rules, recall=0.6\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:39.811242Z",
+ "iopub.status.busy": "2024-06-07T09:20:39.810994Z",
+ "iopub.status.idle": "2024-06-07T09:20:42.328241Z",
+ "shell.execute_reply": "2024-06-07T09:20:42.327675Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:39.349123Z",
- "iopub.status.busy": "2024-06-07T09:20:39.348832Z",
- "iopub.status.idle": "2024-06-07T09:20:39.807802Z",
- "shell.execute_reply": "2024-06-07T09:20:39.807089Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c059aca16c8c48b0bb223e4350ba465f",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [],
- "source": [
- "from splink import Linker, DuckDBAPI\n",
- "\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI(), set_up_basic_logging=False)\n",
- "deterministic_rules = [\n",
- " \"l.full_name = r.full_name\",\n",
- " \"l.postcode_fake = r.postcode_fake and l.dob = r.dob\",\n",
- "]\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(\n",
- " deterministic_rules, recall=0.6\n",
- ")"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
- },
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=2e6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:42.331754Z",
+ "iopub.status.busy": "2024-06-07T09:20:42.331463Z",
+ "iopub.status.idle": "2024-06-07T09:20:44.521913Z",
+ "shell.execute_reply": "2024-06-07T09:20:44.521209Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:39.811242Z",
- "iopub.status.busy": "2024-06-07T09:20:39.810994Z",
- "iopub.status.idle": "2024-06-07T09:20:42.328241Z",
- "shell.execute_reply": "2024-06-07T09:20:42.327675Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "52b43c785e454da7b1dd85706a92c17c",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "c059aca16c8c48b0bb223e4350ba465f",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=2e6)"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:42.331754Z",
- "iopub.status.busy": "2024-06-07T09:20:42.331463Z",
- "iopub.status.idle": "2024-06-07T09:20:44.521913Z",
- "shell.execute_reply": "2024-06-07T09:20:44.521209Z"
- }
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "52b43c785e454da7b1dd85706a92c17c",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'first_name_surname':\n",
- " m values not fully trained\n",
- "Comparison: 'first_name_surname':\n",
- " u values not fully trained\n",
- "Comparison: 'dob':\n",
- " m values not fully trained\n",
- "Comparison: 'postcode_fake':\n",
- " m values not fully trained\n",
- "Comparison: 'birth_place':\n",
- " m values not fully trained\n",
- "Comparison: 'occupation':\n",
- " m values not fully trained\n"
- ]
- }
- ],
- "source": [
- "results = linker.inference.predict(threshold_match_probability=0.9)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'first_name_surname':\n",
+ " m values not fully trained\n",
+ "Comparison: 'first_name_surname':\n",
+ " u values not fully trained\n",
+ "Comparison: 'dob':\n",
+ " m values not fully trained\n",
+ "Comparison: 'postcode_fake':\n",
+ " m values not fully trained\n",
+ "Comparison: 'birth_place':\n",
+ " m values not fully trained\n",
+ "Comparison: 'occupation':\n",
+ " m values not fully trained\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = linker.inference.predict(threshold_match_probability=0.9)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:20:44.525778Z",
+ "iopub.status.busy": "2024-06-07T09:20:44.525492Z",
+ "iopub.status.idle": "2024-06-07T09:20:44.543212Z",
+ "shell.execute_reply": "2024-06-07T09:20:44.542595Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:20:44.525778Z",
- "iopub.status.busy": "2024-06-07T09:20:44.525492Z",
- "iopub.status.idle": "2024-06-07T09:20:44.543212Z",
- "shell.execute_reply": "2024-06-07T09:20:44.542595Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " surname_l | \n",
- " surname_r | \n",
- " first_and_surname_l | \n",
- " first_and_surname_r | \n",
- " ... | \n",
- " gamma_postcode_fake | \n",
- " birth_place_l | \n",
- " birth_place_r | \n",
- " gamma_birth_place | \n",
- " occupation_l | \n",
- " occupation_r | \n",
- " gamma_occupation | \n",
- " full_name_l | \n",
- " full_name_r | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 3.170005 | \n",
- " 0.900005 | \n",
- " Q7412607-1 | \n",
- " Q7412607-3 | \n",
- " samuel | \n",
- " samuel | \n",
- " shelley | \n",
- " shelley | \n",
- " samuel shelley | \n",
- " samuel shelley | \n",
- " ... | \n",
- " 0 | \n",
- " whitechapel | \n",
- " city of london | \n",
- " 0 | \n",
- " illuminator | \n",
- " illuminator | \n",
- " 1 | \n",
- " samuel shelley | \n",
- " samuel shelley | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 3.170695 | \n",
- " 0.900048 | \n",
- " Q15997578-4 | \n",
- " Q15997578-7 | \n",
- " job | \n",
- " wilding | \n",
- " wilding | \n",
- " None | \n",
- " job wilding | \n",
- " wilding | \n",
- " ... | \n",
- " -1 | \n",
- " wrexham | \n",
- " wrexham | \n",
- " 2 | \n",
- " association football player | \n",
- " association football player | \n",
- " 1 | \n",
- " job wilding | \n",
- " wilding | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3.170695 | \n",
- " 0.900048 | \n",
- " Q15997578-2 | \n",
- " Q15997578-7 | \n",
- " job | \n",
- " wilding | \n",
- " wilding | \n",
- " None | \n",
- " job wilding | \n",
- " wilding | \n",
- " ... | \n",
- " -1 | \n",
- " wrexham | \n",
- " wrexham | \n",
- " 2 | \n",
- " association football player | \n",
- " association football player | \n",
- " 1 | \n",
- " job wilding | \n",
- " wilding | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 3.170695 | \n",
- " 0.900048 | \n",
- " Q15997578-1 | \n",
- " Q15997578-7 | \n",
- " job | \n",
- " wilding | \n",
- " wilding | \n",
- " None | \n",
- " job wilding | \n",
- " wilding | \n",
- " ... | \n",
- " -1 | \n",
- " wrexham | \n",
- " wrexham | \n",
- " 2 | \n",
- " association football player | \n",
- " association football player | \n",
- " 1 | \n",
- " job wilding | \n",
- " wilding | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 3.172553 | \n",
- " 0.900164 | \n",
- " Q5726641-11 | \n",
- " Q5726641-8 | \n",
- " henry | \n",
- " harry | \n",
- " page | \n",
- " paige | \n",
- " henry page | \n",
- " harry paige | \n",
- " ... | \n",
- " 2 | \n",
- " staffordshire moorlands | \n",
- " staffordshire moorlands | \n",
- " 2 | \n",
- " cricketer | \n",
- " cricketer | \n",
- " 1 | \n",
- " henry page | \n",
- " harry paige | \n",
- " 3 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 26 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
- "0 3.170005 0.900005 Q7412607-1 Q7412607-3 samuel \n",
- "1 3.170695 0.900048 Q15997578-4 Q15997578-7 job \n",
- "2 3.170695 0.900048 Q15997578-2 Q15997578-7 job \n",
- "3 3.170695 0.900048 Q15997578-1 Q15997578-7 job \n",
- "4 3.172553 0.900164 Q5726641-11 Q5726641-8 henry \n",
- "\n",
- " first_name_r surname_l surname_r first_and_surname_l first_and_surname_r \\\n",
- "0 samuel shelley shelley samuel shelley samuel shelley \n",
- "1 wilding wilding None job wilding wilding \n",
- "2 wilding wilding None job wilding wilding \n",
- "3 wilding wilding None job wilding wilding \n",
- "4 harry page paige henry page harry paige \n",
- "\n",
- " ... gamma_postcode_fake birth_place_l birth_place_r \\\n",
- "0 ... 0 whitechapel city of london \n",
- "1 ... -1 wrexham wrexham \n",
- "2 ... -1 wrexham wrexham \n",
- "3 ... -1 wrexham wrexham \n",
- "4 ... 2 staffordshire moorlands staffordshire moorlands \n",
- "\n",
- " gamma_birth_place occupation_l \\\n",
- "0 0 illuminator \n",
- "1 2 association football player \n",
- "2 2 association football player \n",
- "3 2 association football player \n",
- "4 2 cricketer \n",
- "\n",
- " occupation_r gamma_occupation full_name_l \\\n",
- "0 illuminator 1 samuel shelley \n",
- "1 association football player 1 job wilding \n",
- "2 association football player 1 job wilding \n",
- "3 association football player 1 job wilding \n",
- "4 cricketer 1 henry page \n",
- "\n",
- " full_name_r match_key \n",
- "0 samuel shelley 0 \n",
- "1 wilding 2 \n",
- "2 wilding 2 \n",
- "3 wilding 2 \n",
- "4 harry paige 3 \n",
- "\n",
- "[5 rows x 26 columns]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " surname_l | \n",
+ " surname_r | \n",
+ " first_and_surname_l | \n",
+ " first_and_surname_r | \n",
+ " ... | \n",
+ " gamma_postcode_fake | \n",
+ " birth_place_l | \n",
+ " birth_place_r | \n",
+ " gamma_birth_place | \n",
+ " occupation_l | \n",
+ " occupation_r | \n",
+ " gamma_occupation | \n",
+ " full_name_l | \n",
+ " full_name_r | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 3.170005 | \n",
+ " 0.900005 | \n",
+ " Q7412607-1 | \n",
+ " Q7412607-3 | \n",
+ " samuel | \n",
+ " samuel | \n",
+ " shelley | \n",
+ " shelley | \n",
+ " samuel shelley | \n",
+ " samuel shelley | \n",
+ " ... | \n",
+ " 0 | \n",
+ " whitechapel | \n",
+ " city of london | \n",
+ " 0 | \n",
+ " illuminator | \n",
+ " illuminator | \n",
+ " 1 | \n",
+ " samuel shelley | \n",
+ " samuel shelley | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3.170695 | \n",
+ " 0.900048 | \n",
+ " Q15997578-4 | \n",
+ " Q15997578-7 | \n",
+ " job | \n",
+ " wilding | \n",
+ " wilding | \n",
+ " None | \n",
+ " job wilding | \n",
+ " wilding | \n",
+ " ... | \n",
+ " -1 | \n",
+ " wrexham | \n",
+ " wrexham | \n",
+ " 2 | \n",
+ " association football player | \n",
+ " association football player | \n",
+ " 1 | \n",
+ " job wilding | \n",
+ " wilding | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3.170695 | \n",
+ " 0.900048 | \n",
+ " Q15997578-2 | \n",
+ " Q15997578-7 | \n",
+ " job | \n",
+ " wilding | \n",
+ " wilding | \n",
+ " None | \n",
+ " job wilding | \n",
+ " wilding | \n",
+ " ... | \n",
+ " -1 | \n",
+ " wrexham | \n",
+ " wrexham | \n",
+ " 2 | \n",
+ " association football player | \n",
+ " association football player | \n",
+ " 1 | \n",
+ " job wilding | \n",
+ " wilding | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3.170695 | \n",
+ " 0.900048 | \n",
+ " Q15997578-1 | \n",
+ " Q15997578-7 | \n",
+ " job | \n",
+ " wilding | \n",
+ " wilding | \n",
+ " None | \n",
+ " job wilding | \n",
+ " wilding | \n",
+ " ... | \n",
+ " -1 | \n",
+ " wrexham | \n",
+ " wrexham | \n",
+ " 2 | \n",
+ " association football player | \n",
+ " association football player | \n",
+ " 1 | \n",
+ " job wilding | \n",
+ " wilding | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 3.172553 | \n",
+ " 0.900164 | \n",
+ " Q5726641-11 | \n",
+ " Q5726641-8 | \n",
+ " henry | \n",
+ " harry | \n",
+ " page | \n",
+ " paige | \n",
+ " henry page | \n",
+ " harry paige | \n",
+ " ... | \n",
+ " 2 | \n",
+ " staffordshire moorlands | \n",
+ " staffordshire moorlands | \n",
+ " 2 | \n",
+ " cricketer | \n",
+ " cricketer | \n",
+ " 1 | \n",
+ " henry page | \n",
+ " harry paige | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 26 columns
\n",
+ "
"
],
- "source": [
- "results.as_pandas_dataframe(limit=5)"
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
+ "0 3.170005 0.900005 Q7412607-1 Q7412607-3 samuel \n",
+ "1 3.170695 0.900048 Q15997578-4 Q15997578-7 job \n",
+ "2 3.170695 0.900048 Q15997578-2 Q15997578-7 job \n",
+ "3 3.170695 0.900048 Q15997578-1 Q15997578-7 job \n",
+ "4 3.172553 0.900164 Q5726641-11 Q5726641-8 henry \n",
+ "\n",
+ " first_name_r surname_l surname_r first_and_surname_l first_and_surname_r \\\n",
+ "0 samuel shelley shelley samuel shelley samuel shelley \n",
+ "1 wilding wilding None job wilding wilding \n",
+ "2 wilding wilding None job wilding wilding \n",
+ "3 wilding wilding None job wilding wilding \n",
+ "4 harry page paige henry page harry paige \n",
+ "\n",
+ " ... gamma_postcode_fake birth_place_l birth_place_r \\\n",
+ "0 ... 0 whitechapel city of london \n",
+ "1 ... -1 wrexham wrexham \n",
+ "2 ... -1 wrexham wrexham \n",
+ "3 ... -1 wrexham wrexham \n",
+ "4 ... 2 staffordshire moorlands staffordshire moorlands \n",
+ "\n",
+ " gamma_birth_place occupation_l \\\n",
+ "0 0 illuminator \n",
+ "1 2 association football player \n",
+ "2 2 association football player \n",
+ "3 2 association football player \n",
+ "4 2 cricketer \n",
+ "\n",
+ " occupation_r gamma_occupation full_name_l \\\n",
+ "0 illuminator 1 samuel shelley \n",
+ "1 association football player 1 job wilding \n",
+ "2 association football player 1 job wilding \n",
+ "3 association football player 1 job wilding \n",
+ "4 cricketer 1 henry page \n",
+ "\n",
+ " full_name_r match_key \n",
+ "0 samuel shelley 0 \n",
+ "1 wilding 2 \n",
+ "2 wilding 2 \n",
+ "3 wilding 2 \n",
+ "4 harry paige 3 \n",
+ "\n",
+ "[5 rows x 26 columns]"
]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "6b84d4a42f1a479ca6d8e1b02ccd8eda": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "df640bcb35b2441a904ae87dc47249f9": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_6b84d4a42f1a479ca6d8e1b02ccd8eda",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_e2e4b97696234790991bc2a5ca2e731a",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "e2e4b97696234790991bc2a5ca2e731a": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
- }
- }
+ ],
+ "source": [
+ "results.as_pandas_dataframe(limit=5)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "6b84d4a42f1a479ca6d8e1b02ccd8eda": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "df640bcb35b2441a904ae87dc47249f9": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_6b84d4a42f1a479ca6d8e1b02ccd8eda",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_e2e4b97696234790991bc2a5ca2e731a",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "e2e4b97696234790991bc2a5ca2e731a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/duckdb/real_time_record_linkage.ipynb b/docs/demos/examples/duckdb/real_time_record_linkage.ipynb
index b91260f051..691222c53e 100644
--- a/docs/demos/examples/duckdb/real_time_record_linkage.ipynb
+++ b/docs/demos/examples/duckdb/real_time_record_linkage.ipynb
@@ -1,3062 +1,3064 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Real time linkage\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this notebook, we demonstrate splink's incremental and real time linkage capabilities - specifically:\n",
- "\n",
- "- the `linker.inference.compare_two_records` function, that allows you to interactively explore the results of a linkage model; and\n",
- "- the `linker.find_matches_to_new_records` that allows you to incrementally find matches to a small number of new records\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Real time linkage\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this notebook, we demonstrate splink's incremental and real time linkage capabilities - specifically:\n",
+ "\n",
+ "- the `linker.inference.compare_two_records` function, that allows you to interactively explore the results of a linkage model; and\n",
+ "- the `linker.find_matches_to_new_records` that allows you to incrementally find matches to a small number of new records\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:11.870063Z",
+ "iopub.status.busy": "2024-03-27T15:15:11.869757Z",
+ "iopub.status.idle": "2024-03-27T15:15:11.890661Z",
+ "shell.execute_reply": "2024-03-27T15:15:11.889929Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install ipywidgets\n",
+ "# !pip install splink\n",
+ "# !jupyter nbextension enable --py widgetsnbextension"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Step 1: Load a pre-trained linkage model\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:11.894528Z",
+ "iopub.status.busy": "2024-03-27T15:15:11.894247Z",
+ "iopub.status.idle": "2024-03-27T15:15:13.841789Z",
+ "shell.execute_reply": "2024-03-27T15:15:13.841226Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import urllib.request\n",
+ "import json\n",
+ "from pathlib import Path\n",
+ "from splink import Linker, DuckDBAPI, block_on, SettingsCreator, splink_datasets\n",
+ "\n",
+ "df = splink_datasets.fake_1000\n",
+ "\n",
+ "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink_demos/master/demo_settings/real_time_settings.json\"\n",
+ "\n",
+ "with urllib.request.urlopen(url) as u:\n",
+ " settings = json.loads(u.read().decode())\n",
+ "\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:13.845679Z",
+ "iopub.status.busy": "2024-03-27T15:15:13.845274Z",
+ "iopub.status.idle": "2024-03-27T15:15:14.721033Z",
+ "shell.execute_reply": "2024-03-27T15:15:14.720417Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:11.870063Z",
- "iopub.status.busy": "2024-03-27T15:15:11.869757Z",
- "iopub.status.idle": "2024-03-27T15:15:11.890661Z",
- "shell.execute_reply": "2024-03-27T15:15:11.889929Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install ipywidgets\n",
- "# !pip install splink\n",
- "# !jupyter nbextension enable --py widgetsnbextension"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.waterfall_chart(\n",
+ " linker.inference.predict().as_record_dict(limit=2)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Step Comparing two records\n",
+ "\n",
+ "It's now possible to compute a match weight for any two records using `linker.inference.compare_two_records()`\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:14.724585Z",
+ "iopub.status.busy": "2024-03-27T15:15:14.724327Z",
+ "iopub.status.idle": "2024-03-27T15:15:14.962647Z",
+ "shell.execute_reply": "2024-03-27T15:15:14.961740Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Step 1: Load a pre-trained linkage model\n"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " gamma_first_name | \n",
+ " tf_first_name_l | \n",
+ " tf_first_name_r | \n",
+ " bf_first_name | \n",
+ " ... | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " email_l | \n",
+ " email_r | \n",
+ " gamma_email | \n",
+ " tf_email_l | \n",
+ " tf_email_r | \n",
+ " bf_email | \n",
+ " bf_tf_adj_email | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 13.161672 | \n",
+ " 0.999891 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Lucas | \n",
+ " Lucas | \n",
+ " 2 | \n",
+ " 0.001203 | \n",
+ " 0.001203 | \n",
+ " 87.571229 | \n",
+ " ... | \n",
+ " 0.446404 | \n",
+ " 1.0 | \n",
+ " lucas.smith@hotmail.com | \n",
+ " lucas.smith@hotmail.com | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 263.229168 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1 rows × 40 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
+ "0 13.161672 0.999891 1 2 Lucas \n",
+ "\n",
+ " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
+ "0 Lucas 2 0.001203 0.001203 \n",
+ "\n",
+ " bf_first_name ... bf_city bf_tf_adj_city email_l \\\n",
+ "0 87.571229 ... 0.446404 1.0 lucas.smith@hotmail.com \n",
+ "\n",
+ " email_r gamma_email tf_email_l tf_email_r bf_email \\\n",
+ "0 lucas.smith@hotmail.com 1 NaN NaN 263.229168 \n",
+ "\n",
+ " bf_tf_adj_email match_key \n",
+ "0 1.0 0 \n",
+ "\n",
+ "[1 rows x 40 columns]"
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "record_1 = {\n",
+ " \"unique_id\": 1,\n",
+ " \"first_name\": \"Lucas\",\n",
+ " \"surname\": \"Smith\",\n",
+ " \"dob\": \"1984-01-02\",\n",
+ " \"city\": \"London\",\n",
+ " \"email\": \"lucas.smith@hotmail.com\",\n",
+ "}\n",
+ "\n",
+ "record_2 = {\n",
+ " \"unique_id\": 2,\n",
+ " \"first_name\": \"Lucas\",\n",
+ " \"surname\": \"Smith\",\n",
+ " \"dob\": \"1983-02-12\",\n",
+ " \"city\": \"Machester\",\n",
+ " \"email\": \"lucas.smith@hotmail.com\",\n",
+ "}\n",
+ "\n",
+ "linker._settings_obj._retain_intermediate_calculation_columns = True\n",
+ "\n",
+ "\n",
+ "# To `compare_two_records` the linker needs to compute term frequency tables\n",
+ "# If you have precomputed tables, you can linker.table_management.register_term_frequency_lookup()\n",
+ "linker.table_management.compute_tf_table(\"first_name\")\n",
+ "linker.table_management.compute_tf_table(\"surname\")\n",
+ "linker.table_management.compute_tf_table(\"dob\")\n",
+ "linker.table_management.compute_tf_table(\"city\")\n",
+ "linker.table_management.compute_tf_table(\"email\")\n",
+ "\n",
+ "\n",
+ "df_two = linker.inference.compare_two_records(record_1, record_2)\n",
+ "df_two.as_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Step 3: Interactive comparisons\n",
+ "\n",
+ "One interesting applicatin of `compare_two_records` is to create a simple interface that allows the user to input two records interactively, and get real time feedback.\n",
+ "\n",
+ "In the following cell we use `ipywidets` for this purpose. ✨✨ Change the values in the text boxes to see the waterfall chart update in real time. ✨✨\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:14.968237Z",
+ "iopub.status.busy": "2024-03-27T15:15:14.967899Z",
+ "iopub.status.idle": "2024-03-27T15:15:15.926984Z",
+ "shell.execute_reply": "2024-03-27T15:15:15.925656Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:11.894528Z",
- "iopub.status.busy": "2024-03-27T15:15:11.894247Z",
- "iopub.status.idle": "2024-03-27T15:15:13.841789Z",
- "shell.execute_reply": "2024-03-27T15:15:13.841226Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d3c8f243ce6848518e4fe3093cb9422a",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [],
- "source": [
- "import urllib.request\n",
- "import json\n",
- "from pathlib import Path\n",
- "from splink import Linker, DuckDBAPI, block_on, SettingsCreator, splink_datasets\n",
- "\n",
- "df = splink_datasets.fake_1000\n",
- "\n",
- "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink_demos/master/demo_settings/real_time_settings.json\"\n",
- "\n",
- "with urllib.request.urlopen(url) as u:\n",
- " settings = json.loads(u.read().decode())\n",
- "\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())"
+ "text/plain": [
+ "HBox(children=(VBox(children=(Text(value='1', description='unique_id'), Text(value='Lucas', description='first…"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:13.845679Z",
- "iopub.status.busy": "2024-03-27T15:15:13.845274Z",
- "iopub.status.idle": "2024-03-27T15:15:14.721033Z",
- "shell.execute_reply": "2024-03-27T15:15:14.720417Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f04362ad6bd648f8839b8e7048c9f6f6",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.waterfall_chart(\n",
- " linker.inference.predict().as_record_dict(limit=2)\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Step Comparing two records\n",
- "\n",
- "It's now possible to compute a match weight for any two records using `linker.inference.compare_two_records()`\n"
+ "text/plain": [
+ "Output()"
]
- },
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import ipywidgets as widgets\n",
+ "from IPython.display import display\n",
+ "\n",
+ "\n",
+ "fields = [\"unique_id\", \"first_name\", \"surname\", \"dob\", \"email\", \"city\"]\n",
+ "\n",
+ "left_text_boxes = []\n",
+ "right_text_boxes = []\n",
+ "\n",
+ "inputs_to_interactive_output = {}\n",
+ "\n",
+ "for f in fields:\n",
+ " wl = widgets.Text(description=f, value=str(record_1[f]))\n",
+ " left_text_boxes.append(wl)\n",
+ " inputs_to_interactive_output[f\"{f}_l\"] = wl\n",
+ " wr = widgets.Text(description=f, value=str(record_2[f]))\n",
+ " right_text_boxes.append(wr)\n",
+ " inputs_to_interactive_output[f\"{f}_r\"] = wr\n",
+ "\n",
+ "b1 = widgets.VBox(left_text_boxes)\n",
+ "b2 = widgets.VBox(right_text_boxes)\n",
+ "ui = widgets.HBox([b1, b2])\n",
+ "\n",
+ "\n",
+ "def myfn(**kwargs):\n",
+ " my_args = dict(kwargs)\n",
+ "\n",
+ " record_left = {}\n",
+ " record_right = {}\n",
+ "\n",
+ " for key, value in my_args.items():\n",
+ " if value == \"\":\n",
+ " value = None\n",
+ " if key.endswith(\"_l\"):\n",
+ " record_left[key[:-2]] = value\n",
+ " elif key.endswith(\"_r\"):\n",
+ " record_right[key[:-2]] = value\n",
+ "\n",
+ " # Assuming 'linker' is defined earlier in your code\n",
+ " linker._settings_obj._retain_intermediate_calculation_columns = True\n",
+ "\n",
+ " df_two = linker.inference.compare_two_records(record_left, record_right)\n",
+ "\n",
+ " recs = df_two.as_pandas_dataframe().to_dict(orient=\"records\")\n",
+ "\n",
+ " display(linker.visualisations.waterfall_chart(recs, filter_nulls=False))\n",
+ "\n",
+ "\n",
+ "out = widgets.interactive_output(myfn, inputs_to_interactive_output)\n",
+ "\n",
+ "display(ui, out)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Finding matching records interactively\n",
+ "\n",
+ "It is also possible to search the records in the input dataset rapidly using the `linker.find_matches_to_new_records()` function\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:15.937800Z",
+ "iopub.status.busy": "2024-03-27T15:15:15.935943Z",
+ "iopub.status.idle": "2024-03-27T15:15:16.477834Z",
+ "shell.execute_reply": "2024-03-27T15:15:16.474896Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:14.724585Z",
- "iopub.status.busy": "2024-03-27T15:15:14.724327Z",
- "iopub.status.idle": "2024-03-27T15:15:14.962647Z",
- "shell.execute_reply": "2024-03-27T15:15:14.961740Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " gamma_first_name | \n",
- " tf_first_name_l | \n",
- " tf_first_name_r | \n",
- " bf_first_name | \n",
- " ... | \n",
- " bf_city | \n",
- " bf_tf_adj_city | \n",
- " email_l | \n",
- " email_r | \n",
- " gamma_email | \n",
- " tf_email_l | \n",
- " tf_email_r | \n",
- " bf_email | \n",
- " bf_tf_adj_email | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 13.161672 | \n",
- " 0.999891 | \n",
- " 1 | \n",
- " 2 | \n",
- " Lucas | \n",
- " Lucas | \n",
- " 2 | \n",
- " 0.001203 | \n",
- " 0.001203 | \n",
- " 87.571229 | \n",
- " ... | \n",
- " 0.446404 | \n",
- " 1.0 | \n",
- " lucas.smith@hotmail.com | \n",
- " lucas.smith@hotmail.com | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 263.229168 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
1 rows × 40 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
- "0 13.161672 0.999891 1 2 Lucas \n",
- "\n",
- " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
- "0 Lucas 2 0.001203 0.001203 \n",
- "\n",
- " bf_first_name ... bf_city bf_tf_adj_city email_l \\\n",
- "0 87.571229 ... 0.446404 1.0 lucas.smith@hotmail.com \n",
- "\n",
- " email_r gamma_email tf_email_l tf_email_r bf_email \\\n",
- "0 lucas.smith@hotmail.com 1 NaN NaN 263.229168 \n",
- "\n",
- " bf_tf_adj_email match_key \n",
- "0 1.0 0 \n",
- "\n",
- "[1 rows x 40 columns]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " gamma_first_name | \n",
+ " tf_first_name_l | \n",
+ " tf_first_name_r | \n",
+ " bf_first_name | \n",
+ " ... | \n",
+ " tf_city_r | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " email_l | \n",
+ " email_r | \n",
+ " gamma_email | \n",
+ " tf_email_l | \n",
+ " tf_email_r | \n",
+ " bf_email | \n",
+ " bf_tf_adj_email | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 6 | \n",
+ " 23.531793 | \n",
+ " 1.000000 | \n",
+ " 0 | \n",
+ " 123987 | \n",
+ " Robert | \n",
+ " Robert | \n",
+ " 2 | \n",
+ " 0.003610 | \n",
+ " 0.00361 | \n",
+ " 87.571229 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " robert255@smith.net | \n",
+ " robert255@smith.net | \n",
+ " 1 | \n",
+ " 0.001267 | \n",
+ " 0.001267 | \n",
+ " 263.229168 | \n",
+ " 1.730964 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 14.550320 | \n",
+ " 0.999958 | \n",
+ " 1 | \n",
+ " 123987 | \n",
+ " Robert | \n",
+ " Robert | \n",
+ " 2 | \n",
+ " 0.003610 | \n",
+ " 0.00361 | \n",
+ " 87.571229 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " roberta25@smith.net | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ " 0.002535 | \n",
+ " 0.001267 | \n",
+ " 0.423438 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 10.388623 | \n",
+ " 0.999255 | \n",
+ " 3 | \n",
+ " 123987 | \n",
+ " Robert | \n",
+ " Robert | \n",
+ " 2 | \n",
+ " 0.003610 | \n",
+ " 0.00361 | \n",
+ " 87.571229 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 0.446404 | \n",
+ " 1.000000 | \n",
+ " None | \n",
+ " robert255@smith.net | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.001267 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2.427256 | \n",
+ " 0.843228 | \n",
+ " 2 | \n",
+ " 123987 | \n",
+ " Rob | \n",
+ " Robert | \n",
+ " 0 | \n",
+ " 0.001203 | \n",
+ " 0.00361 | \n",
+ " 0.218767 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 10.484859 | \n",
+ " 0.259162 | \n",
+ " roberta25@smith.net | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ " 0.002535 | \n",
+ " 0.001267 | \n",
+ " 0.423438 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " -2.123090 | \n",
+ " 0.186697 | \n",
+ " 8 | \n",
+ " 123987 | \n",
+ " None | \n",
+ " Robert | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.00361 | \n",
+ " 1.000000 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " None | \n",
+ " robert255@smith.net | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.001267 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " -2.205894 | \n",
+ " 0.178139 | \n",
+ " 754 | \n",
+ " 123987 | \n",
+ " None | \n",
+ " Robert | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.00361 | \n",
+ " 1.000000 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " j.c@whige.wort | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ " 0.001267 | \n",
+ " 0.001267 | \n",
+ " 0.423438 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " -2.802309 | \n",
+ " 0.125383 | \n",
+ " 750 | \n",
+ " 123987 | \n",
+ " None | \n",
+ " Robert | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.00361 | \n",
+ " 1.000000 | \n",
+ " ... | \n",
+ " 0.212792 | \n",
+ " 10.484859 | \n",
+ " 0.259162 | \n",
+ " j.c@white.org | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ " 0.002535 | \n",
+ " 0.001267 | \n",
+ " 0.423438 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
7 rows × 39 columns
\n",
+ "
"
],
- "source": [
- "record_1 = {\n",
- " \"unique_id\": 1,\n",
- " \"first_name\": \"Lucas\",\n",
- " \"surname\": \"Smith\",\n",
- " \"dob\": \"1984-01-02\",\n",
- " \"city\": \"London\",\n",
- " \"email\": \"lucas.smith@hotmail.com\",\n",
- "}\n",
- "\n",
- "record_2 = {\n",
- " \"unique_id\": 2,\n",
- " \"first_name\": \"Lucas\",\n",
- " \"surname\": \"Smith\",\n",
- " \"dob\": \"1983-02-12\",\n",
- " \"city\": \"Machester\",\n",
- " \"email\": \"lucas.smith@hotmail.com\",\n",
- "}\n",
- "\n",
- "linker._settings_obj._retain_intermediate_calculation_columns = True\n",
- "\n",
- "\n",
- "# To `compare_two_records` the linker needs to compute term frequency tables\n",
- "# If you have precomputed tables, you can linker.table_management.register_term_frequency_lookup()\n",
- "linker.table_management.compute_tf_table(\"first_name\")\n",
- "linker.table_management.compute_tf_table(\"surname\")\n",
- "linker.table_management.compute_tf_table(\"dob\")\n",
- "linker.table_management.compute_tf_table(\"city\")\n",
- "linker.table_management.compute_tf_table(\"email\")\n",
- "\n",
- "\n",
- "df_two = linker.inference.compare_two_records(record_1, record_2)\n",
- "df_two.as_pandas_dataframe()"
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
+ "6 23.531793 1.000000 0 123987 Robert \n",
+ "5 14.550320 0.999958 1 123987 Robert \n",
+ "4 10.388623 0.999255 3 123987 Robert \n",
+ "3 2.427256 0.843228 2 123987 Rob \n",
+ "2 -2.123090 0.186697 8 123987 None \n",
+ "1 -2.205894 0.178139 754 123987 None \n",
+ "0 -2.802309 0.125383 750 123987 None \n",
+ "\n",
+ " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
+ "6 Robert 2 0.003610 0.00361 \n",
+ "5 Robert 2 0.003610 0.00361 \n",
+ "4 Robert 2 0.003610 0.00361 \n",
+ "3 Robert 0 0.001203 0.00361 \n",
+ "2 Robert -1 NaN 0.00361 \n",
+ "1 Robert -1 NaN 0.00361 \n",
+ "0 Robert -1 NaN 0.00361 \n",
+ "\n",
+ " bf_first_name ... tf_city_r bf_city bf_tf_adj_city \\\n",
+ "6 87.571229 ... 0.212792 1.000000 1.000000 \n",
+ "5 87.571229 ... 0.212792 1.000000 1.000000 \n",
+ "4 87.571229 ... 0.212792 0.446404 1.000000 \n",
+ "3 0.218767 ... 0.212792 10.484859 0.259162 \n",
+ "2 1.000000 ... 0.212792 1.000000 1.000000 \n",
+ "1 1.000000 ... 0.212792 1.000000 1.000000 \n",
+ "0 1.000000 ... 0.212792 10.484859 0.259162 \n",
+ "\n",
+ " email_l email_r gamma_email tf_email_l \\\n",
+ "6 robert255@smith.net robert255@smith.net 1 0.001267 \n",
+ "5 roberta25@smith.net robert255@smith.net 0 0.002535 \n",
+ "4 None robert255@smith.net -1 NaN \n",
+ "3 roberta25@smith.net robert255@smith.net 0 0.002535 \n",
+ "2 None robert255@smith.net -1 NaN \n",
+ "1 j.c@whige.wort robert255@smith.net 0 0.001267 \n",
+ "0 j.c@white.org robert255@smith.net 0 0.002535 \n",
+ "\n",
+ " tf_email_r bf_email bf_tf_adj_email \n",
+ "6 0.001267 263.229168 1.730964 \n",
+ "5 0.001267 0.423438 1.000000 \n",
+ "4 0.001267 1.000000 1.000000 \n",
+ "3 0.001267 0.423438 1.000000 \n",
+ "2 0.001267 1.000000 1.000000 \n",
+ "1 0.001267 0.423438 1.000000 \n",
+ "0 0.001267 0.423438 1.000000 \n",
+ "\n",
+ "[7 rows x 39 columns]"
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Step 3: Interactive comparisons\n",
- "\n",
- "One interesting applicatin of `compare_two_records` is to create a simple interface that allows the user to input two records interactively, and get real time feedback.\n",
- "\n",
- "In the following cell we use `ipywidets` for this purpose. ✨✨ Change the values in the text boxes to see the waterfall chart update in real time. ✨✨\n"
- ]
- },
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "record = {\n",
+ " \"unique_id\": 123987,\n",
+ " \"first_name\": \"Robert\",\n",
+ " \"surname\": \"Alan\",\n",
+ " \"dob\": \"1971-05-24\",\n",
+ " \"city\": \"London\",\n",
+ " \"email\": \"robert255@smith.net\",\n",
+ "}\n",
+ "\n",
+ "\n",
+ "df_inc = linker.inference.find_matches_to_new_records(\n",
+ " [record], blocking_rules=[]\n",
+ ").as_pandas_dataframe()\n",
+ "df_inc.sort_values(\"match_weight\", ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Interactive interface for finding records\n",
+ "\n",
+ "Again, we can use `ipywidgets` to build an interactive interface for the `linker.find_matches_to_new_records` function\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:16.486337Z",
+ "iopub.status.busy": "2024-03-27T15:15:16.484941Z",
+ "iopub.status.idle": "2024-03-27T15:15:17.549243Z",
+ "shell.execute_reply": "2024-03-27T15:15:17.548423Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:14.968237Z",
- "iopub.status.busy": "2024-03-27T15:15:14.967899Z",
- "iopub.status.idle": "2024-03-27T15:15:15.926984Z",
- "shell.execute_reply": "2024-03-27T15:15:15.925656Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4ae33c34076a42088ad5b52beb7a8112",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "d3c8f243ce6848518e4fe3093cb9422a",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(VBox(children=(Text(value='1', description='unique_id'), Text(value='Lucas', description='first…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "f04362ad6bd648f8839b8e7048c9f6f6",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Output()"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import ipywidgets as widgets\n",
- "from IPython.display import display\n",
- "\n",
- "\n",
- "fields = [\"unique_id\", \"first_name\", \"surname\", \"dob\", \"email\", \"city\"]\n",
- "\n",
- "left_text_boxes = []\n",
- "right_text_boxes = []\n",
- "\n",
- "inputs_to_interactive_output = {}\n",
- "\n",
- "for f in fields:\n",
- " wl = widgets.Text(description=f, value=str(record_1[f]))\n",
- " left_text_boxes.append(wl)\n",
- " inputs_to_interactive_output[f\"{f}_l\"] = wl\n",
- " wr = widgets.Text(description=f, value=str(record_2[f]))\n",
- " right_text_boxes.append(wr)\n",
- " inputs_to_interactive_output[f\"{f}_r\"] = wr\n",
- "\n",
- "b1 = widgets.VBox(left_text_boxes)\n",
- "b2 = widgets.VBox(right_text_boxes)\n",
- "ui = widgets.HBox([b1, b2])\n",
- "\n",
- "\n",
- "def myfn(**kwargs):\n",
- " my_args = dict(kwargs)\n",
- "\n",
- " record_left = {}\n",
- " record_right = {}\n",
- "\n",
- " for key, value in my_args.items():\n",
- " if value == \"\":\n",
- " value = None\n",
- " if key.endswith(\"_l\"):\n",
- " record_left[key[:-2]] = value\n",
- " elif key.endswith(\"_r\"):\n",
- " record_right[key[:-2]] = value\n",
- "\n",
- " # Assuming 'linker' is defined earlier in your code\n",
- " linker._settings_obj._retain_intermediate_calculation_columns = True\n",
- "\n",
- " df_two = linker.inference.compare_two_records(record_left, record_right)\n",
- "\n",
- " recs = df_two.as_pandas_dataframe().to_dict(orient=\"records\")\n",
- "\n",
- " display(linker.visualisations.waterfall_chart(recs, filter_nulls=False))\n",
- "\n",
- "\n",
- "out = widgets.interactive_output(myfn, inputs_to_interactive_output)\n",
- "\n",
- "display(ui, out)"
+ "text/plain": [
+ "interactive(children=(Text(value='Robert', description='first_name'), Text(value='Alan', description='surname'…"
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Finding matching records interactively\n",
- "\n",
- "It is also possible to search the records in the input dataset rapidly using the `linker.find_matches_to_new_records()` function\n"
- ]
- },
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "@widgets.interact(\n",
+ " first_name=\"Robert\",\n",
+ " surname=\"Alan\",\n",
+ " dob=\"1971-05-24\",\n",
+ " city=\"London\",\n",
+ " email=\"robert255@smith.net\",\n",
+ ")\n",
+ "def interactive_link(first_name, surname, dob, city, email):\n",
+ " record = {\n",
+ " \"unique_id\": 123987,\n",
+ " \"first_name\": first_name,\n",
+ " \"surname\": surname,\n",
+ " \"dob\": dob,\n",
+ " \"city\": city,\n",
+ " \"email\": email,\n",
+ " \"group\": 0,\n",
+ " }\n",
+ "\n",
+ " for key in record.keys():\n",
+ " if type(record[key]) == str:\n",
+ " if record[key].strip() == \"\":\n",
+ " record[key] = None\n",
+ "\n",
+ " df_inc = linker.inference.find_matches_to_new_records(\n",
+ " [record], blocking_rules=[f\"(true)\"]\n",
+ " ).as_pandas_dataframe()\n",
+ " df_inc = df_inc.sort_values(\"match_weight\", ascending=False)\n",
+ " recs = df_inc.to_dict(orient=\"records\")\n",
+ "\n",
+ " display(linker.visualisations.waterfall_chart(recs, filter_nulls=False))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-27T15:15:17.555875Z",
+ "iopub.status.busy": "2024-03-27T15:15:17.555576Z",
+ "iopub.status.idle": "2024-03-27T15:15:17.884897Z",
+ "shell.execute_reply": "2024-03-27T15:15:17.884033Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:15.937800Z",
- "iopub.status.busy": "2024-03-27T15:15:15.935943Z",
- "iopub.status.idle": "2024-03-27T15:15:16.477834Z",
- "shell.execute_reply": "2024-03-27T15:15:16.474896Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " gamma_first_name | \n",
- " tf_first_name_l | \n",
- " tf_first_name_r | \n",
- " bf_first_name | \n",
- " ... | \n",
- " tf_city_r | \n",
- " bf_city | \n",
- " bf_tf_adj_city | \n",
- " email_l | \n",
- " email_r | \n",
- " gamma_email | \n",
- " tf_email_l | \n",
- " tf_email_r | \n",
- " bf_email | \n",
- " bf_tf_adj_email | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 6 | \n",
- " 23.531793 | \n",
- " 1.000000 | \n",
- " 0 | \n",
- " 123987 | \n",
- " Robert | \n",
- " Robert | \n",
- " 2 | \n",
- " 0.003610 | \n",
- " 0.00361 | \n",
- " 87.571229 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " robert255@smith.net | \n",
- " robert255@smith.net | \n",
- " 1 | \n",
- " 0.001267 | \n",
- " 0.001267 | \n",
- " 263.229168 | \n",
- " 1.730964 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 14.550320 | \n",
- " 0.999958 | \n",
- " 1 | \n",
- " 123987 | \n",
- " Robert | \n",
- " Robert | \n",
- " 2 | \n",
- " 0.003610 | \n",
- " 0.00361 | \n",
- " 87.571229 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " roberta25@smith.net | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- " 0.002535 | \n",
- " 0.001267 | \n",
- " 0.423438 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 10.388623 | \n",
- " 0.999255 | \n",
- " 3 | \n",
- " 123987 | \n",
- " Robert | \n",
- " Robert | \n",
- " 2 | \n",
- " 0.003610 | \n",
- " 0.00361 | \n",
- " 87.571229 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 0.446404 | \n",
- " 1.000000 | \n",
- " None | \n",
- " robert255@smith.net | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.001267 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 2.427256 | \n",
- " 0.843228 | \n",
- " 2 | \n",
- " 123987 | \n",
- " Rob | \n",
- " Robert | \n",
- " 0 | \n",
- " 0.001203 | \n",
- " 0.00361 | \n",
- " 0.218767 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 10.484859 | \n",
- " 0.259162 | \n",
- " roberta25@smith.net | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- " 0.002535 | \n",
- " 0.001267 | \n",
- " 0.423438 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " -2.123090 | \n",
- " 0.186697 | \n",
- " 8 | \n",
- " 123987 | \n",
- " None | \n",
- " Robert | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.00361 | \n",
- " 1.000000 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " None | \n",
- " robert255@smith.net | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.001267 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " -2.205894 | \n",
- " 0.178139 | \n",
- " 754 | \n",
- " 123987 | \n",
- " None | \n",
- " Robert | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.00361 | \n",
- " 1.000000 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " j.c@whige.wort | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- " 0.001267 | \n",
- " 0.001267 | \n",
- " 0.423438 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- " | 0 | \n",
- " -2.802309 | \n",
- " 0.125383 | \n",
- " 750 | \n",
- " 123987 | \n",
- " None | \n",
- " Robert | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.00361 | \n",
- " 1.000000 | \n",
- " ... | \n",
- " 0.212792 | \n",
- " 10.484859 | \n",
- " 0.259162 | \n",
- " j.c@white.org | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- " 0.002535 | \n",
- " 0.001267 | \n",
- " 0.423438 | \n",
- " 1.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
7 rows × 39 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
- "6 23.531793 1.000000 0 123987 Robert \n",
- "5 14.550320 0.999958 1 123987 Robert \n",
- "4 10.388623 0.999255 3 123987 Robert \n",
- "3 2.427256 0.843228 2 123987 Rob \n",
- "2 -2.123090 0.186697 8 123987 None \n",
- "1 -2.205894 0.178139 754 123987 None \n",
- "0 -2.802309 0.125383 750 123987 None \n",
- "\n",
- " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
- "6 Robert 2 0.003610 0.00361 \n",
- "5 Robert 2 0.003610 0.00361 \n",
- "4 Robert 2 0.003610 0.00361 \n",
- "3 Robert 0 0.001203 0.00361 \n",
- "2 Robert -1 NaN 0.00361 \n",
- "1 Robert -1 NaN 0.00361 \n",
- "0 Robert -1 NaN 0.00361 \n",
- "\n",
- " bf_first_name ... tf_city_r bf_city bf_tf_adj_city \\\n",
- "6 87.571229 ... 0.212792 1.000000 1.000000 \n",
- "5 87.571229 ... 0.212792 1.000000 1.000000 \n",
- "4 87.571229 ... 0.212792 0.446404 1.000000 \n",
- "3 0.218767 ... 0.212792 10.484859 0.259162 \n",
- "2 1.000000 ... 0.212792 1.000000 1.000000 \n",
- "1 1.000000 ... 0.212792 1.000000 1.000000 \n",
- "0 1.000000 ... 0.212792 10.484859 0.259162 \n",
- "\n",
- " email_l email_r gamma_email tf_email_l \\\n",
- "6 robert255@smith.net robert255@smith.net 1 0.001267 \n",
- "5 roberta25@smith.net robert255@smith.net 0 0.002535 \n",
- "4 None robert255@smith.net -1 NaN \n",
- "3 roberta25@smith.net robert255@smith.net 0 0.002535 \n",
- "2 None robert255@smith.net -1 NaN \n",
- "1 j.c@whige.wort robert255@smith.net 0 0.001267 \n",
- "0 j.c@white.org robert255@smith.net 0 0.002535 \n",
- "\n",
- " tf_email_r bf_email bf_tf_adj_email \n",
- "6 0.001267 263.229168 1.730964 \n",
- "5 0.001267 0.423438 1.000000 \n",
- "4 0.001267 1.000000 1.000000 \n",
- "3 0.001267 0.423438 1.000000 \n",
- "2 0.001267 1.000000 1.000000 \n",
- "1 0.001267 0.423438 1.000000 \n",
- "0 0.001267 0.423438 1.000000 \n",
- "\n",
- "[7 rows x 39 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "record = {\n",
- " \"unique_id\": 123987,\n",
- " \"first_name\": \"Robert\",\n",
- " \"surname\": \"Alan\",\n",
- " \"dob\": \"1971-05-24\",\n",
- " \"city\": \"London\",\n",
- " \"email\": \"robert255@smith.net\",\n",
- "}\n",
- "\n",
- "\n",
- "df_inc = linker.inference.find_matches_to_new_records(\n",
- " [record], blocking_rules=[]\n",
- ").as_pandas_dataframe()\n",
- "df_inc.sort_values(\"match_weight\", ascending=False)"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Interactive interface for finding records\n",
- "\n",
- "Again, we can use `ipywidgets` to build an interactive interface for the `linker.find_matches_to_new_records` function\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:16.486337Z",
- "iopub.status.busy": "2024-03-27T15:15:16.484941Z",
- "iopub.status.idle": "2024-03-27T15:15:17.549243Z",
- "shell.execute_reply": "2024-03-27T15:15:17.548423Z"
- }
- },
- "outputs": [
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "04ba5b27fff046cdbcff86aeb938daf6": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "first_name",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_5d4a7365a34746fe9ed2319c19ad0d58",
+ "placeholder": "",
+ "style": "IPY_MODEL_2fe5902f30274f95a26e4c0897b2a011",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Lucas"
+ }
+ },
+ "086fdaefbddf4915be8385c98d46c358": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "unique_id",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_658f48614d3d4c63bbdc946d44b9f734",
+ "placeholder": "",
+ "style": "IPY_MODEL_80683606961c4187965cc00e4944fbe1",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "2"
+ }
+ },
+ "0908dba9a7fa4e22b818b1bb31bd415a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "11bbe083b862406d814c35e47d773ade": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "1b024ddb36284e4b85b5054092f531bc": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "1fa9c4a8c62846f591a1fb4e4d91d3ed": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "dob",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_cb0b167a052745fd9204729f961b7ad6",
+ "placeholder": "",
+ "style": "IPY_MODEL_ca1c754b37a640a69908cb2b34cb7b3c",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "1984-01-02"
+ }
+ },
+ "2191036959a444558a7e6663f51f60d4": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "21ae049d073b458b902a65f9d4b1e0f1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_968249064f3446ee806ef5e750085fe4",
+ "IPY_MODEL_272d325448674ae4882758a7af3d6354"
+ ],
+ "layout": "IPY_MODEL_ecb562219597492fae7b4bf881ea113f",
+ "tabbable": null,
+ "tooltip": null
+ }
+ },
+ "23d4adc30abf4b8c845e3a22a7929ada": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "24f94cd7e9f84e2098d97f75d6f01a8d": {
+ "model_module": "@jupyter-widgets/output",
+ "model_module_version": "1.0.0",
+ "model_name": "OutputModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/output",
+ "_model_module_version": "1.0.0",
+ "_model_name": "OutputModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/output",
+ "_view_module_version": "1.0.0",
+ "_view_name": "OutputView",
+ "layout": "IPY_MODEL_f3de223015854925895cbd794a7ee8c2",
+ "msg_id": "",
+ "outputs": [
{
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "4ae33c34076a42088ad5b52beb7a8112",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "interactive(children=(Text(value='Robert', description='first_name'), Text(value='Alan', description='surname'…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
+ "data": {
+ "text/html": "\n\n\n",
+ "text/plain": "alt.LayerChart(...)"
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
- ],
- "source": [
- "@widgets.interact(\n",
- " first_name=\"Robert\",\n",
- " surname=\"Alan\",\n",
- " dob=\"1971-05-24\",\n",
- " city=\"London\",\n",
- " email=\"robert255@smith.net\",\n",
- ")\n",
- "def interactive_link(first_name, surname, dob, city, email):\n",
- " record = {\n",
- " \"unique_id\": 123987,\n",
- " \"first_name\": first_name,\n",
- " \"surname\": surname,\n",
- " \"dob\": dob,\n",
- " \"city\": city,\n",
- " \"email\": email,\n",
- " \"group\": 0,\n",
- " }\n",
- "\n",
- " for key in record.keys():\n",
- " if type(record[key]) == str:\n",
- " if record[key].strip() == \"\":\n",
- " record[key] = None\n",
- "\n",
- " df_inc = linker.inference.find_matches_to_new_records(\n",
- " [record], blocking_rules=[f\"(true)\"]\n",
- " ).as_pandas_dataframe()\n",
- " df_inc = df_inc.sort_values(\"match_weight\", ascending=False)\n",
- " recs = df_inc.to_dict(orient=\"records\")\n",
- "\n",
- " display(linker.visualisations.waterfall_chart(recs, filter_nulls=False))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-27T15:15:17.555875Z",
- "iopub.status.busy": "2024-03-27T15:15:17.555576Z",
- "iopub.status.idle": "2024-03-27T15:15:17.884897Z",
- "shell.execute_reply": "2024-03-27T15:15:17.884033Z"
- }
- },
- "outputs": [
+ ],
+ "tabbable": null,
+ "tooltip": null
+ }
+ },
+ "250e8abe119446478ca7513778aed39a": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "272d325448674ae4882758a7af3d6354": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "VBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "VBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "VBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_086fdaefbddf4915be8385c98d46c358",
+ "IPY_MODEL_04ba5b27fff046cdbcff86aeb938daf6",
+ "IPY_MODEL_3ce34f18f65c440f94f803b75b882788",
+ "IPY_MODEL_4c51a67113a44ddfaff9108a85c37ec9",
+ "IPY_MODEL_42f888b0586648a3b2cfc3394f083e26",
+ "IPY_MODEL_c651dbf1a60b4f219d31355ec3d9d1cd"
+ ],
+ "layout": "IPY_MODEL_ffdf7e9a14d04f6b9a448df89e8ffbec",
+ "tabbable": null,
+ "tooltip": null
+ }
+ },
+ "27e26efe54ab4971bbc5da3a355071b4": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "2a595765a0184d0db511726d83d67110": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "2fe5902f30274f95a26e4c0897b2a011": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "329729744d0c4e839c0503ab0259b18c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "email",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_c71b8b6aa86343a7b0093f15313677f7",
+ "placeholder": "",
+ "style": "IPY_MODEL_8456ddacbc5542c2a9d668d042a6abe6",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "lucas.smith@hotmail.com"
+ }
+ },
+ "3b9b6f02a8f7491585ed99976548eb03": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "VBoxModel",
+ "state": {
+ "_dom_classes": [
+ "widget-interact"
+ ],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "VBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "VBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_73f2384489da4b748209c462724a3961",
+ "IPY_MODEL_d27918510cb3494c930c545563fed3ca",
+ "IPY_MODEL_b571fbb3f6a14611974779e927914d23",
+ "IPY_MODEL_cc0ddc91bbfb4c6d891c202e279d4c74",
+ "IPY_MODEL_b50d412867114f2eb000fc969b677c9c",
+ "IPY_MODEL_24f94cd7e9f84e2098d97f75d6f01a8d"
+ ],
+ "layout": "IPY_MODEL_43db6d5bd49c46da8203977961927fed",
+ "tabbable": null,
+ "tooltip": null
+ }
+ },
+ "3ce34f18f65c440f94f803b75b882788": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "surname",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_de296f36c45e4161a6c3d091dfea2892",
+ "placeholder": "",
+ "style": "IPY_MODEL_a2c309f444944797824589147aede89d",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Smith"
+ }
+ },
+ "3e98708a97934bb095b2be73187fcec0": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "42f888b0586648a3b2cfc3394f083e26": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "email",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_2191036959a444558a7e6663f51f60d4",
+ "placeholder": "",
+ "style": "IPY_MODEL_0908dba9a7fa4e22b818b1bb31bd415a",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "lucas.smith@hotmail.com"
+ }
+ },
+ "43db6d5bd49c46da8203977961927fed": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "440256de4ff2405e95431f2fd9490436": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "4baf8bf6cf784f5ca98feee93978b3af": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "4c51a67113a44ddfaff9108a85c37ec9": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "dob",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_27e26efe54ab4971bbc5da3a355071b4",
+ "placeholder": "",
+ "style": "IPY_MODEL_7c52d046223c4607b35759a148cc8515",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "1983-02-12"
+ }
+ },
+ "4ccb8090b4a549829a1ac4a82a39ceac": {
+ "model_module": "@jupyter-widgets/output",
+ "model_module_version": "1.0.0",
+ "model_name": "OutputModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/output",
+ "_model_module_version": "1.0.0",
+ "_model_name": "OutputModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/output",
+ "_view_module_version": "1.0.0",
+ "_view_name": "OutputView",
+ "layout": "IPY_MODEL_2a595765a0184d0db511726d83d67110",
+ "msg_id": "",
+ "outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
+ "data": {
+ "text/html": "\n\n\n",
+ "text/plain": "alt.LayerChart(...)"
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
- ],
- "source": [
- "linker.visualisations.match_weights_chart()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "04ba5b27fff046cdbcff86aeb938daf6": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "first_name",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_5d4a7365a34746fe9ed2319c19ad0d58",
- "placeholder": "",
- "style": "IPY_MODEL_2fe5902f30274f95a26e4c0897b2a011",
- "tabbable": null,
- "tooltip": null,
- "value": "Lucas"
- }
- },
- "086fdaefbddf4915be8385c98d46c358": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "unique_id",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_658f48614d3d4c63bbdc946d44b9f734",
- "placeholder": "",
- "style": "IPY_MODEL_80683606961c4187965cc00e4944fbe1",
- "tabbable": null,
- "tooltip": null,
- "value": "2"
- }
- },
- "0908dba9a7fa4e22b818b1bb31bd415a": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "11bbe083b862406d814c35e47d773ade": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "1b024ddb36284e4b85b5054092f531bc": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "1fa9c4a8c62846f591a1fb4e4d91d3ed": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "dob",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_cb0b167a052745fd9204729f961b7ad6",
- "placeholder": "",
- "style": "IPY_MODEL_ca1c754b37a640a69908cb2b34cb7b3c",
- "tabbable": null,
- "tooltip": null,
- "value": "1984-01-02"
- }
- },
- "2191036959a444558a7e6663f51f60d4": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "21ae049d073b458b902a65f9d4b1e0f1": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_968249064f3446ee806ef5e750085fe4",
- "IPY_MODEL_272d325448674ae4882758a7af3d6354"
- ],
- "layout": "IPY_MODEL_ecb562219597492fae7b4bf881ea113f",
- "tabbable": null,
- "tooltip": null
- }
- },
- "23d4adc30abf4b8c845e3a22a7929ada": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "24f94cd7e9f84e2098d97f75d6f01a8d": {
- "model_module": "@jupyter-widgets/output",
- "model_module_version": "1.0.0",
- "model_name": "OutputModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/output",
- "_model_module_version": "1.0.0",
- "_model_name": "OutputModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/output",
- "_view_module_version": "1.0.0",
- "_view_name": "OutputView",
- "layout": "IPY_MODEL_f3de223015854925895cbd794a7ee8c2",
- "msg_id": "",
- "outputs": [
- {
- "data": {
- "text/html": "\n\n\n",
- "text/plain": "alt.LayerChart(...)"
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "tabbable": null,
- "tooltip": null
- }
- },
- "250e8abe119446478ca7513778aed39a": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "272d325448674ae4882758a7af3d6354": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "VBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "VBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "VBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_086fdaefbddf4915be8385c98d46c358",
- "IPY_MODEL_04ba5b27fff046cdbcff86aeb938daf6",
- "IPY_MODEL_3ce34f18f65c440f94f803b75b882788",
- "IPY_MODEL_4c51a67113a44ddfaff9108a85c37ec9",
- "IPY_MODEL_42f888b0586648a3b2cfc3394f083e26",
- "IPY_MODEL_c651dbf1a60b4f219d31355ec3d9d1cd"
- ],
- "layout": "IPY_MODEL_ffdf7e9a14d04f6b9a448df89e8ffbec",
- "tabbable": null,
- "tooltip": null
- }
- },
- "27e26efe54ab4971bbc5da3a355071b4": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "2a595765a0184d0db511726d83d67110": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "2fe5902f30274f95a26e4c0897b2a011": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "329729744d0c4e839c0503ab0259b18c": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "email",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_c71b8b6aa86343a7b0093f15313677f7",
- "placeholder": "",
- "style": "IPY_MODEL_8456ddacbc5542c2a9d668d042a6abe6",
- "tabbable": null,
- "tooltip": null,
- "value": "lucas.smith@hotmail.com"
- }
- },
- "3b9b6f02a8f7491585ed99976548eb03": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "VBoxModel",
- "state": {
- "_dom_classes": [
- "widget-interact"
- ],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "VBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "VBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_73f2384489da4b748209c462724a3961",
- "IPY_MODEL_d27918510cb3494c930c545563fed3ca",
- "IPY_MODEL_b571fbb3f6a14611974779e927914d23",
- "IPY_MODEL_cc0ddc91bbfb4c6d891c202e279d4c74",
- "IPY_MODEL_b50d412867114f2eb000fc969b677c9c",
- "IPY_MODEL_24f94cd7e9f84e2098d97f75d6f01a8d"
- ],
- "layout": "IPY_MODEL_43db6d5bd49c46da8203977961927fed",
- "tabbable": null,
- "tooltip": null
- }
- },
- "3ce34f18f65c440f94f803b75b882788": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "surname",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_de296f36c45e4161a6c3d091dfea2892",
- "placeholder": "",
- "style": "IPY_MODEL_a2c309f444944797824589147aede89d",
- "tabbable": null,
- "tooltip": null,
- "value": "Smith"
- }
- },
- "3e98708a97934bb095b2be73187fcec0": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "42f888b0586648a3b2cfc3394f083e26": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "email",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_2191036959a444558a7e6663f51f60d4",
- "placeholder": "",
- "style": "IPY_MODEL_0908dba9a7fa4e22b818b1bb31bd415a",
- "tabbable": null,
- "tooltip": null,
- "value": "lucas.smith@hotmail.com"
- }
- },
- "43db6d5bd49c46da8203977961927fed": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "440256de4ff2405e95431f2fd9490436": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "4baf8bf6cf784f5ca98feee93978b3af": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "4c51a67113a44ddfaff9108a85c37ec9": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "dob",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_27e26efe54ab4971bbc5da3a355071b4",
- "placeholder": "",
- "style": "IPY_MODEL_7c52d046223c4607b35759a148cc8515",
- "tabbable": null,
- "tooltip": null,
- "value": "1983-02-12"
- }
- },
- "4ccb8090b4a549829a1ac4a82a39ceac": {
- "model_module": "@jupyter-widgets/output",
- "model_module_version": "1.0.0",
- "model_name": "OutputModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/output",
- "_model_module_version": "1.0.0",
- "_model_name": "OutputModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/output",
- "_view_module_version": "1.0.0",
- "_view_name": "OutputView",
- "layout": "IPY_MODEL_2a595765a0184d0db511726d83d67110",
- "msg_id": "",
- "outputs": [
- {
- "data": {
- "text/html": "\n\n\n",
- "text/plain": "alt.LayerChart(...)"
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "tabbable": null,
- "tooltip": null
- }
- },
- "4f0444bd007f4fb1a2e35bd299cf1989": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "4f4d5437f9574de4a3b98769f2f94d13": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "5831fd1d9a4a4699a1ac8b3f35c67bc3": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "5d4a7365a34746fe9ed2319c19ad0d58": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "658f48614d3d4c63bbdc946d44b9f734": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "68972985dae04505b33533980fc0b970": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "71ef6ceee3034d95a7a718dda4457629": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "first_name",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_4baf8bf6cf784f5ca98feee93978b3af",
- "placeholder": "",
- "style": "IPY_MODEL_9fc84fac4de14d3abe25a8eeb06fadea",
- "tabbable": null,
- "tooltip": null,
- "value": "Lucas"
- }
- },
- "73f2384489da4b748209c462724a3961": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "first_name",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_773e45ebabda4dc0aafd521d3aa3e6a8",
- "placeholder": "",
- "style": "IPY_MODEL_c8a47b61ac8e41c3b0c85fbbf31a4c24",
- "tabbable": null,
- "tooltip": null,
- "value": "Robert"
- }
- },
- "773e45ebabda4dc0aafd521d3aa3e6a8": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "7c52d046223c4607b35759a148cc8515": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "80683606961c4187965cc00e4944fbe1": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "8456ddacbc5542c2a9d668d042a6abe6": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "968249064f3446ee806ef5e750085fe4": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "VBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "VBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "VBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_e7d7652b464b413283dd5a057dd68aaf",
- "IPY_MODEL_71ef6ceee3034d95a7a718dda4457629",
- "IPY_MODEL_c1d56f96db3a492b88776c47754d6cf1",
- "IPY_MODEL_1fa9c4a8c62846f591a1fb4e4d91d3ed",
- "IPY_MODEL_329729744d0c4e839c0503ab0259b18c",
- "IPY_MODEL_e9b46eb9fbcb4aa8a569749fa50fcff3"
- ],
- "layout": "IPY_MODEL_e3c8bdbb08184f979824e4993cec1f63",
- "tabbable": null,
- "tooltip": null
- }
- },
- "9fc84fac4de14d3abe25a8eeb06fadea": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "a2c309f444944797824589147aede89d": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "aa3e943234ac46ca8d4530ab22443b0e": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "ab457eac76f942508d6f75ff109bbd61": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "b50d412867114f2eb000fc969b677c9c": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "email",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_440256de4ff2405e95431f2fd9490436",
- "placeholder": "",
- "style": "IPY_MODEL_aa3e943234ac46ca8d4530ab22443b0e",
- "tabbable": null,
- "tooltip": null,
- "value": "robert255@smith.net"
- }
- },
- "b571fbb3f6a14611974779e927914d23": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "dob",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_3e98708a97934bb095b2be73187fcec0",
- "placeholder": "",
- "style": "IPY_MODEL_fdd25b8bb828465e83f7a3c452ccbfc1",
- "tabbable": null,
- "tooltip": null,
- "value": "1971-05-24"
- }
- },
- "b7e0ec811eba4e59804eca1852402423": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "c1d56f96db3a492b88776c47754d6cf1": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "surname",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_68972985dae04505b33533980fc0b970",
- "placeholder": "",
- "style": "IPY_MODEL_11bbe083b862406d814c35e47d773ade",
- "tabbable": null,
- "tooltip": null,
- "value": "Smith"
- }
- },
- "c2e6efd78b2148f3a026c6179fdbf683": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "c651dbf1a60b4f219d31355ec3d9d1cd": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "city",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_5831fd1d9a4a4699a1ac8b3f35c67bc3",
- "placeholder": "",
- "style": "IPY_MODEL_b7e0ec811eba4e59804eca1852402423",
- "tabbable": null,
- "tooltip": null,
- "value": "Machester"
- }
- },
- "c71b8b6aa86343a7b0093f15313677f7": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "c8a47b61ac8e41c3b0c85fbbf31a4c24": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "ca1c754b37a640a69908cb2b34cb7b3c": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "cb0b167a052745fd9204729f961b7ad6": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "cc0ddc91bbfb4c6d891c202e279d4c74": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "city",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_e460aba5b4c34695a30ea7008b5b7cfd",
- "placeholder": "",
- "style": "IPY_MODEL_ab457eac76f942508d6f75ff109bbd61",
- "tabbable": null,
- "tooltip": null,
- "value": "London"
- }
- },
- "d27918510cb3494c930c545563fed3ca": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "surname",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_23d4adc30abf4b8c845e3a22a7929ada",
- "placeholder": "",
- "style": "IPY_MODEL_c2e6efd78b2148f3a026c6179fdbf683",
- "tabbable": null,
- "tooltip": null,
- "value": "Alan"
- }
- },
- "de296f36c45e4161a6c3d091dfea2892": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "e3c8bdbb08184f979824e4993cec1f63": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "e460aba5b4c34695a30ea7008b5b7cfd": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "e7d7652b464b413283dd5a057dd68aaf": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "unique_id",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_4f4d5437f9574de4a3b98769f2f94d13",
- "placeholder": "",
- "style": "IPY_MODEL_1b024ddb36284e4b85b5054092f531bc",
- "tabbable": null,
- "tooltip": null,
- "value": "1"
- }
- },
- "e9b46eb9fbcb4aa8a569749fa50fcff3": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "TextView",
- "continuous_update": true,
- "description": "city",
- "description_allow_html": false,
- "disabled": false,
- "layout": "IPY_MODEL_250e8abe119446478ca7513778aed39a",
- "placeholder": "",
- "style": "IPY_MODEL_4f0444bd007f4fb1a2e35bd299cf1989",
- "tabbable": null,
- "tooltip": null,
- "value": "London"
- }
- },
- "ecb562219597492fae7b4bf881ea113f": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "f3de223015854925895cbd794a7ee8c2": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "fdd25b8bb828465e83f7a3c452ccbfc1": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "TextStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "TextStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "background": null,
- "description_width": "",
- "font_size": null,
- "text_color": null
- }
- },
- "ffdf7e9a14d04f6b9a448df89e8ffbec": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
+ ],
+ "tabbable": null,
+ "tooltip": null
}
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+ },
+ "4f0444bd007f4fb1a2e35bd299cf1989": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "4f4d5437f9574de4a3b98769f2f94d13": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "5831fd1d9a4a4699a1ac8b3f35c67bc3": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "5d4a7365a34746fe9ed2319c19ad0d58": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "658f48614d3d4c63bbdc946d44b9f734": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "68972985dae04505b33533980fc0b970": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "71ef6ceee3034d95a7a718dda4457629": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "first_name",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_4baf8bf6cf784f5ca98feee93978b3af",
+ "placeholder": "",
+ "style": "IPY_MODEL_9fc84fac4de14d3abe25a8eeb06fadea",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Lucas"
+ }
+ },
+ "73f2384489da4b748209c462724a3961": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "first_name",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_773e45ebabda4dc0aafd521d3aa3e6a8",
+ "placeholder": "",
+ "style": "IPY_MODEL_c8a47b61ac8e41c3b0c85fbbf31a4c24",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Robert"
+ }
+ },
+ "773e45ebabda4dc0aafd521d3aa3e6a8": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "7c52d046223c4607b35759a148cc8515": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "80683606961c4187965cc00e4944fbe1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "8456ddacbc5542c2a9d668d042a6abe6": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "968249064f3446ee806ef5e750085fe4": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "VBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "VBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "VBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_e7d7652b464b413283dd5a057dd68aaf",
+ "IPY_MODEL_71ef6ceee3034d95a7a718dda4457629",
+ "IPY_MODEL_c1d56f96db3a492b88776c47754d6cf1",
+ "IPY_MODEL_1fa9c4a8c62846f591a1fb4e4d91d3ed",
+ "IPY_MODEL_329729744d0c4e839c0503ab0259b18c",
+ "IPY_MODEL_e9b46eb9fbcb4aa8a569749fa50fcff3"
+ ],
+ "layout": "IPY_MODEL_e3c8bdbb08184f979824e4993cec1f63",
+ "tabbable": null,
+ "tooltip": null
+ }
+ },
+ "9fc84fac4de14d3abe25a8eeb06fadea": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "a2c309f444944797824589147aede89d": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "aa3e943234ac46ca8d4530ab22443b0e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "ab457eac76f942508d6f75ff109bbd61": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "b50d412867114f2eb000fc969b677c9c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "email",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_440256de4ff2405e95431f2fd9490436",
+ "placeholder": "",
+ "style": "IPY_MODEL_aa3e943234ac46ca8d4530ab22443b0e",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "robert255@smith.net"
+ }
+ },
+ "b571fbb3f6a14611974779e927914d23": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "dob",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_3e98708a97934bb095b2be73187fcec0",
+ "placeholder": "",
+ "style": "IPY_MODEL_fdd25b8bb828465e83f7a3c452ccbfc1",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "1971-05-24"
+ }
+ },
+ "b7e0ec811eba4e59804eca1852402423": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "c1d56f96db3a492b88776c47754d6cf1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "surname",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_68972985dae04505b33533980fc0b970",
+ "placeholder": "",
+ "style": "IPY_MODEL_11bbe083b862406d814c35e47d773ade",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Smith"
+ }
+ },
+ "c2e6efd78b2148f3a026c6179fdbf683": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "c651dbf1a60b4f219d31355ec3d9d1cd": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "city",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_5831fd1d9a4a4699a1ac8b3f35c67bc3",
+ "placeholder": "",
+ "style": "IPY_MODEL_b7e0ec811eba4e59804eca1852402423",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Machester"
+ }
+ },
+ "c71b8b6aa86343a7b0093f15313677f7": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "c8a47b61ac8e41c3b0c85fbbf31a4c24": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "ca1c754b37a640a69908cb2b34cb7b3c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "cb0b167a052745fd9204729f961b7ad6": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "cc0ddc91bbfb4c6d891c202e279d4c74": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "city",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_e460aba5b4c34695a30ea7008b5b7cfd",
+ "placeholder": "",
+ "style": "IPY_MODEL_ab457eac76f942508d6f75ff109bbd61",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "London"
+ }
+ },
+ "d27918510cb3494c930c545563fed3ca": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "surname",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_23d4adc30abf4b8c845e3a22a7929ada",
+ "placeholder": "",
+ "style": "IPY_MODEL_c2e6efd78b2148f3a026c6179fdbf683",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "Alan"
+ }
+ },
+ "de296f36c45e4161a6c3d091dfea2892": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "e3c8bdbb08184f979824e4993cec1f63": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "e460aba5b4c34695a30ea7008b5b7cfd": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "e7d7652b464b413283dd5a057dd68aaf": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "unique_id",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_4f4d5437f9574de4a3b98769f2f94d13",
+ "placeholder": "",
+ "style": "IPY_MODEL_1b024ddb36284e4b85b5054092f531bc",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "1"
+ }
+ },
+ "e9b46eb9fbcb4aa8a569749fa50fcff3": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "TextView",
+ "continuous_update": true,
+ "description": "city",
+ "description_allow_html": false,
+ "disabled": false,
+ "layout": "IPY_MODEL_250e8abe119446478ca7513778aed39a",
+ "placeholder": "",
+ "style": "IPY_MODEL_4f0444bd007f4fb1a2e35bd299cf1989",
+ "tabbable": null,
+ "tooltip": null,
+ "value": "London"
+ }
+ },
+ "ecb562219597492fae7b4bf881ea113f": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "f3de223015854925895cbd794a7ee8c2": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "fdd25b8bb828465e83f7a3c452ccbfc1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "TextStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "TextStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "background": null,
+ "description_width": "",
+ "font_size": null,
+ "text_color": null
+ }
+ },
+ "ffdf7e9a14d04f6b9a448df89e8ffbec": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/docs/demos/examples/duckdb/transactions.ipynb b/docs/demos/examples/duckdb/transactions.ipynb
index 2acc462ef4..347c125a12 100644
--- a/docs/demos/examples/duckdb/transactions.ipynb
+++ b/docs/demos/examples/duckdb/transactions.ipynb
@@ -1,1412 +1,1412 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Linking banking transactions\n",
- "\n",
- "This example shows how to perform a one-to-one link on banking transactions.\n",
- "\n",
- "The data is fake data, and was generated has the following features:\n",
- "\n",
- "- Money shows up in the destination account with some time delay\n",
- "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n",
- "- The memo is sometimes truncated and content is sometimes missing\n",
- "\n",
- "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:27.648457Z",
- "iopub.status.busy": "2024-06-07T09:22:27.648128Z",
- "iopub.status.idle": "2024-06-07T09:22:27.653498Z",
- "shell.execute_reply": "2024-06-07T09:22:27.652626Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linking banking transactions\n",
+ "\n",
+ "This example shows how to perform a one-to-one link on banking transactions.\n",
+ "\n",
+ "The data is fake data, and was generated has the following features:\n",
+ "\n",
+ "- Money shows up in the destination account with some time delay\n",
+ "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n",
+ "- The memo is sometimes truncated and content is sometimes missing\n",
+ "\n",
+ "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:27.648457Z",
+ "iopub.status.busy": "2024-06-07T09:22:27.648128Z",
+ "iopub.status.idle": "2024-06-07T09:22:27.653498Z",
+ "shell.execute_reply": "2024-06-07T09:22:27.652626Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:27.657230Z",
+ "iopub.status.busy": "2024-06-07T09:22:27.656926Z",
+ "iopub.status.idle": "2024-06-07T09:22:31.983888Z",
+ "shell.execute_reply": "2024-06-07T09:22:31.983040Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:27.657230Z",
- "iopub.status.busy": "2024-06-07T09:22:27.656926Z",
- "iopub.status.idle": "2024-06-07T09:22:31.983888Z",
- "shell.execute_reply": "2024-06-07T09:22:31.983040Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ground_truth | \n",
- " memo | \n",
- " transaction_date | \n",
- " amount | \n",
- " unique_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " MATTHIAS C paym | \n",
- " 2022-03-28 | \n",
- " 36.36 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " M CORVINUS dona | \n",
- " 2022-02-14 | \n",
- " 221.91 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " ground_truth memo transaction_date amount unique_id\n",
- "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n",
- "1 1 M CORVINUS dona 2022-02-14 221.91 1"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ground_truth | \n",
- " memo | \n",
- " transaction_date | \n",
- " amount | \n",
- " unique_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " MATTHIAS C payment BGC | \n",
- " 2022-03-29 | \n",
- " 36.36 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " M CORVINUS BGC | \n",
- " 2022-02-16 | \n",
- " 221.91 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " ground_truth memo transaction_date amount unique_id\n",
- "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n",
- "1 1 M CORVINUS BGC 2022-02-16 221.91 1"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ground_truth | \n",
+ " memo | \n",
+ " transaction_date | \n",
+ " amount | \n",
+ " unique_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " MATTHIAS C paym | \n",
+ " 2022-03-28 | \n",
+ " 36.36 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " M CORVINUS dona | \n",
+ " 2022-02-14 | \n",
+ " 221.91 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n",
- "\n",
- "df_origin = splink_datasets.transactions_origin\n",
- "df_destination = splink_datasets.transactions_destination\n",
- "\n",
- "display(df_origin.head(2))\n",
- "display(df_destination.head(2))"
+ "text/plain": [
+ " ground_truth memo transaction_date amount unique_id\n",
+ "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n",
+ "1 1 M CORVINUS dona 2022-02-14 221.91 1"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the following chart, we can see this is a challenging dataset to link:\n",
- "\n",
- "- There are only 151 distinct transaction dates, with strong skew\n",
- "- Some 'memos' are used multiple times (up to 48 times)\n",
- "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ground_truth | \n",
+ " memo | \n",
+ " transaction_date | \n",
+ " amount | \n",
+ " unique_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " MATTHIAS C payment BGC | \n",
+ " 2022-03-29 | \n",
+ " 36.36 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " M CORVINUS BGC | \n",
+ " 2022-02-16 | \n",
+ " 221.91 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ground_truth memo transaction_date amount unique_id\n",
+ "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n",
+ "1 1 M CORVINUS BGC 2022-02-16 221.91 1"
]
- },
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n",
+ "\n",
+ "df_origin = splink_datasets.transactions_origin\n",
+ "df_destination = splink_datasets.transactions_destination\n",
+ "\n",
+ "display(df_origin.head(2))\n",
+ "display(df_destination.head(2))"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the following chart, we can see this is a challenging dataset to link:\n",
+ "\n",
+ "- There are only 151 distinct transaction dates, with strong skew\n",
+ "- Some 'memos' are used multiple times (up to 48 times)\n",
+ "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:31.987843Z",
+ "iopub.status.busy": "2024-06-07T09:22:31.987459Z",
+ "iopub.status.idle": "2024-06-07T09:22:32.720064Z",
+ "shell.execute_reply": "2024-06-07T09:22:32.719389Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:31.987843Z",
- "iopub.status.busy": "2024-06-07T09:22:31.987459Z",
- "iopub.status.idle": "2024-06-07T09:22:32.720064Z",
- "shell.execute_reply": "2024-06-07T09:22:32.719389Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink.exploratory import profile_columns\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "profile_columns(\n",
- " [df_origin, df_destination],\n",
- " db_api=db_api,\n",
- " column_expressions=[\n",
- " \"memo\",\n",
- " \"transaction_date\",\n",
- " \"amount\",\n",
- " ],\n",
- ")"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.exploratory import profile_columns\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_origin_sdf = db_api.register(df_origin)\n",
+ "df_destination_sdf = db_api.register(df_destination)\n",
+ "profile_columns(\n",
+ " [df_origin_sdf, df_destination_sdf],\n",
+ " column_expressions=[\n",
+ " \"memo\",\n",
+ " \"transaction_date\",\n",
+ " \"amount\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:32.724189Z",
+ "iopub.status.busy": "2024-06-07T09:22:32.723901Z",
+ "iopub.status.idle": "2024-06-07T09:22:33.500975Z",
+ "shell.execute_reply": "2024-06-07T09:22:33.500399Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:32.724189Z",
- "iopub.status.busy": "2024-06-07T09:22:32.723901Z",
- "iopub.status.idle": "2024-06-07T09:22:33.500975Z",
- "shell.execute_reply": "2024-06-07T09:22:33.500399Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import DuckDBAPI, block_on\n",
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "# Design blocking rules that allow for differences in transaction date and amounts\n",
- "blocking_rule_date_1 = \"\"\"\n",
- " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n",
- " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n",
- " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n",
- "\"\"\"\n",
- "\n",
- "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n",
- "blocking_rule_date_2 = \"\"\"\n",
- " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n",
- " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n",
- " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n",
- "\"\"\"\n",
- "\n",
- "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n",
- "\n",
- "blocking_rule_amount_1 = \"\"\"\n",
- "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n",
- "\"\"\"\n",
- "\n",
- "blocking_rule_amount_2 = \"\"\"\n",
- "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n",
- "\"\"\"\n",
- "\n",
- "blocking_rule_cheat = block_on(\"unique_id\")\n",
- "\n",
- "\n",
- "brs = [\n",
- " blocking_rule_date_1,\n",
- " blocking_rule_date_2,\n",
- " blocking_rule_memo,\n",
- " blocking_rule_amount_1,\n",
- " blocking_rule_amount_2,\n",
- " blocking_rule_cheat,\n",
- "]\n",
- "\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=[df_origin, df_destination],\n",
- " blocking_rules=brs,\n",
- " db_api=db_api,\n",
- " link_type=\"link_only\"\n",
- ")"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import DuckDBAPI, block_on\n",
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "# Design blocking rules that allow for differences in transaction date and amounts\n",
+ "blocking_rule_date_1 = \"\"\"\n",
+ " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n",
+ " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n",
+ " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n",
+ "\"\"\"\n",
+ "\n",
+ "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n",
+ "blocking_rule_date_2 = \"\"\"\n",
+ " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n",
+ " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n",
+ " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n",
+ "\"\"\"\n",
+ "\n",
+ "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n",
+ "\n",
+ "blocking_rule_amount_1 = \"\"\"\n",
+ "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n",
+ "\"\"\"\n",
+ "\n",
+ "blocking_rule_amount_2 = \"\"\"\n",
+ "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n",
+ "\"\"\"\n",
+ "\n",
+ "blocking_rule_cheat = block_on(\"unique_id\")\n",
+ "\n",
+ "\n",
+ "brs = [\n",
+ " blocking_rule_date_1,\n",
+ " blocking_rule_date_2,\n",
+ " blocking_rule_memo,\n",
+ " blocking_rule_amount_1,\n",
+ " blocking_rule_amount_2,\n",
+ " blocking_rule_cheat,\n",
+ "]\n",
+ "\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_origin_sdf = db_api.register(df_origin)\n",
+ "df_destination_sdf = db_api.register(df_destination)\n",
+ "\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " [df_origin_sdf, df_destination_sdf],\n",
+ " blocking_rules=brs,\n",
+ " link_type=\"link_only\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:33.504001Z",
+ "iopub.status.busy": "2024-06-07T09:22:33.503779Z",
+ "iopub.status.idle": "2024-06-07T09:22:33.511675Z",
+ "shell.execute_reply": "2024-06-07T09:22:33.511212Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Full settings for linking model\n",
+ "import splink.comparison_level_library as cll\n",
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "comparison_amount = {\n",
+ " \"output_column_name\": \"amount\",\n",
+ " \"comparison_levels\": [\n",
+ " cll.NullLevel(\"amount\"),\n",
+ " cll.ExactMatchLevel(\"amount\"),\n",
+ " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n",
+ " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n",
+ " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n",
+ " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n",
+ " cll.ElseLevel(),\n",
+ " ],\n",
+ " \"comparison_description\": \"Amount percentage difference\",\n",
+ "}\n",
+ "\n",
+ "# The date distance is one sided becaause transactions should only arrive after they've left\n",
+ "# As a result, the comparison_template_library date difference functions are not appropriate\n",
+ "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n",
+ "\n",
+ "comparison_date = {\n",
+ " \"output_column_name\": \"transaction_date\",\n",
+ " \"comparison_levels\": [\n",
+ " cll.NullLevel(\"transaction_date\"),\n",
+ " {\n",
+ " \"sql_condition\": within_n_days_template.format(n=1),\n",
+ " \"label_for_charts\": \"1 day\",\n",
+ " },\n",
+ " {\n",
+ " \"sql_condition\": within_n_days_template.format(n=4),\n",
+ " \"label_for_charts\": \"<=4 days\",\n",
+ " },\n",
+ " {\n",
+ " \"sql_condition\": within_n_days_template.format(n=10),\n",
+ " \"label_for_charts\": \"<=10 days\",\n",
+ " },\n",
+ " {\n",
+ " \"sql_condition\": within_n_days_template.format(n=30),\n",
+ " \"label_for_charts\": \"<=30 days\",\n",
+ " },\n",
+ " cll.ElseLevel(),\n",
+ " ],\n",
+ " \"comparison_description\": \"Transaction date days apart\",\n",
+ "}\n",
+ "\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"link_only\",\n",
+ " probability_two_random_records_match=1 / len(df_origin),\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " blocking_rule_date_1,\n",
+ " blocking_rule_date_2,\n",
+ " blocking_rule_memo,\n",
+ " blocking_rule_amount_1,\n",
+ " blocking_rule_amount_2,\n",
+ " blocking_rule_cheat,\n",
+ " ],\n",
+ " comparisons=[\n",
+ " comparison_amount,\n",
+ " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n",
+ " comparison_date,\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:33.514381Z",
+ "iopub.status.busy": "2024-06-07T09:22:33.514150Z",
+ "iopub.status.idle": "2024-06-07T09:22:33.621746Z",
+ "shell.execute_reply": "2024-06-07T09:22:33.621038Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "db_api = DuckDBAPI()\n",
+ "df_origin_sdf = db_api.register(df_origin, source_dataset_name=\"__ori\")\n",
+ "df_destination_sdf = db_api.register(df_destination, source_dataset_name=\"_dest\")\n",
+ "linker = Linker([df_origin_sdf, df_destination_sdf], settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:33.625044Z",
+ "iopub.status.busy": "2024-06-07T09:22:33.624807Z",
+ "iopub.status.idle": "2024-06-07T09:22:35.145751Z",
+ "shell.execute_reply": "2024-06-07T09:22:35.145280Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:33.504001Z",
- "iopub.status.busy": "2024-06-07T09:22:33.503779Z",
- "iopub.status.idle": "2024-06-07T09:22:33.511675Z",
- "shell.execute_reply": "2024-06-07T09:22:33.511212Z"
- }
- },
- "outputs": [],
- "source": [
- "# Full settings for linking model\n",
- "import splink.comparison_level_library as cll\n",
- "import splink.comparison_library as cl\n",
- "\n",
- "comparison_amount = {\n",
- " \"output_column_name\": \"amount\",\n",
- " \"comparison_levels\": [\n",
- " cll.NullLevel(\"amount\"),\n",
- " cll.ExactMatchLevel(\"amount\"),\n",
- " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n",
- " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n",
- " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n",
- " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n",
- " cll.ElseLevel(),\n",
- " ],\n",
- " \"comparison_description\": \"Amount percentage difference\",\n",
- "}\n",
- "\n",
- "# The date distance is one sided becaause transactions should only arrive after they've left\n",
- "# As a result, the comparison_template_library date difference functions are not appropriate\n",
- "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n",
- "\n",
- "comparison_date = {\n",
- " \"output_column_name\": \"transaction_date\",\n",
- " \"comparison_levels\": [\n",
- " cll.NullLevel(\"transaction_date\"),\n",
- " {\n",
- " \"sql_condition\": within_n_days_template.format(n=1),\n",
- " \"label_for_charts\": \"1 day\",\n",
- " },\n",
- " {\n",
- " \"sql_condition\": within_n_days_template.format(n=4),\n",
- " \"label_for_charts\": \"<=4 days\",\n",
- " },\n",
- " {\n",
- " \"sql_condition\": within_n_days_template.format(n=10),\n",
- " \"label_for_charts\": \"<=10 days\",\n",
- " },\n",
- " {\n",
- " \"sql_condition\": within_n_days_template.format(n=30),\n",
- " \"label_for_charts\": \"<=30 days\",\n",
- " },\n",
- " cll.ElseLevel(),\n",
- " ],\n",
- " \"comparison_description\": \"Transaction date days apart\",\n",
- "}\n",
- "\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"link_only\",\n",
- " probability_two_random_records_match=1 / len(df_origin),\n",
- " blocking_rules_to_generate_predictions=[\n",
- " blocking_rule_date_1,\n",
- " blocking_rule_date_2,\n",
- " blocking_rule_memo,\n",
- " blocking_rule_amount_1,\n",
- " blocking_rule_amount_2,\n",
- " blocking_rule_cheat,\n",
- " ],\n",
- " comparisons=[\n",
- " comparison_amount,\n",
- " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n",
- " comparison_date,\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
+ "----- Estimating u probabilities using random sampling -----\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - amount (no m values are trained).\n",
+ " - memo (no m values are trained).\n",
+ " - transaction_date (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:35.148614Z",
+ "iopub.status.busy": "2024-06-07T09:22:35.148331Z",
+ "iopub.status.idle": "2024-06-07T09:22:36.323460Z",
+ "shell.execute_reply": "2024-06-07T09:22:36.322736Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:33.514381Z",
- "iopub.status.busy": "2024-06-07T09:22:33.514150Z",
- "iopub.status.idle": "2024-06-07T09:22:33.621746Z",
- "shell.execute_reply": "2024-06-07T09:22:33.621038Z"
- }
- },
- "outputs": [],
- "source": [
- "linker = Linker(\n",
- " [df_origin, df_destination],\n",
- " settings,\n",
- " input_table_aliases=[\"__ori\", \"_dest\"],\n",
- " db_api=db_api,\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"memo\" = r.\"memo\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - amount\n",
+ " - transaction_date\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - memo\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.588 in the m_probability of amount, level `Exact match on amount`\n",
+ "Iteration 2: Largest change in params was -0.176 in the m_probability of transaction_date, level `1 day`\n",
+ "Iteration 3: Largest change in params was 0.00996 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n",
+ "Iteration 4: Largest change in params was 0.0022 in the m_probability of transaction_date, level `<=30 days`\n",
+ "Iteration 5: Largest change in params was 0.000385 in the m_probability of transaction_date, level `<=30 days`\n",
+ "Iteration 6: Largest change in params was -0.000255 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was -0.000229 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 8: Largest change in params was -0.000208 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 9: Largest change in params was -0.00019 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 10: Largest change in params was -0.000173 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 11: Largest change in params was -0.000159 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 12: Largest change in params was -0.000146 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 13: Largest change in params was -0.000135 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 14: Largest change in params was -0.000124 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 15: Largest change in params was -0.000115 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 16: Largest change in params was -0.000107 in the m_probability of amount, level `All other comparisons`\n",
+ "Iteration 17: Largest change in params was -9.92e-05 in the m_probability of amount, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 17 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - memo (no m values are trained).\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:33.625044Z",
- "iopub.status.busy": "2024-06-07T09:22:33.624807Z",
- "iopub.status.idle": "2024-06-07T09:22:35.145751Z",
- "shell.execute_reply": "2024-06-07T09:22:35.145280Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
- "----- Estimating u probabilities using random sampling -----\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - amount (no m values are trained).\n",
- " - memo (no m values are trained).\n",
- " - transaction_date (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ "data": {
+ "text/plain": [
+ ""
]
- },
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:36.326561Z",
+ "iopub.status.busy": "2024-06-07T09:22:36.326344Z",
+ "iopub.status.idle": "2024-06-07T09:22:37.563023Z",
+ "shell.execute_reply": "2024-06-07T09:22:37.562461Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:35.148614Z",
- "iopub.status.busy": "2024-06-07T09:22:35.148331Z",
- "iopub.status.idle": "2024-06-07T09:22:36.323460Z",
- "shell.execute_reply": "2024-06-07T09:22:36.322736Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"memo\" = r.\"memo\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - amount\n",
- " - transaction_date\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - memo\n",
- "\n",
- "Iteration 1: Largest change in params was -0.588 in the m_probability of amount, level `Exact match on amount`\n",
- "Iteration 2: Largest change in params was -0.176 in the m_probability of transaction_date, level `1 day`\n",
- "Iteration 3: Largest change in params was 0.00996 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n",
- "Iteration 4: Largest change in params was 0.0022 in the m_probability of transaction_date, level `<=30 days`\n",
- "Iteration 5: Largest change in params was 0.000385 in the m_probability of transaction_date, level `<=30 days`\n",
- "Iteration 6: Largest change in params was -0.000255 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 7: Largest change in params was -0.000229 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 8: Largest change in params was -0.000208 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 9: Largest change in params was -0.00019 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 10: Largest change in params was -0.000173 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 11: Largest change in params was -0.000159 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 12: Largest change in params was -0.000146 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 13: Largest change in params was -0.000135 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 14: Largest change in params was -0.000124 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 15: Largest change in params was -0.000115 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 16: Largest change in params was -0.000107 in the m_probability of amount, level `All other comparisons`\n",
- "Iteration 17: Largest change in params was -9.92e-05 in the m_probability of amount, level `All other comparisons`\n",
- "\n",
- "EM converged after 17 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - memo (no m values are trained).\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"amount\" = r.\"amount\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - memo\n",
+ " - transaction_date\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - amount\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.373 in the m_probability of memo, level `Exact match on memo`\n",
+ "Iteration 2: Largest change in params was -0.108 in the m_probability of memo, level `Exact match on memo`\n",
+ "Iteration 3: Largest change in params was 0.0202 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n",
+ "Iteration 4: Largest change in params was -0.00538 in the m_probability of memo, level `Exact match on memo`\n",
+ "Iteration 5: Largest change in params was 0.00482 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 6: Largest change in params was 0.00508 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 7: Largest change in params was 0.00502 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 8: Largest change in params was 0.00466 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 9: Largest change in params was 0.00409 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 10: Largest change in params was 0.00343 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 11: Largest change in params was 0.00276 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 12: Largest change in params was 0.00216 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 13: Largest change in params was 0.00165 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 14: Largest change in params was 0.00124 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 15: Largest change in params was 0.000915 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 16: Largest change in params was 0.000671 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 17: Largest change in params was 0.000488 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 18: Largest change in params was 0.000353 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 19: Largest change in params was 0.000255 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 20: Largest change in params was 0.000183 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 21: Largest change in params was 0.000132 in the m_probability of memo, level `All other comparisons`\n",
+ "Iteration 22: Largest change in params was 9.45e-05 in the m_probability of memo, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 22 iterations\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
+ }
+ ],
+ "source": [
+ "session = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:37.565956Z",
+ "iopub.status.busy": "2024-06-07T09:22:37.565738Z",
+ "iopub.status.idle": "2024-06-07T09:22:37.832159Z",
+ "shell.execute_reply": "2024-06-07T09:22:37.831506Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:36.326561Z",
- "iopub.status.busy": "2024-06-07T09:22:36.326344Z",
- "iopub.status.idle": "2024-06-07T09:22:37.563023Z",
- "shell.execute_reply": "2024-06-07T09:22:37.562461Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"amount\" = r.\"amount\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - memo\n",
- " - transaction_date\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - amount\n",
- "\n",
- "Iteration 1: Largest change in params was -0.373 in the m_probability of memo, level `Exact match on memo`\n",
- "Iteration 2: Largest change in params was -0.108 in the m_probability of memo, level `Exact match on memo`\n",
- "Iteration 3: Largest change in params was 0.0202 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n",
- "Iteration 4: Largest change in params was -0.00538 in the m_probability of memo, level `Exact match on memo`\n",
- "Iteration 5: Largest change in params was 0.00482 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 6: Largest change in params was 0.00508 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 7: Largest change in params was 0.00502 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 8: Largest change in params was 0.00466 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 9: Largest change in params was 0.00409 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 10: Largest change in params was 0.00343 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 11: Largest change in params was 0.00276 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 12: Largest change in params was 0.00216 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 13: Largest change in params was 0.00165 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 14: Largest change in params was 0.00124 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 15: Largest change in params was 0.000915 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 16: Largest change in params was 0.000671 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 17: Largest change in params was 0.000488 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 18: Largest change in params was 0.000353 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 19: Largest change in params was 0.000255 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 20: Largest change in params was 0.000183 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 21: Largest change in params was 0.000132 in the m_probability of memo, level `All other comparisons`\n",
- "Iteration 22: Largest change in params was 9.45e-05 in the m_probability of memo, level `All other comparisons`\n",
- "\n",
- "EM converged after 22 iterations\n",
- "\n",
- "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "session = linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:37.835082Z",
+ "iopub.status.busy": "2024-06-07T09:22:37.834871Z",
+ "iopub.status.idle": "2024-06-07T09:22:58.616771Z",
+ "shell.execute_reply": "2024-06-07T09:22:58.615862Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:37.565956Z",
- "iopub.status.busy": "2024-06-07T09:22:37.565738Z",
- "iopub.status.idle": "2024-06-07T09:22:37.832159Z",
- "shell.execute_reply": "2024-06-07T09:22:37.831506Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "862ba86b3fa649ddb3c14eee78c00fed",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.match_weights_chart()"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
- },
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_predict = linker.inference.predict(threshold_match_probability=0.001)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:58.620828Z",
+ "iopub.status.busy": "2024-06-07T09:22:58.620523Z",
+ "iopub.status.idle": "2024-06-07T09:22:59.018555Z",
+ "shell.execute_reply": "2024-06-07T09:22:59.017917Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:37.835082Z",
- "iopub.status.busy": "2024-06-07T09:22:37.834871Z",
- "iopub.status.idle": "2024-06-07T09:22:58.616771Z",
- "shell.execute_reply": "2024-06-07T09:22:58.615862Z"
- }
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "862ba86b3fa649ddb3c14eee78c00fed",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
],
- "source": [
- "df_predict = linker.inference.predict(threshold_match_probability=0.001)"
+ "text/plain": [
+ ""
]
- },
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.comparison_viewer_dashboard(\n",
+ " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n",
+ ")\n",
+ "from IPython.display import IFrame\n",
+ "\n",
+ "IFrame(\n",
+ " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:22:59.022067Z",
+ "iopub.status.busy": "2024-06-07T09:22:59.021794Z",
+ "iopub.status.idle": "2024-06-07T09:23:04.254280Z",
+ "shell.execute_reply": "2024-06-07T09:23:04.253648Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:58.620828Z",
- "iopub.status.busy": "2024-06-07T09:22:58.620523Z",
- "iopub.status.idle": "2024-06-07T09:22:59.018555Z",
- "shell.execute_reply": "2024-06-07T09:22:59.017917Z"
- }
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "cdc0840392db4f8da99156e19a89599e",
+ "version_major": 2,
+ "version_minor": 0
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.comparison_viewer_dashboard(\n",
- " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n",
- ")\n",
- "from IPython.display import IFrame\n",
- "\n",
- "IFrame(\n",
- " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n",
- ")"
+ "text/plain": [
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:22:59.022067Z",
- "iopub.status.busy": "2024-06-07T09:22:59.021794Z",
- "iopub.status.idle": "2024-06-07T09:23:04.254280Z",
- "shell.execute_reply": "2024-06-07T09:23:04.253648Z"
- }
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "cdc0840392db4f8da99156e19a89599e",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "pred_errors = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n",
- ")\n",
- "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pred_errors = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n",
+ ")\n",
+ "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-07T09:23:04.257242Z",
+ "iopub.status.busy": "2024-06-07T09:23:04.257017Z",
+ "iopub.status.idle": "2024-06-07T09:23:05.029715Z",
+ "shell.execute_reply": "2024-06-07T09:23:05.029153Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-07T09:23:04.257242Z",
- "iopub.status.busy": "2024-06-07T09:23:04.257017Z",
- "iopub.status.idle": "2024-06-07T09:23:05.029715Z",
- "shell.execute_reply": "2024-06-07T09:23:05.029153Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "pred_errors = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n",
- ")\n",
- "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {
- "0cb4a943a08a42c7841ca32d466f9eed": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "2bae68755fc34e38ac69e792f314ba8e": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "4430006dcc174ff092d96adf68c301ff": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "5c32bb2a7a714bd79accac15915b17e5": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "6222247c7cbe45b19cfeb9b182147a18": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "63719efff46e49ecba53edb438f35c3f": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "2.0.0",
- "_view_name": "ProgressView",
- "bar_style": "",
- "description": "",
- "description_allow_html": false,
- "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57",
- "max": 100,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e",
- "tabbable": null,
- "tooltip": null,
- "value": 100
- }
- },
- "921bb606e07743f7a252c05830098a57": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- },
- "ed234594aea94bf98ffb67a51d3811f4": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "2.0.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "2.0.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "StyleView",
- "bar_color": "black",
- "description_width": ""
- }
- },
- "fd157120a2ca488496c737cec882713d": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "2.0.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "2.0.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "2.0.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border_bottom": null,
- "border_left": null,
- "border_right": null,
- "border_top": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": "auto"
- }
- }
- },
- "version_major": 2,
- "version_minor": 0
- }
- }
+ ],
+ "source": [
+ "pred_errors = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n",
+ ")\n",
+ "linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {
+ "0cb4a943a08a42c7841ca32d466f9eed": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "2bae68755fc34e38ac69e792f314ba8e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "4430006dcc174ff092d96adf68c301ff": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "5c32bb2a7a714bd79accac15915b17e5": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "6222247c7cbe45b19cfeb9b182147a18": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "63719efff46e49ecba53edb438f35c3f": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "2.0.0",
+ "_view_name": "ProgressView",
+ "bar_style": "",
+ "description": "",
+ "description_allow_html": false,
+ "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57",
+ "max": 100,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e",
+ "tabbable": null,
+ "tooltip": null,
+ "value": 100
+ }
+ },
+ "921bb606e07743f7a252c05830098a57": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ },
+ "ed234594aea94bf98ffb67a51d3811f4": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "2.0.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "StyleView",
+ "bar_color": "black",
+ "description_width": ""
+ }
+ },
+ "fd157120a2ca488496c737cec882713d": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "2.0.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "2.0.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border_bottom": null,
+ "border_left": null,
+ "border_right": null,
+ "border_top": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": "auto"
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/duckdb_no_test/bias_eval.ipynb b/docs/demos/examples/duckdb_no_test/bias_eval.ipynb
index ead3525be9..d2afdaada1 100644
--- a/docs/demos/examples/duckdb_no_test/bias_eval.ipynb
+++ b/docs/demos/examples/duckdb_no_test/bias_eval.ipynb
@@ -290,11 +290,13 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "linker = Linker(production_df, settings='../../demo_settings/model_h50k.json', db_api=db_api)"
+ "db_api = DuckDBAPI()\n",
+ "production_df_sdf = db_api.register(production_df)\n",
+ "linker = Linker(production_df_sdf, settings='../../demo_settings/model_h50k.json')"
]
},
{
diff --git a/docs/demos/examples/duckdb_no_test/business_rates_match.ipynb b/docs/demos/examples/duckdb_no_test/business_rates_match.ipynb
index 1cdbd4b805..198d064b92 100644
--- a/docs/demos/examples/duckdb_no_test/business_rates_match.ipynb
+++ b/docs/demos/examples/duckdb_no_test/business_rates_match.ipynb
@@ -435,7 +435,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -477,8 +477,7 @@
" company_name,\n",
" company_number,\n",
" COALESCE(\n",
- " REGEXP_EXTRACT(address_concat, '(\\\\d+[A-Z]?)'),\n",
- " REGEXP_EXTRACT(address_concat, '(\\\\S+)(?=\\\\s+HOUSE)')\n",
+ " REGEXP_EXTRACT(address_concat, '(\\\\d+[A-Z]?)')\n",
" ) AS first_num_in_address,\n",
" postcode,\n",
" name_tokens_with_freq,\n",
@@ -540,7 +539,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -612,7 +611,9 @@
" retain_matching_columns=True,\n",
")\n",
"\n",
- "linker = Linker([df_stockport, df_all_companies], settings, db_api)"
+ "df_stockport_sdf = db_api.register(df_stockport)\n",
+ "df_all_companies_sdf = db_api.register(df_all_companies)\n",
+ "linker = Linker([df_stockport_sdf, df_all_companies_sdf], settings)"
]
},
{
@@ -960,7 +961,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "splink (3.11.11)",
"language": "python",
"name": "python3"
},
@@ -974,7 +975,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.8"
+ "version": "3.11.11"
}
},
"nbformat": 4,
diff --git a/docs/demos/examples/duckdb_no_test/cookbook.ipynb b/docs/demos/examples/duckdb_no_test/cookbook.ipynb
index 0f6d0fed0f..1c78f10f5e 100644
--- a/docs/demos/examples/duckdb_no_test/cookbook.ipynb
+++ b/docs/demos/examples/duckdb_no_test/cookbook.ipynb
@@ -51,7 +51,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -282,7 +282,9 @@
")\n",
"\n",
"\n",
- "linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
"\n",
"linker.inference.predict().as_pandas_dataframe()"
]
@@ -298,7 +300,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -393,9 +395,11 @@
")\n",
"\n",
"\n",
- "linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
"\n",
- "linker.inference.predict().as_pandas_dataframe()\n"
+ "linker.inference.predict().as_pandas_dataframe()"
]
},
{
@@ -416,7 +420,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -477,6 +481,7 @@
"duckdb_df = duckdb.read_parquet(temp_file_path)\n",
"\n",
"db_api = DuckDBAPI(\":default:\")\n",
+ "df_sdf = db_api.register(df)\n",
"settings = SettingsCreator(\n",
" link_type=\"dedupe_only\",\n",
" comparisons=[\n",
@@ -489,7 +494,7 @@
" ],\n",
")\n",
"\n",
- "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
"\n",
"result = linker.inference.predict().as_duckdbpyrelation()\n",
"\n",
@@ -498,7 +503,7 @@
"\n",
"# For example, we can use the `sort` function to sort the results,\n",
"# or could use result.to_parquet() to write to a parquet file.\n",
- "result.sort(\"match_weight\")\n"
+ "result.sort(\"match_weight\")"
]
},
{
@@ -510,7 +515,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -628,7 +633,8 @@
")\n",
"\n",
"df = splink_datasets.fake_1000\n",
- "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
"\n",
"linker.training.estimate_u_using_random_sampling(max_pairs=1e6)\n",
"linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
@@ -647,7 +653,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -754,7 +760,8 @@
" ],\n",
")\n",
"df = splink_datasets.fake_1000\n",
- "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
"\n",
"linker.training.estimate_u_using_random_sampling(max_pairs=1e6)\n",
"linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
@@ -781,7 +788,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -808,7 +815,8 @@
" max_iterations=2,\n",
")\n",
"\n",
- "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
"\n",
"linker.training.estimate_probability_two_random_records_match(\n",
" [block_on(\"first_name\", \"surname\")], recall=0.7\n",
@@ -859,7 +867,8 @@
" ]\n",
")\n",
"\n",
- "linker = Linker(df, settings, db_api)\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)\n",
"\n",
"\n",
"linker.misc.save_model_to_json(\"mod.json\", overwrite=True)\n",
@@ -869,8 +878,10 @@
"new_settings.retain_intermediate_calculation_columns = True\n",
"new_settings.blocking_rules_to_generate_predictions = [\"1=1\"]\n",
"new_settings.additional_columns_to_retain = [\"cluster\"]\n",
+ "db_api_new = DuckDBAPI()\n",
+ "df_sdf_new = db_api_new.register(df)\n",
+ "linker = Linker(df_sdf_new, new_settings)\n",
"\n",
- "linker = Linker(df, new_settings, DuckDBAPI())\n",
"\n",
"linker.inference.predict().as_duckdbpyrelation().show()"
]
@@ -891,6 +902,7 @@
"import difflib\n",
"\n",
"import duckdb\n",
+ "from duckdb.sqltypes import VARCHAR, DOUBLE\n",
"\n",
"import splink.comparison_level_library as cll\n",
"import splink.comparison_library as cl\n",
@@ -910,8 +922,8 @@
"con.create_function(\n",
" \"custom_partial_ratio\",\n",
" custom_partial_ratio,\n",
- " [duckdb.typing.VARCHAR, duckdb.typing.VARCHAR],\n",
- " duckdb.typing.DOUBLE,\n",
+ " [VARCHAR, VARCHAR],\n",
+ " DOUBLE,\n",
")\n",
"db_api = DuckDBAPI(connection=con)\n",
"\n",
@@ -945,7 +957,8 @@
" max_iterations=2,\n",
")\n",
"\n",
- "linker = Linker(df, settings, db_api)\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)\n",
"\n",
"linker.training.estimate_probability_two_random_records_match(\n",
" [block_on(\"first_name\", \"surname\")], recall=0.7\n",
@@ -1092,7 +1105,8 @@
")\n",
"\n",
"db_api = DuckDBAPI(connection=con)\n",
- "company_linker = Linker(\"company_person_records\", company_settings, db_api)\n",
+ "company_records_sdf = db_api.register(\"company_person_records\")\n",
+ "company_linker = Linker(company_records_sdf, company_settings)\n",
"company_predictions = company_linker.inference.predict(threshold_match_probability=0.5)\n",
"\n",
"print(\"\\nCompany pairwise matches:\")\n",
@@ -1176,8 +1190,8 @@
" retain_matching_columns=True,\n",
")\n",
"\n",
- "# Link persons within company clusters\n",
- "person_linker = Linker(\"records_with_company_cluster\", person_settings, db_api2)\n",
+ "person_records_sdf = db_api2.register(\"records_with_company_cluster\")\n",
+ "person_linker = Linker(person_records_sdf, person_settings)\n",
"person_predictions = person_linker.inference.predict(threshold_match_probability=0.5)\n",
"\n",
"print(\"\\nPerson pairwise matches (within company clusters):\")\n",
@@ -1187,7 +1201,8 @@
" person_predictions, threshold_match_probability=0.5\n",
")\n",
"\n",
- "person_clusters.as_duckdbpyrelation().sort(\"cluster_id\").show(max_width=1000)\n"
+ "person_clusters.as_duckdbpyrelation().sort(\"cluster_id\").show(max_width=1000)\n",
+ "\n"
]
},
{
@@ -1296,16 +1311,19 @@
" retain_intermediate_calculation_columns=True,\n",
" retain_matching_columns=True,\n",
")\n",
+ "db_api_linker = DuckDBAPI(con)\n",
+ "df_left_sdf = db_api_linker.register(\"df_left\")\n",
+ "df_right_sdf = db_api_linker.register(\"df_right\")\n",
"linker = Linker(\n",
- " [\"df_left\", \"df_right\"],\n",
+ " [df_left_sdf, df_right_sdf],\n",
" settings,\n",
- " db_api=DuckDBAPI(con),\n",
")\n",
"\n",
"# Skip training for demo purposes, just demonstrate that predict() works\n",
"\n",
"df_predict = linker.inference.predict()\n",
- "df_predict.as_duckdbpyrelation()\n"
+ "\n",
+ "df_predict.as_duckdbpyrelation()"
]
}
],
diff --git a/docs/demos/examples/duckdb_no_test/pseudopeople-acs.ipynb b/docs/demos/examples/duckdb_no_test/pseudopeople-acs.ipynb
index 5fce2b909b..74dcb50145 100755
--- a/docs/demos/examples/duckdb_no_test/pseudopeople-acs.ipynb
+++ b/docs/demos/examples/duckdb_no_test/pseudopeople-acs.ipynb
@@ -744,7 +744,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": null,
"id": "319ffdbc-7853-40a9-b331-e635d96b6fdc",
"metadata": {
"execution": {
@@ -840,9 +840,10 @@
"from splink import DuckDBAPI\n",
"from splink.exploratory import completeness_chart\n",
"\n",
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
"completeness_chart(\n",
- " dfs,\n",
- " db_api=DuckDBAPI(),\n",
+ " dfs_sdf,\n",
" table_names_for_chart=[\"census\", \"acs\"],\n",
" cols=[\n",
" \"age_in_2020\",\n",
@@ -865,7 +866,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": null,
"id": "dff8dfca-57c8-42bf-878c-da9dd23d2682",
"metadata": {
"execution": {
@@ -960,7 +961,9 @@
"source": [
"from splink.exploratory import profile_columns\n",
"\n",
- "profile_columns(dfs, db_api=DuckDBAPI())"
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
+ "profile_columns(dfs_sdf)"
]
},
{
@@ -1032,7 +1035,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": null,
"id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de",
"metadata": {
"execution": {
@@ -1144,10 +1147,10 @@
"\n",
"\n",
"db_api = DuckDBAPI()\n",
+ "dfs_sdf = [db_api.register(df) for df in dfs]\n",
"cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=dfs,\n",
+ " dfs_sdf,\n",
" blocking_rules=blocking_rules,\n",
- " db_api=db_api,\n",
" link_type=\"link_only\",\n",
" unique_id_column_name=\"id\",\n",
" source_dataset_column_name=\"source_dataset\",\n",
@@ -1174,7 +1177,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": null,
"id": "f6360b69-2d52-4f1a-9199-2edf2339ec63",
"metadata": {
"execution": {
@@ -1213,9 +1216,10 @@
" probability_two_random_records_match=len(dfs[1]) / (len(dfs[0]) * len(dfs[1])),\n",
")\n",
"\n",
- "linker = Linker(\n",
- " dfs, settings, db_api=DuckDBAPI(), input_table_aliases=[\"census\", \"acs\"]\n",
- ")"
+ "db_api = DuckDBAPI()\n",
+ "dfs_sdf_0 = db_api.register(dfs[0], source_dataset_name=\"census\")\n",
+ "dfs_sdf_1 = db_api.register(dfs[1], source_dataset_name=\"acs\")\n",
+ "linker = Linker([dfs_sdf_0, dfs_sdf_1], settings)"
]
},
{
@@ -3238,7 +3242,7 @@
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": null,
"id": "e0f74783",
"metadata": {},
"outputs": [
@@ -3326,11 +3330,11 @@
"source": [
"linker.visualisations.waterfall_chart(\n",
" # choose comparisons that have a term frequency adjustment for address\n",
- " df_predictions[df_predictions.bf_tf_adj_address != 1]\n",
+ " df_predictions[df_predictions.mw_tf_adj_address != 0]\n",
" .head(10) # only display some of the first such comparisons\n",
" .sort_values(\n",
- " \"bf_tf_adj_address\"\n",
- " ) # sort by lowest adjustment (common addresses) first\n",
+ " \"mw_tf_adj_address\"\n",
+ " ) # sort by lowest match weight (common addresses) first\n",
" .to_dict(orient=\"records\")\n",
")"
]
diff --git a/docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb b/docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb
index 0bf9f88d2b..f496ef5c3b 100644
--- a/docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb
+++ b/docs/demos/examples/spark/deduplicate_1k_synthetic.ipynb
@@ -1,420 +1,422 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Linking in Spark\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:29:57.518197Z",
- "iopub.status.busy": "2024-03-13T12:29:57.517750Z",
- "iopub.status.idle": "2024-03-13T12:29:57.523242Z",
- "shell.execute_reply": "2024-03-13T12:29:57.522525Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink\n",
- "# !pip install pyspark"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:29:57.527366Z",
- "iopub.status.busy": "2024-03-13T12:29:57.527045Z",
- "iopub.status.idle": "2024-03-13T12:30:42.348824Z",
- "shell.execute_reply": "2024-03-13T12:30:42.347900Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "24/07/13 19:50:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
- "Setting default log level to \"WARN\".\n",
- "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
- ]
- }
- ],
- "source": [
- "from pyspark import SparkConf, SparkContext\n",
- "from pyspark.sql import SparkSession\n",
- "\n",
- "from splink.backends.spark import similarity_jar_location\n",
- "\n",
- "conf = SparkConf()\n",
- "# This parallelism setting is only suitable for a small toy example\n",
- "conf.set(\"spark.driver.memory\", \"12g\")\n",
- "conf.set(\"spark.default.parallelism\", \"8\")\n",
- "conf.set(\"spark.sql.codegen.wholeStage\", \"false\")\n",
- "\n",
- "\n",
- "# Add custom similarity functions, which are bundled with Splink\n",
- "# documented here: https://github.com/moj-analytical-services/splink_scalaudfs\n",
- "path = similarity_jar_location()\n",
- "conf.set(\"spark.jars\", path)\n",
- "\n",
- "sc = SparkContext.getOrCreate(conf=conf)\n",
- "\n",
- "spark = SparkSession(sc)\n",
- "spark.sparkContext.setCheckpointDir(\"./tmp_checkpoints\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:30:42.353970Z",
- "iopub.status.busy": "2024-03-13T12:30:42.353260Z",
- "iopub.status.idle": "2024-03-13T12:30:42.358982Z",
- "shell.execute_reply": "2024-03-13T12:30:42.358209Z"
- },
- "tags": [
- "hide_input",
- "hide_output"
- ]
- },
- "outputs": [],
- "source": [
- "# Disable warnings for pyspark - you don't need to include this\n",
- "import warnings\n",
- "\n",
- "spark.sparkContext.setLogLevel(\"ERROR\")\n",
- "warnings.simplefilter(\"ignore\", UserWarning)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:30:42.363648Z",
- "iopub.status.busy": "2024-03-13T12:30:42.363227Z",
- "iopub.status.idle": "2024-03-13T12:30:45.734688Z",
- "shell.execute_reply": "2024-03-13T12:30:45.733419Z"
- }
- },
- "outputs": [],
- "source": [
- "from splink import splink_datasets\n",
- "\n",
- "pandas_df = splink_datasets.fake_1000\n",
- "\n",
- "df = spark.createDataFrame(pandas_df)"
- ]
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linking in Spark\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:29:57.518197Z",
+ "iopub.status.busy": "2024-03-13T12:29:57.517750Z",
+ "iopub.status.idle": "2024-03-13T12:29:57.523242Z",
+ "shell.execute_reply": "2024-03-13T12:29:57.522525Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink\n",
+ "# !pip install pyspark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:29:57.527366Z",
+ "iopub.status.busy": "2024-03-13T12:29:57.527045Z",
+ "iopub.status.idle": "2024-03-13T12:30:42.348824Z",
+ "shell.execute_reply": "2024-03-13T12:30:42.347900Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:30:45.740685Z",
- "iopub.status.busy": "2024-03-13T12:30:45.740314Z",
- "iopub.status.idle": "2024-03-13T12:30:45.773778Z",
- "shell.execute_reply": "2024-03-13T12:30:45.772855Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_library as cl\n",
- "from splink import Linker, SettingsCreator, SparkAPI, block_on\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " comparisons=[\n",
- " cl.NameComparison(\"first_name\"),\n",
- " cl.NameComparison(\"surname\"),\n",
- " cl.LevenshteinAtThresholds(\n",
- " \"dob\"\n",
- " ),\n",
- " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
- " cl.EmailComparison(\"email\"),\n",
- " ],\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"first_name\"),\n",
- " \"l.surname = r.surname\", # alternatively, you can write BRs in their SQL form\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- " em_convergence=0.01,\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "24/07/13 19:50:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+ "Setting default log level to \"WARN\".\n",
+ "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
+ ]
+ }
+ ],
+ "source": [
+ "from pyspark import SparkConf, SparkContext\n",
+ "from pyspark.sql import SparkSession\n",
+ "\n",
+ "from splink.backends.spark import similarity_jar_location\n",
+ "\n",
+ "conf = SparkConf()\n",
+ "# This parallelism setting is only suitable for a small toy example\n",
+ "conf.set(\"spark.driver.memory\", \"12g\")\n",
+ "conf.set(\"spark.default.parallelism\", \"8\")\n",
+ "conf.set(\"spark.sql.codegen.wholeStage\", \"false\")\n",
+ "\n",
+ "\n",
+ "# Add custom similarity functions, which are bundled with Splink\n",
+ "# documented here: https://github.com/moj-analytical-services/splink_scalaudfs\n",
+ "path = similarity_jar_location()\n",
+ "conf.set(\"spark.jars\", path)\n",
+ "\n",
+ "sc = SparkContext.getOrCreate(conf=conf)\n",
+ "\n",
+ "spark = SparkSession(sc)\n",
+ "spark.sparkContext.setCheckpointDir(\"./tmp_checkpoints\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:30:42.353970Z",
+ "iopub.status.busy": "2024-03-13T12:30:42.353260Z",
+ "iopub.status.idle": "2024-03-13T12:30:42.358982Z",
+ "shell.execute_reply": "2024-03-13T12:30:42.358209Z"
},
+ "tags": [
+ "hide_input",
+ "hide_output"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Disable warnings for pyspark - you don't need to include this\n",
+ "import warnings\n",
+ "\n",
+ "spark.sparkContext.setLogLevel(\"ERROR\")\n",
+ "warnings.simplefilter(\"ignore\", UserWarning)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:30:42.363648Z",
+ "iopub.status.busy": "2024-03-13T12:30:42.363227Z",
+ "iopub.status.idle": "2024-03-13T12:30:45.734688Z",
+ "shell.execute_reply": "2024-03-13T12:30:45.733419Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from splink import splink_datasets\n",
+ "\n",
+ "pandas_df = splink_datasets.fake_1000\n",
+ "\n",
+ "df = spark.createDataFrame(pandas_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:30:45.740685Z",
+ "iopub.status.busy": "2024-03-13T12:30:45.740314Z",
+ "iopub.status.idle": "2024-03-13T12:30:45.773778Z",
+ "shell.execute_reply": "2024-03-13T12:30:45.772855Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "from splink import Linker, SettingsCreator, SparkAPI, block_on\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " comparisons=[\n",
+ " cl.NameComparison(\"first_name\"),\n",
+ " cl.NameComparison(\"surname\"),\n",
+ " cl.LevenshteinAtThresholds(\n",
+ " \"dob\"\n",
+ " ),\n",
+ " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
+ " cl.EmailComparison(\"email\"),\n",
+ " ],\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"first_name\"),\n",
+ " \"l.surname = r.surname\", # alternatively, you can write BRs in their SQL form\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ " em_convergence=0.01,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:30:45.779194Z",
+ "iopub.status.busy": "2024-03-13T12:30:45.778688Z",
+ "iopub.status.idle": "2024-03-13T12:30:57.746806Z",
+ "shell.execute_reply": "2024-03-13T12:30:57.744480Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:30:45.779194Z",
- "iopub.status.busy": "2024-03-13T12:30:45.778688Z",
- "iopub.status.idle": "2024-03-13T12:30:57.746806Z",
- "shell.execute_reply": "2024-03-13T12:30:57.744480Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.0806. \n",
- "This means that amongst all possible pairwise record comparisons, one in 12.41 are expected to match. With 499,500 total possible comparisons, we expect a total of around 40,246.67 matching pairs\n"
- ]
- }
- ],
- "source": [
- "linker = Linker(df, settings, db_api=SparkAPI(spark_session=spark))\n",
- "deterministic_rules = [\n",
- " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
- " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
- " \"l.email = r.email\",\n",
- "]\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.0806. \n",
+ "This means that amongst all possible pairwise record comparisons, one in 12.41 are expected to match. With 499,500 total possible comparisons, we expect a total of around 40,246.67 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_api = SparkAPI(spark_session=spark)\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)\n",
+ "deterministic_rules = [\n",
+ " \"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1\",\n",
+ " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
+ " \"l.email = r.email\",\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:30:57.757986Z",
+ "iopub.status.busy": "2024-03-13T12:30:57.757315Z",
+ "iopub.status.idle": "2024-03-13T12:31:17.080600Z",
+ "shell.execute_reply": "2024-03-13T12:31:17.079503Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:30:57.757986Z",
- "iopub.status.busy": "2024-03-13T12:30:57.757315Z",
- "iopub.status.idle": "2024-03-13T12:31:17.080600Z",
- "shell.execute_reply": "2024-03-13T12:31:17.079503Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "----- Estimating u probabilities using random sampling -----\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " \n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - dob (no m values are trained).\n",
- " - city (no m values are trained).\n",
- " - email (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=5e5)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "----- Estimating u probabilities using random sampling -----\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:31:17.085610Z",
- "iopub.status.busy": "2024-03-13T12:31:17.085246Z",
- "iopub.status.idle": "2024-03-13T12:31:36.217869Z",
- "shell.execute_reply": "2024-03-13T12:31:36.217063Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " \n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.first_name = r.first_name and l.surname = r.surname\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - dob\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - first_name\n",
- " - surname\n",
- " \n",
- "Iteration 1: Largest change in params was -0.709 in probability_two_random_records_match\n",
- "Iteration 2: Largest change in params was 0.0573 in the m_probability of email, level `All other comparisons`\n",
- "Iteration 3: Largest change in params was 0.0215 in the m_probability of email, level `All other comparisons`\n",
- "Iteration 4: Largest change in params was 0.00888 in the m_probability of email, level `All other comparisons`\n",
- "\n",
- "EM converged after 4 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.dob = r.dob\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name\n",
- " - surname\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - dob\n",
- " \n",
- "WARNING: \n",
- "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was -0.548 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 2: Largest change in params was 0.129 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was 0.0313 in probability_two_random_records_match\n",
- "Iteration 4: Largest change in params was 0.0128 in probability_two_random_records_match\n",
- "Iteration 5: Largest change in params was 0.00651 in probability_two_random_records_match\n",
- "\n",
- "EM converged after 5 iterations\n",
- "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
- ]
- }
- ],
- "source": [
- "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n",
- "training_session_fname_sname = (\n",
- " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
- ")\n",
- "\n",
- "training_blocking_rule = \"l.dob = r.dob\"\n",
- "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " training_blocking_rule\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " \n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - dob (no m values are trained).\n",
+ " - city (no m values are trained).\n",
+ " - email (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=5e5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:31:17.085610Z",
+ "iopub.status.busy": "2024-03-13T12:31:17.085246Z",
+ "iopub.status.idle": "2024-03-13T12:31:36.217869Z",
+ "shell.execute_reply": "2024-03-13T12:31:36.217063Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:31:36.223120Z",
- "iopub.status.busy": "2024-03-13T12:31:36.222561Z",
- "iopub.status.idle": "2024-03-13T12:31:44.599133Z",
- "shell.execute_reply": "2024-03-13T12:31:44.597894Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Blocking time: 4.65 seconds \n",
- "Predict time: 82.92 seconds \n"
- ]
- }
- ],
- "source": [
- "results = linker.inference.predict(threshold_match_probability=0.9)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " \n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.first_name = r.first_name and l.surname = r.surname\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - dob\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - first_name\n",
+ " - surname\n",
+ " \n",
+ "Iteration 1: Largest change in params was -0.709 in probability_two_random_records_match\n",
+ "Iteration 2: Largest change in params was 0.0573 in the m_probability of email, level `All other comparisons`\n",
+ "Iteration 3: Largest change in params was 0.0215 in the m_probability of email, level `All other comparisons`\n",
+ "Iteration 4: Largest change in params was 0.00888 in the m_probability of email, level `All other comparisons`\n",
+ "\n",
+ "EM converged after 4 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.dob = r.dob\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name\n",
+ " - surname\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - dob\n",
+ " \n",
+ "WARNING: \n",
+ "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.548 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 2: Largest change in params was 0.129 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was 0.0313 in probability_two_random_records_match\n",
+ "Iteration 4: Largest change in params was 0.0128 in probability_two_random_records_match\n",
+ "Iteration 5: Largest change in params was 0.00651 in probability_two_random_records_match\n",
+ "\n",
+ "EM converged after 5 iterations\n",
+ "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n",
+ "training_session_fname_sname = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+ ")\n",
+ "\n",
+ "training_blocking_rule = \"l.dob = r.dob\"\n",
+ "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " training_blocking_rule\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:31:36.223120Z",
+ "iopub.status.busy": "2024-03-13T12:31:36.222561Z",
+ "iopub.status.idle": "2024-03-13T12:31:44.599133Z",
+ "shell.execute_reply": "2024-03-13T12:31:44.597894Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-03-13T12:31:44.605970Z",
- "iopub.status.busy": "2024-03-13T12:31:44.605505Z",
- "iopub.status.idle": "2024-03-13T12:31:44.750590Z",
- "shell.execute_reply": "2024-03-13T12:31:44.749429Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+------------------+------------------+-----------+-----------+------------+------------+----------------+---------------+---------------+------------------+--------------------+---------+---------+-------------+------------+------------+-------------------+------------------+----------+----------+---------+------------------+----------+----------+----------+---------+---------+------------------+------------------+--------------------+--------------------+-----------+----------+----------+-------------------+-------------------+---------+\n",
- "| match_weight| match_probability|unique_id_l|unique_id_r|first_name_l|first_name_r|gamma_first_name|tf_first_name_l|tf_first_name_r| bf_first_name|bf_tf_adj_first_name|surname_l|surname_r|gamma_surname|tf_surname_l|tf_surname_r| bf_surname| bf_tf_adj_surname| dob_l| dob_r|gamma_dob| bf_dob| city_l| city_r|gamma_city|tf_city_l|tf_city_r| bf_city| bf_tf_adj_city| email_l| email_r|gamma_email|tf_email_l|tf_email_r| bf_email| bf_tf_adj_email|match_key|\n",
- "+------------------+------------------+-----------+-----------+------------+------------+----------------+---------------+---------------+------------------+--------------------+---------+---------+-------------+------------+------------+-------------------+------------------+----------+----------+---------+------------------+----------+----------+----------+---------+---------+------------------+------------------+--------------------+--------------------+-----------+----------+----------+-------------------+-------------------+---------+\n",
- "|15.131885475840011|0.9999721492762709| 51| 56| Jayden| Jayden| 4| 0.008| 0.008|11.371009132404957| 4.0525525525525525| Bennett| Bennett| 4| 0.006| 0.006| 9.113630950205666| 5.981981981981981|2017-01-11|2017-02-10| 1|14.373012181955707| Swansea| Swansea| 1| 0.013| 0.013|5.8704874944935215| 5.481481481481482| NaN| jb88@king.com| 0| 0.211| 0.004|0.35260600559686806| 1.0| 0|\n",
- "| 7.86514930254232|0.9957293356289956| 575| 577| Jessica| Jessica| 4| 0.011| 0.011|11.371009132404957| 2.9473109473109473| Owen| NaN| 0| 0.006| 0.181|0.45554364195240765| 1.0|1974-11-17|1974-11-17| 3|220.92747883214062| NaN| NaN| 1| 0.187| 0.187|5.8704874944935215|0.3810655575361458| NaN|jessica.owen@elli...| 0| 0.211| 0.002|0.35260600559686806| 1.0| 0|\n",
- "| 5.951711022429932|0.9841000517299358| 171| 174| NaN| Leah| 0| 0.169| 0.002|0.4452000905514796| 1.0| Russell| Russell| 4| 0.01| 0.01| 9.113630950205666| 3.589189189189189|2011-06-08|2012-07-09| 0|0.2607755750325071| London| London| 1| 0.173| 0.173|5.8704874944935215|0.4119032327124813|leahrussell@charl...|leahrussell@charl...| 4| 0.005| 0.005| 8.411105418567649| 9.143943943943944| 1|\n",
- "|21.650093935297473|0.9999996961409438| 518| 519| Amelia| Amlelia| 2| 0.009| 0.001| 47.10808446952784| 1.0| Morgan| Morgan| 4| 0.012| 0.012| 9.113630950205666|2.9909909909909906|2011-05-26|2011-05-26| 3|220.92747883214062| Swindno| Swindon| 0| 0.001| 0.01|0.6263033203299755| 1.0|amelia.morgan92@d...|amelia.morgan92@d...| 3| 0.004| 0.001| 211.35554441198767| 1.0| 1|\n",
- "|11.456207518049865|0.9996442185022277| 752| 754| Jaes| NaN| 0| 0.001| 0.169|0.4452000905514796| 1.0| NaN| NaN| 4| 0.181| 0.181| 9.113630950205666|0.1982977452590712|1972-07-20|1971-07-20| 2| 84.28155355946456| NaN| NaN| 1| 0.187| 0.187|5.8704874944935215|0.3810655575361458| j.c@white.org| j.c@whige.wort| 3| 0.002| 0.001| 211.35554441198767| 1.0| 1|\n",
- "|24.387299048327478|0.9999999544286963| 760| 761| Henry| Henry| 4| 0.009| 0.009|11.371009132404957| 3.602268935602269| Day| Day| 4| 0.004| 0.004| 9.113630950205666| 8.972972972972972|2002-09-15|2002-08-18| 1|14.373012181955707| Leeds| Leeds| 1| 0.017| 0.017|5.8704874944935215| 4.191721132897603|hday48@thomas-car...|hday48@thomas-car...| 3| 0.003| 0.001| 211.35554441198767| 1.0| 0|\n",
- "|12.076660303346712|0.9997685471829967| 920| 922| Evi| Evie| 3| 0.001| 0.007| 61.79623639995749| 1.0| Jones| Jones| 4| 0.023| 0.023| 9.113630950205666|1.5605170387779081|2012-06-19|2002-07-22| 0|0.2607755750325071| NaN| NaN| 1| 0.187| 0.187|5.8704874944935215|0.3810655575361458|eviejones@brewer-...|eviejones@brewer-...| 4| 0.004| 0.004| 8.411105418567649| 11.42992992992993| 1|\n",
- "| 4.002786788974079|0.9412833223288347| 171| 175| NaN| Lheah| 0| 0.169| 0.001|0.4452000905514796| 1.0| Russell| Russell| 4| 0.01| 0.01| 9.113630950205666| 3.589189189189189|2011-06-08|2011-07-10| 0|0.2607755750325071| London| Londoon| 0| 0.173| 0.002|0.6263033203299755| 1.0|leahrussell@charl...|leahrussell@charl...| 4| 0.005| 0.005| 8.411105418567649| 9.143943943943944| 1|\n",
- "|19.936162812706836|0.9999990031804153| 851| 853| Mhichael| Michael| 2| 0.001| 0.006| 47.10808446952784| 1.0| NaN| NaN| 4| 0.181| 0.181| 9.113630950205666|0.1982977452590712|2000-04-03|2000-04-03| 3|220.92747883214062| London| London| 1| 0.173| 0.173|5.8704874944935215|0.4119032327124813| m.w@cannon.com| m@w.cannon.com| 2| 0.002| 0.001| 251.69908796212906| 1.0| 1|\n",
- "| 21.33290823458872|0.9999996214227064| 400| 402| James| James| 4| 0.013| 0.013|11.371009132404957| 2.4938784938784937| Dixon| Dixon| 4| 0.009| 0.009| 9.113630950205666| 3.987987987987988|1991-04-12|1991-04-12| 3|220.92747883214062| NaN| Loodnon| 0| 0.187| 0.001|0.6263033203299755| 1.0|james.d@merritot-...|james.d@merritt-s...| 3| 0.001| 0.005| 211.35554441198767| 1.0| 0|\n",
- "|22.169132705637786|0.9999997879560012| 81| 84| Ryan| Ryan| 4| 0.005| 0.005|11.371009132404957| 6.484084084084084| Cole| Cole| 4| 0.005| 0.005| 9.113630950205666| 7.178378378378378|1987-05-27|1988-05-27| 2| 84.28155355946456| NaN| Bristol| 0| 0.187| 0.016|0.6263033203299755| 1.0|r.cole1@ramirez-a...|r.cole1@ramtrez-a...| 3| 0.005| 0.001| 211.35554441198767| 1.0| 0|\n",
- "|6.1486678498977065|0.9861008615160808| 652| 654| NaN| NaN| 4| 0.169| 0.169|11.371009132404957| 0.19183680722142257| Roberts| NaN| 0| 0.006| 0.181|0.45554364195240765| 1.0|1990-10-26|1990-10-26| 3|220.92747883214062|Birmingham|Birmingham| 1| 0.04| 0.04|5.8704874944935215|1.7814814814814814| NaN|droberts73@taylor...| 0| 0.211| 0.003|0.35260600559686806| 1.0| 0|\n",
- "|17.935398542824068|0.9999960106207738| 582| 584| ilivOa| Olivia| 1| 0.001| 0.014| 3.944098136204933| 1.0| Edwards| Edwards| 4| 0.008| 0.008| 9.113630950205666| 4.486486486486486|1988-12-27|1988-12-27| 3|220.92747883214062| Dudley| Duudley| 0| 0.006| 0.001|0.6263033203299755| 1.0| oe56@lopez.net| oe56@lopez.net| 4| 0.003| 0.003| 8.411105418567649| 15.239906573239907| 1|\n",
- "|21.036204363210302|0.9999995349803662| 978| 981| Jessica| Jessica| 4| 0.011| 0.011|11.371009132404957| 2.9473109473109473| Miller| Miiller| 3| 0.004| 0.001| 82.56312210691897| 1.0|2001-05-23|2001-05-23| 3|220.92747883214062| NaN| Coventry| 0| 0.187| 0.021|0.6263033203299755| 1.0|jessica.miller@jo...|jessica.miller@jo...| 4| 0.006| 0.006| 8.411105418567649| 7.619953286619953| 0|\n",
- "|13.095432674729635|0.9998857562788657| 684| 686| Rosie| Rosie| 4| 0.005| 0.005|11.371009132404957| 6.484084084084084| Johnstn| Johnston| 3| 0.001| 0.002| 82.56312210691897| 1.0|1979-12-23|1978-11-23| 1|14.373012181955707| NaN| Sheffield| 0| 0.187| 0.007|0.6263033203299755| 1.0| NaN| NaN| 4| 0.211| 0.211| 8.411105418567649|0.21668113611241574| 0|\n",
- "|25.252698357543103|0.9999999749861632| 279| 280| Lola| Lola| 4| 0.008| 0.008|11.371009132404957| 4.0525525525525525| Taylor| Taylor| 4| 0.014| 0.014| 9.113630950205666|2.5637065637065635|2017-11-20|2016-11-20| 2| 84.28155355946456| Aberdeen| Aberdeen| 1| 0.016| 0.016|5.8704874944935215| 4.453703703703703|lolat86@bishop-gi...|lolat86@bishop-gi...| 4| 0.002| 0.002| 8.411105418567649| 22.85985985985986| 0|\n",
- "| 9.711807138722323|0.9988089303569408| 42| 43| Theodore| Theodore| 4| 0.01| 0.01|11.371009132404957| 3.242042042042042| Morris| Morris| 4| 0.004| 0.004| 9.113630950205666| 8.972972972972972|1978-09-18|1978-08-19| 1|14.373012181955707|Birgmhniam|Birmingham| 0| 0.001| 0.04|0.6263033203299755| 1.0| NaN|t.m39@brooks-sawy...| 0| 0.211| 0.005|0.35260600559686806| 1.0| 0|\n",
- "| 5.951711022429932|0.9841000517299358| 173| 174| NaN| Leah| 0| 0.169| 0.002|0.4452000905514796| 1.0| Russell| Russell| 4| 0.01| 0.01| 9.113630950205666| 3.589189189189189|2011-06-08|2012-07-09| 0|0.2607755750325071| London| London| 1| 0.173| 0.173|5.8704874944935215|0.4119032327124813|leahrussell@charl...|leahrussell@charl...| 4| 0.005| 0.005| 8.411105418567649| 9.143943943943944| 1|\n",
- "| 23.43211696288854|0.9999999116452517| 88| 89| Lexi| Lexi| 4| 0.003| 0.003|11.371009132404957| 10.806806806806806| NaN| NaN| 4| 0.181| 0.181| 9.113630950205666|0.1982977452590712|1994-09-02|1994-09-02| 3|220.92747883214062|Birmingham|Birmingham| 1| 0.04| 0.04|5.8704874944935215|1.7814814814814814|l.gordon34cfren@h...|l.gordon34@french...| 2| 0.001| 0.002| 251.69908796212906| 1.0| 0|\n",
- "|7.1659948250873144|0.9930847652376709| 391| 393| Isaac| Isaac| 4| 0.005| 0.005|11.371009132404957| 6.484084084084084| NaN| James| 0| 0.181| 0.007|0.45554364195240765| 1.0|1991-05-06|1991-05-06| 3|220.92747883214062| Lodon| London| 0| 0.008| 0.173|0.6263033203299755| 1.0|isaac.james@smich...| NaN| 0| 0.001| 0.211|0.35260600559686806| 1.0| 0|\n",
- "+------------------+------------------+-----------+-----------+------------+------------+----------------+---------------+---------------+------------------+--------------------+---------+---------+-------------+------------+------------+-------------------+------------------+----------+----------+---------+------------------+----------+----------+----------+---------+---------+------------------+------------------+--------------------+--------------------+-----------+----------+----------+-------------------+-------------------+---------+\n",
- "only showing top 20 rows\n",
- "\n"
- ]
- }
- ],
- "source": [
- "spark_df = results.as_spark_dataframe().show()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 4.65 seconds \n",
+ "Predict time: 82.92 seconds \n"
+ ]
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
+ ],
+ "source": [
+ "results = linker.inference.predict(threshold_match_probability=0.9)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-03-13T12:31:44.605970Z",
+ "iopub.status.busy": "2024-03-13T12:31:44.605505Z",
+ "iopub.status.idle": "2024-03-13T12:31:44.750590Z",
+ "shell.execute_reply": "2024-03-13T12:31:44.749429Z"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------------------+------------------+-----------+-----------+------------+------------+----------------+---------------+---------------+------------------+--------------------+---------+---------+-------------+------------+------------+-------------------+------------------+----------+----------+---------+------------------+----------+----------+----------+---------+---------+------------------+------------------+--------------------+--------------------+-----------+----------+----------+-------------------+-------------------+---------+\n",
+ "| match_weight| match_probability|unique_id_l|unique_id_r|first_name_l|first_name_r|gamma_first_name|tf_first_name_l|tf_first_name_r| bf_first_name|bf_tf_adj_first_name|surname_l|surname_r|gamma_surname|tf_surname_l|tf_surname_r| bf_surname| bf_tf_adj_surname| dob_l| dob_r|gamma_dob| bf_dob| city_l| city_r|gamma_city|tf_city_l|tf_city_r| bf_city| bf_tf_adj_city| email_l| email_r|gamma_email|tf_email_l|tf_email_r| bf_email| bf_tf_adj_email|match_key|\n",
+ "+------------------+------------------+-----------+-----------+------------+------------+----------------+---------------+---------------+------------------+--------------------+---------+---------+-------------+------------+------------+-------------------+------------------+----------+----------+---------+------------------+----------+----------+----------+---------+---------+------------------+------------------+--------------------+--------------------+-----------+----------+----------+-------------------+-------------------+---------+\n",
+ "|15.131885475840011|0.9999721492762709| 51| 56| Jayden| Jayden| 4| 0.008| 0.008|11.371009132404957| 4.0525525525525525| Bennett| Bennett| 4| 0.006| 0.006| 9.113630950205666| 5.981981981981981|2017-01-11|2017-02-10| 1|14.373012181955707| Swansea| Swansea| 1| 0.013| 0.013|5.8704874944935215| 5.481481481481482| NaN| jb88@king.com| 0| 0.211| 0.004|0.35260600559686806| 1.0| 0|\n",
+ "| 7.86514930254232|0.9957293356289956| 575| 577| Jessica| Jessica| 4| 0.011| 0.011|11.371009132404957| 2.9473109473109473| Owen| NaN| 0| 0.006| 0.181|0.45554364195240765| 1.0|1974-11-17|1974-11-17| 3|220.92747883214062| NaN| NaN| 1| 0.187| 0.187|5.8704874944935215|0.3810655575361458| NaN|jessica.owen@elli...| 0| 0.211| 0.002|0.35260600559686806| 1.0| 0|\n",
+ "| 5.951711022429932|0.9841000517299358| 171| 174| NaN| Leah| 0| 0.169| 0.002|0.4452000905514796| 1.0| Russell| Russell| 4| 0.01| 0.01| 9.113630950205666| 3.589189189189189|2011-06-08|2012-07-09| 0|0.2607755750325071| London| London| 1| 0.173| 0.173|5.8704874944935215|0.4119032327124813|leahrussell@charl...|leahrussell@charl...| 4| 0.005| 0.005| 8.411105418567649| 9.143943943943944| 1|\n",
+ "|21.650093935297473|0.9999996961409438| 518| 519| Amelia| Amlelia| 2| 0.009| 0.001| 47.10808446952784| 1.0| Morgan| Morgan| 4| 0.012| 0.012| 9.113630950205666|2.9909909909909906|2011-05-26|2011-05-26| 3|220.92747883214062| Swindno| Swindon| 0| 0.001| 0.01|0.6263033203299755| 1.0|amelia.morgan92@d...|amelia.morgan92@d...| 3| 0.004| 0.001| 211.35554441198767| 1.0| 1|\n",
+ "|11.456207518049865|0.9996442185022277| 752| 754| Jaes| NaN| 0| 0.001| 0.169|0.4452000905514796| 1.0| NaN| NaN| 4| 0.181| 0.181| 9.113630950205666|0.1982977452590712|1972-07-20|1971-07-20| 2| 84.28155355946456| NaN| NaN| 1| 0.187| 0.187|5.8704874944935215|0.3810655575361458| j.c@white.org| j.c@whige.wort| 3| 0.002| 0.001| 211.35554441198767| 1.0| 1|\n",
+ "|24.387299048327478|0.9999999544286963| 760| 761| Henry| Henry| 4| 0.009| 0.009|11.371009132404957| 3.602268935602269| Day| Day| 4| 0.004| 0.004| 9.113630950205666| 8.972972972972972|2002-09-15|2002-08-18| 1|14.373012181955707| Leeds| Leeds| 1| 0.017| 0.017|5.8704874944935215| 4.191721132897603|hday48@thomas-car...|hday48@thomas-car...| 3| 0.003| 0.001| 211.35554441198767| 1.0| 0|\n",
+ "|12.076660303346712|0.9997685471829967| 920| 922| Evi| Evie| 3| 0.001| 0.007| 61.79623639995749| 1.0| Jones| Jones| 4| 0.023| 0.023| 9.113630950205666|1.5605170387779081|2012-06-19|2002-07-22| 0|0.2607755750325071| NaN| NaN| 1| 0.187| 0.187|5.8704874944935215|0.3810655575361458|eviejones@brewer-...|eviejones@brewer-...| 4| 0.004| 0.004| 8.411105418567649| 11.42992992992993| 1|\n",
+ "| 4.002786788974079|0.9412833223288347| 171| 175| NaN| Lheah| 0| 0.169| 0.001|0.4452000905514796| 1.0| Russell| Russell| 4| 0.01| 0.01| 9.113630950205666| 3.589189189189189|2011-06-08|2011-07-10| 0|0.2607755750325071| London| Londoon| 0| 0.173| 0.002|0.6263033203299755| 1.0|leahrussell@charl...|leahrussell@charl...| 4| 0.005| 0.005| 8.411105418567649| 9.143943943943944| 1|\n",
+ "|19.936162812706836|0.9999990031804153| 851| 853| Mhichael| Michael| 2| 0.001| 0.006| 47.10808446952784| 1.0| NaN| NaN| 4| 0.181| 0.181| 9.113630950205666|0.1982977452590712|2000-04-03|2000-04-03| 3|220.92747883214062| London| London| 1| 0.173| 0.173|5.8704874944935215|0.4119032327124813| m.w@cannon.com| m@w.cannon.com| 2| 0.002| 0.001| 251.69908796212906| 1.0| 1|\n",
+ "| 21.33290823458872|0.9999996214227064| 400| 402| James| James| 4| 0.013| 0.013|11.371009132404957| 2.4938784938784937| Dixon| Dixon| 4| 0.009| 0.009| 9.113630950205666| 3.987987987987988|1991-04-12|1991-04-12| 3|220.92747883214062| NaN| Loodnon| 0| 0.187| 0.001|0.6263033203299755| 1.0|james.d@merritot-...|james.d@merritt-s...| 3| 0.001| 0.005| 211.35554441198767| 1.0| 0|\n",
+ "|22.169132705637786|0.9999997879560012| 81| 84| Ryan| Ryan| 4| 0.005| 0.005|11.371009132404957| 6.484084084084084| Cole| Cole| 4| 0.005| 0.005| 9.113630950205666| 7.178378378378378|1987-05-27|1988-05-27| 2| 84.28155355946456| NaN| Bristol| 0| 0.187| 0.016|0.6263033203299755| 1.0|r.cole1@ramirez-a...|r.cole1@ramtrez-a...| 3| 0.005| 0.001| 211.35554441198767| 1.0| 0|\n",
+ "|6.1486678498977065|0.9861008615160808| 652| 654| NaN| NaN| 4| 0.169| 0.169|11.371009132404957| 0.19183680722142257| Roberts| NaN| 0| 0.006| 0.181|0.45554364195240765| 1.0|1990-10-26|1990-10-26| 3|220.92747883214062|Birmingham|Birmingham| 1| 0.04| 0.04|5.8704874944935215|1.7814814814814814| NaN|droberts73@taylor...| 0| 0.211| 0.003|0.35260600559686806| 1.0| 0|\n",
+ "|17.935398542824068|0.9999960106207738| 582| 584| ilivOa| Olivia| 1| 0.001| 0.014| 3.944098136204933| 1.0| Edwards| Edwards| 4| 0.008| 0.008| 9.113630950205666| 4.486486486486486|1988-12-27|1988-12-27| 3|220.92747883214062| Dudley| Duudley| 0| 0.006| 0.001|0.6263033203299755| 1.0| oe56@lopez.net| oe56@lopez.net| 4| 0.003| 0.003| 8.411105418567649| 15.239906573239907| 1|\n",
+ "|21.036204363210302|0.9999995349803662| 978| 981| Jessica| Jessica| 4| 0.011| 0.011|11.371009132404957| 2.9473109473109473| Miller| Miiller| 3| 0.004| 0.001| 82.56312210691897| 1.0|2001-05-23|2001-05-23| 3|220.92747883214062| NaN| Coventry| 0| 0.187| 0.021|0.6263033203299755| 1.0|jessica.miller@jo...|jessica.miller@jo...| 4| 0.006| 0.006| 8.411105418567649| 7.619953286619953| 0|\n",
+ "|13.095432674729635|0.9998857562788657| 684| 686| Rosie| Rosie| 4| 0.005| 0.005|11.371009132404957| 6.484084084084084| Johnstn| Johnston| 3| 0.001| 0.002| 82.56312210691897| 1.0|1979-12-23|1978-11-23| 1|14.373012181955707| NaN| Sheffield| 0| 0.187| 0.007|0.6263033203299755| 1.0| NaN| NaN| 4| 0.211| 0.211| 8.411105418567649|0.21668113611241574| 0|\n",
+ "|25.252698357543103|0.9999999749861632| 279| 280| Lola| Lola| 4| 0.008| 0.008|11.371009132404957| 4.0525525525525525| Taylor| Taylor| 4| 0.014| 0.014| 9.113630950205666|2.5637065637065635|2017-11-20|2016-11-20| 2| 84.28155355946456| Aberdeen| Aberdeen| 1| 0.016| 0.016|5.8704874944935215| 4.453703703703703|lolat86@bishop-gi...|lolat86@bishop-gi...| 4| 0.002| 0.002| 8.411105418567649| 22.85985985985986| 0|\n",
+ "| 9.711807138722323|0.9988089303569408| 42| 43| Theodore| Theodore| 4| 0.01| 0.01|11.371009132404957| 3.242042042042042| Morris| Morris| 4| 0.004| 0.004| 9.113630950205666| 8.972972972972972|1978-09-18|1978-08-19| 1|14.373012181955707|Birgmhniam|Birmingham| 0| 0.001| 0.04|0.6263033203299755| 1.0| NaN|t.m39@brooks-sawy...| 0| 0.211| 0.005|0.35260600559686806| 1.0| 0|\n",
+ "| 5.951711022429932|0.9841000517299358| 173| 174| NaN| Leah| 0| 0.169| 0.002|0.4452000905514796| 1.0| Russell| Russell| 4| 0.01| 0.01| 9.113630950205666| 3.589189189189189|2011-06-08|2012-07-09| 0|0.2607755750325071| London| London| 1| 0.173| 0.173|5.8704874944935215|0.4119032327124813|leahrussell@charl...|leahrussell@charl...| 4| 0.005| 0.005| 8.411105418567649| 9.143943943943944| 1|\n",
+ "| 23.43211696288854|0.9999999116452517| 88| 89| Lexi| Lexi| 4| 0.003| 0.003|11.371009132404957| 10.806806806806806| NaN| NaN| 4| 0.181| 0.181| 9.113630950205666|0.1982977452590712|1994-09-02|1994-09-02| 3|220.92747883214062|Birmingham|Birmingham| 1| 0.04| 0.04|5.8704874944935215|1.7814814814814814|l.gordon34cfren@h...|l.gordon34@french...| 2| 0.001| 0.002| 251.69908796212906| 1.0| 0|\n",
+ "|7.1659948250873144|0.9930847652376709| 391| 393| Isaac| Isaac| 4| 0.005| 0.005|11.371009132404957| 6.484084084084084| NaN| James| 0| 0.181| 0.007|0.45554364195240765| 1.0|1991-05-06|1991-05-06| 3|220.92747883214062| Lodon| London| 0| 0.008| 0.173|0.6263033203299755| 1.0|isaac.james@smich...| NaN| 0| 0.001| 0.211|0.35260600559686806| 1.0| 0|\n",
+ "+------------------+------------------+-----------+-----------+------------+------------+----------------+---------------+---------------+------------------+--------------------+---------+---------+-------------+------------+------------+-------------------+------------------+----------+----------+---------+------------------+----------+----------+----------+---------+---------+------------------+------------------+--------------------+--------------------+-----------+----------+----------+-------------------+-------------------+---------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
}
+ ],
+ "source": [
+ "spark_df = results.as_spark_dataframe().show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb
index 2986bea0b9..97d914fd71 100644
--- a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb
+++ b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb
@@ -1,1506 +1,1506 @@
{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Linking a dataset of real historical persons\n",
- "\n",
- "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n",
- "\n",
- "Note, as explained in the [backends topic guide](../../../topic_guides/splink_fundamentals/backends/backends.md#sqlite), SQLite does not natively support string fuzzy matching functions such as `damareau-levenshtein` and `jaro-winkler` (as used in this example). Instead, these have been imported as python User Defined Functions (UDFs). One drawback of python UDFs is that they are considerably slower than native-SQL comparisons. As such, if you are hitting issues with large run times, consider switching to DuckDB (or some other backend).\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:30.610213Z",
- "iopub.status.busy": "2024-05-15T18:41:30.609846Z",
- "iopub.status.idle": "2024-05-15T18:41:30.615335Z",
- "shell.execute_reply": "2024-05-15T18:41:30.614566Z"
- }
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink\n",
- "# !pip install rapidfuzz"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:30.619046Z",
- "iopub.status.busy": "2024-05-15T18:41:30.618760Z",
- "iopub.status.idle": "2024-05-15T18:41:31.933775Z",
- "shell.execute_reply": "2024-05-15T18:41:31.932989Z"
- }
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "\n",
- "from splink import splink_datasets\n",
- "\n",
- "pd.options.display.max_rows = 1000\n",
- "# reduce size of dataset to make things run faster\n",
- "df = splink_datasets.historical_50k.sample(5000)"
- ]
- },
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linking a dataset of real historical persons\n",
+ "\n",
+ "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n",
+ "\n",
+ "Note, as explained in the [backends topic guide](../../../topic_guides/splink_fundamentals/backends/backends.md#sqlite), SQLite does not natively support string fuzzy matching functions such as `damareau-levenshtein` and `jaro-winkler` (as used in this example). Instead, these have been imported as python User Defined Functions (UDFs). One drawback of python UDFs is that they are considerably slower than native-SQL comparisons. As such, if you are hitting issues with large run times, consider switching to DuckDB (or some other backend).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:30.610213Z",
+ "iopub.status.busy": "2024-05-15T18:41:30.609846Z",
+ "iopub.status.idle": "2024-05-15T18:41:30.615335Z",
+ "shell.execute_reply": "2024-05-15T18:41:30.614566Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink\n",
+ "# !pip install rapidfuzz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:30.619046Z",
+ "iopub.status.busy": "2024-05-15T18:41:30.618760Z",
+ "iopub.status.idle": "2024-05-15T18:41:31.933775Z",
+ "shell.execute_reply": "2024-05-15T18:41:31.932989Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "from splink import splink_datasets\n",
+ "\n",
+ "pd.options.display.max_rows = 1000\n",
+ "# reduce size of dataset to make things run faster\n",
+ "df = splink_datasets.historical_50k.sample(5000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:31.938051Z",
+ "iopub.status.busy": "2024-05-15T18:41:31.937677Z",
+ "iopub.status.idle": "2024-05-15T18:41:32.856954Z",
+ "shell.execute_reply": "2024-05-15T18:41:32.856284Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:31.938051Z",
- "iopub.status.busy": "2024-05-15T18:41:31.937677Z",
- "iopub.status.idle": "2024-05-15T18:41:32.856954Z",
- "shell.execute_reply": "2024-05-15T18:41:32.856284Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink.backends.sqlite import SQLiteAPI\n",
- "from splink.exploratory import profile_columns\n",
- "\n",
- "db_api = SQLiteAPI()\n",
- "profile_columns(\n",
- " df, db_api, column_expressions=[\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"]\n",
- ")"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.backends.sqlite import SQLiteAPI\n",
+ "from splink.exploratory import profile_columns\n",
+ "\n",
+ "db_api = SQLiteAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "profile_columns(\n",
+ " df_sdf, column_expressions=[\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:32.900620Z",
+ "iopub.status.busy": "2024-05-15T18:41:32.900280Z",
+ "iopub.status.idle": "2024-05-15T18:41:33.193607Z",
+ "shell.execute_reply": "2024-05-15T18:41:33.192963Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:32.900620Z",
- "iopub.status.busy": "2024-05-15T18:41:32.900280Z",
- "iopub.status.idle": "2024-05-15T18:41:33.193607Z",
- "shell.execute_reply": "2024-05-15T18:41:33.192963Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink import block_on\n",
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "blocking_rules = [block_on(\"first_name\", \"surname\"),\n",
- " block_on(\"surname\", \"dob\"),\n",
- " block_on(\"first_name\", \"dob\"),\n",
- " block_on(\"postcode_fake\", \"first_name\")]\n",
- "\n",
- "db_api = SQLiteAPI()\n",
- "\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=df,\n",
- " blocking_rules=blocking_rules,\n",
- " db_api=db_api,\n",
- " link_type=\"dedupe_only\"\n",
- ")"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink import block_on\n",
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "blocking_rules = [block_on(\"first_name\", \"surname\"),\n",
+ " block_on(\"surname\", \"dob\"),\n",
+ " block_on(\"first_name\", \"dob\"),\n",
+ " block_on(\"postcode_fake\", \"first_name\")]\n",
+ "\n",
+ "\n",
+ "\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " df_sdf,\n",
+ " blocking_rules=blocking_rules,\n",
+ " link_type=\"dedupe_only\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:33.197015Z",
+ "iopub.status.busy": "2024-05-15T18:41:33.196743Z",
+ "iopub.status.idle": "2024-05-15T18:41:33.330331Z",
+ "shell.execute_reply": "2024-05-15T18:41:33.329671Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "from splink import Linker\n",
+ "\n",
+ "settings = {\n",
+ " \"link_type\": \"dedupe_only\",\n",
+ " \"blocking_rules_to_generate_predictions\": [\n",
+ " block_on(\"first_name\", \"surname\"),\n",
+ " block_on(\"surname\", \"dob\"),\n",
+ " block_on(\"first_name\", \"dob\"),\n",
+ " block_on(\"postcode_fake\", \"first_name\"),\n",
+ "\n",
+ " ],\n",
+ " \"comparisons\": [\n",
+ " cl.NameComparison(\"first_name\"),\n",
+ " cl.NameComparison(\"surname\"),\n",
+ " cl.DamerauLevenshteinAtThresholds(\"dob\", [1, 2]).configure(\n",
+ " term_frequency_adjustments=True\n",
+ " ),\n",
+ " cl.DamerauLevenshteinAtThresholds(\"postcode_fake\", [1, 2]),\n",
+ " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n",
+ " cl.ExactMatch(\n",
+ " \"occupation\",\n",
+ " ).configure(term_frequency_adjustments=True),\n",
+ " ],\n",
+ " \"retain_matching_columns\": True,\n",
+ " \"retain_intermediate_calculation_columns\": True,\n",
+ " \"max_iterations\": 10,\n",
+ " \"em_convergence\": 0.01,\n",
+ "}\n",
+ "\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:33.334300Z",
+ "iopub.status.busy": "2024-05-15T18:41:33.333988Z",
+ "iopub.status.idle": "2024-05-15T18:41:33.488238Z",
+ "shell.execute_reply": "2024-05-15T18:41:33.487555Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:33.197015Z",
- "iopub.status.busy": "2024-05-15T18:41:33.196743Z",
- "iopub.status.idle": "2024-05-15T18:41:33.330331Z",
- "shell.execute_reply": "2024-05-15T18:41:33.329671Z"
- }
- },
- "outputs": [],
- "source": [
- "import splink.comparison_library as cl\n",
- "from splink import Linker\n",
- "\n",
- "settings = {\n",
- " \"link_type\": \"dedupe_only\",\n",
- " \"blocking_rules_to_generate_predictions\": [\n",
- " block_on(\"first_name\", \"surname\"),\n",
- " block_on(\"surname\", \"dob\"),\n",
- " block_on(\"first_name\", \"dob\"),\n",
- " block_on(\"postcode_fake\", \"first_name\"),\n",
- "\n",
- " ],\n",
- " \"comparisons\": [\n",
- " cl.NameComparison(\"first_name\"),\n",
- " cl.NameComparison(\"surname\"),\n",
- " cl.DamerauLevenshteinAtThresholds(\"dob\", [1, 2]).configure(\n",
- " term_frequency_adjustments=True\n",
- " ),\n",
- " cl.DamerauLevenshteinAtThresholds(\"postcode_fake\", [1, 2]),\n",
- " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n",
- " cl.ExactMatch(\n",
- " \"occupation\",\n",
- " ).configure(term_frequency_adjustments=True),\n",
- " ],\n",
- " \"retain_matching_columns\": True,\n",
- " \"retain_intermediate_calculation_columns\": True,\n",
- " \"max_iterations\": 10,\n",
- " \"em_convergence\": 0.01,\n",
- "}\n",
- "\n",
- "linker = Linker(df, settings, db_api=db_api)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.000125.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 7,985.62 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 1,565.00 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_probability_two_random_records_match(\n",
+ " [\n",
+ " block_on(\"first_name\", \"surname\", \"dob\"),\n",
+ " block_on(\"substr(first_name,1,2)\", \"surname\", \"substr(postcode_fake,1,2)\"),\n",
+ " block_on(\"dob\", \"postcode_fake\"),\n",
+ " ],\n",
+ " recall=0.6,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:33.491551Z",
+ "iopub.status.busy": "2024-05-15T18:41:33.491328Z",
+ "iopub.status.idle": "2024-05-15T18:41:41.469753Z",
+ "shell.execute_reply": "2024-05-15T18:41:41.469157Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:33.334300Z",
- "iopub.status.busy": "2024-05-15T18:41:33.333988Z",
- "iopub.status.idle": "2024-05-15T18:41:33.488238Z",
- "shell.execute_reply": "2024-05-15T18:41:33.487555Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.000125.\n",
- "This means that amongst all possible pairwise record comparisons, one in 7,985.62 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 1,565.00 matching pairs\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_probability_two_random_records_match(\n",
- " [\n",
- " block_on(\"first_name\", \"surname\", \"dob\"),\n",
- " block_on(\"substr(first_name,1,2)\", \"surname\", \"substr(postcode_fake,1,2)\"),\n",
- " block_on(\"dob\", \"postcode_fake\"),\n",
- " ],\n",
- " recall=0.6,\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
+ "----- Estimating u probabilities using random sampling -----\n",
+ "u probability not trained for first_name - Jaro-Winkler distance of first_name >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "u probability not trained for surname - Jaro-Winkler distance of surname >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Estimated u probabilities using random sampling\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (some u values are not trained, no m values are trained).\n",
+ " - surname (some u values are not trained, no m values are trained).\n",
+ " - dob (no m values are trained).\n",
+ " - postcode_fake (no m values are trained).\n",
+ " - birth_place (no m values are trained).\n",
+ " - occupation (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:41.473301Z",
+ "iopub.status.busy": "2024-05-15T18:41:41.473009Z",
+ "iopub.status.idle": "2024-05-15T18:41:41.683463Z",
+ "shell.execute_reply": "2024-05-15T18:41:41.682843Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:33.491551Z",
- "iopub.status.busy": "2024-05-15T18:41:33.491328Z",
- "iopub.status.idle": "2024-05-15T18:41:41.469753Z",
- "shell.execute_reply": "2024-05-15T18:41:41.469157Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n",
- "----- Estimating u probabilities using random sampling -----\n",
- "u probability not trained for first_name - Jaro-Winkler distance of first_name >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "u probability not trained for surname - Jaro-Winkler distance of surname >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Estimated u probabilities using random sampling\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (some u values are not trained, no m values are trained).\n",
- " - surname (some u values are not trained, no m values are trained).\n",
- " - dob (no m values are trained).\n",
- " - postcode_fake (no m values are trained).\n",
- " - birth_place (no m values are trained).\n",
- " - occupation (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.first_name = r.first_name and l.surname = r.surname\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - dob\n",
+ " - postcode_fake\n",
+ " - birth_place\n",
+ " - occupation\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - first_name\n",
+ " - surname\n",
+ "\n",
+ "Iteration 1: Largest change in params was -0.438 in probability_two_random_records_match\n",
+ "Iteration 2: Largest change in params was -0.0347 in probability_two_random_records_match\n",
+ "Iteration 3: Largest change in params was -0.0126 in the m_probability of birth_place, level `All other comparisons`\n",
+ "Iteration 4: Largest change in params was 0.00644 in the m_probability of birth_place, level `Exact match on birth_place`\n",
+ "\n",
+ "EM converged after 4 iterations\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (some u values are not trained, no m values are trained).\n",
+ " - surname (some u values are not trained, no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n",
+ "training_session_names = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " training_blocking_rule, estimate_without_term_frequencies=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:41.686951Z",
+ "iopub.status.busy": "2024-05-15T18:41:41.686683Z",
+ "iopub.status.idle": "2024-05-15T18:41:41.926273Z",
+ "shell.execute_reply": "2024-05-15T18:41:41.925689Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:41.473301Z",
- "iopub.status.busy": "2024-05-15T18:41:41.473009Z",
- "iopub.status.idle": "2024-05-15T18:41:41.683463Z",
- "shell.execute_reply": "2024-05-15T18:41:41.682843Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.first_name = r.first_name and l.surname = r.surname\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - dob\n",
- " - postcode_fake\n",
- " - birth_place\n",
- " - occupation\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - first_name\n",
- " - surname\n",
- "\n",
- "Iteration 1: Largest change in params was -0.438 in probability_two_random_records_match\n",
- "Iteration 2: Largest change in params was -0.0347 in probability_two_random_records_match\n",
- "Iteration 3: Largest change in params was -0.0126 in the m_probability of birth_place, level `All other comparisons`\n",
- "Iteration 4: Largest change in params was 0.00644 in the m_probability of birth_place, level `Exact match on birth_place`\n",
- "\n",
- "EM converged after 4 iterations\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (some u values are not trained, no m values are trained).\n",
- " - surname (some u values are not trained, no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n",
- "training_session_names = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " training_blocking_rule, estimate_without_term_frequencies=True\n",
- ")"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n",
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.dob = r.dob\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name\n",
+ " - surname\n",
+ " - postcode_fake\n",
+ " - birth_place\n",
+ " - occupation\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - dob\n",
+ "\n",
+ "WARNING:\n",
+ "Level Jaro-Winkler distance of first_name >= 0.88 on comparison first_name not observed in dataset, unable to train m value\n",
+ "\n",
+ "WARNING:\n",
+ "Level Jaro-Winkler distance of surname >= 0.88 on comparison surname not observed in dataset, unable to train m value\n",
+ "\n",
+ "Iteration 1: Largest change in params was 0.327 in the m_probability of first_name, level `All other comparisons`\n",
+ "Iteration 2: Largest change in params was -0.0566 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 3: Largest change in params was -0.0184 in the m_probability of surname, level `Exact match on surname`\n",
+ "Iteration 4: Largest change in params was -0.006 in the m_probability of surname, level `Exact match on surname`\n",
+ "\n",
+ "EM converged after 4 iterations\n",
+ "m probability not trained for first_name - Jaro-Winkler distance of first_name >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "m probability not trained for surname - Jaro-Winkler distance of surname >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (some u values are not trained, some m values are not trained).\n",
+ " - surname (some u values are not trained, some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = \"l.dob = r.dob\"\n",
+ "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " training_blocking_rule, estimate_without_term_frequencies=True\n",
+ ")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The final match weights can be viewed in the match weights chart:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:41.929306Z",
+ "iopub.status.busy": "2024-05-15T18:41:41.929078Z",
+ "iopub.status.idle": "2024-05-15T18:41:42.230106Z",
+ "shell.execute_reply": "2024-05-15T18:41:42.229484Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:41.686951Z",
- "iopub.status.busy": "2024-05-15T18:41:41.686683Z",
- "iopub.status.idle": "2024-05-15T18:41:41.926273Z",
- "shell.execute_reply": "2024-05-15T18:41:41.925689Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n",
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.dob = r.dob\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name\n",
- " - surname\n",
- " - postcode_fake\n",
- " - birth_place\n",
- " - occupation\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - dob\n",
- "\n",
- "WARNING:\n",
- "Level Jaro-Winkler distance of first_name >= 0.88 on comparison first_name not observed in dataset, unable to train m value\n",
- "\n",
- "WARNING:\n",
- "Level Jaro-Winkler distance of surname >= 0.88 on comparison surname not observed in dataset, unable to train m value\n",
- "\n",
- "Iteration 1: Largest change in params was 0.327 in the m_probability of first_name, level `All other comparisons`\n",
- "Iteration 2: Largest change in params was -0.0566 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 3: Largest change in params was -0.0184 in the m_probability of surname, level `Exact match on surname`\n",
- "Iteration 4: Largest change in params was -0.006 in the m_probability of surname, level `Exact match on surname`\n",
- "\n",
- "EM converged after 4 iterations\n",
- "m probability not trained for first_name - Jaro-Winkler distance of first_name >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "m probability not trained for surname - Jaro-Winkler distance of surname >= 0.88 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n",
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (some u values are not trained, some m values are not trained).\n",
- " - surname (some u values are not trained, some m values are not trained).\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "training_blocking_rule = \"l.dob = r.dob\"\n",
- "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " training_blocking_rule, estimate_without_term_frequencies=True\n",
- ")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The final match weights can be viewed in the match weights chart:\n"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
- },
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:42.233172Z",
+ "iopub.status.busy": "2024-05-15T18:41:42.232933Z",
+ "iopub.status.idle": "2024-05-15T18:41:42.813828Z",
+ "shell.execute_reply": "2024-05-15T18:41:42.813043Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:41.929306Z",
- "iopub.status.busy": "2024-05-15T18:41:41.929078Z",
- "iopub.status.idle": "2024-05-15T18:41:42.230106Z",
- "shell.execute_reply": "2024-05-15T18:41:42.229484Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.visualisations.match_weights_chart()"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.unlinkables_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:42.817975Z",
+ "iopub.status.busy": "2024-05-15T18:41:42.817397Z",
+ "iopub.status.idle": "2024-05-15T18:41:43.292311Z",
+ "shell.execute_reply": "2024-05-15T18:41:43.291620Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'first_name':\n",
+ " m values not fully trained\n",
+ "Comparison: 'first_name':\n",
+ " u values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " m values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:42.233172Z",
- "iopub.status.busy": "2024-05-15T18:41:42.232933Z",
- "iopub.status.idle": "2024-05-15T18:41:42.813828Z",
- "shell.execute_reply": "2024-05-15T18:41:42.813043Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " gamma_first_name | \n",
+ " tf_first_name_l | \n",
+ " tf_first_name_r | \n",
+ " bf_first_name | \n",
+ " ... | \n",
+ " bf_birth_place | \n",
+ " bf_tf_adj_birth_place | \n",
+ " occupation_l | \n",
+ " occupation_r | \n",
+ " gamma_occupation | \n",
+ " tf_occupation_l | \n",
+ " tf_occupation_r | \n",
+ " bf_occupation | \n",
+ " bf_tf_adj_occupation | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 26.932083 | \n",
+ " 1.000000 | \n",
+ " Q446382-1 | \n",
+ " Q446382-3 | \n",
+ " marianne | \n",
+ " marianne | \n",
+ " 4 | \n",
+ " 0.000801 | \n",
+ " 0.000801 | \n",
+ " 51.871289 | \n",
+ " ... | \n",
+ " 0.162366 | \n",
+ " 1.000000 | \n",
+ " None | \n",
+ " None | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 30.788800 | \n",
+ " 1.000000 | \n",
+ " Q2835078-1 | \n",
+ " Q2835078-2 | \n",
+ " alfred | \n",
+ " alfred | \n",
+ " 4 | \n",
+ " 0.013622 | \n",
+ " 0.013622 | \n",
+ " 51.871289 | \n",
+ " ... | \n",
+ " 197.452526 | \n",
+ " 0.607559 | \n",
+ " None | \n",
+ " None | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 23.882340 | \n",
+ " 1.000000 | \n",
+ " Q2835078-1 | \n",
+ " Q2835078-5 | \n",
+ " alfred | \n",
+ " alfred | \n",
+ " 4 | \n",
+ " 0.013622 | \n",
+ " 0.013622 | \n",
+ " 51.871289 | \n",
+ " ... | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " None | \n",
+ " None | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 39.932187 | \n",
+ " 1.000000 | \n",
+ " Q80158702-1 | \n",
+ " Q80158702-4 | \n",
+ " john | \n",
+ " john | \n",
+ " 4 | \n",
+ " 0.053085 | \n",
+ " 0.053085 | \n",
+ " 51.871289 | \n",
+ " ... | \n",
+ " 197.452526 | \n",
+ " 2.025198 | \n",
+ " sculptor | \n",
+ " sculptor | \n",
+ " 1 | \n",
+ " 0.002769 | \n",
+ " 0.002769 | \n",
+ " 23.836781 | \n",
+ " 13.868019 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 17.042339 | \n",
+ " 0.999993 | \n",
+ " Q18810722-3 | \n",
+ " Q18810722-6 | \n",
+ " frederick | \n",
+ " frederick | \n",
+ " 4 | \n",
+ " 0.012220 | \n",
+ " 0.012220 | \n",
+ " 51.871289 | \n",
+ " ... | \n",
+ " 197.452526 | \n",
+ " 0.607559 | \n",
+ " printer | \n",
+ " printer | \n",
+ " 1 | \n",
+ " 0.000791 | \n",
+ " 0.000791 | \n",
+ " 23.836781 | \n",
+ " 48.538067 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 44 columns
\n",
+ "
"
],
- "source": [
- "linker.evaluation.unlinkables_chart()"
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
+ "0 26.932083 1.000000 Q446382-1 Q446382-3 marianne \n",
+ "1 30.788800 1.000000 Q2835078-1 Q2835078-2 alfred \n",
+ "2 23.882340 1.000000 Q2835078-1 Q2835078-5 alfred \n",
+ "3 39.932187 1.000000 Q80158702-1 Q80158702-4 john \n",
+ "4 17.042339 0.999993 Q18810722-3 Q18810722-6 frederick \n",
+ "\n",
+ " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
+ "0 marianne 4 0.000801 0.000801 \n",
+ "1 alfred 4 0.013622 0.013622 \n",
+ "2 alfred 4 0.013622 0.013622 \n",
+ "3 john 4 0.053085 0.053085 \n",
+ "4 frederick 4 0.012220 0.012220 \n",
+ "\n",
+ " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n",
+ "0 51.871289 ... 0.162366 1.000000 None \n",
+ "1 51.871289 ... 197.452526 0.607559 None \n",
+ "2 51.871289 ... 1.000000 1.000000 None \n",
+ "3 51.871289 ... 197.452526 2.025198 sculptor \n",
+ "4 51.871289 ... 197.452526 0.607559 printer \n",
+ "\n",
+ " occupation_r gamma_occupation tf_occupation_l tf_occupation_r \\\n",
+ "0 None -1 NaN NaN \n",
+ "1 None -1 NaN NaN \n",
+ "2 None -1 NaN NaN \n",
+ "3 sculptor 1 0.002769 0.002769 \n",
+ "4 printer 1 0.000791 0.000791 \n",
+ "\n",
+ " bf_occupation bf_tf_adj_occupation match_key \n",
+ "0 1.000000 1.000000 0 \n",
+ "1 1.000000 1.000000 0 \n",
+ "2 1.000000 1.000000 0 \n",
+ "3 23.836781 13.868019 0 \n",
+ "4 23.836781 48.538067 0 \n",
+ "\n",
+ "[5 rows x 44 columns]"
]
- },
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_predict = linker.inference.predict()\n",
+ "df_e = df_predict.as_pandas_dataframe(limit=5)\n",
+ "df_e"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also view rows in this dataset as a waterfall chart as follows:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:43.296030Z",
+ "iopub.status.busy": "2024-05-15T18:41:43.295753Z",
+ "iopub.status.idle": "2024-05-15T18:41:43.969119Z",
+ "shell.execute_reply": "2024-05-15T18:41:43.968521Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:42.817975Z",
- "iopub.status.busy": "2024-05-15T18:41:42.817397Z",
- "iopub.status.idle": "2024-05-15T18:41:43.292311Z",
- "shell.execute_reply": "2024-05-15T18:41:43.291620Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'first_name':\n",
- " m values not fully trained\n",
- "Comparison: 'first_name':\n",
- " u values not fully trained\n",
- "Comparison: 'surname':\n",
- " m values not fully trained\n",
- "Comparison: 'surname':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " gamma_first_name | \n",
- " tf_first_name_l | \n",
- " tf_first_name_r | \n",
- " bf_first_name | \n",
- " ... | \n",
- " bf_birth_place | \n",
- " bf_tf_adj_birth_place | \n",
- " occupation_l | \n",
- " occupation_r | \n",
- " gamma_occupation | \n",
- " tf_occupation_l | \n",
- " tf_occupation_r | \n",
- " bf_occupation | \n",
- " bf_tf_adj_occupation | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 26.932083 | \n",
- " 1.000000 | \n",
- " Q446382-1 | \n",
- " Q446382-3 | \n",
- " marianne | \n",
- " marianne | \n",
- " 4 | \n",
- " 0.000801 | \n",
- " 0.000801 | \n",
- " 51.871289 | \n",
- " ... | \n",
- " 0.162366 | \n",
- " 1.000000 | \n",
- " None | \n",
- " None | \n",
- " -1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 30.788800 | \n",
- " 1.000000 | \n",
- " Q2835078-1 | \n",
- " Q2835078-2 | \n",
- " alfred | \n",
- " alfred | \n",
- " 4 | \n",
- " 0.013622 | \n",
- " 0.013622 | \n",
- " 51.871289 | \n",
- " ... | \n",
- " 197.452526 | \n",
- " 0.607559 | \n",
- " None | \n",
- " None | \n",
- " -1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 23.882340 | \n",
- " 1.000000 | \n",
- " Q2835078-1 | \n",
- " Q2835078-5 | \n",
- " alfred | \n",
- " alfred | \n",
- " 4 | \n",
- " 0.013622 | \n",
- " 0.013622 | \n",
- " 51.871289 | \n",
- " ... | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " None | \n",
- " None | \n",
- " -1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 39.932187 | \n",
- " 1.000000 | \n",
- " Q80158702-1 | \n",
- " Q80158702-4 | \n",
- " john | \n",
- " john | \n",
- " 4 | \n",
- " 0.053085 | \n",
- " 0.053085 | \n",
- " 51.871289 | \n",
- " ... | \n",
- " 197.452526 | \n",
- " 2.025198 | \n",
- " sculptor | \n",
- " sculptor | \n",
- " 1 | \n",
- " 0.002769 | \n",
- " 0.002769 | \n",
- " 23.836781 | \n",
- " 13.868019 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 17.042339 | \n",
- " 0.999993 | \n",
- " Q18810722-3 | \n",
- " Q18810722-6 | \n",
- " frederick | \n",
- " frederick | \n",
- " 4 | \n",
- " 0.012220 | \n",
- " 0.012220 | \n",
- " 51.871289 | \n",
- " ... | \n",
- " 197.452526 | \n",
- " 0.607559 | \n",
- " printer | \n",
- " printer | \n",
- " 1 | \n",
- " 0.000791 | \n",
- " 0.000791 | \n",
- " 23.836781 | \n",
- " 48.538067 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 44 columns
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
- "0 26.932083 1.000000 Q446382-1 Q446382-3 marianne \n",
- "1 30.788800 1.000000 Q2835078-1 Q2835078-2 alfred \n",
- "2 23.882340 1.000000 Q2835078-1 Q2835078-5 alfred \n",
- "3 39.932187 1.000000 Q80158702-1 Q80158702-4 john \n",
- "4 17.042339 0.999993 Q18810722-3 Q18810722-6 frederick \n",
- "\n",
- " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
- "0 marianne 4 0.000801 0.000801 \n",
- "1 alfred 4 0.013622 0.013622 \n",
- "2 alfred 4 0.013622 0.013622 \n",
- "3 john 4 0.053085 0.053085 \n",
- "4 frederick 4 0.012220 0.012220 \n",
- "\n",
- " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n",
- "0 51.871289 ... 0.162366 1.000000 None \n",
- "1 51.871289 ... 197.452526 0.607559 None \n",
- "2 51.871289 ... 1.000000 1.000000 None \n",
- "3 51.871289 ... 197.452526 2.025198 sculptor \n",
- "4 51.871289 ... 197.452526 0.607559 printer \n",
- "\n",
- " occupation_r gamma_occupation tf_occupation_l tf_occupation_r \\\n",
- "0 None -1 NaN NaN \n",
- "1 None -1 NaN NaN \n",
- "2 None -1 NaN NaN \n",
- "3 sculptor 1 0.002769 0.002769 \n",
- "4 printer 1 0.000791 0.000791 \n",
- "\n",
- " bf_occupation bf_tf_adj_occupation match_key \n",
- "0 1.000000 1.000000 0 \n",
- "1 1.000000 1.000000 0 \n",
- "2 1.000000 1.000000 0 \n",
- "3 23.836781 13.868019 0 \n",
- "4 23.836781 48.538067 0 \n",
- "\n",
- "[5 rows x 44 columns]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "df_predict = linker.inference.predict()\n",
- "df_e = df_predict.as_pandas_dataframe(limit=5)\n",
- "df_e"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "records_to_plot = df_e.to_dict(orient=\"records\")\n",
+ "linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:43.972219Z",
+ "iopub.status.busy": "2024-05-15T18:41:43.971787Z",
+ "iopub.status.idle": "2024-05-15T18:41:44.116709Z",
+ "shell.execute_reply": "2024-05-15T18:41:44.115993Z"
+ }
+ },
+ "outputs": [
{
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can also view rows in this dataset as a waterfall chart as follows:\n"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 1, root rows count 5\n",
+ "Completed iteration 2, root rows count 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " df_predict, threshold_match_probability=0.95\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:44.120162Z",
+ "iopub.status.busy": "2024-05-15T18:41:44.119922Z",
+ "iopub.status.idle": "2024-05-15T18:41:44.180152Z",
+ "shell.execute_reply": "2024-05-15T18:41:44.179445Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:43.296030Z",
- "iopub.status.busy": "2024-05-15T18:41:43.295753Z",
- "iopub.status.idle": "2024-05-15T18:41:43.969119Z",
- "shell.execute_reply": "2024-05-15T18:41:43.968521Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
],
- "source": [
- "\n",
- "records_to_plot = df_e.to_dict(orient=\"records\")\n",
- "linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)"
+ "text/plain": [
+ ""
]
- },
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.cluster_studio_dashboard(\n",
+ " df_predict,\n",
+ " clusters,\n",
+ " \"dashboards/50k_cluster.html\",\n",
+ " sampling_method=\"by_cluster_size\",\n",
+ " overwrite=True,\n",
+ ")\n",
+ "\n",
+ "from IPython.display import IFrame\n",
+ "\n",
+ "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:44.184020Z",
+ "iopub.status.busy": "2024-05-15T18:41:44.183710Z",
+ "iopub.status.idle": "2024-05-15T18:41:46.543532Z",
+ "shell.execute_reply": "2024-05-15T18:41:46.542614Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:43.972219Z",
- "iopub.status.busy": "2024-05-15T18:41:43.971787Z",
- "iopub.status.idle": "2024-05-15T18:41:44.116709Z",
- "shell.execute_reply": "2024-05-15T18:41:44.115993Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 1, root rows count 5\n",
- "Completed iteration 2, root rows count 0\n"
- ]
- }
- ],
- "source": [
- "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " df_predict, threshold_match_probability=0.95\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'first_name':\n",
+ " m values not fully trained\n",
+ "Comparison: 'first_name':\n",
+ " u values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " m values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:44.120162Z",
- "iopub.status.busy": "2024-05-15T18:41:44.119922Z",
- "iopub.status.idle": "2024-05-15T18:41:44.180152Z",
- "shell.execute_reply": "2024-05-15T18:41:44.179445Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.visualisations.cluster_studio_dashboard(\n",
- " df_predict,\n",
- " clusters,\n",
- " \"dashboards/50k_cluster.html\",\n",
- " sampling_method=\"by_cluster_size\",\n",
- " overwrite=True,\n",
- ")\n",
- "\n",
- "from IPython.display import IFrame\n",
- "\n",
- "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_column(\n",
+ " \"cluster\", output_type=\"roc\", match_weight_round_to_nearest=0.02\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:46.557696Z",
+ "iopub.status.busy": "2024-05-15T18:41:46.557395Z",
+ "iopub.status.idle": "2024-05-15T18:41:47.295019Z",
+ "shell.execute_reply": "2024-05-15T18:41:47.294474Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:44.184020Z",
- "iopub.status.busy": "2024-05-15T18:41:44.183710Z",
- "iopub.status.idle": "2024-05-15T18:41:46.543532Z",
- "shell.execute_reply": "2024-05-15T18:41:46.542614Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'first_name':\n",
- " m values not fully trained\n",
- "Comparison: 'first_name':\n",
- " u values not fully trained\n",
- "Comparison: 'surname':\n",
- " m values not fully trained\n",
- "Comparison: 'surname':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_column(\n",
- " \"cluster\", output_type=\"roc\", match_weight_round_to_nearest=0.02\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'first_name':\n",
+ " m values not fully trained\n",
+ "Comparison: 'first_name':\n",
+ " u values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " m values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:46.557696Z",
- "iopub.status.busy": "2024-05-15T18:41:46.557395Z",
- "iopub.status.idle": "2024-05-15T18:41:47.295019Z",
- "shell.execute_reply": "2024-05-15T18:41:47.294474Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'first_name':\n",
- " m values not fully trained\n",
- "Comparison: 'first_name':\n",
- " u values not fully trained\n",
- "Comparison: 'surname':\n",
- " m values not fully trained\n",
- "Comparison: 'surname':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "records = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\",\n",
- " threshold_match_probability=0.999,\n",
- " include_false_negatives=False,\n",
- " include_false_positives=True,\n",
- ").as_record_dict()\n",
- "linker.visualisations.waterfall_chart(records)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "records = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\",\n",
+ " threshold_match_probability=0.999,\n",
+ " include_false_negatives=False,\n",
+ " include_false_positives=True,\n",
+ ").as_record_dict()\n",
+ "linker.visualisations.waterfall_chart(records)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-05-15T18:41:47.298555Z",
+ "iopub.status.busy": "2024-05-15T18:41:47.298310Z",
+ "iopub.status.idle": "2024-05-15T18:41:50.039196Z",
+ "shell.execute_reply": "2024-05-15T18:41:50.038400Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'first_name':\n",
+ " m values not fully trained\n",
+ "Comparison: 'first_name':\n",
+ " u values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " m values not fully trained\n",
+ "Comparison: 'surname':\n",
+ " u values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-05-15T18:41:47.298555Z",
- "iopub.status.busy": "2024-05-15T18:41:47.298310Z",
- "iopub.status.idle": "2024-05-15T18:41:50.039196Z",
- "shell.execute_reply": "2024-05-15T18:41:50.038400Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'first_name':\n",
- " m values not fully trained\n",
- "Comparison: 'first_name':\n",
- " u values not fully trained\n",
- "Comparison: 'surname':\n",
- " m values not fully trained\n",
- "Comparison: 'surname':\n",
- " u values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "# Some of the false negatives will be because they weren't detected by the blocking rules\n",
- "records = linker.evaluation.prediction_errors_from_labels_column(\n",
- " \"cluster\",\n",
- " threshold_match_probability=0.5,\n",
- " include_false_negatives=True,\n",
- " include_false_positives=False,\n",
- ").as_record_dict(limit=50)\n",
- "\n",
- "linker.visualisations.waterfall_chart(records)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "# Some of the false negatives will be because they weren't detected by the blocking rules\n",
+ "records = linker.evaluation.prediction_errors_from_labels_column(\n",
+ " \"cluster\",\n",
+ " threshold_match_probability=0.5,\n",
+ " include_false_negatives=True,\n",
+ " include_false_positives=False,\n",
+ ").as_record_dict(limit=50)\n",
+ "\n",
+ "linker.visualisations.waterfall_chart(records)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/docs/demos/tutorials/02_Exploratory_analysis.ipynb b/docs/demos/tutorials/02_Exploratory_analysis.ipynb
index cc28672b0f..00947f834a 100644
--- a/docs/demos/tutorials/02_Exploratory_analysis.ipynb
+++ b/docs/demos/tutorials/02_Exploratory_analysis.ipynb
@@ -277,7 +277,8 @@
"from splink.exploratory import completeness_chart\n",
"from splink import DuckDBAPI\n",
"db_api = DuckDBAPI()\n",
- "completeness_chart(df, db_api=db_api)"
+ "df_sdf = db_api.register(df)\n",
+ "completeness_chart(df_sdf)"
]
},
{
@@ -314,7 +315,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"id": "897d183c",
"metadata": {
"execution": {
@@ -407,7 +408,7 @@
"source": [
"from splink.exploratory import profile_columns\n",
"\n",
- "profile_columns(df, db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+ "profile_columns(df_sdf, top_n=10, bottom_n=5)"
]
},
{
diff --git a/docs/demos/tutorials/03_Blocking.ipynb b/docs/demos/tutorials/03_Blocking.ipynb
index 446b3badbd..64a0052eab 100644
--- a/docs/demos/tutorials/03_Blocking.ipynb
+++ b/docs/demos/tutorials/03_Blocking.ipynb
@@ -1,664 +1,662 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Choosing blocking rules to optimise runtime\n",
- "\n",
- "\n",
- "
\n",
- "\n",
- "\n",
- "To link records, we need to compare pairs of records and decide which pairs are matches.\n",
- "\n",
- "For example consider the following two records:\n",
- "\n",
- "| first_name | surname | dob | city | email |\n",
- "| ---------- | ------- | ---------- | ------ | ------------------- |\n",
- "| Robert | Allen | 1971-05-24 | nan | roberta25@smith.net |\n",
- "| Rob | Allen | 1971-06-24 | London | roberta25@smith.net |\n",
- "\n",
- "These can be represented as a pairwise comparison as follows:\n",
- "\n",
- "| first_name_l | first_name_r | surname_l | surname_r | dob_l | dob_r | city_l | city_r | email_l | email_r |\n",
- "| ------------ | ------------ | --------- | --------- | ---------- | ---------- | ------ | ------ | ------------------- | ------------------- |\n",
- "| Robert | Rob | Allen | Allen | 1971-05-24 | 1971-06-24 | nan | London | roberta25@smith.net | roberta25@smith.net |\n",
- "\n",
- "For most large datasets, it is computationally intractable to compare every row with every other row, since the number of comparisons rises quadratically with the number of records.\n",
- "\n",
- "Instead we rely on blocking rules, which specify which pairwise comparisons to generate. For example, we could generate the subset of pairwise comparisons where either first name or surname matches.\n",
- "\n",
- "This is part of a two step process to link data:\n",
- "\n",
- "1. Use blocking rules to generate candidate pairwise record comparisons\n",
- "\n",
- "2. Use a probabilistic linkage model to score these candidate pairs, to determine which ones should be linked\n",
- "\n",
- "**Blocking rules are the most important determinant of the performance of your linkage job**.\n",
- "\n",
- "When deciding on your blocking rules, you're trading off accuracy for performance:\n",
- "\n",
- "- If your rules are too loose, your linkage job may fail.\n",
- "- If they're too tight, you may miss some valid links.\n",
- "\n",
- "This tutorial clarifies what blocking rules are, and how to choose good rules.\n",
- "\n",
- "## Blocking rules in Splink\n",
- "\n",
- "In Splink, blocking rules are specified as SQL expressions.\n",
- "\n",
- "For example, to generate the subset of record comparisons where the first name and surname matches, we can specify the following blocking rule:\n",
- "\n",
- "```python\n",
- "from splink import block_on\n",
- "block_on(\"first_name\", \"surname\")\n",
- "```\n",
- "\n",
- "When executed, this blocking rule will be converted to a SQL statement with the following form:\n",
- "\n",
- "```sql\n",
- "SELECT ...\n",
- "FROM input_tables as l\n",
- "INNER JOIN input_tables as r\n",
- "ON l.first_name = r.first_name AND l.surname = r.surname\n",
- "```\n",
- "\n",
- "Since blocking rules are SQL expressions, they can be arbitrarily complex. For example, you could create record comparisons where the initial of the first name and the surname match with the following rule:\n",
- "\n",
- "```python\n",
- "from splink import block_on\n",
- "block_on(\"substr(first_name, 1, 2)\", \"surname\")\n",
- "```\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Devising effective blocking rules for prediction\n",
- "\n",
- "The aims of your blocking rules are twofold:\n",
- "\n",
- "1. Eliminate enough non-matching comparison pairs so your record linkage job is small enough to compute\n",
- "2. Eliminate as few truly matching pairs as possible (ideally none)\n",
- "\n",
- "It is usually impossible to find a single blocking rule which achieves both aims, so we recommend using multiple blocking rules.\n",
- "\n",
- "When we specify multiple blocking rules, Splink will generate all comparison pairs that meet any one of the rules.\n",
- "\n",
- "For example, consider the following blocking rule:\n",
- "\n",
- "`block_on(\"first_name\", \"dob\")`\n",
- "\n",
- "This rule is likely to be effective in reducing the number of comparison pairs. It will retain all truly matching pairs, except those with errors or nulls in either the `first_name` or `dob` fields.\n",
- "\n",
- "Now consider a second blocking rule:\n",
- "\n",
- "`block_on(\"email\")`.\n",
- "\n",
- "This will retain all truly matching pairs, except those with errors or nulls in the `email` column.\n",
- "\n",
- "Individually, these blocking rules are problematic because they exclude true matches where the records contain typos of certain types. But between them, they might do quite a good job.\n",
- "\n",
- "For a true match to be eliminated by the use of these two blocking rules, it would have to have an error in _both_ `email` AND (`first_name` or `dob`).\n",
- "\n",
- "This is not completely implausible, but it is significantly less likely than if we'd used a single rule.\n",
- "\n",
- "More generally, we can often specify multiple blocking rules such that it becomes highly implausible that a true match would not meet at least one of these blocking criteria. This is the recommended approach in Splink. Generally we would recommend between about 3 and 10, though even more is possible.\n",
- "\n",
- "The question then becomes how to choose what to put in this list.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Splink tools to help choose your blocking rules\n",
- "\n",
- "Splink contains a number of tools to help you choose effective blocking rules. Let's try them out, using our small test dataset:\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:01.481605Z",
- "iopub.status.busy": "2024-07-17T08:01:01.481304Z",
- "iopub.status.idle": "2024-07-17T08:01:01.500325Z",
- "shell.execute_reply": "2024-07-17T08:01:01.499540Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:01.507471Z",
- "iopub.status.busy": "2024-07-17T08:01:01.507076Z",
- "iopub.status.idle": "2024-07-17T08:01:04.027392Z",
- "shell.execute_reply": "2024-07-17T08:01:04.026677Z"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "from splink import DuckDBAPI, block_on, splink_datasets\n",
- "\n",
- "df = splink_datasets.fake_1000"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Counting the number of comparisons created by a single blocking rule\n",
- "\n",
- "On large datasets, some blocking rules imply the creation of trillions of record comparisons, which would cause a linkage job to fail.\n",
- "\n",
- "Before using a blocking rule in a linkage job, it's therefore a good idea to count the number of records it generates to ensure it is not too loose:\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Choosing blocking rules to optimise runtime\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n",
+ "To link records, we need to compare pairs of records and decide which pairs are matches.\n",
+ "\n",
+ "For example consider the following two records:\n",
+ "\n",
+ "| first_name | surname | dob | city | email |\n",
+ "| ---------- | ------- | ---------- | ------ | ------------------- |\n",
+ "| Robert | Allen | 1971-05-24 | nan | roberta25@smith.net |\n",
+ "| Rob | Allen | 1971-06-24 | London | roberta25@smith.net |\n",
+ "\n",
+ "These can be represented as a pairwise comparison as follows:\n",
+ "\n",
+ "| first_name_l | first_name_r | surname_l | surname_r | dob_l | dob_r | city_l | city_r | email_l | email_r |\n",
+ "| ------------ | ------------ | --------- | --------- | ---------- | ---------- | ------ | ------ | ------------------- | ------------------- |\n",
+ "| Robert | Rob | Allen | Allen | 1971-05-24 | 1971-06-24 | nan | London | roberta25@smith.net | roberta25@smith.net |\n",
+ "\n",
+ "For most large datasets, it is computationally intractable to compare every row with every other row, since the number of comparisons rises quadratically with the number of records.\n",
+ "\n",
+ "Instead we rely on blocking rules, which specify which pairwise comparisons to generate. For example, we could generate the subset of pairwise comparisons where either first name or surname matches.\n",
+ "\n",
+ "This is part of a two step process to link data:\n",
+ "\n",
+ "1. Use blocking rules to generate candidate pairwise record comparisons\n",
+ "\n",
+ "2. Use a probabilistic linkage model to score these candidate pairs, to determine which ones should be linked\n",
+ "\n",
+ "**Blocking rules are the most important determinant of the performance of your linkage job**.\n",
+ "\n",
+ "When deciding on your blocking rules, you're trading off accuracy for performance:\n",
+ "\n",
+ "- If your rules are too loose, your linkage job may fail.\n",
+ "- If they're too tight, you may miss some valid links.\n",
+ "\n",
+ "This tutorial clarifies what blocking rules are, and how to choose good rules.\n",
+ "\n",
+ "## Blocking rules in Splink\n",
+ "\n",
+ "In Splink, blocking rules are specified as SQL expressions.\n",
+ "\n",
+ "For example, to generate the subset of record comparisons where the first name and surname matches, we can specify the following blocking rule:\n",
+ "\n",
+ "```python\n",
+ "from splink import block_on\n",
+ "block_on(\"first_name\", \"surname\")\n",
+ "```\n",
+ "\n",
+ "When executed, this blocking rule will be converted to a SQL statement with the following form:\n",
+ "\n",
+ "```sql\n",
+ "SELECT ...\n",
+ "FROM input_tables as l\n",
+ "INNER JOIN input_tables as r\n",
+ "ON l.first_name = r.first_name AND l.surname = r.surname\n",
+ "```\n",
+ "\n",
+ "Since blocking rules are SQL expressions, they can be arbitrarily complex. For example, you could create record comparisons where the initial of the first name and the surname match with the following rule:\n",
+ "\n",
+ "```python\n",
+ "from splink import block_on\n",
+ "block_on(\"substr(first_name, 1, 2)\", \"surname\")\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Devising effective blocking rules for prediction\n",
+ "\n",
+ "The aims of your blocking rules are twofold:\n",
+ "\n",
+ "1. Eliminate enough non-matching comparison pairs so your record linkage job is small enough to compute\n",
+ "2. Eliminate as few truly matching pairs as possible (ideally none)\n",
+ "\n",
+ "It is usually impossible to find a single blocking rule which achieves both aims, so we recommend using multiple blocking rules.\n",
+ "\n",
+ "When we specify multiple blocking rules, Splink will generate all comparison pairs that meet any one of the rules.\n",
+ "\n",
+ "For example, consider the following blocking rule:\n",
+ "\n",
+ "`block_on(\"first_name\", \"dob\")`\n",
+ "\n",
+ "This rule is likely to be effective in reducing the number of comparison pairs. It will retain all truly matching pairs, except those with errors or nulls in either the `first_name` or `dob` fields.\n",
+ "\n",
+ "Now consider a second blocking rule:\n",
+ "\n",
+ "`block_on(\"email\")`.\n",
+ "\n",
+ "This will retain all truly matching pairs, except those with errors or nulls in the `email` column.\n",
+ "\n",
+ "Individually, these blocking rules are problematic because they exclude true matches where the records contain typos of certain types. But between them, they might do quite a good job.\n",
+ "\n",
+ "For a true match to be eliminated by the use of these two blocking rules, it would have to have an error in _both_ `email` AND (`first_name` or `dob`).\n",
+ "\n",
+ "This is not completely implausible, but it is significantly less likely than if we'd used a single rule.\n",
+ "\n",
+ "More generally, we can often specify multiple blocking rules such that it becomes highly implausible that a true match would not meet at least one of these blocking criteria. This is the recommended approach in Splink. Generally we would recommend between about 3 and 10, though even more is possible.\n",
+ "\n",
+ "The question then becomes how to choose what to put in this list.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Splink tools to help choose your blocking rules\n",
+ "\n",
+ "Splink contains a number of tools to help you choose effective blocking rules. Let's try them out, using our small test dataset:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:01.481605Z",
+ "iopub.status.busy": "2024-07-17T08:01:01.481304Z",
+ "iopub.status.idle": "2024-07-17T08:01:01.500325Z",
+ "shell.execute_reply": "2024-07-17T08:01:01.499540Z"
},
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:04.031221Z",
- "iopub.status.busy": "2024-07-17T08:01:04.030942Z",
- "iopub.status.idle": "2024-07-17T08:01:04.225615Z",
- "shell.execute_reply": "2024-07-17T08:01:04.224859Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'number_of_comparisons_generated_pre_filter_conditions': 1632,\n",
- " 'number_of_comparisons_to_be_scored_post_filter_conditions': 473,\n",
- " 'filter_conditions_identified': '',\n",
- " 'equi_join_conditions_identified': 'SUBSTR(l.first_name, 1, 1) = SUBSTR(r.first_name, 1, 1) AND l.\"surname\" = r.\"surname\"',\n",
- " 'link_type_join_condition': 'where l.\"unique_id\" < r.\"unique_id\"'}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from splink.blocking_analysis import count_comparisons_from_blocking_rule\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "\n",
- "br = block_on(\"substr(first_name, 1,1)\", \"surname\")\n",
- "\n",
- "counts = count_comparisons_from_blocking_rule(\n",
- " table_or_tables=df,\n",
- " blocking_rule=br,\n",
- " link_type=\"dedupe_only\",\n",
- " db_api=db_api,\n",
- ")\n",
- "\n",
- "counts"
- ]
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:01.507471Z",
+ "iopub.status.busy": "2024-07-17T08:01:01.507076Z",
+ "iopub.status.idle": "2024-07-17T08:01:04.027392Z",
+ "shell.execute_reply": "2024-07-17T08:01:04.026677Z"
},
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:04.261760Z",
- "iopub.status.busy": "2024-07-17T08:01:04.261491Z",
- "iopub.status.idle": "2024-07-17T08:01:04.314843Z",
- "shell.execute_reply": "2024-07-17T08:01:04.314299Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'number_of_comparisons_generated_pre_filter_conditions': 4827,\n",
- " 'number_of_comparisons_to_be_scored_post_filter_conditions': 372,\n",
- " 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2',\n",
- " 'equi_join_conditions_identified': 'l.first_name = r.first_name',\n",
- " 'link_type_join_condition': 'where l.\"unique_id\" < r.\"unique_id\"'}"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "br = \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\"\n",
- "\n",
- "counts = count_comparisons_from_blocking_rule(\n",
- " table_or_tables=df,\n",
- " blocking_rule= br,\n",
- " link_type=\"dedupe_only\",\n",
- " db_api=db_api,\n",
- ")\n",
- "counts"
- ]
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from splink import DuckDBAPI, block_on, splink_datasets\n",
+ "\n",
+ "df = splink_datasets.fake_1000"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Counting the number of comparisons created by a single blocking rule\n",
+ "\n",
+ "On large datasets, some blocking rules imply the creation of trillions of record comparisons, which would cause a linkage job to fail.\n",
+ "\n",
+ "Before using a blocking rule in a linkage job, it's therefore a good idea to count the number of records it generates to ensure it is not too loose:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:04.031221Z",
+ "iopub.status.busy": "2024-07-17T08:01:04.030942Z",
+ "iopub.status.idle": "2024-07-17T08:01:04.225615Z",
+ "shell.execute_reply": "2024-07-17T08:01:04.224859Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The maximum number of comparisons that you can compute will be affected by your choice of SQL backend, and how powerful your computer is.\n",
- "\n",
- "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons. For Spark and Athena, try starting with fewer than 100 million comparisons, before scaling up.\n"
+ "data": {
+ "text/plain": [
+ "{'number_of_comparisons_generated_pre_filter_conditions': 1632,\n",
+ " 'number_of_comparisons_to_be_scored_post_filter_conditions': 473,\n",
+ " 'filter_conditions_identified': '',\n",
+ " 'equi_join_conditions_identified': 'SUBSTR(l.first_name, 1, 1) = SUBSTR(r.first_name, 1, 1) AND l.\"surname\" = r.\"surname\"',\n",
+ " 'link_type_join_condition': 'where l.\"unique_id\" < r.\"unique_id\"'}"
]
- },
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.blocking_analysis import count_comparisons_from_blocking_rule\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "\n",
+ "br = block_on(\"substr(first_name, 1,1)\", \"surname\")\n",
+ "\n",
+ "counts = count_comparisons_from_blocking_rule(\n",
+ " df_sdf,\n",
+ " blocking_rule=br,\n",
+ " link_type=\"dedupe_only\",\n",
+ ")\n",
+ "\n",
+ "counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:04.261760Z",
+ "iopub.status.busy": "2024-07-17T08:01:04.261491Z",
+ "iopub.status.idle": "2024-07-17T08:01:04.314843Z",
+ "shell.execute_reply": "2024-07-17T08:01:04.314299Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Finding 'worst offending' values for your blocking rule\n",
- "\n",
- "Blocking rules can be affected by skew: some values of a field may be much more common than others, and this can lead to a disproportionate number of comparisons being generated.\n",
- "\n",
- "It can be useful to identify whether your data is afflicted by this problem. "
+ "data": {
+ "text/plain": [
+ "{'number_of_comparisons_generated_pre_filter_conditions': 4827,\n",
+ " 'number_of_comparisons_to_be_scored_post_filter_conditions': 372,\n",
+ " 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2',\n",
+ " 'equi_join_conditions_identified': 'l.first_name = r.first_name',\n",
+ " 'link_type_join_condition': 'where l.\"unique_id\" < r.\"unique_id\"'}"
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "br = \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\"\n",
+ "\n",
+ "counts = count_comparisons_from_blocking_rule(\n",
+ " df_sdf,\n",
+ " blocking_rule=br,\n",
+ " link_type=\"dedupe_only\",\n",
+ ")\n",
+ "counts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The maximum number of comparisons that you can compute will be affected by your choice of SQL backend, and how powerful your computer is.\n",
+ "\n",
+ "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons. For Spark and Athena, try starting with fewer than 100 million comparisons, before scaling up.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Finding 'worst offending' values for your blocking rule\n",
+ "\n",
+ "Blocking rules can be affected by skew: some values of a field may be much more common than others, and this can lead to a disproportionate number of comparisons being generated.\n",
+ "\n",
+ "It can be useful to identify whether your data is afflicted by this problem. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:04.318472Z",
+ "iopub.status.busy": "2024-07-17T08:01:04.318176Z",
+ "iopub.status.idle": "2024-07-17T08:01:04.364407Z",
+ "shell.execute_reply": "2024-07-17T08:01:04.363784Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:04.318472Z",
- "iopub.status.busy": "2024-07-17T08:01:04.318176Z",
- "iopub.status.idle": "2024-07-17T08:01:04.364407Z",
- "shell.execute_reply": "2024-07-17T08:01:04.363784Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " key_0 | \n",
- " key_1 | \n",
- " count_l | \n",
- " count_r | \n",
- " block_count | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Birmingham | \n",
- " Theodore | \n",
- " 7 | \n",
- " 7 | \n",
- " 49 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " London | \n",
- " Oliver | \n",
- " 7 | \n",
- " 7 | \n",
- " 49 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " London | \n",
- " James | \n",
- " 6 | \n",
- " 6 | \n",
- " 36 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " key_0 key_1 count_l count_r block_count\n",
- "0 Birmingham Theodore 7 7 49\n",
- "1 London Oliver 7 7 49\n",
- "2 London James 6 6 36"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key_0 | \n",
+ " key_1 | \n",
+ " count_l | \n",
+ " count_r | \n",
+ " block_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Birmingham | \n",
+ " Theodore | \n",
+ " 7 | \n",
+ " 7 | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " London | \n",
+ " Oliver | \n",
+ " 7 | \n",
+ " 7 | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " London | \n",
+ " James | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink.blocking_analysis import n_largest_blocks\n",
- "\n",
- "result = n_largest_blocks( table_or_tables=df,\n",
- " blocking_rule= block_on(\"city\", \"first_name\"),\n",
- " link_type=\"dedupe_only\",\n",
- " db_api=db_api,\n",
- " n_largest=3\n",
- " )\n",
- "\n",
- "result.as_pandas_dataframe()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this case, we can see that `Oliver`s in `London` will result in 49 comparisons being generated. This is acceptable on this small dataset, but on a larger dataset, `Oliver`s in `London` could be responsible for many million comparisons."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Counting the number of comparisons created by a list of blocking rules\n",
- "\n",
- "As noted above, it's usually a good idea to use multiple blocking rules. It's therefore useful to know how many record comparisons will be generated when these rules are applied.\n",
- "\n",
- "Since the same record comparison may be created by several blocking rules, and Splink automatically deduplicates these comparisons, we cannot simply total the number of comparisons generated by each rule individually.\n",
- "\n",
- "Splink provides a chart that shows the marginal (additional) comparisons generated by each blocking rule, after deduplication:\n"
+ "text/plain": [
+ " key_0 key_1 count_l count_r block_count\n",
+ "0 Birmingham Theodore 7 7 49\n",
+ "1 London Oliver 7 7 49\n",
+ "2 London James 6 6 36"
]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.blocking_analysis import n_largest_blocks\n",
+ "\n",
+ "result = n_largest_blocks(\n",
+ " df_sdf,\n",
+ " blocking_rule=block_on(\"city\", \"first_name\"),\n",
+ " link_type=\"dedupe_only\",\n",
+ " n_largest=3\n",
+ ")\n",
+ "\n",
+ "result.as_pandas_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this case, we can see that `Oliver`s in `London` will result in 49 comparisons being generated. This is acceptable on this small dataset, but on a larger dataset, `Oliver`s in `London` could be responsible for many million comparisons."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Counting the number of comparisons created by a list of blocking rules\n",
+ "\n",
+ "As noted above, it's usually a good idea to use multiple blocking rules. It's therefore useful to know how many record comparisons will be generated when these rules are applied.\n",
+ "\n",
+ "Since the same record comparison may be created by several blocking rules, and Splink automatically deduplicates these comparisons, we cannot simply total the number of comparisons generated by each rule individually.\n",
+ "\n",
+ "Splink provides a chart that shows the marginal (additional) comparisons generated by each blocking rule, after deduplication:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:04.368088Z",
+ "iopub.status.busy": "2024-07-17T08:01:04.367810Z",
+ "iopub.status.idle": "2024-07-17T08:01:04.642204Z",
+ "shell.execute_reply": "2024-07-17T08:01:04.640945Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:04.368088Z",
- "iopub.status.busy": "2024-07-17T08:01:04.367810Z",
- "iopub.status.idle": "2024-07-17T08:01:04.642204Z",
- "shell.execute_reply": "2024-07-17T08:01:04.640945Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink.blocking_analysis import (\n",
- " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
- ")\n",
- "\n",
- "blocking_rules_for_analysis = [\n",
- " block_on(\"substr(first_name, 1,1)\", \"surname\"),\n",
- " block_on(\"surname\"),\n",
- " block_on(\"email\"),\n",
- " block_on(\"city\", \"first_name\"),\n",
- " \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\",\n",
- "]\n",
- "\n",
- "\n",
- "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
- " table_or_tables=df,\n",
- " blocking_rules=blocking_rules_for_analysis,\n",
- " db_api=db_api,\n",
- " link_type=\"dedupe_only\",\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Digging deeper: Understanding why certain blocking rules create large numbers of comparisons\n",
- "\n",
- "Finally, we can use the `profile_columns` function we saw in the previous tutorial to understand a specific blocking rule in more depth.\n",
- "\n",
- "Suppose we're interested in blocking on city and first initial.\n",
- "\n",
- "Within each distinct value of `(city, first initial)`, all possible pairwise comparisons will be generated.\n",
- "\n",
- "So for instance, if there are 15 distinct records with `London,J` then these records will result in `n(n-1)/2 = 105` pairwise comparisons being generated.\n",
- "\n",
- "In a larger dataset, we might observe 10,000 `London,J` records, which would then be responsible for `49,995,000` comparisons.\n",
- "\n",
- "These high-frequency values therefore have a disproportionate influence on the overall number of pairwise comparisons, and so it can be useful to analyse skew, as follows:\n"
+ "text/plain": [
+ "alt.Chart(...)"
]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.blocking_analysis import (\n",
+ " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+ ")\n",
+ "\n",
+ "blocking_rules_for_analysis = [\n",
+ " block_on(\"substr(first_name, 1,1)\", \"surname\"),\n",
+ " block_on(\"surname\"),\n",
+ " block_on(\"email\"),\n",
+ " block_on(\"city\", \"first_name\"),\n",
+ " \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\",\n",
+ "]\n",
+ "\n",
+ "\n",
+ "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+ " df_sdf,\n",
+ " blocking_rules=blocking_rules_for_analysis,\n",
+ " link_type=\"dedupe_only\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Digging deeper: Understanding why certain blocking rules create large numbers of comparisons\n",
+ "\n",
+ "Finally, we can use the `profile_columns` function we saw in the previous tutorial to understand a specific blocking rule in more depth.\n",
+ "\n",
+ "Suppose we're interested in blocking on city and first initial.\n",
+ "\n",
+ "Within each distinct value of `(city, first initial)`, all possible pairwise comparisons will be generated.\n",
+ "\n",
+ "So for instance, if there are 15 distinct records with `London,J` then these records will result in `n(n-1)/2 = 105` pairwise comparisons being generated.\n",
+ "\n",
+ "In a larger dataset, we might observe 10,000 `London,J` records, which would then be responsible for `49,995,000` comparisons.\n",
+ "\n",
+ "These high-frequency values therefore have a disproportionate influence on the overall number of pairwise comparisons, and so it can be useful to analyse skew, as follows:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-17T08:01:04.645665Z",
+ "iopub.status.busy": "2024-07-17T08:01:04.645388Z",
+ "iopub.status.idle": "2024-07-17T08:01:04.857248Z",
+ "shell.execute_reply": "2024-07-17T08:01:04.856730Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-17T08:01:04.645665Z",
- "iopub.status.busy": "2024-07-17T08:01:04.645388Z",
- "iopub.status.idle": "2024-07-17T08:01:04.857248Z",
- "shell.execute_reply": "2024-07-17T08:01:04.856730Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "from splink.exploratory import profile_columns\n",
- "\n",
- "profile_columns(df, column_expressions=[\"city || left(first_name,1)\"], db_api=db_api)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "!!! note \"Further Reading\"\n",
- " :simple-readme: For a deeper dive on blocking, please refer to the [Blocking Topic Guides](../../topic_guides/blocking/blocking_rules.md).\n",
- "\n",
- " :material-tools: For more on the blocking tools in Splink, please refer to the [Blocking API documentation](../../api_docs/blocking.md).\n",
- "\n",
- " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#blocking).\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Next steps\n",
- "\n",
- "Now we have chosen which records to compare, we can use those records to train a linkage model.\n"
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "from splink.exploratory import profile_columns\n",
+ "\n",
+ "profile_columns(df_sdf, column_expressions=[\"city || left(first_name,1)\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "!!! note \"Further Reading\"\n",
+ " :simple-readme: For a deeper dive on blocking, please refer to the [Blocking Topic Guides](../../topic_guides/blocking/blocking_rules.md).\n",
+ "\n",
+ " :material-tools: For more on the blocking tools in Splink, please refer to the [Blocking API documentation](../../api_docs/blocking.md).\n",
+ "\n",
+ " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#blocking).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Next steps\n",
+ "\n",
+ "Now we have chosen which records to compare, we can use those records to train a linkage model.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demos/tutorials/04_Estimating_model_parameters.ipynb b/docs/demos/tutorials/04_Estimating_model_parameters.ipynb
index 3cc83bd09e..30c9c5406d 100644
--- a/docs/demos/tutorials/04_Estimating_model_parameters.ipynb
+++ b/docs/demos/tutorials/04_Estimating_model_parameters.ipynb
@@ -1,1342 +1,1344 @@
{
- "cells": [
+ "cells": [
+ {
+ "cell_type": "raw",
+ "id": "98e133ed-7f91-4e11-9f6b-abde9a67f980",
+ "metadata": {},
+ "source": [
+ "# Specifying and estimating a linkage model\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n",
+ "In the last tutorial we looked at how we can use blocking rules to generate pairwise record comparisons.\n",
+ "\n",
+ "Now it's time to estimate a probabilistic linkage model to score each of these comparisons. The resultant match score is a prediction of whether the two records represent the same entity (e.g. are the same person).\n",
+ "\n",
+ "The purpose of estimating the model is to learn the relative importance of different parts of your data for the purpose of data linking.\n",
+ "\n",
+ "For example, a match on date of birth is a much stronger indicator that two records refer to the same entity than a match on gender. A mismatch on gender may be a stronger indicate against two records referring than a mismatch on name, since names are more likely to be entered differently.\n",
+ "\n",
+ "The relative importance of different information is captured in the (partial) 'match weights', which can be learned from your data. These match weights are then added up to compute the overall match score.\n",
+ "\n",
+ "The match weights are are derived from the `m` and `u` parameters of the underlying Fellegi Sunter model. Splink uses various statistical routines to estimate these parameters. Further details of the underlying theory can be found [here](https://www.robinlinacre.com/intro_to_probabilistic_linkage/), which will help you understand this part of the tutorial.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f104340",
+ "metadata": {},
+ "source": [
+ "## Specifying a linkage model\n",
+ "\n",
+ "To build a linkage model, the user defines the partial match weights that `splink` needs to estimate. This is done by defining how the information in the input records should be compared.\n",
+ "\n",
+ "To be concrete, here is an example comparison:\n",
+ "\n",
+ "| first_name_l | first_name_r | surname_l | surname_r | dob_l | dob_r | city_l | city_r | email_l | email_r |\n",
+ "| ------------ | ------------ | --------- | --------- | ---------- | ---------- | ------ | ------ | ------------------- | ------------------- |\n",
+ "| Robert | Rob | Allen | Allen | 1971-05-24 | 1971-06-24 | nan | London | roberta25@smith.net | roberta25@smith.net |\n",
+ "\n",
+ "What functions should we use to assess the similarity of `Rob` vs. `Robert` in the the `first_name` field?\n",
+ "\n",
+ "Should similarity in the `dob` field be computed in the same way, or a different way?\n",
+ "\n",
+ "Your job as the developer of a linkage model is to decide what comparisons are most appropriate for the types of data you have.\n",
+ "\n",
+ "Splink can then estimate how much weight to place on a fuzzy match of `Rob` vs. `Robert`, relative to an exact match on `Robert`, or a non-match.\n",
+ "\n",
+ "Defining these scenarios is done using `Comparison`s.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8a520392",
+ "metadata": {},
+ "source": [
+ "### Comparisons\n",
+ "\n",
+ "The concept of a `Comparison` has a specific definition within Splink: it defines how data from one or more input columns is compared.\n",
+ "\n",
+ "For example, one `Comparison` may represent how similarity is assessed for a person's date of birth.\n",
+ "\n",
+ "Another `Comparison` may represent the comparison of a person's name or location.\n",
+ "\n",
+ "A model is composed of many `Comparison`s, which between them assess the similarity of all of the columns being used for data linking.\n",
+ "\n",
+ "Each `Comparison` contains two or more `ComparisonLevels` which define _n_ discrete gradations of similarity between the input columns within the Comparison.\n",
+ "\n",
+ "As such `ComparisonLevels`are nested within `Comparisons` as follows:\n",
+ "\n",
+ "```\n",
+ "Data Linking Model\n",
+ "├─-- Comparison: Date of birth\n",
+ "│ ├─-- ComparisonLevel: Exact match\n",
+ "│ ├─-- ComparisonLevel: One character difference\n",
+ "│ ├─-- ComparisonLevel: All other\n",
+ "├─-- Comparison: Surname\n",
+ "│ ├─-- ComparisonLevel: Exact match on surname\n",
+ "│ ├─-- ComparisonLevel: All other\n",
+ "│ etc.\n",
+ "```\n",
+ "\n",
+ "Our example data would therefore result in the following comparisons, for `dob` and `surname`:\n",
+ "\n",
+ "| dob_l | dob_r | comparison_level | interpretation |\n",
+ "| ---------- | ---------- | ------------------------ | -------------- |\n",
+ "| 1971-05-24 | 1971-05-24 | Exact match | great match |\n",
+ "| 1971-05-24 | 1971-06-24 | One character difference | fuzzy match |\n",
+ "| 1971-05-24 | 2000-01-02 | All other | bad match |\n",
+ "\n",
+ "
\n",
+ "\n",
+ "| surname_l | surname_r | comparison_level | interpretation |\n",
+ "| --------- | --------- | ---------------- | ----------------------------------------------------- |\n",
+ "| Rob | Rob | Exact match | great match |\n",
+ "| Rob | Jane | All other | bad match |\n",
+ "| Rob | Robert | All other | bad match, this comparison has no notion of nicknames |\n",
+ "\n",
+ "More information about specifying comparisons can be found [here](../../topic_guides/comparisons/customising_comparisons.ipynb) and [here](../../topic_guides/comparisons/comparisons_and_comparison_levels.md).\n",
+ "\n",
+ "We will now use these concepts to build a data linking model.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9ceef6f1",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:21.934832Z",
+ "iopub.status.busy": "2024-07-11T13:34:21.934445Z",
+ "iopub.status.idle": "2024-07-11T13:34:21.956453Z",
+ "shell.execute_reply": "2024-07-11T13:34:21.955605Z"
+ },
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "aa6a9e30",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:21.961236Z",
+ "iopub.status.busy": "2024-07-11T13:34:21.960890Z",
+ "iopub.status.idle": "2024-07-11T13:34:23.786243Z",
+ "shell.execute_reply": "2024-07-11T13:34:23.785015Z"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Begin by reading in the tutorial data again\n",
+ "from splink import splink_datasets\n",
+ "\n",
+ "df = splink_datasets.fake_1000"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02000a24",
+ "metadata": {},
+ "source": [
+ "### Specifying the model using comparisons\n",
+ "\n",
+ "Splink includes a library of comparison functions at `splink.comparison_library` to make it simple to get started. These are split into two categories:\n",
+ "\n",
+ "1. Generic `Comparison` functions which apply a particular fuzzy matching function. For example, levenshtein distance.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4b7159fb",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:23.790520Z",
+ "iopub.status.busy": "2024-07-11T13:34:23.790213Z",
+ "iopub.status.idle": "2024-07-11T13:34:23.818960Z",
+ "shell.execute_reply": "2024-07-11T13:34:23.818252Z"
+ },
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "raw",
- "id": "98e133ed-7f91-4e11-9f6b-abde9a67f980",
- "metadata": {},
- "source": [
- "# Specifying and estimating a linkage model\n",
- "\n",
- "\n",
- "
\n",
- "\n",
- "\n",
- "In the last tutorial we looked at how we can use blocking rules to generate pairwise record comparisons.\n",
- "\n",
- "Now it's time to estimate a probabilistic linkage model to score each of these comparisons. The resultant match score is a prediction of whether the two records represent the same entity (e.g. are the same person).\n",
- "\n",
- "The purpose of estimating the model is to learn the relative importance of different parts of your data for the purpose of data linking.\n",
- "\n",
- "For example, a match on date of birth is a much stronger indicator that two records refer to the same entity than a match on gender. A mismatch on gender may be a stronger indicate against two records referring than a mismatch on name, since names are more likely to be entered differently.\n",
- "\n",
- "The relative importance of different information is captured in the (partial) 'match weights', which can be learned from your data. These match weights are then added up to compute the overall match score.\n",
- "\n",
- "The match weights are are derived from the `m` and `u` parameters of the underlying Fellegi Sunter model. Splink uses various statistical routines to estimate these parameters. Further details of the underlying theory can be found [here](https://www.robinlinacre.com/intro_to_probabilistic_linkage/), which will help you understand this part of the tutorial.\n"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'LevenshteinAtThresholds' of \"city\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n",
+ " - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n",
+ " - 'Levenshtein distance of city <= 2' with SQL rule: levenshtein(\"city_l\", \"city_r\") <= 2\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import splink.comparison_library as cl\n",
+ "\n",
+ "city_comparison = cl.LevenshteinAtThresholds(\"city\", 2)\n",
+ "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0a6cc8b",
+ "metadata": {},
+ "source": [
+ "2. `Comparison` functions tailored for specific data types. For example, email."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bd6143e7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:23.822972Z",
+ "iopub.status.busy": "2024-07-11T13:34:23.822670Z",
+ "iopub.status.idle": "2024-07-11T13:34:23.844781Z",
+ "shell.execute_reply": "2024-07-11T13:34:23.844138Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "0f104340",
- "metadata": {},
- "source": [
- "## Specifying a linkage model\n",
- "\n",
- "To build a linkage model, the user defines the partial match weights that `splink` needs to estimate. This is done by defining how the information in the input records should be compared.\n",
- "\n",
- "To be concrete, here is an example comparison:\n",
- "\n",
- "| first_name_l | first_name_r | surname_l | surname_r | dob_l | dob_r | city_l | city_r | email_l | email_r |\n",
- "| ------------ | ------------ | --------- | --------- | ---------- | ---------- | ------ | ------ | ------------------- | ------------------- |\n",
- "| Robert | Rob | Allen | Allen | 1971-05-24 | 1971-06-24 | nan | London | roberta25@smith.net | roberta25@smith.net |\n",
- "\n",
- "What functions should we use to assess the similarity of `Rob` vs. `Robert` in the the `first_name` field?\n",
- "\n",
- "Should similarity in the `dob` field be computed in the same way, or a different way?\n",
- "\n",
- "Your job as the developer of a linkage model is to decide what comparisons are most appropriate for the types of data you have.\n",
- "\n",
- "Splink can then estimate how much weight to place on a fuzzy match of `Rob` vs. `Robert`, relative to an exact match on `Robert`, or a non-match.\n",
- "\n",
- "Defining these scenarios is done using `Comparison`s.\n"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Comparison 'EmailComparison' of \"email\".\n",
+ "Similarity is assessed using the following ComparisonLevels:\n",
+ " - 'email is NULL' with SQL rule: \"email_l\" IS NULL OR \"email_r\" IS NULL\n",
+ " - 'Exact match on email' with SQL rule: \"email_l\" = \"email_r\"\n",
+ " - 'Exact match on username' with SQL rule: NULLIF(regexp_extract(\"email_l\", '^[^@]+', 0), '') = NULLIF(regexp_extract(\"email_r\", '^[^@]+', 0), '')\n",
+ " - 'Jaro-Winkler distance of email >= 0.88' with SQL rule: jaro_winkler_similarity(\"email_l\", \"email_r\") >= 0.88\n",
+ " - 'Jaro-Winkler >0.88 on username' with SQL rule: jaro_winkler_similarity(NULLIF(regexp_extract(\"email_l\", '^[^@]+', 0), ''), NULLIF(regexp_extract(\"email_r\", '^[^@]+', 0), '')) >= 0.88\n",
+ " - 'All other comparisons' with SQL rule: ELSE\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "email_comparison = cl.EmailComparison(\"email\")\n",
+ "print(email_comparison.get_comparison(\"duckdb\").human_readable_description)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47b7677a",
+ "metadata": {},
+ "source": [
+ "## Specifying the full settings dictionary\n",
+ "\n",
+ "`Comparisons` are specified as part of the Splink `settings`, a Python dictionary which controls all of the configuration of a Splink model:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0fa0611a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:23.848567Z",
+ "iopub.status.busy": "2024-07-11T13:34:23.848319Z",
+ "iopub.status.idle": "2024-07-11T13:34:24.152927Z",
+ "shell.execute_reply": "2024-07-11T13:34:24.152375Z"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from splink import Linker, SettingsCreator, block_on, DuckDBAPI\n",
+ "\n",
+ "settings = SettingsCreator(\n",
+ " link_type=\"dedupe_only\",\n",
+ " comparisons=[\n",
+ " cl.NameComparison(\"first_name\"),\n",
+ " cl.NameComparison(\"surname\"),\n",
+ " cl.LevenshteinAtThresholds(\"dob\", 1),\n",
+ " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
+ " cl.EmailComparison(\"email\"),\n",
+ " ],\n",
+ " blocking_rules_to_generate_predictions=[\n",
+ " block_on(\"first_name\", \"city\"),\n",
+ " block_on(\"surname\"),\n",
+ "\n",
+ " ],\n",
+ " retain_intermediate_calculation_columns=True,\n",
+ ")\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df_sdf = db_api.register(df)\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "657a1fb8",
+ "metadata": {},
+ "source": [
+ "In words, this setting dictionary says:\n",
+ "\n",
+ "- We are performing a `dedupe_only` (the other options are `link_only`, or `link_and_dedupe`, which may be used if there are multiple input datasets).\n",
+ "- When comparing records, we will use information from the `first_name`, `surname`, `dob`, `city` and `email` columns to compute a match score.\n",
+ "- The `blocking_rules_to_generate_predictions` states that we will only check for duplicates amongst records where either the `first_name AND city` or `surname` is identical.\n",
+ "- We have enabled [term frequency adjustments](https://moj-analytical-services.github.io/splink/topic_guides/comparisons/term-frequency.html) for the 'city' column, because some values (e.g. `London`) appear much more frequently than others.\n",
+ "- We have set `retain_intermediate_calculation_columns` and `additional_columns_to_retain` to `True` so that Splink outputs additional information that helps the user understand the calculations. If they were `False`, the computations would run faster.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afa31386",
+ "metadata": {},
+ "source": [
+ "## Estimate the parameters of the model\n",
+ "\n",
+ "Now that we have specified our linkage model, we need to estimate the [`probability_two_random_records_match`](../../api_docs//settings_dict_guide.md#probability_two_random_records_match), `u`, and `m` parameters.\n",
+ "\n",
+ "- The `probability_two_random_records_match` parameter is the probability that two records taken at random from your input data represent a match (typically a very small number).\n",
+ "\n",
+ "- The `u` values are the proportion of records falling into each `ComparisonLevel` amongst truly _non-matching_ records.\n",
+ "\n",
+ "- The `m` values are the proportion of records falling into each `ComparisonLevel` amongst truly _matching_ records\n",
+ "\n",
+ "You can read more about [the theory of what these mean](https://www.robinlinacre.com/m_and_u_values/).\n",
+ "\n",
+ "We can estimate these parameters using unlabeled data. If we have labels, then we can estimate them even more accurately.\n",
+ "\n",
+ "The rationale for the approach recommended in this tutorial is documented [here](../../topic_guides/training/training_rationale.md).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2871ac6",
+ "metadata": {},
+ "source": [
+ "### Estimation of `probability_two_random_records_match`\n",
+ "\n",
+ "In some cases, the `probability_two_random_records_match` will be known. For example, if you are linking two tables of 10,000 records and expect a one-to-one match, then you should set this value to `1/10_000` in your settings instead of estimating it.\n",
+ "\n",
+ "More generally, this parameter is unknown and needs to be estimated.\n",
+ "\n",
+ "It can be estimated accurately enough for most purposes by combining a series of deterministic matching rules and a guess of the recall corresponding to those rules. For further details of the rationale behind this appraoch see [here](https://github.com/moj-analytical-services/splink/issues/462#issuecomment-1227027995).\n",
+ "\n",
+ "In this example, I guess that the following deterministic matching rules have a recall of about 70%. That means, between them, the rules recover 70% of all true matches.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "cbf92120",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:24.156645Z",
+ "iopub.status.busy": "2024-07-11T13:34:24.156410Z",
+ "iopub.status.idle": "2024-07-11T13:34:24.279603Z",
+ "shell.execute_reply": "2024-07-11T13:34:24.279002Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "8a520392",
- "metadata": {},
- "source": [
- "### Comparisons\n",
- "\n",
- "The concept of a `Comparison` has a specific definition within Splink: it defines how data from one or more input columns is compared.\n",
- "\n",
- "For example, one `Comparison` may represent how similarity is assessed for a person's date of birth.\n",
- "\n",
- "Another `Comparison` may represent the comparison of a person's name or location.\n",
- "\n",
- "A model is composed of many `Comparison`s, which between them assess the similarity of all of the columns being used for data linking.\n",
- "\n",
- "Each `Comparison` contains two or more `ComparisonLevels` which define _n_ discrete gradations of similarity between the input columns within the Comparison.\n",
- "\n",
- "As such `ComparisonLevels`are nested within `Comparisons` as follows:\n",
- "\n",
- "```\n",
- "Data Linking Model\n",
- "├─-- Comparison: Date of birth\n",
- "│ ├─-- ComparisonLevel: Exact match\n",
- "│ ├─-- ComparisonLevel: One character difference\n",
- "│ ├─-- ComparisonLevel: All other\n",
- "├─-- Comparison: Surname\n",
- "│ ├─-- ComparisonLevel: Exact match on surname\n",
- "│ ├─-- ComparisonLevel: All other\n",
- "│ etc.\n",
- "```\n",
- "\n",
- "Our example data would therefore result in the following comparisons, for `dob` and `surname`:\n",
- "\n",
- "| dob_l | dob_r | comparison_level | interpretation |\n",
- "| ---------- | ---------- | ------------------------ | -------------- |\n",
- "| 1971-05-24 | 1971-05-24 | Exact match | great match |\n",
- "| 1971-05-24 | 1971-06-24 | One character difference | fuzzy match |\n",
- "| 1971-05-24 | 2000-01-02 | All other | bad match |\n",
- "\n",
- "
\n",
- "\n",
- "| surname_l | surname_r | comparison_level | interpretation |\n",
- "| --------- | --------- | ---------------- | ----------------------------------------------------- |\n",
- "| Rob | Rob | Exact match | great match |\n",
- "| Rob | Jane | All other | bad match |\n",
- "| Rob | Robert | All other | bad match, this comparison has no notion of nicknames |\n",
- "\n",
- "More information about specifying comparisons can be found [here](../../topic_guides/comparisons/customising_comparisons.ipynb) and [here](../../topic_guides/comparisons/comparisons_and_comparison_levels.md).\n",
- "\n",
- "We will now use these concepts to build a data linking model.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Probability two random records match is estimated to be 0.00298.\n",
+ "This means that amongst all possible pairwise record comparisons, one in 335.56 are expected to match. With 499,500 total possible comparisons, we expect a total of around 1,488.57 matching pairs\n"
+ ]
+ }
+ ],
+ "source": [
+ "deterministic_rules = [\n",
+ " block_on(\"first_name\", \"dob\"),\n",
+ " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
+ " block_on(\"email\")\n",
+ "]\n",
+ "\n",
+ "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7712860b",
+ "metadata": {},
+ "source": [
+ "### Estimation of `u` probabilities\n",
+ "\n",
+ "Once we have the `probability_two_random_records_match` parameter, we can estimate the `u` probabilities.\n",
+ "\n",
+ "We estimate `u` using the `estimate_u_using_random_sampling` method, which doesn't require any labels.\n",
+ "\n",
+ "It works by sampling random pairs of records, since most of these pairs are going to be non-matches. Over these non-matches we compute the distribution of `ComparisonLevel`s for each `Comparison`.\n",
+ "\n",
+ "For instance, for `gender`, we would find that the the gender matches 50% of the time, and mismatches 50% of the time.\n",
+ "\n",
+ "For `dob` on the other hand, we would find that the `dob` matches 1% of the time, has a \"one character difference\" 3% of the time, and everything else happens 96% of the time.\n",
+ "\n",
+ "The larger the random sample, the more accurate the predictions. You control this using the `max_pairs` parameter. For large datasets, we recommend using at least 10 million - but the higher the better and 1 billion is often appropriate for larger datasets.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b8d49e7a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:24.282770Z",
+ "iopub.status.busy": "2024-07-11T13:34:24.282515Z",
+ "iopub.status.idle": "2024-07-11T13:34:24.793159Z",
+ "shell.execute_reply": "2024-07-11T13:34:24.792198Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 1,
- "id": "9ceef6f1",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:21.934832Z",
- "iopub.status.busy": "2024-07-11T13:34:21.934445Z",
- "iopub.status.idle": "2024-07-11T13:34:21.956453Z",
- "shell.execute_reply": "2024-07-11T13:34:21.955605Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "----- Estimating u probabilities using random sampling -----\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 2,
- "id": "aa6a9e30",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:21.961236Z",
- "iopub.status.busy": "2024-07-11T13:34:21.960890Z",
- "iopub.status.idle": "2024-07-11T13:34:23.786243Z",
- "shell.execute_reply": "2024-07-11T13:34:23.785015Z"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "# Begin by reading in the tutorial data again\n",
- "from splink import splink_datasets\n",
- "\n",
- "df = splink_datasets.fake_1000"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Estimated u probabilities using random sampling\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "02000a24",
- "metadata": {},
- "source": [
- "### Specifying the model using comparisons\n",
- "\n",
- "Splink includes a library of comparison functions at `splink.comparison_library` to make it simple to get started. These are split into two categories:\n",
- "\n",
- "1. Generic `Comparison` functions which apply a particular fuzzy matching function. For example, levenshtein distance.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - dob (no m values are trained).\n",
+ " - city (no m values are trained).\n",
+ " - email (no m values are trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a73921b7",
+ "metadata": {},
+ "source": [
+ "### Estimation of `m` probabilities\n",
+ "\n",
+ "`m` is the trickiest of the parameters to estimate, because we have to have some idea of what the true matches are.\n",
+ "\n",
+ "If we have labels, we can directly estimate it.\n",
+ "\n",
+ "If we do not have labelled data, the `m` parameters can be estimated using an iterative maximum likelihood approach called Expectation Maximisation.\n",
+ "\n",
+ "#### Estimating directly\n",
+ "\n",
+ "If we have labels, we can estimate `m` directly using the `estimate_m_from_label_column` method of the linker.\n",
+ "\n",
+ "For example, if the entity being matched is persons, and your input dataset(s) contain social security number, this could be used to estimate the m values for the model.\n",
+ "\n",
+ "Note that this column does not need to be fully populated. A common case is where a unique identifier such as social security number is only partially populated.\n",
+ "\n",
+ "For example (in this tutorial we don't have labels, so we're not actually going to use this):\n",
+ "\n",
+ "```python\n",
+ "linker.training.estimate_m_from_label_column(\"social_security_number\")\n",
+ "```\n",
+ "\n",
+ "#### Estimating with Expectation Maximisation\n",
+ "\n",
+ "This algorithm estimates the `m` values by generating pairwise record comparisons, and using them to maximise a likelihood function.\n",
+ "\n",
+ "Each estimation pass requires the user to configure an estimation blocking rule to reduce the number of record comparisons generated to a manageable level.\n",
+ "\n",
+ "In our first estimation pass, we block on `first_name` and `surname`, meaning we will generate all record comparisons that have `first_name` and `surname` exactly equal.\n",
+ "\n",
+ "Recall we are trying to estimate the `m` values of the model, i.e. proportion of records falling into each `ComparisonLevel` amongst truly matching records.\n",
+ "\n",
+ "This means that, in this training session, we cannot estimate parameter estimates for the `first_name` or `surname` columns, since they will be equal for all the comparisons we do.\n",
+ "\n",
+ "We can, however, estimate parameter estimates for all of the other columns. The output messages produced by Splink confirm this.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "098f0a40",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:24.796381Z",
+ "iopub.status.busy": "2024-07-11T13:34:24.796116Z",
+ "iopub.status.idle": "2024-07-11T13:34:25.542726Z",
+ "shell.execute_reply": "2024-07-11T13:34:25.541946Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 3,
- "id": "4b7159fb",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:23.790520Z",
- "iopub.status.busy": "2024-07-11T13:34:23.790213Z",
- "iopub.status.idle": "2024-07-11T13:34:23.818960Z",
- "shell.execute_reply": "2024-07-11T13:34:23.818252Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Comparison 'LevenshteinAtThresholds' of \"city\".\n",
- "Similarity is assessed using the following ComparisonLevels:\n",
- " - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n",
- " - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n",
- " - 'Levenshtein distance of city <= 2' with SQL rule: levenshtein(\"city_l\", \"city_r\") <= 2\n",
- " - 'All other comparisons' with SQL rule: ELSE\n",
- "\n"
- ]
- }
- ],
- "source": [
- "import splink.comparison_library as cl\n",
- "\n",
- "city_comparison = cl.LevenshteinAtThresholds(\"city\", 2)\n",
- "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - dob\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - first_name\n",
+ " - surname\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "f0a6cc8b",
- "metadata": {},
- "source": [
- "2. `Comparison` functions tailored for specific data types. For example, email."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 4,
- "id": "bd6143e7",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:23.822972Z",
- "iopub.status.busy": "2024-07-11T13:34:23.822670Z",
- "iopub.status.idle": "2024-07-11T13:34:23.844781Z",
- "shell.execute_reply": "2024-07-11T13:34:23.844138Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Comparison 'EmailComparison' of \"email\".\n",
- "Similarity is assessed using the following ComparisonLevels:\n",
- " - 'email is NULL' with SQL rule: \"email_l\" IS NULL OR \"email_r\" IS NULL\n",
- " - 'Exact match on email' with SQL rule: \"email_l\" = \"email_r\"\n",
- " - 'Exact match on username' with SQL rule: NULLIF(regexp_extract(\"email_l\", '^[^@]+', 0), '') = NULLIF(regexp_extract(\"email_r\", '^[^@]+', 0), '')\n",
- " - 'Jaro-Winkler distance of email >= 0.88' with SQL rule: jaro_winkler_similarity(\"email_l\", \"email_r\") >= 0.88\n",
- " - 'Jaro-Winkler >0.88 on username' with SQL rule: jaro_winkler_similarity(NULLIF(regexp_extract(\"email_l\", '^[^@]+', 0), ''), NULLIF(regexp_extract(\"email_r\", '^[^@]+', 0), '')) >= 0.88\n",
- " - 'All other comparisons' with SQL rule: ELSE\n",
- "\n"
- ]
- }
- ],
- "source": [
- "email_comparison = cl.EmailComparison(\"email\")\n",
- "print(email_comparison.get_comparison(\"duckdb\").human_readable_description)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:\n",
+ "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
+ "\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "47b7677a",
- "metadata": {},
- "source": [
- "## Specifying the full settings dictionary\n",
- "\n",
- "`Comparisons` are specified as part of the Splink `settings`, a Python dictionary which controls all of the configuration of a Splink model:\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 1: Largest change in params was -0.521 in the m_probability of dob, level `Exact match on dob`\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 5,
- "id": "0fa0611a",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:23.848567Z",
- "iopub.status.busy": "2024-07-11T13:34:23.848319Z",
- "iopub.status.idle": "2024-07-11T13:34:24.152927Z",
- "shell.execute_reply": "2024-07-11T13:34:24.152375Z"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "from splink import Linker, SettingsCreator, block_on, DuckDBAPI\n",
- "\n",
- "settings = SettingsCreator(\n",
- " link_type=\"dedupe_only\",\n",
- " comparisons=[\n",
- " cl.NameComparison(\"first_name\"),\n",
- " cl.NameComparison(\"surname\"),\n",
- " cl.LevenshteinAtThresholds(\"dob\", 1),\n",
- " cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
- " cl.EmailComparison(\"email\"),\n",
- " ],\n",
- " blocking_rules_to_generate_predictions=[\n",
- " block_on(\"first_name\", \"city\"),\n",
- " block_on(\"surname\"),\n",
- "\n",
- " ],\n",
- " retain_intermediate_calculation_columns=True,\n",
- ")\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 2: Largest change in params was 0.0516 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "657a1fb8",
- "metadata": {},
- "source": [
- "In words, this setting dictionary says:\n",
- "\n",
- "- We are performing a `dedupe_only` (the other options are `link_only`, or `link_and_dedupe`, which may be used if there are multiple input datasets).\n",
- "- When comparing records, we will use information from the `first_name`, `surname`, `dob`, `city` and `email` columns to compute a match score.\n",
- "- The `blocking_rules_to_generate_predictions` states that we will only check for duplicates amongst records where either the `first_name AND city` or `surname` is identical.\n",
- "- We have enabled [term frequency adjustments](https://moj-analytical-services.github.io/splink/topic_guides/comparisons/term-frequency.html) for the 'city' column, because some values (e.g. `London`) appear much more frequently than others.\n",
- "- We have set `retain_intermediate_calculation_columns` and `additional_columns_to_retain` to `True` so that Splink outputs additional information that helps the user understand the calculations. If they were `False`, the computations would run faster.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 3: Largest change in params was 0.0183 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "afa31386",
- "metadata": {},
- "source": [
- "## Estimate the parameters of the model\n",
- "\n",
- "Now that we have specified our linkage model, we need to estimate the [`probability_two_random_records_match`](../../api_docs//settings_dict_guide.md#probability_two_random_records_match), `u`, and `m` parameters.\n",
- "\n",
- "- The `probability_two_random_records_match` parameter is the probability that two records taken at random from your input data represent a match (typically a very small number).\n",
- "\n",
- "- The `u` values are the proportion of records falling into each `ComparisonLevel` amongst truly _non-matching_ records.\n",
- "\n",
- "- The `m` values are the proportion of records falling into each `ComparisonLevel` amongst truly _matching_ records\n",
- "\n",
- "You can read more about [the theory of what these mean](https://www.robinlinacre.com/m_and_u_values/).\n",
- "\n",
- "We can estimate these parameters using unlabeled data. If we have labels, then we can estimate them even more accurately.\n",
- "\n",
- "The rationale for the approach recommended in this tutorial is documented [here](../../topic_guides/training/training_rationale.md).\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 4: Largest change in params was 0.00744 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "c2871ac6",
- "metadata": {},
- "source": [
- "### Estimation of `probability_two_random_records_match`\n",
- "\n",
- "In some cases, the `probability_two_random_records_match` will be known. For example, if you are linking two tables of 10,000 records and expect a one-to-one match, then you should set this value to `1/10_000` in your settings instead of estimating it.\n",
- "\n",
- "More generally, this parameter is unknown and needs to be estimated.\n",
- "\n",
- "It can be estimated accurately enough for most purposes by combining a series of deterministic matching rules and a guess of the recall corresponding to those rules. For further details of the rationale behind this appraoch see [here](https://github.com/moj-analytical-services/splink/issues/462#issuecomment-1227027995).\n",
- "\n",
- "In this example, I guess that the following deterministic matching rules have a recall of about 70%. That means, between them, the rules recover 70% of all true matches.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 5: Largest change in params was 0.00349 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 6,
- "id": "cbf92120",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:24.156645Z",
- "iopub.status.busy": "2024-07-11T13:34:24.156410Z",
- "iopub.status.idle": "2024-07-11T13:34:24.279603Z",
- "shell.execute_reply": "2024-07-11T13:34:24.279002Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Probability two random records match is estimated to be 0.00298.\n",
- "This means that amongst all possible pairwise record comparisons, one in 335.56 are expected to match. With 499,500 total possible comparisons, we expect a total of around 1,488.57 matching pairs\n"
- ]
- }
- ],
- "source": [
- "deterministic_rules = [\n",
- " block_on(\"first_name\", \"dob\"),\n",
- " \"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2\",\n",
- " block_on(\"email\")\n",
- "]\n",
- "\n",
- "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 6: Largest change in params was 0.00183 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "7712860b",
- "metadata": {},
- "source": [
- "### Estimation of `u` probabilities\n",
- "\n",
- "Once we have the `probability_two_random_records_match` parameter, we can estimate the `u` probabilities.\n",
- "\n",
- "We estimate `u` using the `estimate_u_using_random_sampling` method, which doesn't require any labels.\n",
- "\n",
- "It works by sampling random pairs of records, since most of these pairs are going to be non-matches. Over these non-matches we compute the distribution of `ComparisonLevel`s for each `Comparison`.\n",
- "\n",
- "For instance, for `gender`, we would find that the the gender matches 50% of the time, and mismatches 50% of the time.\n",
- "\n",
- "For `dob` on the other hand, we would find that the `dob` matches 1% of the time, has a \"one character difference\" 3% of the time, and everything else happens 96% of the time.\n",
- "\n",
- "The larger the random sample, the more accurate the predictions. You control this using the `max_pairs` parameter. For large datasets, we recommend using at least 10 million - but the higher the better and 1 billion is often appropriate for larger datasets.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 7: Largest change in params was 0.00103 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 7,
- "id": "b8d49e7a",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:24.282770Z",
- "iopub.status.busy": "2024-07-11T13:34:24.282515Z",
- "iopub.status.idle": "2024-07-11T13:34:24.793159Z",
- "shell.execute_reply": "2024-07-11T13:34:24.792198Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "----- Estimating u probabilities using random sampling -----\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "Estimated u probabilities using random sampling\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - dob (no m values are trained).\n",
- " - city (no m values are trained).\n",
- " - email (no m values are trained).\n"
- ]
- }
- ],
- "source": [
- "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 8: Largest change in params was 0.000607 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "a73921b7",
- "metadata": {},
- "source": [
- "### Estimation of `m` probabilities\n",
- "\n",
- "`m` is the trickiest of the parameters to estimate, because we have to have some idea of what the true matches are.\n",
- "\n",
- "If we have labels, we can directly estimate it.\n",
- "\n",
- "If we do not have labelled data, the `m` parameters can be estimated using an iterative maximum likelihood approach called Expectation Maximisation.\n",
- "\n",
- "#### Estimating directly\n",
- "\n",
- "If we have labels, we can estimate `m` directly using the `estimate_m_from_label_column` method of the linker.\n",
- "\n",
- "For example, if the entity being matched is persons, and your input dataset(s) contain social security number, this could be used to estimate the m values for the model.\n",
- "\n",
- "Note that this column does not need to be fully populated. A common case is where a unique identifier such as social security number is only partially populated.\n",
- "\n",
- "For example (in this tutorial we don't have labels, so we're not actually going to use this):\n",
- "\n",
- "```python\n",
- "linker.training.estimate_m_from_label_column(\"social_security_number\")\n",
- "```\n",
- "\n",
- "#### Estimating with Expectation Maximisation\n",
- "\n",
- "This algorithm estimates the `m` values by generating pairwise record comparisons, and using them to maximise a likelihood function.\n",
- "\n",
- "Each estimation pass requires the user to configure an estimation blocking rule to reduce the number of record comparisons generated to a manageable level.\n",
- "\n",
- "In our first estimation pass, we block on `first_name` and `surname`, meaning we will generate all record comparisons that have `first_name` and `surname` exactly equal.\n",
- "\n",
- "Recall we are trying to estimate the `m` values of the model, i.e. proportion of records falling into each `ComparisonLevel` amongst truly matching records.\n",
- "\n",
- "This means that, in this training session, we cannot estimate parameter estimates for the `first_name` or `surname` columns, since they will be equal for all the comparisons we do.\n",
- "\n",
- "We can, however, estimate parameter estimates for all of the other columns. The output messages produced by Splink confirm this.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 9: Largest change in params was 0.000367 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 8,
- "id": "098f0a40",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:24.796381Z",
- "iopub.status.busy": "2024-07-11T13:34:24.796116Z",
- "iopub.status.idle": "2024-07-11T13:34:25.542726Z",
- "shell.execute_reply": "2024-07-11T13:34:25.541946Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Estimating the m probabilities of the model by blocking on:\n",
- "(l.\"first_name\" = r.\"first_name\") AND (l.\"surname\" = r.\"surname\")\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - dob\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - first_name\n",
- " - surname\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING:\n",
- "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 1: Largest change in params was -0.521 in the m_probability of dob, level `Exact match on dob`\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 2: Largest change in params was 0.0516 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 3: Largest change in params was 0.0183 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 4: Largest change in params was 0.00744 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 5: Largest change in params was 0.00349 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 6: Largest change in params was 0.00183 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 7: Largest change in params was 0.00103 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 8: Largest change in params was 0.000607 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 9: Largest change in params was 0.000367 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 10: Largest change in params was 0.000226 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 11: Largest change in params was 0.00014 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 12: Largest change in params was 8.73e-05 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "EM converged after 12 iterations\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - first_name (no m values are trained).\n",
- " - surname (no m values are trained).\n",
- " - email (some m values are not trained).\n"
- ]
- }
- ],
- "source": [
- "training_blocking_rule = block_on(\"first_name\", \"surname\")\n",
- "training_session_fname_sname = (\n",
- " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 10: Largest change in params was 0.000226 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "92bd4a31",
- "metadata": {},
- "source": [
- "In a second estimation pass, we block on dob. This allows us to estimate parameters for the `first_name` and `surname` comparisons.\n",
- "\n",
- "Between the two estimation passes, we now have parameter estimates for all comparisons.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 11: Largest change in params was 0.00014 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 9,
- "id": "ac8d3264",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:25.546223Z",
- "iopub.status.busy": "2024-07-11T13:34:25.545981Z",
- "iopub.status.idle": "2024-07-11T13:34:26.550128Z",
- "shell.execute_reply": "2024-07-11T13:34:26.549700Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "----- Starting EM training session -----\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Estimating the m probabilities of the model by blocking on:\n",
- "l.\"dob\" = r.\"dob\"\n",
- "\n",
- "Parameter estimates will be made for the following comparison(s):\n",
- " - first_name\n",
- " - surname\n",
- " - city\n",
- " - email\n",
- "\n",
- "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
- " - dob\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING:\n",
- "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 1: Largest change in params was -0.407 in the m_probability of surname, level `Exact match on surname`\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 2: Largest change in params was 0.0929 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 3: Largest change in params was 0.0548 in the m_probability of first_name, level `All other comparisons`\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 4: Largest change in params was 0.0186 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 5: Largest change in params was 0.00758 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 6: Largest change in params was 0.00339 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 7: Largest change in params was 0.0016 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 8: Largest change in params was 0.000773 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 9: Largest change in params was 0.000379 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 10: Largest change in params was 0.000189 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Iteration 11: Largest change in params was 9.68e-05 in probability_two_random_records_match\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "EM converged after 11 iterations\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "Your model is not yet fully trained. Missing estimates for:\n",
- " - email (some m values are not trained).\n"
- ]
- }
- ],
- "source": [
- "training_blocking_rule = block_on(\"dob\")\n",
- "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
- " training_blocking_rule\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 12: Largest change in params was 8.73e-05 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "efdb0c5f",
- "metadata": {},
- "source": [
- "Note that Splink includes other algorithms for estimating m and u values, which are documented [here](https://moj-analytical-services.github.io/splink/api_docs/training.html).\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "EM converged after 12 iterations\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "38355535",
- "metadata": {},
- "source": [
- "## Visualising model parameters\n",
- "\n",
- "Splink can generate a number of charts to help you understand your model. For an introduction to these charts and how to interpret them, please see [this](https://www.youtube.com/watch?v=msz3T741KQI&t=507s) video.\n",
- "\n",
- "The final estimated match weights can be viewed in the match weights chart:\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 10,
- "id": "3a1e15cc",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:26.554693Z",
- "iopub.status.busy": "2024-07-11T13:34:26.554427Z",
- "iopub.status.idle": "2024-07-11T13:34:26.997140Z",
- "shell.execute_reply": "2024-07-11T13:34:26.996455Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.VConcatChart(...)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.match_weights_chart()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - first_name (no m values are trained).\n",
+ " - surname (no m values are trained).\n",
+ " - email (some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"first_name\", \"surname\")\n",
+ "training_session_fname_sname = (\n",
+ " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "92bd4a31",
+ "metadata": {},
+ "source": [
+ "In a second estimation pass, we block on dob. This allows us to estimate parameters for the `first_name` and `surname` comparisons.\n",
+ "\n",
+ "Between the two estimation passes, we now have parameter estimates for all comparisons.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ac8d3264",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:25.546223Z",
+ "iopub.status.busy": "2024-07-11T13:34:25.545981Z",
+ "iopub.status.idle": "2024-07-11T13:34:26.550128Z",
+ "shell.execute_reply": "2024-07-11T13:34:26.549700Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----- Starting EM training session -----\n",
+ "\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 11,
- "id": "8576c042",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:27.001281Z",
- "iopub.status.busy": "2024-07-11T13:34:27.000968Z",
- "iopub.status.idle": "2024-07-11T13:34:27.207585Z",
- "shell.execute_reply": "2024-07-11T13:34:27.206879Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.m_u_parameters_chart()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Estimating the m probabilities of the model by blocking on:\n",
+ "l.\"dob\" = r.\"dob\"\n",
+ "\n",
+ "Parameter estimates will be made for the following comparison(s):\n",
+ " - first_name\n",
+ " - surname\n",
+ " - city\n",
+ " - email\n",
+ "\n",
+ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+ " - dob\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "1f58a85a",
- "metadata": {},
- "source": [
- "We can also compare the estimates that were produced by the different EM training sessions"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 12,
- "id": "e267dc50",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:27.211362Z",
- "iopub.status.busy": "2024-07-11T13:34:27.211072Z",
- "iopub.status.idle": "2024-07-11T13:34:27.378196Z",
- "shell.execute_reply": "2024-07-11T13:34:27.377546Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.parameter_estimate_comparisons_chart()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:\n",
+ "Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value\n",
+ "\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "c44fcc26",
- "metadata": {},
- "source": [
- "### Saving the model\n",
- "\n",
- "We can save the model, including our estimated parameters, to a `.json` file, so we can use it in the next tutorial.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 1: Largest change in params was -0.407 in the m_probability of surname, level `Exact match on surname`\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 13,
- "id": "992703a7",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:27.382784Z",
- "iopub.status.busy": "2024-07-11T13:34:27.382468Z",
- "iopub.status.idle": "2024-07-11T13:34:27.405586Z",
- "shell.execute_reply": "2024-07-11T13:34:27.404875Z"
- }
- },
- "outputs": [],
- "source": [
- "settings = linker.misc.save_model_to_json(\n",
- " \"../demo_settings/saved_model_from_demo.json\", overwrite=True\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 2: Largest change in params was 0.0929 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "d07e6901-110f-449e-9534-8e24bbf1d5fb",
- "metadata": {},
- "source": [
- "## Detecting unlinkable records\n",
- "\n",
- "An interesting application of our trained model that is useful to explore before making any predictions is to detect 'unlinkable' records.\n",
- "\n",
- "Unlinkable records are those which do not contain enough information to be linked. A simple example would be a record containing only 'John Smith', and null in all other fields. This record may link to other records, but we'll never know because there's not enough information to disambiguate any potential links. Unlinkable records can be found by linking records to themselves - if, even when matched to themselves, they don't meet the match threshold score, we can be sure they will never link to anything.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 3: Largest change in params was 0.0548 in the m_probability of first_name, level `All other comparisons`\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 14,
- "id": "b0f17f7c-fa83-41b5-b2da-25ae18e11d81",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-11T13:34:27.409528Z",
- "iopub.status.busy": "2024-07-11T13:34:27.409234Z",
- "iopub.status.idle": "2024-07-11T13:34:28.142003Z",
- "shell.execute_reply": "2024-07-11T13:34:28.140889Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.evaluation.unlinkables_chart()"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 4: Largest change in params was 0.0186 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "ca2570b7-9f68-4fb2-b7a9-527233a8fcd4",
- "metadata": {},
- "source": [
- "In the above chart, we can see that about 1.3% of records in the input dataset are unlinkable at a threshold match weight of 6.11 (correponding to a match probability of around 98.6%)\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 5: Largest change in params was 0.00758 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "83fd8e7f",
- "metadata": {},
- "source": [
- "!!! note \"Further Reading\"\n",
- "\n",
- " :material-tools: For more on the model estimation tools in Splink, please refer to the [Model Training API documentation](../../api_docs/training.md).\n",
- "\n",
- " :simple-readme: For a deeper dive on:\n",
- "\n",
- " * choosing comparisons, please refer to the [Comparisons Topic Guides](../../topic_guides/comparisons/customising_comparisons.ipynb)\n",
- " * the underlying model theory, please refer to the [Fellegi Sunter Topic Guide](../../topic_guides/theory/fellegi_sunter.md)\n",
- " * model training, please refer to the Model Training Topic Guides (Coming Soon).\n",
- "\n",
- " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md).\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 6: Largest change in params was 0.00339 in probability_two_random_records_match\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "fd531cd2",
- "metadata": {},
- "source": [
- "## Next steps\n",
- "\n",
- "Now we have trained a model, we can move on to using it predict matching records.\n"
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 7: Largest change in params was 0.0016 in probability_two_random_records_match\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 8: Largest change in params was 0.000773 in probability_two_random_records_match\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 9: Largest change in params was 0.000379 in probability_two_random_records_match\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 10: Largest change in params was 0.000189 in probability_two_random_records_match\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 11: Largest change in params was 9.68e-05 in probability_two_random_records_match\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "EM converged after 11 iterations\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "m probability not trained for email - Jaro-Winkler >0.88 on username (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Your model is not yet fully trained. Missing estimates for:\n",
+ " - email (some m values are not trained).\n"
+ ]
+ }
+ ],
+ "source": [
+ "training_blocking_rule = block_on(\"dob\")\n",
+ "training_session_dob = linker.training.estimate_parameters_using_expectation_maximisation(\n",
+ " training_blocking_rule\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "efdb0c5f",
+ "metadata": {},
+ "source": [
+ "Note that Splink includes other algorithms for estimating m and u values, which are documented [here](https://moj-analytical-services.github.io/splink/api_docs/training.html).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38355535",
+ "metadata": {},
+ "source": [
+ "## Visualising model parameters\n",
+ "\n",
+ "Splink can generate a number of charts to help you understand your model. For an introduction to these charts and how to interpret them, please see [this](https://www.youtube.com/watch?v=msz3T741KQI&t=507s) video.\n",
+ "\n",
+ "The final estimated match weights can be viewed in the match weights chart:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "3a1e15cc",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:26.554693Z",
+ "iopub.status.busy": "2024-07-11T13:34:26.554427Z",
+ "iopub.status.idle": "2024-07-11T13:34:26.997140Z",
+ "shell.execute_reply": "2024-07-11T13:34:26.996455Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
+ ],
+ "source": [
+ "linker.visualisations.match_weights_chart()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "8576c042",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:27.001281Z",
+ "iopub.status.busy": "2024-07-11T13:34:27.000968Z",
+ "iopub.status.idle": "2024-07-11T13:34:27.207585Z",
+ "shell.execute_reply": "2024-07-11T13:34:27.206879Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.HConcatChart(...)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.m_u_parameters_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1f58a85a",
+ "metadata": {},
+ "source": [
+ "We can also compare the estimates that were produced by the different EM training sessions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "e267dc50",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:27.211362Z",
+ "iopub.status.busy": "2024-07-11T13:34:27.211072Z",
+ "iopub.status.idle": "2024-07-11T13:34:27.378196Z",
+ "shell.execute_reply": "2024-07-11T13:34:27.377546Z"
}
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.parameter_estimate_comparisons_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c44fcc26",
+ "metadata": {},
+ "source": [
+ "### Saving the model\n",
+ "\n",
+ "We can save the model, including our estimated parameters, to a `.json` file, so we can use it in the next tutorial.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "992703a7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:27.382784Z",
+ "iopub.status.busy": "2024-07-11T13:34:27.382468Z",
+ "iopub.status.idle": "2024-07-11T13:34:27.405586Z",
+ "shell.execute_reply": "2024-07-11T13:34:27.404875Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "settings = linker.misc.save_model_to_json(\n",
+ " \"../demo_settings/saved_model_from_demo.json\", overwrite=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d07e6901-110f-449e-9534-8e24bbf1d5fb",
+ "metadata": {},
+ "source": [
+ "## Detecting unlinkable records\n",
+ "\n",
+ "An interesting application of our trained model that is useful to explore before making any predictions is to detect 'unlinkable' records.\n",
+ "\n",
+ "Unlinkable records are those which do not contain enough information to be linked. A simple example would be a record containing only 'John Smith', and null in all other fields. This record may link to other records, but we'll never know because there's not enough information to disambiguate any potential links. Unlinkable records can be found by linking records to themselves - if, even when matched to themselves, they don't meet the match threshold score, we can be sure they will never link to anything.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "b0f17f7c-fa83-41b5-b2da-25ae18e11d81",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-11T13:34:27.409528Z",
+ "iopub.status.busy": "2024-07-11T13:34:27.409234Z",
+ "iopub.status.idle": "2024-07-11T13:34:28.142003Z",
+ "shell.execute_reply": "2024-07-11T13:34:28.140889Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.unlinkables_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca2570b7-9f68-4fb2-b7a9-527233a8fcd4",
+ "metadata": {},
+ "source": [
+ "In the above chart, we can see that about 1.3% of records in the input dataset are unlinkable at a threshold match weight of 6.11 (correponding to a match probability of around 98.6%)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "83fd8e7f",
+ "metadata": {},
+ "source": [
+ "!!! note \"Further Reading\"\n",
+ "\n",
+ " :material-tools: For more on the model estimation tools in Splink, please refer to the [Model Training API documentation](../../api_docs/training.md).\n",
+ "\n",
+ " :simple-readme: For a deeper dive on:\n",
+ "\n",
+ " * choosing comparisons, please refer to the [Comparisons Topic Guides](../../topic_guides/comparisons/customising_comparisons.ipynb)\n",
+ " * the underlying model theory, please refer to the [Fellegi Sunter Topic Guide](../../topic_guides/theory/fellegi_sunter.md)\n",
+ " * model training, please refer to the Model Training Topic Guides (Coming Soon).\n",
+ "\n",
+ " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd531cd2",
+ "metadata": {},
+ "source": [
+ "## Next steps\n",
+ "\n",
+ "Now we have trained a model, we can move on to using it predict matching records.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 5
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/docs/demos/tutorials/05_Predicting_results.ipynb b/docs/demos/tutorials/05_Predicting_results.ipynb
index 619549fda8..95b68c153b 100644
--- a/docs/demos/tutorials/05_Predicting_results.ipynb
+++ b/docs/demos/tutorials/05_Predicting_results.ipynb
@@ -1,1031 +1,1032 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "84cca40c",
- "metadata": {},
- "source": [
- "# Predicting which records match\n",
- "\n",
- "\n",
- "
\n",
- "\n",
- "\n",
- "In the previous tutorial, we built and estimated a linkage model.\n",
- "\n",
- "In this tutorial, we will load the estimated model and use it to make predictions of which pairwise record comparisons match.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "9a445f52",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:08:04.047347Z",
- "iopub.status.busy": "2024-06-22T08:08:04.047019Z",
- "iopub.status.idle": "2024-06-22T08:08:04.053987Z",
- "shell.execute_reply": "2024-06-22T08:08:04.053286Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "84cca40c",
+ "metadata": {},
+ "source": [
+ "# Predicting which records match\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n",
+ "In the previous tutorial, we built and estimated a linkage model.\n",
+ "\n",
+ "In this tutorial, we will load the estimated model and use it to make predictions of which pairwise record comparisons match.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9a445f52",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:08:04.047347Z",
+ "iopub.status.busy": "2024-06-22T08:08:04.047019Z",
+ "iopub.status.idle": "2024-06-22T08:08:04.053987Z",
+ "shell.execute_reply": "2024-06-22T08:08:04.053286Z"
},
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "48f57034",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:08:04.057741Z",
- "iopub.status.busy": "2024-06-22T08:08:04.057465Z",
- "iopub.status.idle": "2024-06-22T08:08:06.111293Z",
- "shell.execute_reply": "2024-06-22T08:08:06.110501Z"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "from splink import Linker, DuckDBAPI, splink_datasets\n",
- "\n",
- "import pandas as pd\n",
- "\n",
- "pd.options.display.max_columns = 1000\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "df = splink_datasets.fake_1000"
- ]
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "48f57034",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:08:04.057741Z",
+ "iopub.status.busy": "2024-06-22T08:08:04.057465Z",
+ "iopub.status.idle": "2024-06-22T08:08:06.111293Z",
+ "shell.execute_reply": "2024-06-22T08:08:06.110501Z"
},
- {
- "cell_type": "markdown",
- "id": "d77b6eb8",
- "metadata": {},
- "source": [
- "## Load estimated model from previous tutorial\n"
- ]
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from splink import Linker, DuckDBAPI, splink_datasets\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "pd.options.display.max_columns = 1000\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df = splink_datasets.fake_1000\n",
+ "df_sdf = db_api.register(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d77b6eb8",
+ "metadata": {},
+ "source": [
+ "## Load estimated model from previous tutorial\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "619553a5",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:08:06.115468Z",
+ "iopub.status.busy": "2024-06-22T08:08:06.115158Z",
+ "iopub.status.idle": "2024-06-22T08:08:06.295408Z",
+ "shell.execute_reply": "2024-06-22T08:08:06.294871Z"
},
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "619553a5",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:08:06.115468Z",
- "iopub.status.busy": "2024-06-22T08:08:06.115158Z",
- "iopub.status.idle": "2024-06-22T08:08:06.295408Z",
- "shell.execute_reply": "2024-06-22T08:08:06.294871Z"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "import json\n",
- "import urllib\n",
- "\n",
- "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink/847e32508b1a9cdd7bcd2ca6c0a74e547fb69865/docs/demos/demo_settings/saved_model_from_demo.json\"\n",
- "\n",
- "with urllib.request.urlopen(url) as u:\n",
- " settings = json.loads(u.read().decode())\n",
- "\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())"
- ]
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import urllib\n",
+ "\n",
+ "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink/847e32508b1a9cdd7bcd2ca6c0a74e547fb69865/docs/demos/demo_settings/saved_model_from_demo.json\"\n",
+ "\n",
+ "with urllib.request.urlopen(url) as u:\n",
+ " settings = json.loads(u.read().decode())\n",
+ "\n",
+ "\n",
+ "linker = Linker(df_sdf, settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c1d97518",
+ "metadata": {},
+ "source": [
+ "# Predicting match weights using the trained model\n",
+ "\n",
+ "We use `linker.inference.predict()` to run the model.\n",
+ "\n",
+ "Under the hood this will:\n",
+ "\n",
+ "- Generate all pairwise record comparisons that match at least one of the `blocking_rules_to_generate_predictions`\n",
+ "\n",
+ "- Use the rules specified in the `Comparisons` to evaluate the similarity of the input data\n",
+ "\n",
+ "- Use the estimated match weights, applying term frequency adjustments where requested to produce the final `match_weight` and `match_probability` scores\n",
+ "\n",
+ "Optionally, a `threshold_match_probability` or `threshold_match_weight` can be provided, which will drop any row where the predicted score is below the threshold.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "ead23f3e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:08:06.298723Z",
+ "iopub.status.busy": "2024-06-22T08:08:06.298474Z",
+ "iopub.status.idle": "2024-06-22T08:08:06.707778Z",
+ "shell.execute_reply": "2024-06-22T08:08:06.707043Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "c1d97518",
- "metadata": {},
- "source": [
- "# Predicting match weights using the trained model\n",
- "\n",
- "We use `linker.inference.predict()` to run the model.\n",
- "\n",
- "Under the hood this will:\n",
- "\n",
- "- Generate all pairwise record comparisons that match at least one of the `blocking_rules_to_generate_predictions`\n",
- "\n",
- "- Use the rules specified in the `Comparisons` to evaluate the similarity of the input data\n",
- "\n",
- "- Use the estimated match weights, applying term frequency adjustments where requested to produce the final `match_weight` and `match_probability` scores\n",
- "\n",
- "Optionally, a `threshold_match_probability` or `threshold_match_weight` can be provided, which will drop any row where the predicted score is below the threshold.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 4,
- "id": "ead23f3e",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:08:06.298723Z",
- "iopub.status.busy": "2024-06-22T08:08:06.298474Z",
- "iopub.status.idle": "2024-06-22T08:08:06.707778Z",
- "shell.execute_reply": "2024-06-22T08:08:06.707043Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " gamma_first_name | \n",
- " tf_first_name_l | \n",
- " tf_first_name_r | \n",
- " bf_first_name | \n",
- " bf_tf_adj_first_name | \n",
- " surname_l | \n",
- " surname_r | \n",
- " gamma_surname | \n",
- " tf_surname_l | \n",
- " tf_surname_r | \n",
- " bf_surname | \n",
- " bf_tf_adj_surname | \n",
- " dob_l | \n",
- " dob_r | \n",
- " gamma_dob | \n",
- " bf_dob | \n",
- " city_l | \n",
- " city_r | \n",
- " gamma_city | \n",
- " tf_city_l | \n",
- " tf_city_r | \n",
- " bf_city | \n",
- " bf_tf_adj_city | \n",
- " email_l | \n",
- " email_r | \n",
- " gamma_email | \n",
- " tf_email_l | \n",
- " tf_email_r | \n",
- " bf_email | \n",
- " bf_tf_adj_email | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " -1.749664 | \n",
- " 0.229211 | \n",
- " 324 | \n",
- " 326 | \n",
- " Kai | \n",
- " Kai | \n",
- " 4 | \n",
- " 0.006017 | \n",
- " 0.006017 | \n",
- " 84.821765 | \n",
- " 0.962892 | \n",
- " None | \n",
- " Turner | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.007326 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2018-12-31 | \n",
- " 2009-11-03 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " London | \n",
- " London | \n",
- " 1 | \n",
- " 0.212792 | \n",
- " 0.212792 | \n",
- " 10.20126 | \n",
- " 0.259162 | \n",
- " k.t50eherand@z.ncom | \n",
- " None | \n",
- " -1 | \n",
- " 0.001267 | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " -1.626076 | \n",
- " 0.244695 | \n",
- " 25 | \n",
- " 27 | \n",
- " Gabriel | \n",
- " None | \n",
- " -1 | \n",
- " 0.001203 | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " Thomas | \n",
- " Thomas | \n",
- " 4 | \n",
- " 0.004884 | \n",
- " 0.004884 | \n",
- " 88.870507 | \n",
- " 1.001222 | \n",
- " 1977-09-13 | \n",
- " 1977-10-17 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " London | \n",
- " London | \n",
- " 1 | \n",
- " 0.212792 | \n",
- " 0.212792 | \n",
- " 10.20126 | \n",
- " 0.259162 | \n",
- " gabriel.t54@nichols.info | \n",
- " None | \n",
- " -1 | \n",
- " 0.002535 | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " -1.551265 | \n",
- " 0.254405 | \n",
- " 626 | \n",
- " 629 | \n",
- " geeorGe | \n",
- " George | \n",
- " 1 | \n",
- " 0.001203 | \n",
- " 0.014440 | \n",
- " 4.176727 | \n",
- " 1.000000 | \n",
- " Davidson | \n",
- " Davidson | \n",
- " 4 | \n",
- " 0.007326 | \n",
- " 0.007326 | \n",
- " 88.870507 | \n",
- " 0.667482 | \n",
- " 1999-05-07 | \n",
- " 2000-05-06 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " Southamptn | \n",
- " None | \n",
- " -1 | \n",
- " 0.001230 | \n",
- " NaN | \n",
- " 1.00000 | \n",
- " 1.000000 | \n",
- " None | \n",
- " gdavidson@johnson-brown.com | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.00507 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " -1.427735 | \n",
- " 0.270985 | \n",
- " 600 | \n",
- " 602 | \n",
- " Toby | \n",
- " Toby | \n",
- " 4 | \n",
- " 0.004813 | \n",
- " 0.004813 | \n",
- " 84.821765 | \n",
- " 1.203614 | \n",
- " None | \n",
- " None | \n",
- " -1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2003-04-23 | \n",
- " 2013-03-21 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " London | \n",
- " London | \n",
- " 1 | \n",
- " 0.212792 | \n",
- " 0.212792 | \n",
- " 10.20126 | \n",
- " 0.259162 | \n",
- " toby.d@menhez.com | \n",
- " None | \n",
- " -1 | \n",
- " 0.001267 | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " -1.427735 | \n",
- " 0.270985 | \n",
- " 599 | \n",
- " 602 | \n",
- " Toby | \n",
- " Toby | \n",
- " 4 | \n",
- " 0.004813 | \n",
- " 0.004813 | \n",
- " 84.821765 | \n",
- " 1.203614 | \n",
- " Haall | \n",
- " None | \n",
- " -1 | \n",
- " 0.001221 | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2003-04-23 | \n",
- " 2013-03-21 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " London | \n",
- " London | \n",
- " 1 | \n",
- " 0.212792 | \n",
- " 0.212792 | \n",
- " 10.20126 | \n",
- " 0.259162 | \n",
- " None | \n",
- " None | \n",
- " -1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
- "0 -1.749664 0.229211 324 326 Kai \n",
- "1 -1.626076 0.244695 25 27 Gabriel \n",
- "2 -1.551265 0.254405 626 629 geeorGe \n",
- "3 -1.427735 0.270985 600 602 Toby \n",
- "4 -1.427735 0.270985 599 602 Toby \n",
- "\n",
- " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
- "0 Kai 4 0.006017 0.006017 \n",
- "1 None -1 0.001203 NaN \n",
- "2 George 1 0.001203 0.014440 \n",
- "3 Toby 4 0.004813 0.004813 \n",
- "4 Toby 4 0.004813 0.004813 \n",
- "\n",
- " bf_first_name bf_tf_adj_first_name surname_l surname_r gamma_surname \\\n",
- "0 84.821765 0.962892 None Turner -1 \n",
- "1 1.000000 1.000000 Thomas Thomas 4 \n",
- "2 4.176727 1.000000 Davidson Davidson 4 \n",
- "3 84.821765 1.203614 None None -1 \n",
- "4 84.821765 1.203614 Haall None -1 \n",
- "\n",
- " tf_surname_l tf_surname_r bf_surname bf_tf_adj_surname dob_l \\\n",
- "0 NaN 0.007326 1.000000 1.000000 2018-12-31 \n",
- "1 0.004884 0.004884 88.870507 1.001222 1977-09-13 \n",
- "2 0.007326 0.007326 88.870507 0.667482 1999-05-07 \n",
- "3 NaN NaN 1.000000 1.000000 2003-04-23 \n",
- "4 0.001221 NaN 1.000000 1.000000 2003-04-23 \n",
- "\n",
- " dob_r gamma_dob bf_dob city_l city_r gamma_city tf_city_l \\\n",
- "0 2009-11-03 0 0.460743 London London 1 0.212792 \n",
- "1 1977-10-17 0 0.460743 London London 1 0.212792 \n",
- "2 2000-05-06 0 0.460743 Southamptn None -1 0.001230 \n",
- "3 2013-03-21 0 0.460743 London London 1 0.212792 \n",
- "4 2013-03-21 0 0.460743 London London 1 0.212792 \n",
- "\n",
- " tf_city_r bf_city bf_tf_adj_city email_l \\\n",
- "0 0.212792 10.20126 0.259162 k.t50eherand@z.ncom \n",
- "1 0.212792 10.20126 0.259162 gabriel.t54@nichols.info \n",
- "2 NaN 1.00000 1.000000 None \n",
- "3 0.212792 10.20126 0.259162 toby.d@menhez.com \n",
- "4 0.212792 10.20126 0.259162 None \n",
- "\n",
- " email_r gamma_email tf_email_l tf_email_r bf_email \\\n",
- "0 None -1 0.001267 NaN 1.0 \n",
- "1 None -1 0.002535 NaN 1.0 \n",
- "2 gdavidson@johnson-brown.com -1 NaN 0.00507 1.0 \n",
- "3 None -1 0.001267 NaN 1.0 \n",
- "4 None -1 NaN NaN 1.0 \n",
- "\n",
- " bf_tf_adj_email match_key \n",
- "0 1.0 0 \n",
- "1 1.0 1 \n",
- "2 1.0 1 \n",
- "3 1.0 0 \n",
- "4 1.0 0 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " gamma_first_name | \n",
+ " tf_first_name_l | \n",
+ " tf_first_name_r | \n",
+ " bf_first_name | \n",
+ " bf_tf_adj_first_name | \n",
+ " surname_l | \n",
+ " surname_r | \n",
+ " gamma_surname | \n",
+ " tf_surname_l | \n",
+ " tf_surname_r | \n",
+ " bf_surname | \n",
+ " bf_tf_adj_surname | \n",
+ " dob_l | \n",
+ " dob_r | \n",
+ " gamma_dob | \n",
+ " bf_dob | \n",
+ " city_l | \n",
+ " city_r | \n",
+ " gamma_city | \n",
+ " tf_city_l | \n",
+ " tf_city_r | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " email_l | \n",
+ " email_r | \n",
+ " gamma_email | \n",
+ " tf_email_l | \n",
+ " tf_email_r | \n",
+ " bf_email | \n",
+ " bf_tf_adj_email | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " -1.749664 | \n",
+ " 0.229211 | \n",
+ " 324 | \n",
+ " 326 | \n",
+ " Kai | \n",
+ " Kai | \n",
+ " 4 | \n",
+ " 0.006017 | \n",
+ " 0.006017 | \n",
+ " 84.821765 | \n",
+ " 0.962892 | \n",
+ " None | \n",
+ " Turner | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.007326 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 2018-12-31 | \n",
+ " 2009-11-03 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " London | \n",
+ " London | \n",
+ " 1 | \n",
+ " 0.212792 | \n",
+ " 0.212792 | \n",
+ " 10.20126 | \n",
+ " 0.259162 | \n",
+ " k.t50eherand@z.ncom | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001267 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " -1.626076 | \n",
+ " 0.244695 | \n",
+ " 25 | \n",
+ " 27 | \n",
+ " Gabriel | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001203 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " Thomas | \n",
+ " Thomas | \n",
+ " 4 | \n",
+ " 0.004884 | \n",
+ " 0.004884 | \n",
+ " 88.870507 | \n",
+ " 1.001222 | \n",
+ " 1977-09-13 | \n",
+ " 1977-10-17 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " London | \n",
+ " London | \n",
+ " 1 | \n",
+ " 0.212792 | \n",
+ " 0.212792 | \n",
+ " 10.20126 | \n",
+ " 0.259162 | \n",
+ " gabriel.t54@nichols.info | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.002535 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " -1.551265 | \n",
+ " 0.254405 | \n",
+ " 626 | \n",
+ " 629 | \n",
+ " geeorGe | \n",
+ " George | \n",
+ " 1 | \n",
+ " 0.001203 | \n",
+ " 0.014440 | \n",
+ " 4.176727 | \n",
+ " 1.000000 | \n",
+ " Davidson | \n",
+ " Davidson | \n",
+ " 4 | \n",
+ " 0.007326 | \n",
+ " 0.007326 | \n",
+ " 88.870507 | \n",
+ " 0.667482 | \n",
+ " 1999-05-07 | \n",
+ " 2000-05-06 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " Southamptn | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001230 | \n",
+ " NaN | \n",
+ " 1.00000 | \n",
+ " 1.000000 | \n",
+ " None | \n",
+ " gdavidson@johnson-brown.com | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.00507 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " -1.427735 | \n",
+ " 0.270985 | \n",
+ " 600 | \n",
+ " 602 | \n",
+ " Toby | \n",
+ " Toby | \n",
+ " 4 | \n",
+ " 0.004813 | \n",
+ " 0.004813 | \n",
+ " 84.821765 | \n",
+ " 1.203614 | \n",
+ " None | \n",
+ " None | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 2003-04-23 | \n",
+ " 2013-03-21 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " London | \n",
+ " London | \n",
+ " 1 | \n",
+ " 0.212792 | \n",
+ " 0.212792 | \n",
+ " 10.20126 | \n",
+ " 0.259162 | \n",
+ " toby.d@menhez.com | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001267 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " -1.427735 | \n",
+ " 0.270985 | \n",
+ " 599 | \n",
+ " 602 | \n",
+ " Toby | \n",
+ " Toby | \n",
+ " 4 | \n",
+ " 0.004813 | \n",
+ " 0.004813 | \n",
+ " 84.821765 | \n",
+ " 1.203614 | \n",
+ " Haall | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001221 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 2003-04-23 | \n",
+ " 2013-03-21 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " London | \n",
+ " London | \n",
+ " 1 | \n",
+ " 0.212792 | \n",
+ " 0.212792 | \n",
+ " 10.20126 | \n",
+ " 0.259162 | \n",
+ " None | \n",
+ " None | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "df_predictions = linker.inference.predict(threshold_match_probability=0.2)\n",
- "df_predictions.as_pandas_dataframe(limit=5)"
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
+ "0 -1.749664 0.229211 324 326 Kai \n",
+ "1 -1.626076 0.244695 25 27 Gabriel \n",
+ "2 -1.551265 0.254405 626 629 geeorGe \n",
+ "3 -1.427735 0.270985 600 602 Toby \n",
+ "4 -1.427735 0.270985 599 602 Toby \n",
+ "\n",
+ " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
+ "0 Kai 4 0.006017 0.006017 \n",
+ "1 None -1 0.001203 NaN \n",
+ "2 George 1 0.001203 0.014440 \n",
+ "3 Toby 4 0.004813 0.004813 \n",
+ "4 Toby 4 0.004813 0.004813 \n",
+ "\n",
+ " bf_first_name bf_tf_adj_first_name surname_l surname_r gamma_surname \\\n",
+ "0 84.821765 0.962892 None Turner -1 \n",
+ "1 1.000000 1.000000 Thomas Thomas 4 \n",
+ "2 4.176727 1.000000 Davidson Davidson 4 \n",
+ "3 84.821765 1.203614 None None -1 \n",
+ "4 84.821765 1.203614 Haall None -1 \n",
+ "\n",
+ " tf_surname_l tf_surname_r bf_surname bf_tf_adj_surname dob_l \\\n",
+ "0 NaN 0.007326 1.000000 1.000000 2018-12-31 \n",
+ "1 0.004884 0.004884 88.870507 1.001222 1977-09-13 \n",
+ "2 0.007326 0.007326 88.870507 0.667482 1999-05-07 \n",
+ "3 NaN NaN 1.000000 1.000000 2003-04-23 \n",
+ "4 0.001221 NaN 1.000000 1.000000 2003-04-23 \n",
+ "\n",
+ " dob_r gamma_dob bf_dob city_l city_r gamma_city tf_city_l \\\n",
+ "0 2009-11-03 0 0.460743 London London 1 0.212792 \n",
+ "1 1977-10-17 0 0.460743 London London 1 0.212792 \n",
+ "2 2000-05-06 0 0.460743 Southamptn None -1 0.001230 \n",
+ "3 2013-03-21 0 0.460743 London London 1 0.212792 \n",
+ "4 2013-03-21 0 0.460743 London London 1 0.212792 \n",
+ "\n",
+ " tf_city_r bf_city bf_tf_adj_city email_l \\\n",
+ "0 0.212792 10.20126 0.259162 k.t50eherand@z.ncom \n",
+ "1 0.212792 10.20126 0.259162 gabriel.t54@nichols.info \n",
+ "2 NaN 1.00000 1.000000 None \n",
+ "3 0.212792 10.20126 0.259162 toby.d@menhez.com \n",
+ "4 0.212792 10.20126 0.259162 None \n",
+ "\n",
+ " email_r gamma_email tf_email_l tf_email_r bf_email \\\n",
+ "0 None -1 0.001267 NaN 1.0 \n",
+ "1 None -1 0.002535 NaN 1.0 \n",
+ "2 gdavidson@johnson-brown.com -1 NaN 0.00507 1.0 \n",
+ "3 None -1 0.001267 NaN 1.0 \n",
+ "4 None -1 NaN NaN 1.0 \n",
+ "\n",
+ " bf_tf_adj_email match_key \n",
+ "0 1.0 0 \n",
+ "1 1.0 1 \n",
+ "2 1.0 1 \n",
+ "3 1.0 0 \n",
+ "4 1.0 0 "
]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_predictions = linker.inference.predict(threshold_match_probability=0.2)\n",
+ "df_predictions.as_pandas_dataframe(limit=5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f00370bb",
+ "metadata": {},
+ "source": [
+ "## Clustering\n",
+ "\n",
+ "The result of `linker.inference.predict()` is a list of pairwise record comparisons and their associated scores. For instance, if we have input records A, B, C and D, it could be represented conceptually as:\n",
+ "\n",
+ "```\n",
+ "A -> B with score 0.9\n",
+ "B -> C with score 0.95\n",
+ "C -> D with score 0.1\n",
+ "D -> E with score 0.99\n",
+ "```\n",
+ "\n",
+ "Often, an alternative representation of this result is more useful, where each row is an input record, and where records link, they are assigned to the same cluster.\n",
+ "\n",
+ "With a score threshold of 0.5, the above data could be represented conceptually as:\n",
+ "\n",
+ "```\n",
+ "ID, Cluster ID\n",
+ "A, 1\n",
+ "B, 1\n",
+ "C, 1\n",
+ "D, 2\n",
+ "E, 2\n",
+ "```\n",
+ "\n",
+ "The algorithm that converts between the pairwise results and the clusters is called connected components, and it is included in Splink. You can use it as follows:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "257ae717",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:08:06.711722Z",
+ "iopub.status.busy": "2024-06-22T08:08:06.711425Z",
+ "iopub.status.idle": "2024-06-22T08:08:06.756664Z",
+ "shell.execute_reply": "2024-06-22T08:08:06.755985Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "f00370bb",
- "metadata": {},
- "source": [
- "## Clustering\n",
- "\n",
- "The result of `linker.inference.predict()` is a list of pairwise record comparisons and their associated scores. For instance, if we have input records A, B, C and D, it could be represented conceptually as:\n",
- "\n",
- "```\n",
- "A -> B with score 0.9\n",
- "B -> C with score 0.95\n",
- "C -> D with score 0.1\n",
- "D -> E with score 0.99\n",
- "```\n",
- "\n",
- "Often, an alternative representation of this result is more useful, where each row is an input record, and where records link, they are assigned to the same cluster.\n",
- "\n",
- "With a score threshold of 0.5, the above data could be represented conceptually as:\n",
- "\n",
- "```\n",
- "ID, Cluster ID\n",
- "A, 1\n",
- "B, 1\n",
- "C, 1\n",
- "D, 2\n",
- "E, 2\n",
- "```\n",
- "\n",
- "The algorithm that converts between the pairwise results and the clusters is called connected components, and it is included in Splink. You can use it as follows:\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 1, root rows count 2\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 5,
- "id": "257ae717",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:08:06.711722Z",
- "iopub.status.busy": "2024-06-22T08:08:06.711425Z",
- "iopub.status.idle": "2024-06-22T08:08:06.756664Z",
- "shell.execute_reply": "2024-06-22T08:08:06.755985Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 1, root rows count 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 2, root rows count 0\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cluster_id | \n",
- " unique_id | \n",
- " first_name | \n",
- " surname | \n",
- " dob | \n",
- " city | \n",
- " email | \n",
- " cluster | \n",
- " __splink_salt | \n",
- " tf_surname | \n",
- " tf_email | \n",
- " tf_city | \n",
- " tf_first_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " Robert | \n",
- " Alan | \n",
- " 1971-06-24 | \n",
- " None | \n",
- " robert255@smith.net | \n",
- " 0 | \n",
- " 0.012924 | \n",
- " 0.001221 | \n",
- " 0.001267 | \n",
- " NaN | \n",
- " 0.003610 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " Robert | \n",
- " Allen | \n",
- " 1971-05-24 | \n",
- " None | \n",
- " roberta25@smith.net | \n",
- " 0 | \n",
- " 0.478756 | \n",
- " 0.002442 | \n",
- " 0.002535 | \n",
- " NaN | \n",
- " 0.003610 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1 | \n",
- " 2 | \n",
- " Rob | \n",
- " Allen | \n",
- " 1971-06-24 | \n",
- " London | \n",
- " roberta25@smith.net | \n",
- " 0 | \n",
- " 0.409662 | \n",
- " 0.002442 | \n",
- " 0.002535 | \n",
- " 0.212792 | \n",
- " 0.001203 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " Robert | \n",
- " Alen | \n",
- " 1971-06-24 | \n",
- " Lonon | \n",
- " None | \n",
- " 0 | \n",
- " 0.311029 | \n",
- " 0.001221 | \n",
- " NaN | \n",
- " 0.007380 | \n",
- " 0.003610 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " Grace | \n",
- " None | \n",
- " 1997-04-26 | \n",
- " Hull | \n",
- " grace.kelly52@jones.com | \n",
- " 1 | \n",
- " 0.486141 | \n",
- " NaN | \n",
- " 0.002535 | \n",
- " 0.001230 | \n",
- " 0.006017 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " Grace | \n",
- " Kelly | \n",
- " 1991-04-26 | \n",
- " None | \n",
- " grace.kelly52@jones.com | \n",
- " 1 | \n",
- " 0.434566 | \n",
- " 0.002442 | \n",
- " 0.002535 | \n",
- " NaN | \n",
- " 0.006017 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 6 | \n",
- " 6 | \n",
- " Logan | \n",
- " pMurphy | \n",
- " 1973-08-01 | \n",
- " None | \n",
- " None | \n",
- " 2 | \n",
- " 0.423760 | \n",
- " 0.001221 | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.012034 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 7 | \n",
- " 7 | \n",
- " None | \n",
- " None | \n",
- " 2015-03-03 | \n",
- " Portsmouth | \n",
- " evied56@harris-bailey.net | \n",
- " 3 | \n",
- " 0.683689 | \n",
- " NaN | \n",
- " 0.002535 | \n",
- " 0.017220 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 8 | \n",
- " 8 | \n",
- " None | \n",
- " Dean | \n",
- " 2015-03-03 | \n",
- " None | \n",
- " None | \n",
- " 3 | \n",
- " 0.553086 | \n",
- " 0.003663 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 8 | \n",
- " 9 | \n",
- " Evie | \n",
- " Dean | \n",
- " 2015-03-03 | \n",
- " Pootsmruth | \n",
- " evihd56@earris-bailey.net | \n",
- " 3 | \n",
- " 0.753070 | \n",
- " 0.003663 | \n",
- " 0.001267 | \n",
- " 0.001230 | \n",
- " 0.008424 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " cluster_id unique_id first_name surname dob city \\\n",
- "0 0 0 Robert Alan 1971-06-24 None \n",
- "1 1 1 Robert Allen 1971-05-24 None \n",
- "2 1 2 Rob Allen 1971-06-24 London \n",
- "3 3 3 Robert Alen 1971-06-24 Lonon \n",
- "4 4 4 Grace None 1997-04-26 Hull \n",
- "5 5 5 Grace Kelly 1991-04-26 None \n",
- "6 6 6 Logan pMurphy 1973-08-01 None \n",
- "7 7 7 None None 2015-03-03 Portsmouth \n",
- "8 8 8 None Dean 2015-03-03 None \n",
- "9 8 9 Evie Dean 2015-03-03 Pootsmruth \n",
- "\n",
- " email cluster __splink_salt tf_surname tf_email \\\n",
- "0 robert255@smith.net 0 0.012924 0.001221 0.001267 \n",
- "1 roberta25@smith.net 0 0.478756 0.002442 0.002535 \n",
- "2 roberta25@smith.net 0 0.409662 0.002442 0.002535 \n",
- "3 None 0 0.311029 0.001221 NaN \n",
- "4 grace.kelly52@jones.com 1 0.486141 NaN 0.002535 \n",
- "5 grace.kelly52@jones.com 1 0.434566 0.002442 0.002535 \n",
- "6 None 2 0.423760 0.001221 NaN \n",
- "7 evied56@harris-bailey.net 3 0.683689 NaN 0.002535 \n",
- "8 None 3 0.553086 0.003663 NaN \n",
- "9 evihd56@earris-bailey.net 3 0.753070 0.003663 0.001267 \n",
- "\n",
- " tf_city tf_first_name \n",
- "0 NaN 0.003610 \n",
- "1 NaN 0.003610 \n",
- "2 0.212792 0.001203 \n",
- "3 0.007380 0.003610 \n",
- "4 0.001230 0.006017 \n",
- "5 NaN 0.006017 \n",
- "6 NaN 0.012034 \n",
- "7 0.017220 NaN \n",
- "8 NaN NaN \n",
- "9 0.001230 0.008424 "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " df_predictions, threshold_match_probability=0.5\n",
- ")\n",
- "clusters.as_pandas_dataframe(limit=10)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 2, root rows count 0\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 6,
- "id": "b973f53f-6d57-4c79-a87d-fbad40f303f1",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:08:06.760329Z",
- "iopub.status.busy": "2024-06-22T08:08:06.760043Z",
- "iopub.status.idle": "2024-06-22T08:08:06.788279Z",
- "shell.execute_reply": "2024-06-22T08:08:06.787675Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " match_weight | \n",
- " match_probability | \n",
- " unique_id_l | \n",
- " unique_id_r | \n",
- " first_name_l | \n",
- " first_name_r | \n",
- " gamma_first_name | \n",
- " tf_first_name_l | \n",
- " tf_first_name_r | \n",
- " bf_first_name | \n",
- " bf_tf_adj_first_name | \n",
- " surname_l | \n",
- " surname_r | \n",
- " gamma_surname | \n",
- " tf_surname_l | \n",
- " tf_surname_r | \n",
- " bf_surname | \n",
- " bf_tf_adj_surname | \n",
- " dob_l | \n",
- " dob_r | \n",
- " gamma_dob | \n",
- " bf_dob | \n",
- " city_l | \n",
- " city_r | \n",
- " gamma_city | \n",
- " tf_city_l | \n",
- " tf_city_r | \n",
- " bf_city | \n",
- " bf_tf_adj_city | \n",
- " email_l | \n",
- " email_r | \n",
- " gamma_email | \n",
- " tf_email_l | \n",
- " tf_email_r | \n",
- " bf_email | \n",
- " bf_tf_adj_email | \n",
- " match_key | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " -1.749664 | \n",
- " 0.229211 | \n",
- " 324 | \n",
- " 326 | \n",
- " Kai | \n",
- " Kai | \n",
- " 4 | \n",
- " 0.006017 | \n",
- " 0.006017 | \n",
- " 84.821765 | \n",
- " 0.962892 | \n",
- " None | \n",
- " Turner | \n",
- " -1 | \n",
- " NaN | \n",
- " 0.007326 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 2018-12-31 | \n",
- " 2009-11-03 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " London | \n",
- " London | \n",
- " 1 | \n",
- " 0.212792 | \n",
- " 0.212792 | \n",
- " 10.20126 | \n",
- " 0.259162 | \n",
- " k.t50eherand@z.ncom | \n",
- " None | \n",
- " -1 | \n",
- " 0.001267 | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " -1.626076 | \n",
- " 0.244695 | \n",
- " 25 | \n",
- " 27 | \n",
- " Gabriel | \n",
- " None | \n",
- " -1 | \n",
- " 0.001203 | \n",
- " NaN | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " Thomas | \n",
- " Thomas | \n",
- " 4 | \n",
- " 0.004884 | \n",
- " 0.004884 | \n",
- " 88.870507 | \n",
- " 1.001222 | \n",
- " 1977-09-13 | \n",
- " 1977-10-17 | \n",
- " 0 | \n",
- " 0.460743 | \n",
- " London | \n",
- " London | \n",
- " 1 | \n",
- " 0.212792 | \n",
- " 0.212792 | \n",
- " 10.20126 | \n",
- " 0.259162 | \n",
- " gabriel.t54@nichols.info | \n",
- " None | \n",
- " -1 | \n",
- " 0.002535 | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
- "0 -1.749664 0.229211 324 326 Kai \n",
- "1 -1.626076 0.244695 25 27 Gabriel \n",
- "\n",
- " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
- "0 Kai 4 0.006017 0.006017 \n",
- "1 None -1 0.001203 NaN \n",
- "\n",
- " bf_first_name bf_tf_adj_first_name surname_l surname_r gamma_surname \\\n",
- "0 84.821765 0.962892 None Turner -1 \n",
- "1 1.000000 1.000000 Thomas Thomas 4 \n",
- "\n",
- " tf_surname_l tf_surname_r bf_surname bf_tf_adj_surname dob_l \\\n",
- "0 NaN 0.007326 1.000000 1.000000 2018-12-31 \n",
- "1 0.004884 0.004884 88.870507 1.001222 1977-09-13 \n",
- "\n",
- " dob_r gamma_dob bf_dob city_l city_r gamma_city tf_city_l \\\n",
- "0 2009-11-03 0 0.460743 London London 1 0.212792 \n",
- "1 1977-10-17 0 0.460743 London London 1 0.212792 \n",
- "\n",
- " tf_city_r bf_city bf_tf_adj_city email_l email_r \\\n",
- "0 0.212792 10.20126 0.259162 k.t50eherand@z.ncom None \n",
- "1 0.212792 10.20126 0.259162 gabriel.t54@nichols.info None \n",
- "\n",
- " gamma_email tf_email_l tf_email_r bf_email bf_tf_adj_email match_key \n",
- "0 -1 0.001267 NaN 1.0 1.0 0 \n",
- "1 -1 0.002535 NaN 1.0 1.0 1 "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cluster_id | \n",
+ " unique_id | \n",
+ " first_name | \n",
+ " surname | \n",
+ " dob | \n",
+ " city | \n",
+ " email | \n",
+ " cluster | \n",
+ " __splink_salt | \n",
+ " tf_surname | \n",
+ " tf_email | \n",
+ " tf_city | \n",
+ " tf_first_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Robert | \n",
+ " Alan | \n",
+ " 1971-06-24 | \n",
+ " None | \n",
+ " robert255@smith.net | \n",
+ " 0 | \n",
+ " 0.012924 | \n",
+ " 0.001221 | \n",
+ " 0.001267 | \n",
+ " NaN | \n",
+ " 0.003610 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Robert | \n",
+ " Allen | \n",
+ " 1971-05-24 | \n",
+ " None | \n",
+ " roberta25@smith.net | \n",
+ " 0 | \n",
+ " 0.478756 | \n",
+ " 0.002442 | \n",
+ " 0.002535 | \n",
+ " NaN | \n",
+ " 0.003610 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Rob | \n",
+ " Allen | \n",
+ " 1971-06-24 | \n",
+ " London | \n",
+ " roberta25@smith.net | \n",
+ " 0 | \n",
+ " 0.409662 | \n",
+ " 0.002442 | \n",
+ " 0.002535 | \n",
+ " 0.212792 | \n",
+ " 0.001203 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " Robert | \n",
+ " Alen | \n",
+ " 1971-06-24 | \n",
+ " Lonon | \n",
+ " None | \n",
+ " 0 | \n",
+ " 0.311029 | \n",
+ " 0.001221 | \n",
+ " NaN | \n",
+ " 0.007380 | \n",
+ " 0.003610 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " Grace | \n",
+ " None | \n",
+ " 1997-04-26 | \n",
+ " Hull | \n",
+ " grace.kelly52@jones.com | \n",
+ " 1 | \n",
+ " 0.486141 | \n",
+ " NaN | \n",
+ " 0.002535 | \n",
+ " 0.001230 | \n",
+ " 0.006017 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " Grace | \n",
+ " Kelly | \n",
+ " 1991-04-26 | \n",
+ " None | \n",
+ " grace.kelly52@jones.com | \n",
+ " 1 | \n",
+ " 0.434566 | \n",
+ " 0.002442 | \n",
+ " 0.002535 | \n",
+ " NaN | \n",
+ " 0.006017 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " Logan | \n",
+ " pMurphy | \n",
+ " 1973-08-01 | \n",
+ " None | \n",
+ " None | \n",
+ " 2 | \n",
+ " 0.423760 | \n",
+ " 0.001221 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.012034 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7 | \n",
+ " 7 | \n",
+ " None | \n",
+ " None | \n",
+ " 2015-03-03 | \n",
+ " Portsmouth | \n",
+ " evied56@harris-bailey.net | \n",
+ " 3 | \n",
+ " 0.683689 | \n",
+ " NaN | \n",
+ " 0.002535 | \n",
+ " 0.017220 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " None | \n",
+ " Dean | \n",
+ " 2015-03-03 | \n",
+ " None | \n",
+ " None | \n",
+ " 3 | \n",
+ " 0.553086 | \n",
+ " 0.003663 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " Evie | \n",
+ " Dean | \n",
+ " 2015-03-03 | \n",
+ " Pootsmruth | \n",
+ " evihd56@earris-bailey.net | \n",
+ " 3 | \n",
+ " 0.753070 | \n",
+ " 0.003663 | \n",
+ " 0.001267 | \n",
+ " 0.001230 | \n",
+ " 0.008424 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "sql = f\"\"\"\n",
- "select *\n",
- "from {df_predictions.physical_name}\n",
- "limit 2\n",
- "\"\"\"\n",
- "linker.misc.query_sql(sql)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "177c5013",
- "metadata": {},
- "source": [
- "!!! note \"Further Reading\"\n",
- ":material-tools: For more on the prediction tools in Splink, please refer to the [Prediction API documentation](../../api_docs/inference.md).\n"
+ "text/plain": [
+ " cluster_id unique_id first_name surname dob city \\\n",
+ "0 0 0 Robert Alan 1971-06-24 None \n",
+ "1 1 1 Robert Allen 1971-05-24 None \n",
+ "2 1 2 Rob Allen 1971-06-24 London \n",
+ "3 3 3 Robert Alen 1971-06-24 Lonon \n",
+ "4 4 4 Grace None 1997-04-26 Hull \n",
+ "5 5 5 Grace Kelly 1991-04-26 None \n",
+ "6 6 6 Logan pMurphy 1973-08-01 None \n",
+ "7 7 7 None None 2015-03-03 Portsmouth \n",
+ "8 8 8 None Dean 2015-03-03 None \n",
+ "9 8 9 Evie Dean 2015-03-03 Pootsmruth \n",
+ "\n",
+ " email cluster __splink_salt tf_surname tf_email \\\n",
+ "0 robert255@smith.net 0 0.012924 0.001221 0.001267 \n",
+ "1 roberta25@smith.net 0 0.478756 0.002442 0.002535 \n",
+ "2 roberta25@smith.net 0 0.409662 0.002442 0.002535 \n",
+ "3 None 0 0.311029 0.001221 NaN \n",
+ "4 grace.kelly52@jones.com 1 0.486141 NaN 0.002535 \n",
+ "5 grace.kelly52@jones.com 1 0.434566 0.002442 0.002535 \n",
+ "6 None 2 0.423760 0.001221 NaN \n",
+ "7 evied56@harris-bailey.net 3 0.683689 NaN 0.002535 \n",
+ "8 None 3 0.553086 0.003663 NaN \n",
+ "9 evihd56@earris-bailey.net 3 0.753070 0.003663 0.001267 \n",
+ "\n",
+ " tf_city tf_first_name \n",
+ "0 NaN 0.003610 \n",
+ "1 NaN 0.003610 \n",
+ "2 0.212792 0.001203 \n",
+ "3 0.007380 0.003610 \n",
+ "4 0.001230 0.006017 \n",
+ "5 NaN 0.006017 \n",
+ "6 NaN 0.012034 \n",
+ "7 0.017220 NaN \n",
+ "8 NaN NaN \n",
+ "9 0.001230 0.008424 "
]
- },
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " df_predictions, threshold_match_probability=0.5\n",
+ ")\n",
+ "clusters.as_pandas_dataframe(limit=10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "b973f53f-6d57-4c79-a87d-fbad40f303f1",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:08:06.760329Z",
+ "iopub.status.busy": "2024-06-22T08:08:06.760043Z",
+ "iopub.status.idle": "2024-06-22T08:08:06.788279Z",
+ "shell.execute_reply": "2024-06-22T08:08:06.787675Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "id": "b7cae5d7",
- "metadata": {},
- "source": [
- "## Next steps\n",
- "\n",
- "Now we have made predictions with a model, we can move on to visualising it to understand how it is working.\n"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " match_weight | \n",
+ " match_probability | \n",
+ " unique_id_l | \n",
+ " unique_id_r | \n",
+ " first_name_l | \n",
+ " first_name_r | \n",
+ " gamma_first_name | \n",
+ " tf_first_name_l | \n",
+ " tf_first_name_r | \n",
+ " bf_first_name | \n",
+ " bf_tf_adj_first_name | \n",
+ " surname_l | \n",
+ " surname_r | \n",
+ " gamma_surname | \n",
+ " tf_surname_l | \n",
+ " tf_surname_r | \n",
+ " bf_surname | \n",
+ " bf_tf_adj_surname | \n",
+ " dob_l | \n",
+ " dob_r | \n",
+ " gamma_dob | \n",
+ " bf_dob | \n",
+ " city_l | \n",
+ " city_r | \n",
+ " gamma_city | \n",
+ " tf_city_l | \n",
+ " tf_city_r | \n",
+ " bf_city | \n",
+ " bf_tf_adj_city | \n",
+ " email_l | \n",
+ " email_r | \n",
+ " gamma_email | \n",
+ " tf_email_l | \n",
+ " tf_email_r | \n",
+ " bf_email | \n",
+ " bf_tf_adj_email | \n",
+ " match_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " -1.749664 | \n",
+ " 0.229211 | \n",
+ " 324 | \n",
+ " 326 | \n",
+ " Kai | \n",
+ " Kai | \n",
+ " 4 | \n",
+ " 0.006017 | \n",
+ " 0.006017 | \n",
+ " 84.821765 | \n",
+ " 0.962892 | \n",
+ " None | \n",
+ " Turner | \n",
+ " -1 | \n",
+ " NaN | \n",
+ " 0.007326 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 2018-12-31 | \n",
+ " 2009-11-03 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " London | \n",
+ " London | \n",
+ " 1 | \n",
+ " 0.212792 | \n",
+ " 0.212792 | \n",
+ " 10.20126 | \n",
+ " 0.259162 | \n",
+ " k.t50eherand@z.ncom | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001267 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " -1.626076 | \n",
+ " 0.244695 | \n",
+ " 25 | \n",
+ " 27 | \n",
+ " Gabriel | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.001203 | \n",
+ " NaN | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " Thomas | \n",
+ " Thomas | \n",
+ " 4 | \n",
+ " 0.004884 | \n",
+ " 0.004884 | \n",
+ " 88.870507 | \n",
+ " 1.001222 | \n",
+ " 1977-09-13 | \n",
+ " 1977-10-17 | \n",
+ " 0 | \n",
+ " 0.460743 | \n",
+ " London | \n",
+ " London | \n",
+ " 1 | \n",
+ " 0.212792 | \n",
+ " 0.212792 | \n",
+ " 10.20126 | \n",
+ " 0.259162 | \n",
+ " gabriel.t54@nichols.info | \n",
+ " None | \n",
+ " -1 | \n",
+ " 0.002535 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n",
+ "0 -1.749664 0.229211 324 326 Kai \n",
+ "1 -1.626076 0.244695 25 27 Gabriel \n",
+ "\n",
+ " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n",
+ "0 Kai 4 0.006017 0.006017 \n",
+ "1 None -1 0.001203 NaN \n",
+ "\n",
+ " bf_first_name bf_tf_adj_first_name surname_l surname_r gamma_surname \\\n",
+ "0 84.821765 0.962892 None Turner -1 \n",
+ "1 1.000000 1.000000 Thomas Thomas 4 \n",
+ "\n",
+ " tf_surname_l tf_surname_r bf_surname bf_tf_adj_surname dob_l \\\n",
+ "0 NaN 0.007326 1.000000 1.000000 2018-12-31 \n",
+ "1 0.004884 0.004884 88.870507 1.001222 1977-09-13 \n",
+ "\n",
+ " dob_r gamma_dob bf_dob city_l city_r gamma_city tf_city_l \\\n",
+ "0 2009-11-03 0 0.460743 London London 1 0.212792 \n",
+ "1 1977-10-17 0 0.460743 London London 1 0.212792 \n",
+ "\n",
+ " tf_city_r bf_city bf_tf_adj_city email_l email_r \\\n",
+ "0 0.212792 10.20126 0.259162 k.t50eherand@z.ncom None \n",
+ "1 0.212792 10.20126 0.259162 gabriel.t54@nichols.info None \n",
+ "\n",
+ " gamma_email tf_email_l tf_email_r bf_email bf_tf_adj_email match_key \n",
+ "0 -1 0.001267 NaN 1.0 1.0 0 \n",
+ "1 -1 0.002535 NaN 1.0 1.0 1 "
]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "sql = f\"\"\"\n",
+ "select *\n",
+ "from {df_predictions.physical_name}\n",
+ "limit 2\n",
+ "\"\"\"\n",
+ "linker.misc.query_sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "177c5013",
+ "metadata": {},
+ "source": [
+ "!!! note \"Further Reading\"\n",
+ ":material-tools: For more on the prediction tools in Splink, please refer to the [Prediction API documentation](../../api_docs/inference.md).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7cae5d7",
+ "metadata": {},
+ "source": [
+ "## Next steps\n",
+ "\n",
+ "Now we have made predictions with a model, we can move on to visualising it to understand how it is working.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 5
-}
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/docs/demos/tutorials/06_Visualising_predictions.ipynb b/docs/demos/tutorials/06_Visualising_predictions.ipynb
index 7b5e96118b..babd07d201 100644
--- a/docs/demos/tutorials/06_Visualising_predictions.ipynb
+++ b/docs/demos/tutorials/06_Visualising_predictions.ipynb
@@ -1,401 +1,402 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "f110e018",
- "metadata": {},
- "source": [
- "# Visualising predictions\n",
- "\n",
- "\n",
- "
\n",
- "\n",
- "\n",
- "Splink contains a variety of tools to help you visualise your predictions.\n",
- "\n",
- "The idea is that, by developing an understanding of how your model works, you can gain confidence that the predictions it makes are sensible, or alternatively find examples of where your model isn't working, which may help you improve the model specification and fix these problems.\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "f110e018",
+ "metadata": {},
+ "source": [
+ "# Visualising predictions\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n",
+ "Splink contains a variety of tools to help you visualise your predictions.\n",
+ "\n",
+ "The idea is that, by developing an understanding of how your model works, you can gain confidence that the predictions it makes are sensible, or alternatively find examples of where your model isn't working, which may help you improve the model specification and fix these problems.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "1e3eed24",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:35:35.067665Z",
+ "iopub.status.busy": "2024-06-22T08:35:35.067291Z",
+ "iopub.status.idle": "2024-06-22T08:35:35.087061Z",
+ "shell.execute_reply": "2024-06-22T08:35:35.085914Z"
},
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "1e3eed24",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:35:35.067665Z",
- "iopub.status.busy": "2024-06-22T08:35:35.067291Z",
- "iopub.status.idle": "2024-06-22T08:35:35.087061Z",
- "shell.execute_reply": "2024-06-22T08:35:35.085914Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "fb29d421",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:35:35.091703Z",
+ "iopub.status.busy": "2024-06-22T08:35:35.091361Z",
+ "iopub.status.idle": "2024-06-22T08:35:37.305178Z",
+ "shell.execute_reply": "2024-06-22T08:35:37.304151Z"
},
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Rerun our predictions to we're ready to view the charts\n",
+ "from splink import Linker, DuckDBAPI, splink_datasets\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "pd.options.display.max_columns = 1000\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df = splink_datasets.fake_1000\n",
+ "df_sdf = db_api.register(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "2733ac16",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:35:37.309398Z",
+ "iopub.status.busy": "2024-06-22T08:35:37.309063Z",
+ "iopub.status.idle": "2024-06-22T08:35:37.869364Z",
+ "shell.execute_reply": "2024-06-22T08:35:37.868626Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 2,
- "id": "fb29d421",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:35:35.091703Z",
- "iopub.status.busy": "2024-06-22T08:35:35.091361Z",
- "iopub.status.idle": "2024-06-22T08:35:37.305178Z",
- "shell.execute_reply": "2024-06-22T08:35:37.304151Z"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "# Rerun our predictions to we're ready to view the charts\n",
- "from splink import Linker, DuckDBAPI, splink_datasets\n",
- "\n",
- "import pandas as pd\n",
- "\n",
- "pd.options.display.max_columns = 1000\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "df = splink_datasets.fake_1000"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "import urllib\n",
+ "\n",
+ "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink/847e32508b1a9cdd7bcd2ca6c0a74e547fb69865/docs/demos/demo_settings/saved_model_from_demo.json\"\n",
+ "\n",
+ "with urllib.request.urlopen(url) as u:\n",
+ " settings = json.loads(u.read().decode())\n",
+ "\n",
+ "\n",
+ "linker = Linker(df_sdf, settings)\n",
+ "df_predictions = linker.inference.predict(threshold_match_probability=0.2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b0dedd9",
+ "metadata": {},
+ "source": [
+ "## Waterfall chart\n",
+ "\n",
+ "The waterfall chart provides a means of visualising individual predictions to understand how Splink computed the final matchweight for a particular pairwise record comparison.\n",
+ "\n",
+ "To plot a waterfall chart, the user chooses one or more records from the results of `linker.inference.predict()`, and provides these records to the [`linker.visualisations.waterfall_chart()`](../../api_docs/visualisations.md#splink.internals.linker_components.visualisations.LinkerVisualisations.waterfall_chart) function.\n",
+ "\n",
+ "For an introduction to waterfall charts and how to interpret them, please see [this](https://www.youtube.com/watch?v=msz3T741KQI&t=507s) video.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bbfdc70c",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:35:37.873246Z",
+ "iopub.status.busy": "2024-06-22T08:35:37.872959Z",
+ "iopub.status.idle": "2024-06-22T08:35:38.614805Z",
+ "shell.execute_reply": "2024-06-22T08:35:38.614251Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 3,
- "id": "2733ac16",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:35:37.309398Z",
- "iopub.status.busy": "2024-06-22T08:35:37.309063Z",
- "iopub.status.idle": "2024-06-22T08:35:37.869364Z",
- "shell.execute_reply": "2024-06-22T08:35:37.868626Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "import json\n",
- "import urllib\n",
- "\n",
- "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink/847e32508b1a9cdd7bcd2ca6c0a74e547fb69865/docs/demos/demo_settings/saved_model_from_demo.json\"\n",
- "\n",
- "with urllib.request.urlopen(url) as u:\n",
- " settings = json.loads(u.read().decode())\n",
- "\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())\n",
- "df_predictions = linker.inference.predict(threshold_match_probability=0.2)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7b0dedd9",
- "metadata": {},
- "source": [
- "## Waterfall chart\n",
- "\n",
- "The waterfall chart provides a means of visualising individual predictions to understand how Splink computed the final matchweight for a particular pairwise record comparison.\n",
- "\n",
- "To plot a waterfall chart, the user chooses one or more records from the results of `linker.inference.predict()`, and provides these records to the [`linker.visualisations.waterfall_chart()`](../../api_docs/visualisations.md#splink.internals.linker_components.visualisations.LinkerVisualisations.waterfall_chart) function.\n",
- "\n",
- "For an introduction to waterfall charts and how to interpret them, please see [this](https://www.youtube.com/watch?v=msz3T741KQI&t=507s) video.\n"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "records_to_view = df_predictions.as_record_dict(limit=5)\n",
+ "linker.visualisations.waterfall_chart(records_to_view, filter_nulls=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48b76176",
+ "metadata": {},
+ "source": [
+ "## Comparison viewer dashboard\n",
+ "\n",
+ "The [comparison viewer dashboard](../../api_docs/visualisations.md#splink.internals.linker_components.visualisations.LinkerVisualisations.comparison_viewer_dashboard) takes this one step further by producing an interactive dashboard that contains example predictions from across the spectrum of match scores.\n",
+ "\n",
+ "An in-depth video describing how to interpret the dashboard can be found [here](https://www.youtube.com/watch?v=DNvCMqjipis).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "da85169c",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:35:38.618191Z",
+ "iopub.status.busy": "2024-06-22T08:35:38.617967Z",
+ "iopub.status.idle": "2024-06-22T08:35:38.717561Z",
+ "shell.execute_reply": "2024-06-22T08:35:38.716858Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "id": "bbfdc70c",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:35:37.873246Z",
- "iopub.status.busy": "2024-06-22T08:35:37.872959Z",
- "iopub.status.idle": "2024-06-22T08:35:38.614805Z",
- "shell.execute_reply": "2024-06-22T08:35:38.614251Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
],
- "source": [
- "records_to_view = df_predictions.as_record_dict(limit=5)\n",
- "linker.visualisations.waterfall_chart(records_to_view, filter_nulls=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "48b76176",
- "metadata": {},
- "source": [
- "## Comparison viewer dashboard\n",
- "\n",
- "The [comparison viewer dashboard](../../api_docs/visualisations.md#splink.internals.linker_components.visualisations.LinkerVisualisations.comparison_viewer_dashboard) takes this one step further by producing an interactive dashboard that contains example predictions from across the spectrum of match scores.\n",
- "\n",
- "An in-depth video describing how to interpret the dashboard can be found [here](https://www.youtube.com/watch?v=DNvCMqjipis).\n"
+ "text/plain": [
+ ""
]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.visualisations.comparison_viewer_dashboard(df_predictions, \"scv.html\", overwrite=True)\n",
+ "\n",
+ "# You can view the scv.html file in your browser, or inline in a notbook as follows\n",
+ "from IPython.display import IFrame\n",
+ "\n",
+ "IFrame(src=\"./scv.html\", width=\"100%\", height=1200)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d34df82c",
+ "metadata": {},
+ "source": [
+ "## Cluster studio dashboard\n",
+ "\n",
+ "Cluster studio is an interactive dashboards that visualises the results of clustering your predictions.\n",
+ "\n",
+ "It provides examples of clusters of different sizes. The shape and size of clusters can be indicative of problems with record linkage, so it provides a tool to help you find potential false positive and negative links.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e2153d91",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-06-22T08:35:38.721856Z",
+ "iopub.status.busy": "2024-06-22T08:35:38.721546Z",
+ "iopub.status.idle": "2024-06-22T08:35:38.845614Z",
+ "shell.execute_reply": "2024-06-22T08:35:38.844884Z"
},
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "id": "da85169c",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:35:38.618191Z",
- "iopub.status.busy": "2024-06-22T08:35:38.617967Z",
- "iopub.status.idle": "2024-06-22T08:35:38.717561Z",
- "shell.execute_reply": "2024-06-22T08:35:38.716858Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "linker.visualisations.comparison_viewer_dashboard(df_predictions, \"scv.html\", overwrite=True)\n",
- "\n",
- "# You can view the scv.html file in your browser, or inline in a notbook as follows\n",
- "from IPython.display import IFrame\n",
- "\n",
- "IFrame(src=\"./scv.html\", width=\"100%\", height=1200)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 1, root rows count 2\n"
+ ]
},
{
- "cell_type": "markdown",
- "id": "d34df82c",
- "metadata": {},
- "source": [
- "## Cluster studio dashboard\n",
- "\n",
- "Cluster studio is an interactive dashboards that visualises the results of clustering your predictions.\n",
- "\n",
- "It provides examples of clusters of different sizes. The shape and size of clusters can be indicative of problems with record linkage, so it provides a tool to help you find potential false positive and negative links.\n"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Completed iteration 2, root rows count 0\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 6,
- "id": "e2153d91",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-06-22T08:35:38.721856Z",
- "iopub.status.busy": "2024-06-22T08:35:38.721546Z",
- "iopub.status.idle": "2024-06-22T08:35:38.845614Z",
- "shell.execute_reply": "2024-06-22T08:35:38.844884Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 1, root rows count 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Completed iteration 2, root rows count 0\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
],
- "source": [
- "df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
- " df_predictions, threshold_match_probability=0.5\n",
- ")\n",
- "\n",
- "linker.visualisations.cluster_studio_dashboard(\n",
- " df_predictions,\n",
- " df_clusters,\n",
- " \"cluster_studio.html\",\n",
- " sampling_method=\"by_cluster_size\",\n",
- " overwrite=True,\n",
- ")\n",
- "\n",
- "# You can view the scv.html file in your browser, or inline in a notbook as follows\n",
- "from IPython.display import IFrame\n",
- "\n",
- "IFrame(src=\"./cluster_studio.html\", width=\"100%\", height=1000)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20ede1e9",
- "metadata": {},
- "source": [
- "!!! note \"Further Reading\"\n",
- "\n",
- " :material-tools: For more on the visualisation tools in Splink, please refer to the [Visualisation API documentation](../../api_docs/visualisations.md).\n",
- "\n",
- " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#model-evaluation).\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7cc780cb",
- "metadata": {},
- "source": [
- "## Next steps\n",
- "\n",
- "Now we have visualised the results of a model, we can move on to some more formal Quality Assurance procedures using labelled data.\n"
+ "text/plain": [
+ ""
]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
+ " df_predictions, threshold_match_probability=0.5\n",
+ ")\n",
+ "\n",
+ "linker.visualisations.cluster_studio_dashboard(\n",
+ " df_predictions,\n",
+ " df_clusters,\n",
+ " \"cluster_studio.html\",\n",
+ " sampling_method=\"by_cluster_size\",\n",
+ " overwrite=True,\n",
+ ")\n",
+ "\n",
+ "# You can view the scv.html file in your browser, or inline in a notbook as follows\n",
+ "from IPython.display import IFrame\n",
+ "\n",
+ "IFrame(src=\"./cluster_studio.html\", width=\"100%\", height=1000)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20ede1e9",
+ "metadata": {},
+ "source": [
+ "!!! note \"Further Reading\"\n",
+ "\n",
+ " :material-tools: For more on the visualisation tools in Splink, please refer to the [Visualisation API documentation](../../api_docs/visualisations.md).\n",
+ "\n",
+ " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#model-evaluation).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cc780cb",
+ "metadata": {},
+ "source": [
+ "## Next steps\n",
+ "\n",
+ "Now we have visualised the results of a model, we can move on to some more formal Quality Assurance procedures using labelled data.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 5
-}
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/docs/demos/tutorials/07_Evaluation.ipynb b/docs/demos/tutorials/07_Evaluation.ipynb
index 6cd8c2f0cb..104dc6d3b3 100644
--- a/docs/demos/tutorials/07_Evaluation.ipynb
+++ b/docs/demos/tutorials/07_Evaluation.ipynb
@@ -1,1120 +1,1121 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "57b58c35",
- "metadata": {},
- "source": [
- "## Evaluation of prediction results\n",
- "\n",
- " \n",
- "
\n",
- "\n",
- "\n",
- "In the previous tutorial, we looked at various ways to visualise the results of our model.\n",
- "These are useful for evaluating a linkage pipeline because they allow us to understand how our model works and verify that it is doing something sensible. They can also be useful to identify examples where the model is not performing as expected.\n",
- "\n",
- "In addition to these spot checks, Splink also has functions to perform more formal accuracy analysis. These functions allow you to understand the likely prevalence of false positives and false negatives in your linkage models.\n",
- "\n",
- "They rely on the existence of a sample of labelled (ground truth) matches, which may have been produced (for example) by human beings. For the accuracy analysis to be unbiased, the sample should be representative of the overall dataset.\n"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "57b58c35",
+ "metadata": {},
+ "source": [
+ "## Evaluation of prediction results\n",
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "\n",
+ "In the previous tutorial, we looked at various ways to visualise the results of our model.\n",
+ "These are useful for evaluating a linkage pipeline because they allow us to understand how our model works and verify that it is doing something sensible. They can also be useful to identify examples where the model is not performing as expected.\n",
+ "\n",
+ "In addition to these spot checks, Splink also has functions to perform more formal accuracy analysis. These functions allow you to understand the likely prevalence of false positives and false negatives in your linkage models.\n",
+ "\n",
+ "They rely on the existence of a sample of labelled (ground truth) matches, which may have been produced (for example) by human beings. For the accuracy analysis to be unbiased, the sample should be representative of the overall dataset.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e08e61e5",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:15.075066Z",
+ "iopub.status.busy": "2024-07-18T13:59:15.074751Z",
+ "iopub.status.idle": "2024-07-18T13:59:15.095735Z",
+ "shell.execute_reply": "2024-07-18T13:59:15.094736Z"
},
+ "tags": [
+ "hide_input"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Uncomment and run this cell if you're running in Google Colab.\n",
+ "# !pip install splink"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb29d421",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:15.100802Z",
+ "iopub.status.busy": "2024-07-18T13:59:15.100475Z",
+ "iopub.status.idle": "2024-07-18T13:59:17.210056Z",
+ "shell.execute_reply": "2024-07-18T13:59:17.209284Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Rerun our predictions to we're ready to view the charts\n",
+ "import pandas as pd\n",
+ "\n",
+ "from splink import DuckDBAPI, Linker, splink_datasets\n",
+ "\n",
+ "pd.options.display.max_columns = 1000\n",
+ "\n",
+ "db_api = DuckDBAPI()\n",
+ "df = splink_datasets.fake_1000\n",
+ "df_sdf = db_api.register(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f88cc1c1",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:17.214467Z",
+ "iopub.status.busy": "2024-07-18T13:59:17.214127Z",
+ "iopub.status.idle": "2024-07-18T13:59:18.511128Z",
+ "shell.execute_reply": "2024-07-18T13:59:18.510248Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 1,
- "id": "e08e61e5",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:15.075066Z",
- "iopub.status.busy": "2024-07-18T13:59:15.074751Z",
- "iopub.status.idle": "2024-07-18T13:59:15.095735Z",
- "shell.execute_reply": "2024-07-18T13:59:15.094736Z"
- },
- "tags": [
- "hide_input"
- ]
- },
- "outputs": [],
- "source": [
- "# Uncomment and run this cell if you're running in Google Colab.\n",
- "# !pip install splink"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Blocking time: 0.02 seconds\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 2,
- "id": "fb29d421",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:15.100802Z",
- "iopub.status.busy": "2024-07-18T13:59:15.100475Z",
- "iopub.status.idle": "2024-07-18T13:59:17.210056Z",
- "shell.execute_reply": "2024-07-18T13:59:17.209284Z"
- }
- },
- "outputs": [],
- "source": [
- "# Rerun our predictions to we're ready to view the charts\n",
- "import pandas as pd\n",
- "\n",
- "from splink import DuckDBAPI, Linker, splink_datasets\n",
- "\n",
- "pd.options.display.max_columns = 1000\n",
- "\n",
- "db_api = DuckDBAPI()\n",
- "df = splink_datasets.fake_1000"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Predict time: 0.80 seconds\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 3,
- "id": "f88cc1c1",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:17.214467Z",
- "iopub.status.busy": "2024-07-18T13:59:17.214127Z",
- "iopub.status.idle": "2024-07-18T13:59:18.511128Z",
- "shell.execute_reply": "2024-07-18T13:59:18.510248Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Blocking time: 0.02 seconds\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Predict time: 0.80 seconds\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- " -- WARNING --\n",
- "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
- "Comparison: 'email':\n",
- " m values not fully trained\n"
- ]
- }
- ],
- "source": [
- "import json\n",
- "import urllib\n",
- "\n",
- "from splink import block_on\n",
- "\n",
- "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink/847e32508b1a9cdd7bcd2ca6c0a74e547fb69865/docs/demos/demo_settings/saved_model_from_demo.json\"\n",
- "\n",
- "with urllib.request.urlopen(url) as u:\n",
- " settings = json.loads(u.read().decode())\n",
- "\n",
- "# The data quality is very poor in this dataset, so we need looser blocking rules\n",
- "# to achieve decent recall\n",
- "settings[\"blocking_rules_to_generate_predictions\"] = [\n",
- " block_on(\"first_name\"),\n",
- " block_on(\"city\"),\n",
- " block_on(\"email\"),\n",
- " block_on(\"dob\"),\n",
- "]\n",
- "\n",
- "linker = Linker(df, settings, db_api=DuckDBAPI())\n",
- "df_predictions = linker.inference.predict(threshold_match_probability=0.01)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7b0dedd9",
- "metadata": {},
- "source": [
- "## Load in labels\n",
- "\n",
- "The labels file contains a list of pairwise comparisons which represent matches and non-matches.\n",
- "\n",
- "The required format of the labels file is described [here](https://moj-analytical-services.github.io/splink/api_docs/evaluation.html#splink.internals.linker_components.evaluation.LinkerEvalution.prediction_errors_from_labels_table).\n"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " -- WARNING --\n",
+ "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n",
+ "Comparison: 'email':\n",
+ " m values not fully trained\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "import urllib\n",
+ "\n",
+ "from splink import block_on\n",
+ "\n",
+ "url = \"https://raw.githubusercontent.com/moj-analytical-services/splink/847e32508b1a9cdd7bcd2ca6c0a74e547fb69865/docs/demos/demo_settings/saved_model_from_demo.json\"\n",
+ "\n",
+ "with urllib.request.urlopen(url) as u:\n",
+ " settings = json.loads(u.read().decode())\n",
+ "\n",
+ "# The data quality is very poor in this dataset, so we need looser blocking rules\n",
+ "# to achieve decent recall\n",
+ "settings[\"blocking_rules_to_generate_predictions\"] = [\n",
+ " block_on(\"first_name\"),\n",
+ " block_on(\"city\"),\n",
+ " block_on(\"email\"),\n",
+ " block_on(\"dob\"),\n",
+ "]\n",
+ "\n",
+ "linker = Linker(df_sdf, settings)\n",
+ "df_predictions = linker.inference.predict(threshold_match_probability=0.01)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b0dedd9",
+ "metadata": {},
+ "source": [
+ "## Load in labels\n",
+ "\n",
+ "The labels file contains a list of pairwise comparisons which represent matches and non-matches.\n",
+ "\n",
+ "The required format of the labels file is described [here](https://moj-analytical-services.github.io/splink/api_docs/evaluation.html#splink.internals.linker_components.evaluation.LinkerEvalution.prediction_errors_from_labels_table).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bbfdc70c",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:18.515644Z",
+ "iopub.status.busy": "2024-07-18T13:59:18.515122Z",
+ "iopub.status.idle": "2024-07-18T13:59:18.552541Z",
+ "shell.execute_reply": "2024-07-18T13:59:18.551821Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 4,
- "id": "bbfdc70c",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:18.515644Z",
- "iopub.status.busy": "2024-07-18T13:59:18.515122Z",
- "iopub.status.idle": "2024-07-18T13:59:18.552541Z",
- "shell.execute_reply": "2024-07-18T13:59:18.551821Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " unique_id_l | \n",
- " source_dataset_l | \n",
- " unique_id_r | \n",
- " source_dataset_r | \n",
- " clerical_match_score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 1 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 2 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 3 | \n",
- " fake_1000 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 4 | \n",
- " fake_1000 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 0 | \n",
- " fake_1000 | \n",
- " 5 | \n",
- " fake_1000 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n",
- "0 0 fake_1000 1 fake_1000 \n",
- "1 0 fake_1000 2 fake_1000 \n",
- "2 0 fake_1000 3 fake_1000 \n",
- "3 0 fake_1000 4 fake_1000 \n",
- "4 0 fake_1000 5 fake_1000 \n",
- "\n",
- " clerical_match_score \n",
- "0 1.0 \n",
- "1 1.0 \n",
- "2 1.0 \n",
- "3 0.0 \n",
- "4 0.0 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unique_id_l | \n",
+ " source_dataset_l | \n",
+ " unique_id_r | \n",
+ " source_dataset_r | \n",
+ " clerical_match_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 1 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 2 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 3 | \n",
+ " fake_1000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 4 | \n",
+ " fake_1000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " fake_1000 | \n",
+ " 5 | \n",
+ " fake_1000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "from splink.datasets import splink_dataset_labels\n",
- "\n",
- "df_labels = splink_dataset_labels.fake_1000_labels\n",
- "labels_table = linker.table_management.register_labels_table(df_labels)\n",
- "df_labels.head(5)"
+ "text/plain": [
+ " unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n",
+ "0 0 fake_1000 1 fake_1000 \n",
+ "1 0 fake_1000 2 fake_1000 \n",
+ "2 0 fake_1000 3 fake_1000 \n",
+ "3 0 fake_1000 4 fake_1000 \n",
+ "4 0 fake_1000 5 fake_1000 \n",
+ "\n",
+ " clerical_match_score \n",
+ "0 1.0 \n",
+ "1 1.0 \n",
+ "2 1.0 \n",
+ "3 0.0 \n",
+ "4 0.0 "
]
- },
- {
- "cell_type": "markdown",
- "id": "ff86458e",
- "metadata": {},
- "source": [
- "## View examples of false positives and false negatives"
- ]
- },
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from splink.datasets import splink_dataset_labels\n",
+ "\n",
+ "df_labels = splink_dataset_labels.fake_1000_labels\n",
+ "labels_table = linker.table_management.register_labels_table(df_labels)\n",
+ "df_labels.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff86458e",
+ "metadata": {},
+ "source": [
+ "## View examples of false positives and false negatives"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c5b3deb6",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:18.556625Z",
+ "iopub.status.busy": "2024-07-18T13:59:18.556304Z",
+ "iopub.status.idle": "2024-07-18T13:59:19.797703Z",
+ "shell.execute_reply": "2024-07-18T13:59:19.797008Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "id": "c5b3deb6",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:18.556625Z",
- "iopub.status.busy": "2024-07-18T13:59:18.556304Z",
- "iopub.status.idle": "2024-07-18T13:59:19.797703Z",
- "shell.execute_reply": "2024-07-18T13:59:19.797008Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "splink_df = linker.evaluation.prediction_errors_from_labels_table(\n",
- " labels_table, include_false_negatives=True, include_false_positives=False\n",
- ")\n",
- "false_negatives = splink_df.as_record_dict(limit=5)\n",
- "linker.visualisations.waterfall_chart(false_negatives)"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
- {
- "cell_type": "markdown",
- "id": "c4fe30d3",
- "metadata": {},
- "source": [
- "### False positives"
- ]
- },
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "splink_df = linker.evaluation.prediction_errors_from_labels_table(\n",
+ " labels_table, include_false_negatives=True, include_false_positives=False\n",
+ ")\n",
+ "false_negatives = splink_df.as_record_dict(limit=5)\n",
+ "linker.visualisations.waterfall_chart(false_negatives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c4fe30d3",
+ "metadata": {},
+ "source": [
+ "### False positives"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "f8f816e2",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:19.801020Z",
+ "iopub.status.busy": "2024-07-18T13:59:19.800643Z",
+ "iopub.status.idle": "2024-07-18T13:59:20.908287Z",
+ "shell.execute_reply": "2024-07-18T13:59:20.907746Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 6,
- "id": "f8f816e2",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:19.801020Z",
- "iopub.status.busy": "2024-07-18T13:59:19.800643Z",
- "iopub.status.idle": "2024-07-18T13:59:20.908287Z",
- "shell.execute_reply": "2024-07-18T13:59:20.907746Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "# Note I've picked a threshold match probability of 0.01 here because otherwise\n",
- "# in this simple example there are no false positives\n",
- "splink_df = linker.evaluation.prediction_errors_from_labels_table(\n",
- " labels_table, include_false_negatives=False, include_false_positives=True, threshold_match_probability=0.01\n",
- ")\n",
- "false_postives = splink_df.as_record_dict(limit=5)\n",
- "linker.visualisations.waterfall_chart(false_postives)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "030c4cde",
- "metadata": {},
- "source": [
- "## Threshold Selection chart\n",
- "\n",
- "Splink includes an interactive dashboard that shows key accuracy statistics:\n"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
- },
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Note I've picked a threshold match probability of 0.01 here because otherwise\n",
+ "# in this simple example there are no false positives\n",
+ "splink_df = linker.evaluation.prediction_errors_from_labels_table(\n",
+ " labels_table, include_false_negatives=False, include_false_positives=True, threshold_match_probability=0.01\n",
+ ")\n",
+ "false_postives = splink_df.as_record_dict(limit=5)\n",
+ "linker.visualisations.waterfall_chart(false_postives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "030c4cde",
+ "metadata": {},
+ "source": [
+ "## Threshold Selection chart\n",
+ "\n",
+ "Splink includes an interactive dashboard that shows key accuracy statistics:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "e83d9645",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:20.911371Z",
+ "iopub.status.busy": "2024-07-18T13:59:20.911132Z",
+ "iopub.status.idle": "2024-07-18T13:59:22.696520Z",
+ "shell.execute_reply": "2024-07-18T13:59:22.695643Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 7,
- "id": "e83d9645",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:20.911371Z",
- "iopub.status.busy": "2024-07-18T13:59:20.911132Z",
- "iopub.status.idle": "2024-07-18T13:59:22.696520Z",
- "shell.execute_reply": "2024-07-18T13:59:22.695643Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_table(\n",
- " labels_table, output_type=\"threshold_selection\", add_metrics=[\"f1\"]\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "81e4396d",
- "metadata": {},
- "source": [
- "## Receiver operating characteristic curve\n",
- "\n",
- "A [ROC chart](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) shows how the number of false positives and false negatives varies depending on the match threshold chosen. The match threshold is the match weight chosen as a cutoff for which pairwise comparisons to accept as matches.\n"
+ "text/plain": [
+ "alt.HConcatChart(...)"
]
- },
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_table(\n",
+ " labels_table, output_type=\"threshold_selection\", add_metrics=[\"f1\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81e4396d",
+ "metadata": {},
+ "source": [
+ "## Receiver operating characteristic curve\n",
+ "\n",
+ "A [ROC chart](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) shows how the number of false positives and false negatives varies depending on the match threshold chosen. The match threshold is the match weight chosen as a cutoff for which pairwise comparisons to accept as matches.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "01dd7eec",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:22.701493Z",
+ "iopub.status.busy": "2024-07-18T13:59:22.701163Z",
+ "iopub.status.idle": "2024-07-18T13:59:23.282190Z",
+ "shell.execute_reply": "2024-07-18T13:59:23.281409Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "id": "01dd7eec",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:22.701493Z",
- "iopub.status.busy": "2024-07-18T13:59:22.701163Z",
- "iopub.status.idle": "2024-07-18T13:59:23.282190Z",
- "shell.execute_reply": "2024-07-18T13:59:23.281409Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.evaluation.accuracy_analysis_from_labels_table(labels_table, output_type=\"roc\")"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
- {
- "cell_type": "markdown",
- "id": "12e6ba74",
- "metadata": {},
- "source": [
- "## Truth table\n",
- "\n",
- "Finally, Splink can also report the underlying table used to construct the ROC and precision recall curves.\n"
- ]
- },
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "linker.evaluation.accuracy_analysis_from_labels_table(labels_table, output_type=\"roc\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "12e6ba74",
+ "metadata": {},
+ "source": [
+ "## Truth table\n",
+ "\n",
+ "Finally, Splink can also report the underlying table used to construct the ROC and precision recall curves.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f7c283ba",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:23.286740Z",
+ "iopub.status.busy": "2024-07-18T13:59:23.286467Z",
+ "iopub.status.idle": "2024-07-18T13:59:23.494911Z",
+ "shell.execute_reply": "2024-07-18T13:59:23.494348Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "id": "f7c283ba",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:23.286740Z",
- "iopub.status.busy": "2024-07-18T13:59:23.286467Z",
- "iopub.status.idle": "2024-07-18T13:59:23.494911Z",
- "shell.execute_reply": "2024-07-18T13:59:23.494348Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " truth_threshold | \n",
- " match_probability | \n",
- " total_clerical_labels | \n",
- " p | \n",
- " n | \n",
- " tp | \n",
- " tn | \n",
- " fp | \n",
- " fn | \n",
- " P_rate | \n",
- " N_rate | \n",
- " tp_rate | \n",
- " tn_rate | \n",
- " fp_rate | \n",
- " fn_rate | \n",
- " precision | \n",
- " recall | \n",
- " specificity | \n",
- " npv | \n",
- " accuracy | \n",
- " f1 | \n",
- " f2 | \n",
- " f0_5 | \n",
- " p4 | \n",
- " phi | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " -18.9 | \n",
- " 0.000002 | \n",
- " 3176.0 | \n",
- " 2031.0 | \n",
- " 1145.0 | \n",
- " 1709.0 | \n",
- " 1103.0 | \n",
- " 42.0 | \n",
- " 322.0 | \n",
- " 0.639484 | \n",
- " 0.360516 | \n",
- " 0.841457 | \n",
- " 0.963319 | \n",
- " 0.036681 | \n",
- " 0.158543 | \n",
- " 0.976014 | \n",
- " 0.841457 | \n",
- " 0.963319 | \n",
- " 0.774035 | \n",
- " 0.885390 | \n",
- " 0.903755 | \n",
- " 0.865316 | \n",
- " 0.945766 | \n",
- " 0.880476 | \n",
- " 0.776931 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " -16.7 | \n",
- " 0.000009 | \n",
- " 3176.0 | \n",
- " 2031.0 | \n",
- " 1145.0 | \n",
- " 1709.0 | \n",
- " 1119.0 | \n",
- " 26.0 | \n",
- " 322.0 | \n",
- " 0.639484 | \n",
- " 0.360516 | \n",
- " 0.841457 | \n",
- " 0.977293 | \n",
- " 0.022707 | \n",
- " 0.158543 | \n",
- " 0.985014 | \n",
- " 0.841457 | \n",
- " 0.977293 | \n",
- " 0.776544 | \n",
- " 0.890428 | \n",
- " 0.907594 | \n",
- " 0.866721 | \n",
- " 0.952514 | \n",
- " 0.886010 | \n",
- " 0.789637 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " -12.8 | \n",
- " 0.000140 | \n",
- " 3176.0 | \n",
- " 2031.0 | \n",
- " 1145.0 | \n",
- " 1709.0 | \n",
- " 1125.0 | \n",
- " 20.0 | \n",
- " 322.0 | \n",
- " 0.639484 | \n",
- " 0.360516 | \n",
- " 0.841457 | \n",
- " 0.982533 | \n",
- " 0.017467 | \n",
- " 0.158543 | \n",
- " 0.988433 | \n",
- " 0.841457 | \n",
- " 0.982533 | \n",
- " 0.777471 | \n",
- " 0.892317 | \n",
- " 0.909043 | \n",
- " 0.867249 | \n",
- " 0.955069 | \n",
- " 0.888076 | \n",
- " 0.794416 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " -12.5 | \n",
- " 0.000173 | \n",
- " 3176.0 | \n",
- " 2031.0 | \n",
- " 1145.0 | \n",
- " 1708.0 | \n",
- " 1125.0 | \n",
- " 20.0 | \n",
- " 323.0 | \n",
- " 0.639484 | \n",
- " 0.360516 | \n",
- " 0.840965 | \n",
- " 0.982533 | \n",
- " 0.017467 | \n",
- " 0.159035 | \n",
- " 0.988426 | \n",
- " 0.840965 | \n",
- " 0.982533 | \n",
- " 0.776934 | \n",
- " 0.892003 | \n",
- " 0.908752 | \n",
- " 0.866829 | \n",
- " 0.954937 | \n",
- " 0.887763 | \n",
- " 0.793897 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " -12.4 | \n",
- " 0.000185 | \n",
- " 3176.0 | \n",
- " 2031.0 | \n",
- " 1145.0 | \n",
- " 1705.0 | \n",
- " 1132.0 | \n",
- " 13.0 | \n",
- " 326.0 | \n",
- " 0.639484 | \n",
- " 0.360516 | \n",
- " 0.839488 | \n",
- " 0.988646 | \n",
- " 0.011354 | \n",
- " 0.160512 | \n",
- " 0.992433 | \n",
- " 0.839488 | \n",
- " 0.988646 | \n",
- " 0.776406 | \n",
- " 0.893262 | \n",
- " 0.909576 | \n",
- " 0.866186 | \n",
- " 0.957542 | \n",
- " 0.889225 | \n",
- " 0.797936 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " truth_threshold match_probability total_clerical_labels p n \\\n",
- "0 -18.9 0.000002 3176.0 2031.0 1145.0 \n",
- "1 -16.7 0.000009 3176.0 2031.0 1145.0 \n",
- "2 -12.8 0.000140 3176.0 2031.0 1145.0 \n",
- "3 -12.5 0.000173 3176.0 2031.0 1145.0 \n",
- "4 -12.4 0.000185 3176.0 2031.0 1145.0 \n",
- "\n",
- " tp tn fp fn P_rate N_rate tp_rate tn_rate \\\n",
- "0 1709.0 1103.0 42.0 322.0 0.639484 0.360516 0.841457 0.963319 \n",
- "1 1709.0 1119.0 26.0 322.0 0.639484 0.360516 0.841457 0.977293 \n",
- "2 1709.0 1125.0 20.0 322.0 0.639484 0.360516 0.841457 0.982533 \n",
- "3 1708.0 1125.0 20.0 323.0 0.639484 0.360516 0.840965 0.982533 \n",
- "4 1705.0 1132.0 13.0 326.0 0.639484 0.360516 0.839488 0.988646 \n",
- "\n",
- " fp_rate fn_rate precision recall specificity npv accuracy \\\n",
- "0 0.036681 0.158543 0.976014 0.841457 0.963319 0.774035 0.885390 \n",
- "1 0.022707 0.158543 0.985014 0.841457 0.977293 0.776544 0.890428 \n",
- "2 0.017467 0.158543 0.988433 0.841457 0.982533 0.777471 0.892317 \n",
- "3 0.017467 0.159035 0.988426 0.840965 0.982533 0.776934 0.892003 \n",
- "4 0.011354 0.160512 0.992433 0.839488 0.988646 0.776406 0.893262 \n",
- "\n",
- " f1 f2 f0_5 p4 phi \n",
- "0 0.903755 0.865316 0.945766 0.880476 0.776931 \n",
- "1 0.907594 0.866721 0.952514 0.886010 0.789637 \n",
- "2 0.909043 0.867249 0.955069 0.888076 0.794416 \n",
- "3 0.908752 0.866829 0.954937 0.887763 0.793897 \n",
- "4 0.909576 0.866186 0.957542 0.889225 0.797936 "
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " truth_threshold | \n",
+ " match_probability | \n",
+ " total_clerical_labels | \n",
+ " p | \n",
+ " n | \n",
+ " tp | \n",
+ " tn | \n",
+ " fp | \n",
+ " fn | \n",
+ " P_rate | \n",
+ " N_rate | \n",
+ " tp_rate | \n",
+ " tn_rate | \n",
+ " fp_rate | \n",
+ " fn_rate | \n",
+ " precision | \n",
+ " recall | \n",
+ " specificity | \n",
+ " npv | \n",
+ " accuracy | \n",
+ " f1 | \n",
+ " f2 | \n",
+ " f0_5 | \n",
+ " p4 | \n",
+ " phi | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " -18.9 | \n",
+ " 0.000002 | \n",
+ " 3176.0 | \n",
+ " 2031.0 | \n",
+ " 1145.0 | \n",
+ " 1709.0 | \n",
+ " 1103.0 | \n",
+ " 42.0 | \n",
+ " 322.0 | \n",
+ " 0.639484 | \n",
+ " 0.360516 | \n",
+ " 0.841457 | \n",
+ " 0.963319 | \n",
+ " 0.036681 | \n",
+ " 0.158543 | \n",
+ " 0.976014 | \n",
+ " 0.841457 | \n",
+ " 0.963319 | \n",
+ " 0.774035 | \n",
+ " 0.885390 | \n",
+ " 0.903755 | \n",
+ " 0.865316 | \n",
+ " 0.945766 | \n",
+ " 0.880476 | \n",
+ " 0.776931 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " -16.7 | \n",
+ " 0.000009 | \n",
+ " 3176.0 | \n",
+ " 2031.0 | \n",
+ " 1145.0 | \n",
+ " 1709.0 | \n",
+ " 1119.0 | \n",
+ " 26.0 | \n",
+ " 322.0 | \n",
+ " 0.639484 | \n",
+ " 0.360516 | \n",
+ " 0.841457 | \n",
+ " 0.977293 | \n",
+ " 0.022707 | \n",
+ " 0.158543 | \n",
+ " 0.985014 | \n",
+ " 0.841457 | \n",
+ " 0.977293 | \n",
+ " 0.776544 | \n",
+ " 0.890428 | \n",
+ " 0.907594 | \n",
+ " 0.866721 | \n",
+ " 0.952514 | \n",
+ " 0.886010 | \n",
+ " 0.789637 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " -12.8 | \n",
+ " 0.000140 | \n",
+ " 3176.0 | \n",
+ " 2031.0 | \n",
+ " 1145.0 | \n",
+ " 1709.0 | \n",
+ " 1125.0 | \n",
+ " 20.0 | \n",
+ " 322.0 | \n",
+ " 0.639484 | \n",
+ " 0.360516 | \n",
+ " 0.841457 | \n",
+ " 0.982533 | \n",
+ " 0.017467 | \n",
+ " 0.158543 | \n",
+ " 0.988433 | \n",
+ " 0.841457 | \n",
+ " 0.982533 | \n",
+ " 0.777471 | \n",
+ " 0.892317 | \n",
+ " 0.909043 | \n",
+ " 0.867249 | \n",
+ " 0.955069 | \n",
+ " 0.888076 | \n",
+ " 0.794416 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " -12.5 | \n",
+ " 0.000173 | \n",
+ " 3176.0 | \n",
+ " 2031.0 | \n",
+ " 1145.0 | \n",
+ " 1708.0 | \n",
+ " 1125.0 | \n",
+ " 20.0 | \n",
+ " 323.0 | \n",
+ " 0.639484 | \n",
+ " 0.360516 | \n",
+ " 0.840965 | \n",
+ " 0.982533 | \n",
+ " 0.017467 | \n",
+ " 0.159035 | \n",
+ " 0.988426 | \n",
+ " 0.840965 | \n",
+ " 0.982533 | \n",
+ " 0.776934 | \n",
+ " 0.892003 | \n",
+ " 0.908752 | \n",
+ " 0.866829 | \n",
+ " 0.954937 | \n",
+ " 0.887763 | \n",
+ " 0.793897 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " -12.4 | \n",
+ " 0.000185 | \n",
+ " 3176.0 | \n",
+ " 2031.0 | \n",
+ " 1145.0 | \n",
+ " 1705.0 | \n",
+ " 1132.0 | \n",
+ " 13.0 | \n",
+ " 326.0 | \n",
+ " 0.639484 | \n",
+ " 0.360516 | \n",
+ " 0.839488 | \n",
+ " 0.988646 | \n",
+ " 0.011354 | \n",
+ " 0.160512 | \n",
+ " 0.992433 | \n",
+ " 0.839488 | \n",
+ " 0.988646 | \n",
+ " 0.776406 | \n",
+ " 0.893262 | \n",
+ " 0.909576 | \n",
+ " 0.866186 | \n",
+ " 0.957542 | \n",
+ " 0.889225 | \n",
+ " 0.797936 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "roc_table = linker.evaluation.accuracy_analysis_from_labels_table(\n",
- " labels_table, output_type=\"table\"\n",
- ")\n",
- "roc_table.as_pandas_dataframe(limit=5)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a0d6e855",
- "metadata": {},
- "source": [
- "## Unlinkables chart\n",
- "\n",
- "Finally, it can be interesting to analyse whether your dataset contains any 'unlinkable' records.\n",
- "\n",
- "'Unlinkable records' are records with such poor data quality they don't even link to themselves at a high enough probability to be accepted as matches\n",
- "\n",
- "For example, in a typical linkage problem, a 'John Smith' record with nulls for their address and postcode may be unlinkable. By 'unlinkable' we don't mean there are no matches; rather, we mean it is not possible to determine whether there are matches.UnicodeTranslateError\n",
- "\n",
- "A high proportion of unlinkable records is an indication of poor quality in the input dataset"
+ "text/plain": [
+ " truth_threshold match_probability total_clerical_labels p n \\\n",
+ "0 -18.9 0.000002 3176.0 2031.0 1145.0 \n",
+ "1 -16.7 0.000009 3176.0 2031.0 1145.0 \n",
+ "2 -12.8 0.000140 3176.0 2031.0 1145.0 \n",
+ "3 -12.5 0.000173 3176.0 2031.0 1145.0 \n",
+ "4 -12.4 0.000185 3176.0 2031.0 1145.0 \n",
+ "\n",
+ " tp tn fp fn P_rate N_rate tp_rate tn_rate \\\n",
+ "0 1709.0 1103.0 42.0 322.0 0.639484 0.360516 0.841457 0.963319 \n",
+ "1 1709.0 1119.0 26.0 322.0 0.639484 0.360516 0.841457 0.977293 \n",
+ "2 1709.0 1125.0 20.0 322.0 0.639484 0.360516 0.841457 0.982533 \n",
+ "3 1708.0 1125.0 20.0 323.0 0.639484 0.360516 0.840965 0.982533 \n",
+ "4 1705.0 1132.0 13.0 326.0 0.639484 0.360516 0.839488 0.988646 \n",
+ "\n",
+ " fp_rate fn_rate precision recall specificity npv accuracy \\\n",
+ "0 0.036681 0.158543 0.976014 0.841457 0.963319 0.774035 0.885390 \n",
+ "1 0.022707 0.158543 0.985014 0.841457 0.977293 0.776544 0.890428 \n",
+ "2 0.017467 0.158543 0.988433 0.841457 0.982533 0.777471 0.892317 \n",
+ "3 0.017467 0.159035 0.988426 0.840965 0.982533 0.776934 0.892003 \n",
+ "4 0.011354 0.160512 0.992433 0.839488 0.988646 0.776406 0.893262 \n",
+ "\n",
+ " f1 f2 f0_5 p4 phi \n",
+ "0 0.903755 0.865316 0.945766 0.880476 0.776931 \n",
+ "1 0.907594 0.866721 0.952514 0.886010 0.789637 \n",
+ "2 0.909043 0.867249 0.955069 0.888076 0.794416 \n",
+ "3 0.908752 0.866829 0.954937 0.887763 0.793897 \n",
+ "4 0.909576 0.866186 0.957542 0.889225 0.797936 "
]
- },
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roc_table = linker.evaluation.accuracy_analysis_from_labels_table(\n",
+ " labels_table, output_type=\"table\"\n",
+ ")\n",
+ "roc_table.as_pandas_dataframe(limit=5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a0d6e855",
+ "metadata": {},
+ "source": [
+ "## Unlinkables chart\n",
+ "\n",
+ "Finally, it can be interesting to analyse whether your dataset contains any 'unlinkable' records.\n",
+ "\n",
+ "'Unlinkable records' are records with such poor data quality they don't even link to themselves at a high enough probability to be accepted as matches\n",
+ "\n",
+ "For example, in a typical linkage problem, a 'John Smith' record with nulls for their address and postcode may be unlinkable. By 'unlinkable' we don't mean there are no matches; rather, we mean it is not possible to determine whether there are matches.UnicodeTranslateError\n",
+ "\n",
+ "A high proportion of unlinkable records is an indication of poor quality in the input dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "421013d0",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-07-18T13:59:23.498601Z",
+ "iopub.status.busy": "2024-07-18T13:59:23.498116Z",
+ "iopub.status.idle": "2024-07-18T13:59:23.867219Z",
+ "shell.execute_reply": "2024-07-18T13:59:23.866532Z"
+ }
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "id": "421013d0",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2024-07-18T13:59:23.498601Z",
- "iopub.status.busy": "2024-07-18T13:59:23.498116Z",
- "iopub.status.idle": "2024-07-18T13:59:23.867219Z",
- "shell.execute_reply": "2024-07-18T13:59:23.866532Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.LayerChart(...)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "linker.evaluation.unlinkables_chart()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "08199ce7",
- "metadata": {},
- "source": [
- "For this dataset and this trained model, we can see that most records are (theoretically) linkable: At a match weight 6, around around 99% of records could be linked to themselves."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f2d6d5f4",
- "metadata": {},
- "source": [
- "!!! note \"Further Reading\"\n",
- "\n",
- " :material-tools: For more on the quality assurance tools in Splink, please refer to the [Evaluation API documentation](../../api_docs/evaluation.md).\n",
- "\n",
- " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#model-evaluation).\n",
- "\n",
- " :material-thumbs-up-down: For more on the Evaluation Metrics used in this tutorial, please refer to the [Edge Metrics guide.](../../topic_guides/evaluation/edge_metrics.md)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b7ee8f2d",
- "metadata": {},
- "source": [
- "## :material-flag-checkered: That's it!\n",
- "\n",
- "That wraps up the Splink tutorial! Don't worry, there are still plenty of resources to help on the next steps of your Splink journey:\n",
- "\n",
- ":octicons-link-16: For some end-to-end notebooks of Splink pipelines, check out our [Examples](../examples/examples_index.md)\n",
- "\n",
- ":simple-readme: For more deepdives into the different aspects of Splink, and record linkage more generally, check out our [Topic Guides](../../topic_guides/topic_guides_index.md)\n",
- "\n",
- ":material-tools: For a reference on all the functionality avalable in Splink, see our [Documentation](../../api_docs/api_docs_index.md)\n"
+ "text/plain": [
+ "alt.LayerChart(...)"
]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ ],
+ "source": [
+ "linker.evaluation.unlinkables_chart()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08199ce7",
+ "metadata": {},
+ "source": [
+ "For this dataset and this trained model, we can see that most records are (theoretically) linkable: At a match weight 6, around around 99% of records could be linked to themselves."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f2d6d5f4",
+ "metadata": {},
+ "source": [
+ "!!! note \"Further Reading\"\n",
+ "\n",
+ " :material-tools: For more on the quality assurance tools in Splink, please refer to the [Evaluation API documentation](../../api_docs/evaluation.md).\n",
+ "\n",
+ " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#model-evaluation).\n",
+ "\n",
+ " :material-thumbs-up-down: For more on the Evaluation Metrics used in this tutorial, please refer to the [Edge Metrics guide.](../../topic_guides/evaluation/edge_metrics.md)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7ee8f2d",
+ "metadata": {},
+ "source": [
+ "## :material-flag-checkered: That's it!\n",
+ "\n",
+ "That wraps up the Splink tutorial! Don't worry, there are still plenty of resources to help on the next steps of your Splink journey:\n",
+ "\n",
+ ":octicons-link-16: For some end-to-end notebooks of Splink pipelines, check out our [Examples](../examples/examples_index.md)\n",
+ "\n",
+ ":simple-readme: For more deepdives into the different aspects of Splink, and record linkage more generally, check out our [Topic Guides](../../topic_guides/topic_guides_index.md)\n",
+ "\n",
+ ":material-tools: For a reference on all the functionality avalable in Splink, see our [Documentation](../../api_docs/api_docs_index.md)\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 5
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py
index cdfb9cc9c6..9891ab1a45 100644
--- a/splink/internals/blocking_analysis.py
+++ b/splink/internals/blocking_analysis.py
@@ -21,11 +21,15 @@
ChartReturnType,
cumulative_blocking_rule_comparisons_generated,
)
-from splink.internals.database_api import AcceptableInputTableType, DatabaseAPISubClass
+from splink.internals.database_api import DatabaseAPISubClass
from splink.internals.input_column import InputColumn
from splink.internals.misc import calculate_cartesian, ensure_is_iterable
from splink.internals.pipeline import CTEPipeline
from splink.internals.splink_dataframe import SplinkDataFrame
+from splink.internals.splinkdataframe_utils import (
+ get_db_api_from_inputs,
+ splink_dataframes_to_dict,
+)
from splink.internals.vertically_concatenate import (
split_df_concat_with_tf_into_two_tables_sqls,
vertically_concatenate_sql,
@@ -502,11 +506,10 @@ def add_l_r(sql, table_name):
def count_comparisons_from_blocking_rule(
+ splink_dataframe_or_dataframes: SplinkDataFrame | Sequence[SplinkDataFrame],
*,
- table_or_tables: Sequence[AcceptableInputTableType],
blocking_rule: Union[BlockingRuleCreator, str, Dict[str, Any]],
link_type: user_input_link_type_options,
- db_api: DatabaseAPISubClass,
unique_id_column_name: str = "unique_id",
source_dataset_column_name: Optional[str] = None,
compute_post_filter_count: bool = True,
@@ -518,12 +521,12 @@ def count_comparisons_from_blocking_rule(
[here]("https://moj-analytical-services.github.io/splink/topic_guides/blocking/performance.html?h=filter+cond#filter-conditions")
Args:
- table_or_tables (dataframe, str): Input data
+ splink_dataframe_or_dataframes (SplinkDataFrame | Sequence[SplinkDataFrame]):
+ Input data
blocking_rule (Union[BlockingRuleCreator, str, Dict[str, Any]]): The blocking
rule to analyse
link_type (user_input_link_type_options): The link type - "link_only",
"dedupe_only" or "link_and_dedupe"
- db_api (DatabaseAPISubClass): Database API
unique_id_column_name (str, optional): Defaults to "unique_id".
source_dataset_column_name (Optional[str], optional): Defaults to None.
compute_post_filter_count (bool, optional): Whether to use a slower methodology
@@ -536,13 +539,14 @@ def count_comparisons_from_blocking_rule(
Returns:
dict[str, Union[int, str]]: A dictionary containing the results
"""
+ db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
# Ensure what's been passed in is a BlockingRuleCreator
blocking_rule_creator = to_blocking_rule_creator(blocking_rule).get_blocking_rule(
db_api.sql_dialect.sql_dialect_str
)
- splink_df_dict = db_api.register_multiple_tables(table_or_tables)
+ splink_df_dict = splink_dataframes_to_dict(splink_dataframe_or_dataframes)
source_dataset_input_column, unique_id_input_column = _process_unique_id_columns(
unique_id_column_name,
@@ -565,17 +569,17 @@ def count_comparisons_from_blocking_rule(
def cumulative_comparisons_to_be_scored_from_blocking_rules_data(
+ splink_dataframe_or_dataframes: SplinkDataFrame | Sequence[SplinkDataFrame],
*,
- table_or_tables: Sequence[AcceptableInputTableType],
blocking_rules: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]],
link_type: user_input_link_type_options,
- db_api: DatabaseAPISubClass,
unique_id_column_name: str = "unique_id",
max_rows_limit: int = int(1e9),
source_dataset_column_name: Optional[str] = None,
) -> pd.DataFrame:
"""TODO: Add docstring here"""
- splink_df_dict = db_api.register_multiple_tables(table_or_tables)
+ db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
+ splink_df_dict = splink_dataframes_to_dict(splink_dataframe_or_dataframes)
# whilst they're named blocking_rules, this is actually a list of
# BlockingRuleCreators. The followign code turns them into BlockingRule objects
@@ -609,17 +613,17 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data(
def cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
+ splink_dataframe_or_dataframes: SplinkDataFrame | Sequence[SplinkDataFrame],
*,
- table_or_tables: Sequence[AcceptableInputTableType],
blocking_rules: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]],
link_type: user_input_link_type_options,
- db_api: DatabaseAPISubClass,
unique_id_column_name: str = "unique_id",
max_rows_limit: int = int(1e9),
source_dataset_column_name: Optional[str] = None,
) -> ChartReturnType:
"""TODO: Add docstring here"""
- splink_df_dict = db_api.register_multiple_tables(table_or_tables)
+ db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
+ splink_df_dict = splink_dataframes_to_dict(splink_dataframe_or_dataframes)
# whilst they're named blocking_rules, this is actually a list of
# BlockingRuleCreators. The followign code turns them into BlockingRule objects
@@ -657,11 +661,10 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
def n_largest_blocks(
+ splink_dataframe_or_dataframes: SplinkDataFrame | Sequence[SplinkDataFrame],
*,
- table_or_tables: Sequence[AcceptableInputTableType],
blocking_rule: Union[BlockingRuleCreator, str, Dict[str, Any]],
link_type: user_input_link_type_options,
- db_api: DatabaseAPISubClass,
n_largest: int = 5,
) -> "SplinkDataFrame":
"""Find the values responsible for creating the largest blocks of records.
@@ -675,22 +678,24 @@ def n_largest_blocks(
[here]("https://moj-analytical-services.github.io/splink/topic_guides/blocking/performance.html?h=filter+cond#filter-conditions")
Args:
- table_or_tables (dataframe, str): Input data
+ splink_dataframe_or_dataframes (SplinkDataFrame | Sequence[SplinkDataFrame]):
+ Input data
blocking_rule (Union[BlockingRuleCreator, str, Dict[str, Any]]): The blocking
rule to analyse
link_type (user_input_link_type_options): The link type - "link_only",
"dedupe_only" or "link_and_dedupe"
- db_api (DatabaseAPISubClass): Database API
n_largest (int, optional): How many rows to return. Defaults to 5.
Returns:
SplinkDataFrame: A dataframe containing the n_largest blocks
"""
+ db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
+
blocking_rule_as_br = to_blocking_rule_creator(blocking_rule).get_blocking_rule(
db_api.sql_dialect.sql_dialect_str
)
- splink_df_dict = db_api.register_multiple_tables(table_or_tables)
+ splink_df_dict = splink_dataframes_to_dict(splink_dataframe_or_dataframes)
sqls = _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls(
splink_df_dict, blocking_rule_as_br, link_type, db_api
diff --git a/splink/internals/clustering.py b/splink/internals/clustering.py
index 3b88d3dfa6..ae926ad36b 100644
--- a/splink/internals/clustering.py
+++ b/splink/internals/clustering.py
@@ -123,12 +123,12 @@ def cluster_pairwise_predictions_at_threshold(
if isinstance(nodes, SplinkDataFrame):
nodes_sdf = nodes
else:
- nodes_sdf = db_api.register_table(nodes, f"__splink__df_nodes_{uid}")
+ nodes_sdf = db_api._create_backend_table(nodes, f"__splink__df_nodes_{uid}")
if isinstance(edges, SplinkDataFrame):
edges_sdf = edges
else:
- edges_sdf = db_api.register_table(edges, f"__splink__df_edges_{uid}")
+ edges_sdf = db_api._create_backend_table(edges, f"__splink__df_edges_{uid}")
edge_id_column_name_left, edge_id_column_name_right = _get_edge_id_column_names(
node_id_column_name,
@@ -438,14 +438,14 @@ def cluster_pairwise_predictions_at_multiple_thresholds(
# Input could either be user data, or a SplinkDataFrame
tid = ascii_uid(8)
if not isinstance(nodes, SplinkDataFrame):
- nodes_sdf = db_api.register_table(
+ nodes_sdf = db_api._create_backend_table(
nodes, f"__splink__df_nodes_{tid}", overwrite=True
)
else:
nodes_sdf = nodes
if not isinstance(edges, SplinkDataFrame):
- edges_sdf = db_api.register_table(
+ edges_sdf = db_api._create_backend_table(
edges, f"__splink__df_edges_{tid}", overwrite=True
)
else:
diff --git a/splink/internals/completeness.py b/splink/internals/completeness.py
index 3a89237832..111d9999b6 100644
--- a/splink/internals/completeness.py
+++ b/splink/internals/completeness.py
@@ -8,10 +8,14 @@
from splink.internals.charts import (
completeness_chart as records_to_completeness_chart,
)
-from splink.internals.database_api import AcceptableInputTableType, DatabaseAPISubClass
+from splink.internals.database_api import DatabaseAPISubClass
from splink.internals.input_column import InputColumn
from splink.internals.pipeline import CTEPipeline
from splink.internals.splink_dataframe import SplinkDataFrame
+from splink.internals.splinkdataframe_utils import (
+ get_db_api_from_inputs,
+ splink_dataframes_to_dict,
+)
from splink.internals.vertically_concatenate import vertically_concatenate_sql
@@ -108,8 +112,7 @@ def completeness_data(
def completeness_chart(
- table_or_tables: Sequence[AcceptableInputTableType],
- db_api: DatabaseAPISubClass,
+ splink_dataframe_or_dataframes: SplinkDataFrame | Sequence[SplinkDataFrame],
cols: List[str] = None,
table_names_for_chart: List[str] = None,
) -> ChartReturnType:
@@ -118,14 +121,14 @@ def completeness_chart(
for all columns in the input data.
Args:
- table_or_tables: A single table or a list of tables of data
- db_api (DatabaseAPISubClass): The backend database API to use
+ splink_dataframe_or_dataframes (SplinkDataFrame | Sequence[SplinkDataFrame]):
+ A single SplinkDataFrame or a sequence of SplinkDataFrames
cols (List[str], optional): List of column names to calculate completeness. If
none, all columns will be computed. Default to None.
table_names_for_chart: A list of names. Must be the same length as
- table_or_tables.
+ splink_dataframe_or_dataframes.
"""
-
- splink_df_dict = db_api.register_multiple_tables(table_or_tables)
+ db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
+ splink_df_dict = splink_dataframes_to_dict(splink_dataframe_or_dataframes)
records = completeness_data(splink_df_dict, db_api, cols, table_names_for_chart)
return records_to_completeness_chart(records)
diff --git a/splink/internals/database_api.py b/splink/internals/database_api.py
index 351d8dd9ea..99bf66232e 100644
--- a/splink/internals/database_api.py
+++ b/splink/internals/database_api.py
@@ -48,7 +48,21 @@ class DatabaseAPI(ABC, Generic[TablishType]):
def __init__(self) -> None:
self._intermediate_table_cache: CacheDictWithLogging = CacheDictWithLogging()
self._cache_uid: str = ascii_uid(8)
+ self._id: str = ascii_uid(8)
self._created_tables: set[str] = set()
+ self._input_table_counter: int = 0
+ self._registered_source_dataset_names: set[str] = set()
+
+ @property
+ @final
+ def id(self) -> str:
+ """Useful for debugging when multiple database API instances exist."""
+ return self._id
+
+ def _new_input_table_name(self) -> str:
+ name = f"__splink__input_table_{self._input_table_counter}"
+ self._input_table_counter += 1
+ return name
@final
def _log_and_run_sql_execution(
@@ -211,59 +225,65 @@ def sql_pipeline_to_splink_dataframe(
return splink_dataframe
- @final
- def register_multiple_tables(
+ # See https://github.com/moj-analytical-services/splink/pull/2863#issue-3738534958
+ # for notes on this code
+ def register(
self,
- input_tables: Sequence[AcceptableInputTableType],
- input_aliases: Optional[List[str]] = None,
- overwrite: bool = False,
- ) -> Dict[str, SplinkDataFrame]:
- input_tables = self.process_input_tables(input_tables)
-
- tables_as_splink_dataframes = {}
- existing_tables = []
-
- if not input_aliases:
- input_aliases = [f"__splink__{ascii_uid(8)}" for table in input_tables]
-
- for table, alias in zip(input_tables, input_aliases):
- if isinstance(table, str):
- # already registered - this should be a table name
- continue
- exists = self.table_exists_in_database(alias)
- # if table exists, and we are not overwriting, we have a problem!
- if exists:
- if not overwrite:
- existing_tables.append(alias)
- else:
- self.delete_table_from_database(alias)
-
- if existing_tables:
- existing_tables_str = ", ".join(existing_tables)
- msg = (
- f"Table(s): {existing_tables_str} already exists in database. "
- "Please remove or rename before retrying"
- )
- raise ValueError(msg)
- for table, alias in zip(input_tables, input_aliases):
- if not isinstance(table, str):
- self._table_registration(table, alias)
- table = alias
- sdf = self.table_to_splink_dataframe(alias, table)
- tables_as_splink_dataframes[alias] = sdf
- return tables_as_splink_dataframes
+ table: AcceptableInputTableType | str,
+ source_dataset_name: Optional[str] = None,
+ ) -> SplinkDataFrame:
+ if source_dataset_name is not None:
+ if source_dataset_name in self._registered_source_dataset_names:
+ raise ValueError(
+ f"A table has already been registered with "
+ f"source_dataset_name='{source_dataset_name}'. "
+ f"Each registered table must have a unique source_dataset_name."
+ )
+ self._registered_source_dataset_names.add(source_dataset_name)
+
+ templated_name = source_dataset_name or self._new_input_table_name()
+
+ # String inputs represent already-registered physical tables.
+ # If `source_dataset_name` is not provided, we still generate a fresh internal
+ # templated name so that the same physical table can be used multiple times as
+ # distinct inputs (e.g. linking a table to itself).
+ if isinstance(table, str):
+ physical_name = table
+ sdf = self.table_to_splink_dataframe(templated_name, physical_name)
+ else:
+ # Allow overwrite of table only if Splink is assigning the name
+ # i.e. allow overwrites of tables of the form __splink__input_table_n
+ overwrite = source_dataset_name is None
+ sdf = self._create_backend_table(table, templated_name, overwrite=overwrite)
+
+ sdf.source_dataset_name = templated_name
+ return sdf
@final
- def register_table(
+ def _create_backend_table(
self,
input_table: AcceptableInputTableType,
- table_name: str,
+ templated_name: str,
overwrite: bool = False,
) -> SplinkDataFrame:
- tables_dict = self.register_multiple_tables(
- [input_table], [table_name], overwrite=overwrite
- )
- return tables_dict[table_name]
+ # If input is a string, it's already a registered table name in the database.
+ # Just create a SplinkDataFrame wrapper pointing to it using the templated
+ # name
+ if isinstance(input_table, str):
+ return self.table_to_splink_dataframe(templated_name, input_table)
+
+ exists = self.table_exists_in_database(templated_name)
+ if exists:
+ if not overwrite:
+ raise ValueError(
+ f"Table '{templated_name}' already exists in database. "
+ "Please remove or rename before retrying"
+ )
+ else:
+ self.delete_table_from_database(templated_name)
+
+ self._table_registration(input_table, templated_name)
+ return self.table_to_splink_dataframe(templated_name, templated_name)
def _setup_for_execute_sql(self, sql: str, physical_name: str) -> str:
# returns sql
diff --git a/splink/internals/linker.py b/splink/internals/linker.py
index efdf4b621c..0fa90b9095 100644
--- a/splink/internals/linker.py
+++ b/splink/internals/linker.py
@@ -4,7 +4,7 @@
from copy import copy, deepcopy
from pathlib import Path
from statistics import median
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, Sequence
from splink.internals.blocking import (
BlockingRule,
@@ -14,7 +14,6 @@
from splink.internals.comparison_vector_values import (
compute_comparison_vector_values_from_id_pairs_sqls,
)
-from splink.internals.database_api import AcceptableInputTableType, DatabaseAPISubClass
from splink.internals.dialects import SplinkDialect
from splink.internals.em_training_session import EMTrainingSession
from splink.internals.exceptions import SplinkException
@@ -32,7 +31,6 @@
from splink.internals.misc import (
ascii_uid,
bayes_factor_to_prob,
- ensure_is_list,
prob_to_bayes_factor,
)
from splink.internals.optimise_cost_of_brs import suggest_blocking_rules
@@ -49,6 +47,10 @@
_validate_dialect,
)
from splink.internals.splink_dataframe import SplinkDataFrame
+from splink.internals.splinkdataframe_utils import (
+ get_db_api_from_inputs,
+ splink_dataframes_to_dict,
+)
from splink.internals.unique_id_concat import (
_composite_unique_id_from_edges_sql,
)
@@ -73,11 +75,9 @@ class Linker:
def __init__(
self,
- input_table_or_tables: str | list[str],
+ splink_dataframe_or_dataframes: SplinkDataFrame | Sequence[SplinkDataFrame],
settings: SettingsCreator | dict[str, Any] | Path | str,
- db_api: DatabaseAPISubClass,
set_up_basic_logging: bool = True,
- input_table_aliases: str | list[str] | None = None,
validate_settings: bool = True,
):
"""
@@ -88,46 +88,37 @@ def __init__(
Dedupe
```py
- linker = Linker(df, settings_dict, db_api)
+ linker = Linker(df, settings_dict)
```
Link
```py
- df_1 = pd.read_parquet("table_1/")
- df_2 = pd.read_parquet("table_2/")
+ df_1 = db_api.register(pd.read_parquet("table_1/"))
+ df_2 = db_api.register(pd.read_parquet("table_2/"))
linker = Linker(
[df_1, df_2],
settings_dict,
- input_table_aliases=["customers", "contact_center_callers"]
)
```
Dedupe with a pre-trained model read from a json file
```py
- df = pd.read_csv("data_to_dedupe.csv")
+ df = db_api.register(pd.read_csv("data_to_dedupe.csv"), "my_data")
linker = Linker(df, "model.json")
```
Args:
- input_table_or_tables (Union[str, list]): Input data into the linkage model.
- Either a single string (the name of a table in a database) for
- deduplication jobs, or a list of strings (the name of tables in a
- database) for link_only or link_and_dedupe. For some linkers, such as
- the DuckDBLinker and the SparkLinker, it's also possible to pass in
- dataframes (Pandas and Spark respectively) rather than strings.
+ splink_dataframe_or_dataframes (SplinkDataFrame | Sequence[SplinkDataFrame]):
+ Input data into the linkage model. Either a single SplinkDataFrame for
+ deduplication jobs, or a sequence of SplinkDataFrames for link_only
+ or link_and_dedupe. Tables should be registered using db_api.register()
+ before being passed to the Linker.
settings_dict (dict | Path | str): A Splink settings dictionary,
or a path (either as a pathlib.Path object, or a string) to a json file
defining a settings dictionary or pre-trained model.
- db_api (DatabaseAPI): A `DatabaseAPI` object, which manages interactions
- with the database. You can import these for use from
- `splink.backends.{your_backend}`
set_up_basic_logging (bool, optional): If true, sets ups up basic logging
so that Splink sends messages at INFO level to stdout. Defaults to True.
- input_table_aliases (Union[str, list], optional): Labels assigned to
- input tables in Splink outputs. If the names of the tables in the
- input database are long or unspecific, this argument can be used
- to attach more easily readable/interpretable names. Defaults to None.
validate_settings (bool, optional): When True, check your settings
dictionary for any potential errors that may cause splink to fail.
- """
+ """ # noqa: E501
self._db_schema = "splink"
if set_up_basic_logging:
logging.basicConfig(
@@ -136,7 +127,7 @@ def __init__(
splink_logger = logging.getLogger("splink")
splink_logger.setLevel(logging.INFO)
- self._db_api = db_api
+ self._db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
# TODO: temp hack for compat
self._intermediate_table_cache: CacheDictWithLogging = (
@@ -158,15 +149,14 @@ def __init__(
# Maybe overwrite it here and incompatibilities have to be dealt with
# by comparisons/ blocking rules etc??
self._settings_obj = settings_creator.get_settings(
- db_api.sql_dialect.sql_dialect_str
+ self._db_api.sql_dialect.sql_dialect_str
)
# TODO: Add test of what happens if the db_api is for a different backend
# to the sql_dialect set in the settings dict
- self._input_tables_dict = self._register_input_tables(
- input_table_or_tables,
- input_table_aliases,
+ self._input_tables_dict = splink_dataframes_to_dict(
+ splink_dataframe_or_dataframes
)
self._validate_input_dfs()
@@ -298,26 +288,6 @@ def _random_sample_sql(
proportion, sample_size, seed=seed, table=table, unique_id=unique_id
)
- def _register_input_tables(
- self,
- input_tables: Sequence[AcceptableInputTableType],
- input_aliases: Optional[str | List[str]],
- ) -> Dict[str, SplinkDataFrame]:
- input_tables_list = ensure_is_list(input_tables)
-
- if input_aliases is None:
- input_table_aliases = [
- f"__splink__input_table_{i}" for i, _ in enumerate(input_tables_list)
- ]
- overwrite = True
- else:
- input_table_aliases = ensure_is_list(input_aliases)
- overwrite = False
-
- return self._db_api.register_multiple_tables(
- input_tables, input_table_aliases, overwrite
- )
-
def _check_for_valid_settings(self):
# raw tables don't yet exist in db
return hasattr(self, "_input_tables_dict")
diff --git a/splink/internals/linker_components/table_management.py b/splink/internals/linker_components/table_management.py
index 96387fa3bb..1744ffb375 100644
--- a/splink/internals/linker_components/table_management.py
+++ b/splink/internals/linker_components/table_management.py
@@ -341,4 +341,6 @@ def register_table(
pipeline
"""
- return self._linker._db_api.register_table(input_table, table_name, overwrite)
+ return self._linker._db_api._create_backend_table(
+ input_table, table_name, overwrite
+ )
diff --git a/splink/internals/profile_data.py b/splink/internals/profile_data.py
index a20f18522f..529b7821c0 100644
--- a/splink/internals/profile_data.py
+++ b/splink/internals/profile_data.py
@@ -9,9 +9,13 @@
load_chart_definition,
)
from splink.internals.column_expression import ColumnExpression
-from splink.internals.database_api import AcceptableInputTableType, DatabaseAPISubClass
from splink.internals.misc import ensure_is_list
from splink.internals.pipeline import CTEPipeline
+from splink.internals.splink_dataframe import SplinkDataFrame
+from splink.internals.splinkdataframe_utils import (
+ get_db_api_from_inputs,
+ splink_dataframes_to_dict,
+)
from splink.internals.vertically_concatenate import vertically_concatenate_sql
logger = logging.getLogger(__name__)
@@ -203,8 +207,7 @@ def _add_100_percentile_to_df_percentiles(percentile_rows):
def profile_columns(
- table_or_tables: Sequence[AcceptableInputTableType],
- db_api: DatabaseAPISubClass,
+ splink_dataframe_or_dataframes: Union[SplinkDataFrame, Sequence[SplinkDataFrame]],
column_expressions: Optional[List[Union[str, ColumnExpression]]] = None,
top_n: int = 10,
bottom_n: int = 10,
@@ -227,7 +230,10 @@ def profile_columns(
identify the need for standardisation within a given column.
Args:
-
+ splink_dataframe_or_dataframes (
+ Union[SplinkDataFrame, Sequence[SplinkDataFrame]]
+ ):
+ A single SplinkDataFrame or a sequence of SplinkDataFrames
column_expressions (list, optional): A list of strings containing the
specified column names.
If left empty this will default to all columns.
@@ -245,8 +251,8 @@ def profile_columns(
- The `top_n` and `bottom_n` parameters determine the number of top and bottom
values to display in the respective charts.
"""
-
- splink_df_dict = db_api.register_multiple_tables(table_or_tables)
+ db_api = get_db_api_from_inputs(splink_dataframe_or_dataframes)
+ splink_df_dict = splink_dataframes_to_dict(splink_dataframe_or_dataframes)
pipeline = CTEPipeline()
sql = vertically_concatenate_sql(splink_df_dict, source_dataset_input_column=None)
diff --git a/splink/internals/realtime.py b/splink/internals/realtime.py
index 6b7caedc72..810406e195 100644
--- a/splink/internals/realtime.py
+++ b/splink/internals/realtime.py
@@ -86,14 +86,14 @@ def compare_records(
else:
to_register_right = record_2
- df_records_left = db_api.register_table(
+ df_records_left = db_api._create_backend_table(
to_register_left,
f"__splink__compare_records_left_{uid}",
overwrite=True,
)
df_records_left.templated_name = "__splink__compare_records_left"
- df_records_right = db_api.register_table(
+ df_records_right = db_api._create_backend_table(
to_register_right,
f"__splink__compare_records_right_{uid}",
overwrite=True,
diff --git a/splink/internals/splink_dataframe.py b/splink/internals/splink_dataframe.py
index 47abbb55a2..5ed6dfd574 100644
--- a/splink/internals/splink_dataframe.py
+++ b/splink/internals/splink_dataframe.py
@@ -48,6 +48,17 @@ def columns_escaped(self):
cols = self.columns
return [c.name for c in cols]
+ # source_dataset_name is just a human-readable name for the dataset that appears
+ # in the source_dataset column of match results. It's only relevant for
+ # input dataframes
+ @property
+ def source_dataset_name(self) -> str:
+ return self.metadata.get("source_dataset", self.templated_name)
+
+ @source_dataset_name.setter
+ def source_dataset_name(self, value: str) -> None:
+ self.metadata["source_dataset"] = value
+
@abstractmethod
def validate(self):
pass
diff --git a/splink/internals/splinkdataframe_utils.py b/splink/internals/splinkdataframe_utils.py
new file mode 100644
index 0000000000..57b9d93cd2
--- /dev/null
+++ b/splink/internals/splinkdataframe_utils.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Iterable, Sequence
+
+from splink.internals.exceptions import SplinkException
+from splink.internals.splink_dataframe import SplinkDataFrame
+
+if TYPE_CHECKING:
+ from splink.internals.database_api import DatabaseAPI
+ from splink.internals.splink_dataframe import SplinkDataFrame
+
+
+def get_db_api_from_inputs(
+ table_or_tables: SplinkDataFrame | Sequence[SplinkDataFrame],
+) -> DatabaseAPI[Any]:
+ tables: Iterable[SplinkDataFrame]
+ if isinstance(table_or_tables, SplinkDataFrame):
+ tables = [table_or_tables]
+ else:
+ tables = list(table_or_tables)
+
+ if not tables:
+ raise SplinkException("At least one SplinkDataFrame must be provided.")
+
+ first = tables[0]
+ first_db_api = first.db_api
+
+ for sdf in tables[1:]:
+ if sdf.db_api is not first_db_api:
+ raise SplinkException(
+ "All input SplinkDataFrames must be registered against the same "
+ "database API.\n"
+ f"Table '{first.templated_name}' is registered with a "
+ f"{type(first_db_api).__name__} with id='{first_db_api.id}', "
+ f"but table '{sdf.templated_name}' is registered with a "
+ f"{type(sdf.db_api).__name__} with id='{sdf.db_api.id}'.\n"
+ "Please ensure all tables are registered using the same db_api. "
+ "You can check the id of your db_api using db_api.id"
+ )
+
+ return first_db_api
+
+
+def splink_dataframes_to_dict(
+ table_or_tables: SplinkDataFrame | Sequence[SplinkDataFrame],
+) -> dict[str, SplinkDataFrame]:
+ tables: Iterable[SplinkDataFrame]
+ if isinstance(table_or_tables, SplinkDataFrame):
+ tables = [table_or_tables]
+ else:
+ tables = table_or_tables
+
+ return {sdf.templated_name: sdf for sdf in tables}
diff --git a/splink/internals/vertically_concatenate.py b/splink/internals/vertically_concatenate.py
index ebd08d9405..a3b21bd3a8 100644
--- a/splink/internals/vertically_concatenate.py
+++ b/splink/internals/vertically_concatenate.py
@@ -54,7 +54,9 @@ def vertically_concatenate_sql(
if source_dataset_column_already_exists:
create_sds_if_needed = ""
else:
- create_sds_if_needed = f"'{df_obj.templated_name}' as source_dataset,"
+ create_sds_if_needed = (
+ f"'{df_obj.source_dataset_name}' as source_dataset,"
+ )
sql = f"""
select
diff --git a/tests/cc_testing_utils.py b/tests/cc_testing_utils.py
index 9de4b79a9b..7d52cce05e 100644
--- a/tests/cc_testing_utils.py
+++ b/tests/cc_testing_utils.py
@@ -28,9 +28,11 @@ def nodes_and_edges_from_graph(G):
def run_cc_implementation(nodes, edges):
# finally, run our connected components algorithm
db_api = DuckDBAPI()
+ nodes_sdf = db_api.register(nodes)
+ edges_sdf = db_api.register(edges)
cc = cluster_pairwise_predictions_at_threshold(
- nodes=nodes,
- edges=edges,
+ nodes=nodes_sdf,
+ edges=edges_sdf,
db_api=db_api,
node_id_column_name="unique_id",
edge_id_column_name_left="unique_id_l",
diff --git a/tests/helpers.py b/tests/helpers.py
index 63d6443b1c..b044d5725f 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -22,14 +22,17 @@ def DatabaseAPI(self):
def db_api_args(self):
return {}
- def extra_linker_args(self):
- # create fresh api each time
- return {"db_api": self.DatabaseAPI(**self.db_api_args())}
+ # def extra_linker_args(self):
+ # # create fresh api each time
+ # return {"db_api": self.DatabaseAPI(**self.db_api_args())}
@property
def date_format(self):
return "yyyy-mm-dd"
+ def db_api(self):
+ return self.DatabaseAPI(**self.db_api_args())
+
@abstractmethod
def convert_frame(self, df):
pass
@@ -44,6 +47,25 @@ def load_frame_from_parquet(self, path):
def arrays_from(self) -> int:
return 1
+ def linker_with_registration(
+ self, data, settings, input_table_aliases=None, **kwargs
+ ):
+ db_api = self.db_api()
+
+ data_list = list(data) if isinstance(data, (list, tuple)) else [data]
+
+ if input_table_aliases is None:
+ aliases = [None] * len(data_list)
+ elif isinstance(input_table_aliases, str):
+ aliases = [input_table_aliases]
+ else:
+ aliases = list(input_table_aliases)
+
+ sdfs = [db_api.register(d, alias) for d, alias in zip(data_list, aliases)]
+
+ input_frames = sdfs[0] if len(sdfs) == 1 else sdfs
+ return Linker(input_frames, settings, **kwargs)
+
class DuckDBTestHelper(TestHelper):
@property
diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py
index 8df48dfad8..9f281b13fe 100644
--- a/tests/test_accuracy.py
+++ b/tests/test_accuracy.py
@@ -49,8 +49,9 @@ def test_scored_labels_table():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
pipeline = CTEPipeline()
concat_with_tf = compute_df_concat_with_tf(linker, pipeline)
@@ -111,8 +112,9 @@ def test_truth_space_table():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
labels_with_predictions = [
{
@@ -194,8 +196,9 @@ def test_roc_chart_dedupe_only():
)
settings_dict = get_settings_dict()
db_api = DuckDBAPI(connection=":memory:")
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
labels_sdf = linker.table_management.register_table(df_labels, "labels")
@@ -225,8 +228,9 @@ def test_roc_chart_link_and_dedupe():
settings_dict = get_settings_dict()
settings_dict["link_type"] = "link_and_dedupe"
db_api = DuckDBAPI(connection=":memory:")
+ df_sdf = db_api.register(df, source_dataset_name="fake_data_1")
- linker = Linker(df, settings_dict, input_table_aliases="fake_data_1", db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
labels_sdf = linker.table_management.register_table(df_labels, "labels")
@@ -287,8 +291,9 @@ def test_prediction_errors_from_labels_table():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker.table_management.register_table(df_labels, "labels")
@@ -308,8 +313,9 @@ def test_prediction_errors_from_labels_table():
assert (1, 2) not in records # tp
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker.table_management.register_table(df_labels, "labels")
@@ -329,8 +335,9 @@ def test_prediction_errors_from_labels_table():
assert (1, 2) not in records # tp
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker.table_management.register_table(df_labels, "labels")
pipeline = CTEPipeline()
@@ -392,8 +399,9 @@ def test_prediction_errors_from_labels_column():
#
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
df_res = linker.evaluation.prediction_errors_from_labels_column(
"cluster"
@@ -409,8 +417,9 @@ def test_prediction_errors_from_labels_column():
assert (1, 5) not in records # TN
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
df_res = linker.evaluation.prediction_errors_from_labels_column(
"cluster", include_false_positives=False
@@ -426,8 +435,9 @@ def test_prediction_errors_from_labels_column():
assert (1, 5) not in records # TN
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
df_res = linker.evaluation.prediction_errors_from_labels_column(
"cluster", include_false_negatives=False
@@ -488,8 +498,9 @@ def test_truth_space_table_from_labels_column_dedupe_only():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
tt = linker.evaluation.accuracy_analysis_from_labels_column(
"cluster", output_type="table"
@@ -559,8 +570,10 @@ def test_truth_space_table_from_labels_column_link_only():
}
db_api = DuckDBAPI()
+ df_left_sdf = db_api.register(df_left)
+ df_right_sdf = db_api.register(df_right)
- linker = Linker([df_left, df_right], settings, db_api=db_api)
+ linker = Linker([df_left_sdf, df_right_sdf], settings)
tt = linker.evaluation.accuracy_analysis_from_labels_column(
"ground_truth", output_type="table"
@@ -610,7 +623,9 @@ def test_truth_space_table_from_column_vs_pandas_implementaiton_inc_unblocked():
additional_columns_to_retain=["cluster"],
)
- linker_for_predictions = Linker(df, settings, db_api=DuckDBAPI())
+ db_api_pred = DuckDBAPI()
+ df_sdf = db_api_pred.register(df)
+ linker_for_predictions = Linker(df_sdf, settings)
df_predictions_raw = linker_for_predictions.inference.predict()
# Score all of the positive labels even if not captured by the blocking rules
@@ -645,7 +660,9 @@ def test_truth_space_table_from_column_vs_pandas_implementaiton_inc_unblocked():
additional_columns_to_retain=["cluster"],
)
- linker_for_splink_answer = Linker(df, settings, db_api=DuckDBAPI())
+ db_api_answer = DuckDBAPI()
+ df_sdf_answer = db_api_answer.register(df)
+ linker_for_splink_answer = Linker(df_sdf_answer, settings)
df_from_splink = (
linker_for_splink_answer.evaluation.accuracy_analysis_from_labels_column(
"cluster",
@@ -686,7 +703,10 @@ def test_truth_space_table_from_column_vs_pandas_implementaiton_ex_unblocked():
additional_columns_to_retain=["cluster"],
)
- linker_for_predictions = Linker([df_1, df_2], settings, db_api=DuckDBAPI())
+ db_api_pred = DuckDBAPI()
+ df_1_sdf = db_api_pred.register(df_1)
+ df_2_sdf = db_api_pred.register(df_2)
+ linker_for_predictions = Linker([df_1_sdf, df_2_sdf], settings)
df_predictions_raw = linker_for_predictions.inference.predict()
# When match_key = 1, the record is not really recovered by the blocking rules
@@ -715,7 +735,10 @@ def test_truth_space_table_from_column_vs_pandas_implementaiton_ex_unblocked():
blocking_rules_to_generate_predictions=[block_on("first_name")],
additional_columns_to_retain=["cluster"],
)
- linker_for_splink_answer = Linker([df_1, df_2], settings, db_api=DuckDBAPI())
+ db_api_answer = DuckDBAPI()
+ df_1_sdf_answer = db_api_answer.register(df_1)
+ df_2_sdf_answer = db_api_answer.register(df_2)
+ linker_for_splink_answer = Linker([df_1_sdf_answer, df_2_sdf_answer], settings)
df_from_splink = (
linker_for_splink_answer.evaluation.accuracy_analysis_from_labels_column(
@@ -764,10 +787,14 @@ def test_truth_space_table_from_table_vs_pandas_cartesian():
additional_columns_to_retain=["cluster"],
)
- linker_for_predictions = Linker(df_first_50, settings, db_api=DuckDBAPI())
+ db_api_pred = DuckDBAPI()
+ df_first_50_sdf = db_api_pred.register(df_first_50)
+ linker_for_predictions = Linker(df_first_50_sdf, settings)
df_predictions = linker_for_predictions.inference.predict().as_pandas_dataframe()
- linker_for_splink_answer = Linker(df, settings, db_api=DuckDBAPI())
+ db_api_answer = DuckDBAPI()
+ df_sdf_answer = db_api_answer.register(df)
+ linker_for_splink_answer = Linker(df_sdf_answer, settings)
labels_input = linker_for_splink_answer.table_management.register_labels_table(
labels_table
)
@@ -823,9 +850,10 @@ def test_truth_space_table_from_table_vs_pandas_with_blocking():
additional_columns_to_retain=["cluster"],
)
- linker_for_predictions = Linker(
- [df_1_first_50, df_2_first_50], settings, db_api=DuckDBAPI()
- )
+ db_api_pred = DuckDBAPI()
+ df_1_first_50_sdf = db_api_pred.register(df_1_first_50)
+ df_2_first_50_sdf = db_api_pred.register(df_2_first_50)
+ linker_for_predictions = Linker([df_1_first_50_sdf, df_2_first_50_sdf], settings)
df_predictions_raw = linker_for_predictions.inference.predict()
df_predictions_raw.as_pandas_dataframe()
sql = f"""
@@ -853,7 +881,10 @@ def test_truth_space_table_from_table_vs_pandas_with_blocking():
additional_columns_to_retain=["cluster"],
)
- linker_for_splink_answer = Linker([df_1, df_2], settings, db_api=DuckDBAPI())
+ db_api_answer = DuckDBAPI()
+ df_1_sdf_answer = db_api_answer.register(df_1)
+ df_2_sdf_answer = db_api_answer.register(df_2)
+ linker_for_splink_answer = Linker([df_1_sdf_answer, df_2_sdf_answer], settings)
labels_input = linker_for_splink_answer.table_management.register_labels_table(
labels_table
)
diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py
index 513c60014c..185ad2a336 100644
--- a/tests/test_analyse_blocking.py
+++ b/tests/test_analyse_blocking.py
@@ -45,21 +45,24 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect):
]
)
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
+ df1_sdf = db_api.register(df_1)
+ df2_sdf = db_api.register(df_2)
+ df3_sdf = db_api.register(df_3)
+
args = {
"link_type": "dedupe_only",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
}
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=df_1, blocking_rule="1=1", **args
+ df1_sdf, blocking_rule="1=1", **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 4 * 3 / 2
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=df_1, blocking_rule=block_on("first_name"), **args
+ df1_sdf, blocking_rule=block_on("first_name"), **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
@@ -67,20 +70,20 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect):
args["link_type"] = "link_only"
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule="1=1", **args
+ [df1_sdf, df2_sdf], blocking_rule="1=1", **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 4 * 3
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule=block_on("surname"), **args
+ [df1_sdf, df2_sdf], blocking_rule=block_on("surname"), **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 1
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2],
+ [df1_sdf, df2_sdf],
blocking_rule=block_on("first_name"),
**args,
)
@@ -88,14 +91,14 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect):
assert res == 3
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2, df_3], blocking_rule="1=1", **args
+ [df1_sdf, df2_sdf, df3_sdf], blocking_rule="1=1", **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 4 * 3 + 4 * 2 + 2 * 3
args["link_type"] = "link_and_dedupe"
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule="1=1", **args
+ [df1_sdf, df2_sdf], blocking_rule="1=1", **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
expected = 4 * 3 + (4 * 3 / 2) + (3 * 2 / 2)
@@ -103,14 +106,14 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect):
rule = "l.first_name = r.first_name and l.surname = r.surname"
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule=rule, **args
+ [df1_sdf, df2_sdf], blocking_rule=rule, **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 1
rule = block_on("first_name", "surname")
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule=rule, **args
+ [df1_sdf, df2_sdf], blocking_rule=rule, **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 1
@@ -136,30 +139,30 @@ def test_blocking_analysis_slow_methodology_exploding(test_helpers, dialect):
{"unique_id": 3, "first_name": "Jayne", "postcode": [1003]},
]
)
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
+ df_1_sdf = db_api.register(df_1)
+ df_2_sdf = db_api.register(df_2)
args = {
"link_type": "link_only",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
}
rule = block_on("postcode", arrays_to_explode=["postcode"])
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule=rule, **args
+ [df_1_sdf, df_2_sdf], blocking_rule=rule, **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 6
args = {
"link_type": "link_and_dedupe",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
}
rule = block_on("postcode", arrays_to_explode=["postcode"])
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule=rule, **args
+ [df_1_sdf, df_2_sdf], blocking_rule=rule, **args
)
res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"]
assert res == 3 + 6 + 2
@@ -170,7 +173,7 @@ def test_blocking_analysis_slow_methodology_exploding(test_helpers, dialect):
def test_blocking_analysis_slow_methodology_exploding_2(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
cols = ("unique_id", "sds", "first_name", "postcode", "age", "amount")
@@ -191,9 +194,11 @@ def test_blocking_analysis_slow_methodology_exploding_2(test_helpers, dialect):
]
df_2 = pd.DataFrame(rows_2, columns=cols)
+ df_1_sdf = db_api.register(df_1)
+ df_2_sdf = db_api.register(df_2)
+
args = {
"link_type": "link_only",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
"source_dataset_column_name": "sds",
}
@@ -210,7 +215,7 @@ def test_blocking_analysis_slow_methodology_exploding_2(test_helpers, dialect):
}
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1, df_2], blocking_rule=rule, **args
+ [df_1_sdf, df_2_sdf], blocking_rule=rule, **args
)
sql = """
@@ -270,21 +275,22 @@ def test_source_dataset_works_as_expected(test_helpers, dialect):
df_1.drop(columns=["src_dataset"], inplace=True)
df_2.drop(columns=["src_dataset"], inplace=True)
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
+ df_concat_sdf = db_api.register(df_concat)
+ df_1_sdf = db_api.register(df_1)
+ df_2_sdf = db_api.register(df_2)
r1 = cumulative_comparisons_to_be_scored_from_blocking_rules_data(
- table_or_tables=df_concat,
+ df_concat_sdf,
blocking_rules=[block_on("first_name")],
- db_api=db_api,
unique_id_column_name="unique_id",
source_dataset_column_name="src_dataset",
link_type="link_only",
)
r2 = cumulative_comparisons_to_be_scored_from_blocking_rules_data(
- table_or_tables=[df_1, df_2],
+ [df_1_sdf, df_2_sdf],
blocking_rules=[block_on("first_name")],
- db_api=db_api,
unique_id_column_name="unique_id",
link_type="link_only",
source_dataset_column_name="source_dataset",
@@ -306,28 +312,31 @@ def test_source_dataset_works_as_expected(test_helpers, dialect):
df_2_no_sds = df[df["unique_id"] % 3 == 1].copy()
df_3_no_sds = df[df["unique_id"] % 3 == 2].copy()
+ df_concat_2_sdf = db_api.register(df_concat_2)
+ df_concat_3_sdf = db_api.register(df_concat_3)
+ df_1_no_sds_sdf = db_api.register(df_1_no_sds)
+ df_2_no_sds_sdf = db_api.register(df_2_no_sds)
+ df_3_no_sds_sdf = db_api.register(df_3_no_sds)
+
count_comparisons_from_blocking_rule(
- table_or_tables=df_concat_3,
+ df_concat_3_sdf,
blocking_rule=block_on("first_name"),
link_type="dedupe_only",
unique_id_column_name="unique_id",
- db_api=db_api,
)
r1 = count_comparisons_from_blocking_rule(
- table_or_tables=df_concat_3,
+ df_concat_3_sdf,
blocking_rule=block_on("first_name"),
link_type="link_only",
- db_api=db_api,
unique_id_column_name="unique_id",
source_dataset_column_name="sds",
)
r2 = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1_no_sds, df_2_no_sds, df_3_no_sds],
+ [df_1_no_sds_sdf, df_2_no_sds_sdf, df_3_no_sds_sdf],
blocking_rule=block_on("first_name"),
link_type="link_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
# Both of the above use the vertical concat of the two datasets so should
@@ -340,19 +349,17 @@ def test_source_dataset_works_as_expected(test_helpers, dialect):
assert r1[k] == r2[k]
r1 = count_comparisons_from_blocking_rule(
- table_or_tables=df_concat_2,
+ df_concat_2_sdf,
blocking_rule=block_on("first_name"),
link_type="link_only",
- db_api=db_api,
unique_id_column_name="unique_id",
source_dataset_column_name="sds",
)
r2 = count_comparisons_from_blocking_rule(
- table_or_tables=[df_1_no_sds, df_2_no_sds],
+ [df_1_no_sds_sdf, df_2_no_sds_sdf],
blocking_rule=block_on("first_name"),
link_type="link_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
# There's an optimisation in the case of two input dataframes only
@@ -374,7 +381,7 @@ def test_blocking_records_accuracy(test_helpers, dialect):
from numpy import nan
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
# resolve an issue w/ pyspark nulls
@@ -385,12 +392,12 @@ def test_blocking_records_accuracy(test_helpers, dialect):
{"unique_id": 4, "first_name": "Kim", "surname": "Lee", "dob": None},
]
df = pd.DataFrame(df).fillna(nan).replace([nan], [None])
+ df_sdf = db_api.register(df)
comparison_count_args = {
- "table_or_tables": df,
+ "splink_dataframe_or_dataframes": df_sdf,
"blocking_rules": [block_on("first_name")],
"link_type": "dedupe_only",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
}
@@ -456,6 +463,9 @@ def test_blocking_records_accuracy(test_helpers, dialect):
df_r = pd.DataFrame(df_r).fillna(nan).replace([nan], [None])
+ df_l_sdf = db_api.register(df_l)
+ df_r_sdf = db_api.register(df_r)
+
blocking_rules = [
"l.surname = r.surname", # 2l:2r,
Or(
@@ -465,9 +475,8 @@ def test_blocking_records_accuracy(test_helpers, dialect):
]
comparison_count_args = {
- "table_or_tables": [df_l, df_r],
+ "splink_dataframe_or_dataframes": [df_l_sdf, df_r_sdf],
"link_type": "link_and_dedupe",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
"blocking_rules": blocking_rules,
"source_dataset_column_name": "source_dataset",
@@ -524,10 +533,13 @@ def test_blocking_records_accuracy(test_helpers, dialect):
df_3 = pd.DataFrame(df_3)
+ df_1_sdf = db_api.register(df_1)
+ df_2_sdf = db_api.register(df_2)
+ df_3_sdf = db_api.register(df_3)
+
comparison_count_args = {
- "table_or_tables": [df_1, df_2, df_3],
+ "splink_dataframe_or_dataframes": [df_1_sdf, df_2_sdf, df_3_sdf],
"link_type": "link_and_dedupe",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
"blocking_rules": [
block_on("surname"),
@@ -581,11 +593,12 @@ def test_analyse_blocking_fast_methodology():
)
db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1)
+ df_2_sdf = db_api.register(df_2)
args = {
- "table_or_tables": df_1,
+ "splink_dataframe_or_dataframes": df_1_sdf,
"link_type": "dedupe_only",
- "db_api": db_api,
"unique_id_column_name": "unique_id",
"compute_post_filter_count": False,
}
@@ -614,7 +627,7 @@ def test_analyse_blocking_fast_methodology():
res = res_dict["number_of_comparisons_generated_pre_filter_conditions"]
assert res == 3 * 3 + 1 * 1 + 1 * 1
- args["table_or_tables"] = [df_1, df_2]
+ args["splink_dataframe_or_dataframes"] = [df_1_sdf, df_2_sdf]
args["link_type"] = "link_and_dedupe"
args["blocking_rule"] = block_on("first_name")
@@ -661,13 +674,13 @@ def test_analyse_blocking_fast_methodology_edge_cases():
results[br] = {"count_from_join_dedupe_only": res.iloc[0].iloc[0]}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
for br in blocking_rules:
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=df,
+ df_sdf,
blocking_rule=br,
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
c = res_dict["number_of_comparisons_generated_pre_filter_conditions"]
@@ -684,6 +697,9 @@ def test_analyse_blocking_fast_methodology_edge_cases():
df_l = df.iloc[::2].copy() # even-indexed rows (starting from 0)
df_r = df.iloc[1::2].copy() # odd-indexed rows (starting from 1)
+ df_l_sdf = db_api.register(df_l)
+ df_r_sdf = db_api.register(df_r)
+
sql_template = """
select count(*)
from df_l as l
@@ -697,14 +713,11 @@ def test_analyse_blocking_fast_methodology_edge_cases():
res = duckdb.sql(sql).df()
results[br] = {"count_from_join_link_only": res.iloc[0].iloc[0]}
- db_api = DuckDBAPI()
-
for br in blocking_rules:
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=[df_l, df_r],
+ [df_l_sdf, df_r_sdf],
blocking_rule=br,
link_type="link_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
c = res_dict["number_of_comparisons_generated_pre_filter_conditions"]
@@ -732,15 +745,14 @@ def test_blocking_rule_accepts_different_dialects():
@mark_with_dialects_excluding()
def test_chart(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
-
+ df_sdf = db_api.register(df)
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
- table_or_tables=df,
+ df_sdf,
blocking_rules=[block_on("first_name"), "l.surname = r.surname"],
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
@@ -748,7 +760,7 @@ def test_chart(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_n_largest_blocks(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
df_1 = pd.DataFrame(
[
@@ -777,13 +789,14 @@ def test_n_largest_blocks(test_helpers, dialect):
]
)
- db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1)
+ df_2_sdf = db_api.register(df_2)
+ df_3_sdf = db_api.register(df_3)
n_largest_dedupe_only = n_largest_blocks(
- table_or_tables=df_1,
+ df_1_sdf,
blocking_rule=block_on("name1", "substr(name2,1,1)"),
link_type="dedupe_only",
- db_api=db_api,
).as_pandas_dataframe()
sql = """
@@ -809,10 +822,9 @@ def test_n_largest_blocks(test_helpers, dialect):
pd.testing.assert_frame_equal(n_largest_dedupe_only, n_largest_manual_dedupe_only)
n_largest_link_and_dedupe = n_largest_blocks(
- table_or_tables=[df_1, df_2],
+ [df_1_sdf, df_2_sdf],
blocking_rule=block_on("name1", "substr(name2,1,1)"),
link_type="link_and_dedupe",
- db_api=db_api,
).as_pandas_dataframe()
sql = """
@@ -840,10 +852,9 @@ def test_n_largest_blocks(test_helpers, dialect):
)
n_largest_link_only = n_largest_blocks(
- table_or_tables=[df_1, df_2],
+ [df_1_sdf, df_2_sdf],
blocking_rule=block_on("name1", "substr(name2,1,1)"),
link_type="link_only",
- db_api=db_api,
).as_pandas_dataframe()
sql = """
@@ -869,10 +880,9 @@ def test_n_largest_blocks(test_helpers, dialect):
pd.testing.assert_frame_equal(n_largest_link_only, n_largest_manual_link_only)
n_largest_link_only_3 = n_largest_blocks(
- table_or_tables=[df_1, df_2, df_3],
+ [df_1_sdf, df_2_sdf, df_3_sdf],
blocking_rule=block_on("name1", "substr(name2,1,1)"),
link_type="link_only",
- db_api=db_api,
).as_pandas_dataframe()
sql = """
@@ -898,10 +908,9 @@ def test_n_largest_blocks(test_helpers, dialect):
pd.testing.assert_frame_equal(n_largest_link_only_3, n_largest_manual_link_only_3)
n_largest_link_and_dedupe_inverted = n_largest_blocks(
- table_or_tables=[df_1, df_2],
+ [df_1_sdf, df_2_sdf],
blocking_rule="l.name1 = r.name2 and l.name2 = r.name1",
link_type="link_and_dedupe",
- db_api=db_api,
).as_pandas_dataframe()
sql = """
@@ -972,6 +981,8 @@ def test_blocking_rule_parentheses_equivalence():
df = pd.DataFrame(data)
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
+
# Test three variations of the same blocking rule
br_with_brl = brl.And(
brl.block_on("forename1_std", "forename2_std", "dob_std"),
@@ -994,24 +1005,21 @@ def test_blocking_rule_parentheses_equivalence():
# Get results for each variation
result_brl = count_comparisons_from_blocking_rule(
- table_or_tables=df,
+ df_sdf,
blocking_rule=br_with_brl,
link_type="dedupe_only",
- db_api=db_api,
)
result_with_parens = count_comparisons_from_blocking_rule(
- table_or_tables=df,
+ df_sdf,
blocking_rule=br_with_parens,
link_type="dedupe_only",
- db_api=db_api,
)
result_without_parens = count_comparisons_from_blocking_rule(
- table_or_tables=df,
+ df_sdf,
blocking_rule=br_without_parens,
link_type="dedupe_only",
- db_api=db_api,
)
# Check specific values
diff --git a/tests/test_array_based_blocking.py b/tests/test_array_based_blocking.py
index d22fa1d28b..db82403d70 100644
--- a/tests/test_array_based_blocking.py
+++ b/tests/test_array_based_blocking.py
@@ -36,7 +36,7 @@ def test_simple_example_link_only(test_helpers, dialect):
}
## the pairs returned by the first blocking rule are (1,6),(2,4),(2,6)
## the additional pairs returned by the second blocking rule are (1,4),(3,5)
- linker = helper.Linker([data_l, data_r], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([data_l, data_r], settings)
linker.debug_mode = False
returned_triples = linker.inference.predict().as_pandas_dataframe()[
["unique_id_l", "unique_id_r", "match_key"]
@@ -108,7 +108,7 @@ def test_array_based_blocking_with_random_data_dedupe(test_helpers, dialect):
"additional_columns_to_retain": ["cluster"],
"comparisons": [cl.ArrayIntersectAtSizes("array_column_1", [1])],
}
- linker = helper.Linker(input_data, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(input_data, settings)
linker.debug_mode = False
df_predict = linker.inference.predict().as_pandas_dataframe()
## check that there are no duplicates in the output
@@ -155,9 +155,7 @@ def test_array_based_blocking_with_random_data_link_only(test_helpers, dialect):
"additional_columns_to_retain": ["cluster"],
"comparisons": [cl.ArrayIntersectAtSizes("array_column_1", [1])],
}
- linker = helper.Linker(
- [input_data_l, input_data_r], settings, **helper.extra_linker_args()
- )
+ linker = helper.linker_with_registration([input_data_l, input_data_r], settings)
linker.debug_mode = False
df_predict = linker.inference.predict().as_pandas_dataframe()
@@ -226,11 +224,10 @@ def test_link_only_unique_id_ambiguity(test_helpers, dialect):
"retain_intermediate_calculation_columns": True,
}
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
[df_1, df_2, df_3],
settings,
input_table_aliases=["a_", "b_", "c_"],
- **helper.extra_linker_args(),
)
returned_triples = linker.inference.predict().as_pandas_dataframe()[
[
diff --git a/tests/test_array_columns.py b/tests/test_array_columns.py
index d0ce69c328..9bff2bf3c1 100644
--- a/tests/test_array_columns.py
+++ b/tests/test_array_columns.py
@@ -9,7 +9,7 @@
@mark_with_dialects_excluding("sqlite", "spark")
def test_array_comparison_1(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -84,7 +84,7 @@ def test_array_comparison_1(test_helpers, dialect):
@mark_with_dialects_excluding("sqlite", "postgres")
def test_array_subset(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
diff --git a/tests/test_caching.py b/tests/test_caching.py
index b71115ef9c..260c0141bb 100644
--- a/tests/test_caching.py
+++ b/tests/test_caching.py
@@ -54,8 +54,9 @@ def register_and_return_dummy_frame(
def test_cache_id(tmp_path):
# Test saving and loading a model
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, get_settings_dict(), db_api=db_api)
+ linker = Linker(df_sdf, get_settings_dict())
prior = linker._settings_obj._cache_uid
@@ -63,8 +64,9 @@ def test_cache_id(tmp_path):
linker.misc.save_model_to_json(path, overwrite=True)
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker_2 = Linker(df, path, db_api=db_api)
+ linker_2 = Linker(df_sdf, path)
assert linker_2._settings_obj._cache_uid == prior
@@ -73,8 +75,9 @@ def test_cache_id(tmp_path):
settings = get_settings_dict()
settings["linker_uid"] = random_uid
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker_uid = linker._cache_uid
assert linker_uid == random_uid
@@ -83,8 +86,9 @@ def test_cache_only_splink_dataframes():
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._intermediate_table_cache["new_table"] = DuckDBDataFrame(
"template", "__splink__dummy_frame", linker
)
@@ -103,8 +107,9 @@ def test_cache_access_df_concat(debug_mode):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = debug_mode
with patch.object(
db_api, "_sql_to_splink_dataframe", new=make_mock_execute(db_api)
@@ -135,8 +140,9 @@ def test_cache_access_compute_tf_table(debug_mode):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = debug_mode
with patch.object(
db_api, "_sql_to_splink_dataframe", new=make_mock_execute(db_api)
@@ -155,8 +161,9 @@ def test_invalidate_cache(debug_mode):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = debug_mode
with patch.object(
@@ -205,8 +212,9 @@ def test_cache_invalidates_with_new_linker(debug_mode):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = debug_mode
with patch.object(
db_api, "_sql_to_splink_dataframe", new=make_mock_execute(db_api)
@@ -222,8 +230,9 @@ def test_cache_invalidates_with_new_linker(debug_mode):
mockexecute_sql_pipeline.assert_not_called()
db_api = DuckDBAPI()
+ df_sdf_new = db_api.register(df)
- new_linker = Linker(df, settings, db_api=db_api)
+ new_linker = Linker(df_sdf_new, settings)
new_linker._debug_mode = debug_mode
with patch.object(
db_api, "_sql_to_splink_dataframe", new=make_mock_execute(db_api)
@@ -253,8 +262,9 @@ def test_cache_register_compute_concat_with_tf_table(debug_mode):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = debug_mode
with patch.object(
@@ -274,8 +284,9 @@ def test_cache_register_compute_tf_table(debug_mode):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = debug_mode
with patch.object(
diff --git a/tests/test_caching_tables.py b/tests/test_caching_tables.py
index f01970213c..76b215e585 100644
--- a/tests/test_caching_tables.py
+++ b/tests/test_caching_tables.py
@@ -27,8 +27,9 @@ def test_cache_tracking_works():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
assert cache.is_in_executed_queries("__splink__df_concat_with_tf") is False
@@ -92,8 +93,9 @@ def test_cache_used_when_registering_nodes_table():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
linker.table_management.register_df_concat_with_tf(splink__df_concat_with_tf)
linker.inference.predict()
@@ -143,8 +145,9 @@ def test_cache_used_when_registering_tf_tables():
# First test do not register any tf tables
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
linker.inference.predict()
@@ -154,8 +157,9 @@ def test_cache_used_when_registering_tf_tables():
# Then try the same after registering surname tf table
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
linker.table_management.register_term_frequency_lookup(surname_tf_table, "surname")
linker.inference.predict()
@@ -165,8 +169,9 @@ def test_cache_used_when_registering_tf_tables():
# Then try the same after registering both
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
linker.table_management.register_term_frequency_lookup(surname_tf_table, "surname")
linker.table_management.register_term_frequency_lookup(
@@ -193,8 +198,9 @@ def test_cache_invalidation():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
linker.table_management.compute_tf_table("name")
@@ -207,8 +213,9 @@ def test_cache_invalidation():
assert cache.is_in_queries_retrieved_from_cache("__splink__df_tf_name")
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
linker.table_management.compute_tf_table("name")
@@ -240,8 +247,9 @@ def test_table_deletions():
}
db_api = DuckDBAPI(connection=con)
+ table_sdf = db_api.register("my_table")
- linker = Linker("my_table", settings, db_api=db_api)
+ linker = Linker(table_sdf, settings)
table_names_before = set(get_duckdb_table_names_as_list(db_api._con))
@@ -290,8 +298,9 @@ def test_table_deletions_with_preregistered():
}
db_api = DuckDBAPI(connection=con)
+ table_sdf = db_api.register("my_data_table")
- linker = Linker("my_data_table", settings, db_api=db_api)
+ linker = Linker(table_sdf, settings)
linker.table_management.register_df_concat_with_tf("my_nodes_with_tf_table")
table_names_before = set(get_duckdb_table_names_as_list(db_api._con))
@@ -325,8 +334,9 @@ def test_single_deletion():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
cache = linker._intermediate_table_cache
tf_table = linker.table_management.compute_tf_table("name")
diff --git a/tests/test_charts.py b/tests/test_charts.py
index 1187ccd15d..455b6ac837 100644
--- a/tests/test_charts.py
+++ b/tests/test_charts.py
@@ -3,7 +3,6 @@
import splink.internals.comparison_library as cl
from splink.internals.charts import save_offline_chart
-from splink.internals.linker import Linker
from tests.decorator import mark_with_dialects_excluding
# ground truth:
@@ -133,9 +132,8 @@ def test_m_u_charts(dialect, test_helpers):
],
}
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- linker = Linker(df, settings, db_api=db_api)
+ linker = helper.linker_with_registration(df, settings)
linker.training.estimate_probability_two_random_records_match(
["l.true_match_id = r.true_match_id"], recall=1.0
@@ -155,9 +153,8 @@ def test_parameter_estimate_charts(dialect, test_helpers):
],
}
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- linker = Linker(df, settings, db_api=db_api)
+ linker = helper.linker_with_registration(df, settings)
linker.training.estimate_probability_two_random_records_match(
["l.true_match_id = r.true_match_id"], recall=1.0
@@ -192,9 +189,7 @@ def test_parameter_estimate_charts(dialect, test_helpers):
cl.LevenshteinAtThresholds("first_name", [1]),
],
}
- db_api = helper.DatabaseAPI(**helper.db_api_args())
-
- linker = Linker(df, settings, db_api=db_api)
+ linker = helper.linker_with_registration(df, settings)
linker.training.estimate_u_using_random_sampling(1e6)
linker.visualisations.parameter_estimate_comparisons_chart()
@@ -213,9 +208,8 @@ def test_tf_adjustment_chart(dialect, test_helpers):
],
}
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- linker = Linker(df, settings, db_api=db_api)
+ linker = helper.linker_with_registration(df, settings)
linker.visualisations.tf_adjustment_chart("gender")
linker.visualisations.tf_adjustment_chart("first_name")
@@ -235,8 +229,7 @@ def test_save_offline_chart(tmp_path, test_helpers):
],
}
helper = test_helpers["duckdb"]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- linker = Linker(df, settings, db_api=db_api)
+ linker = helper.linker_with_registration(df, settings)
ch = linker.visualisations.tf_adjustment_chart("gender", as_dict=True)
save_offline_chart(ch, tmp_path / "test_chart.html")
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index a7a4142d60..bbfabfb122 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -32,12 +32,11 @@ def _sort_predictions(df):
def test_chunked_predict_matches_non_chunked(test_helpers, dialect):
"""Test that chunked predictions produce identical results to non-chunked."""
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
# Get non-chunked predictions
predictions_no_chunk = linker.inference.predict(threshold_match_weight=-10)
@@ -70,12 +69,11 @@ def test_chunked_predict_matches_non_chunked(test_helpers, dialect):
def test_chunked_predict_with_different_chunk_sizes(test_helpers, dialect):
"""Test various chunk size combinations produce consistent results."""
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
# Get baseline predictions
predictions_baseline = linker.inference.predict(threshold_match_weight=-10)
@@ -113,19 +111,18 @@ def test_chunked_predict_with_different_chunk_sizes(test_helpers, dialect):
def test_precached_blocked_pairs_same_result(test_helpers, dialect):
"""Test that pre-caching blocked pairs produces same result as no pre-caching."""
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
# First: run without pre-caching
- linker1 = Linker(df, settings, **helper.extra_linker_args())
+ linker1 = helper.linker_with_registration(df, settings)
predictions_no_cache = linker1.inference.predict(threshold_match_weight=-10)
df_no_cache = _sort_predictions(predictions_no_cache.as_pandas_dataframe())
# Second: run with pre-caching
- linker2 = Linker(df, settings, **helper.extra_linker_args())
+ linker2 = helper.linker_with_registration(df, settings)
linker2.table_management.compute_df_concat_with_tf()
linker2.table_management.compute_blocked_pairs_for_predict()
predictions_with_cache = linker2.inference.predict(threshold_match_weight=-10)
@@ -143,14 +140,13 @@ def test_precached_blocked_pairs_same_result(test_helpers, dialect):
def test_precached_chunked_blocked_pairs_same_result(test_helpers, dialect):
"""Test that pre-caching chunked blocked pairs produces same result."""
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
# First: run chunked without pre-caching
- linker1 = Linker(df, settings, **helper.extra_linker_args())
+ linker1 = helper.linker_with_registration(df, settings)
predictions_no_cache = linker1.inference.predict(
threshold_match_weight=-10,
num_chunks_left=2,
@@ -159,7 +155,7 @@ def test_precached_chunked_blocked_pairs_same_result(test_helpers, dialect):
df_no_cache = _sort_predictions(predictions_no_cache.as_pandas_dataframe())
# Second: run chunked with pre-caching of all chunks
- linker2 = Linker(df, settings, **helper.extra_linker_args())
+ linker2 = helper.linker_with_registration(df, settings)
linker2.table_management.compute_df_concat_with_tf()
# Pre-compute all 4 chunk combinations (2x2)
@@ -195,8 +191,9 @@ def test_cache_is_hit_for_blocked_pairs():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
# Pre-compute blocked pairs (populates cache)
linker.table_management.compute_df_concat_with_tf()
@@ -221,8 +218,9 @@ def test_cache_is_hit_for_chunked_blocked_pairs():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
# Pre-compute blocked pairs for specific chunk
linker.table_management.compute_df_concat_with_tf()
@@ -254,8 +252,9 @@ def test_cache_key_normalization_1_1():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
# Pre-compute with (1,1) x (1,1) - should normalize to base key
linker.table_management.compute_df_concat_with_tf()
@@ -275,8 +274,9 @@ def test_blocked_pairs_not_deleted_when_from_cache():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
# Pre-compute blocked pairs
linker.table_management.compute_df_concat_with_tf()
@@ -294,8 +294,9 @@ def test_blocked_pairs_deleted_when_not_from_cache():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
# Pre-compute df_concat_with_tf but NOT blocked pairs
linker.table_management.compute_df_concat_with_tf()
@@ -316,7 +317,6 @@ def test_blocked_pairs_deleted_when_not_from_cache():
def test_chunked_predict_link_only(test_helpers, dialect):
"""Test chunked predictions work correctly with link_only (two datasets)."""
helper = test_helpers[dialect]
- Linker = helper.Linker
settings = get_settings_dict()
settings["link_type"] = "link_only"
@@ -329,7 +329,7 @@ def test_chunked_predict_link_only(test_helpers, dialect):
df1 = helper.convert_frame(df1_pd)
df2 = helper.convert_frame(df2_pd)
- linker = Linker([df1, df2], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df1, df2], settings)
# Get baseline predictions
predictions_baseline = linker.inference.predict(threshold_match_weight=-10)
@@ -370,7 +370,6 @@ def test_chunked_predict_link_only_three_datasets(test_helpers, dialect):
Two datasets is a special case, so we test with three datasets as well.
"""
helper = test_helpers[dialect]
- Linker = helper.Linker
settings = get_settings_dict()
settings["link_type"] = "link_only"
@@ -385,7 +384,7 @@ def test_chunked_predict_link_only_three_datasets(test_helpers, dialect):
df2 = helper.convert_frame(df2_pd)
df3 = helper.convert_frame(df3_pd)
- linker = Linker([df1, df2, df3], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df1, df2, df3], settings)
# Get baseline predictions
predictions_baseline = linker.inference.predict(threshold_match_weight=-10)
@@ -423,7 +422,6 @@ def test_chunked_predict_link_only_three_datasets(test_helpers, dialect):
def test_chunked_predict_link_and_dedupe(test_helpers, dialect):
"""Test chunked predictions work correctly with link_and_dedupe (two datasets)."""
helper = test_helpers[dialect]
- Linker = helper.Linker
settings = get_settings_dict()
settings["link_type"] = "link_and_dedupe"
@@ -436,7 +434,7 @@ def test_chunked_predict_link_and_dedupe(test_helpers, dialect):
df1 = helper.convert_frame(df1_pd)
df2 = helper.convert_frame(df2_pd)
- linker = Linker([df1, df2], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df1, df2], settings)
# Get baseline predictions
predictions_baseline = linker.inference.predict(threshold_match_weight=-10)
@@ -477,7 +475,6 @@ def test_chunked_predict_link_and_dedupe_three_datasets(test_helpers, dialect):
Two datasets is a special case, so we test with three datasets as well.
"""
helper = test_helpers[dialect]
- Linker = helper.Linker
settings = get_settings_dict()
settings["link_type"] = "link_and_dedupe"
@@ -492,7 +489,7 @@ def test_chunked_predict_link_and_dedupe_three_datasets(test_helpers, dialect):
df2 = helper.convert_frame(df2_pd)
df3 = helper.convert_frame(df3_pd)
- linker = Linker([df1, df2, df3], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df1, df2, df3], settings)
# Get baseline predictions
predictions_baseline = linker.inference.predict(threshold_match_weight=-10)
diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py
index 2f310dadea..cc29016e5b 100644
--- a/tests/test_cluster_studio.py
+++ b/tests/test_cluster_studio.py
@@ -14,7 +14,9 @@ def test_density_sample():
"link_type": "dedupe_only",
"unique_id_column_name": "person_id",
}
- linker = Linker(df, settings, db_api=DuckDBAPI())
+ db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
+ linker = Linker(df_sdf, settings)
# Dummy cluster metrics table
cluster = ["A", "B", "C", "D", "E", "F"]
diff --git a/tests/test_cluster_using_single_best_links.py b/tests/test_cluster_using_single_best_links.py
index db85dbbfe1..f09eca25e8 100644
--- a/tests/test_cluster_using_single_best_links.py
+++ b/tests/test_cluster_using_single_best_links.py
@@ -1,7 +1,7 @@
import pandas as pd
import splink.comparison_library as cl
-from splink import Linker, SettingsCreator, block_on
+from splink import SettingsCreator, block_on
from .decorator import mark_with_dialects_excluding
@@ -35,7 +35,7 @@ def test_single_best_links_correctness_example_1(test_helpers, dialect):
blocking_rules_to_generate_predictions=[],
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.table_management.register_table_predict(
predictions, overwrite=True
@@ -102,7 +102,7 @@ def test_single_best_links_example_2(test_helpers, dialect):
blocking_rules_to_generate_predictions=[],
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.table_management.register_table_predict(
predictions, overwrite=True
@@ -167,7 +167,7 @@ def test_single_best_links_example_3(test_helpers, dialect):
blocking_rules_to_generate_predictions=[],
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.table_management.register_table_predict(
predictions, overwrite=True
@@ -230,7 +230,7 @@ def test_single_best_links_ties(test_helpers, dialect):
blocking_rules_to_generate_predictions=[],
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.table_management.register_table_predict(
predictions, overwrite=True
@@ -275,7 +275,7 @@ def test_single_best_links_ties_method(test_helpers, dialect):
blocking_rules_to_generate_predictions=[],
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.table_management.register_table_predict(
predictions, overwrite=True
@@ -369,7 +369,7 @@ def test_single_best_links_one_to_many(test_helpers, dialect):
blocking_rules_to_generate_predictions=[],
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.table_management.register_table_predict(
predictions, overwrite=True
@@ -427,7 +427,7 @@ def test_single_best_links_one_to_one(test_helpers, dialect):
],
)
- linker = Linker([df_l, df_r], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df_l, df_r], settings)
linker.training.estimate_u_using_random_sampling(1e6)
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
index a15f6dc1d3..6d974bcd36 100644
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@@ -61,7 +61,7 @@ def test_clustering(test_helpers, dialect, link_type, input_pd_tables):
],
)
linker_input = list(map(helper.convert_frame, input_pd_tables))
- linker = Linker(linker_input, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(linker_input, settings)
df_predict = linker.inference.predict()
linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.95)
@@ -70,8 +70,9 @@ def test_clustering(test_helpers, dialect, link_type, input_pd_tables):
def test_clustering_mw_prob_equivalence():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
settings_dict = get_settings_dict()
- linker = Linker(df, settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
df_predict = linker.inference.predict()
@@ -121,7 +122,7 @@ def test_clustering_no_edges(test_helpers, dialect):
unique_id_column_name="id",
)
linker_input = helper.convert_frame(df)
- linker = Linker(linker_input, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(linker_input, settings)
# due to blocking rules, df_predict will be empty
df_predict = linker.inference.predict()
diff --git a/tests/test_column_expression.py b/tests/test_column_expression.py
index 74574fe6ab..3ce582ee29 100644
--- a/tests/test_column_expression.py
+++ b/tests/test_column_expression.py
@@ -22,7 +22,7 @@ def test_access_extreme_array_element(test_helpers, dialect):
table_name = "arr_tab"
table = helper.convert_frame(df_arr)
db_api = helper.DatabaseAPI(**helper.db_api_args())
- arr_tab = db_api.register_table(table, table_name)
+ arr_tab = db_api.register(table, table_name)
# construct a SQL query from ColumnExpressions and run it against backend
splink_dialect = SplinkDialect.from_string(dialect)
@@ -66,7 +66,7 @@ def test_nullif(test_helpers, dialect):
table_name = "nully_name_table"
table = helper.convert_frame(df_arr)
db_api = helper.DatabaseAPI(**helper.db_api_args())
- nully_table = db_api.register_table(table, table_name)
+ nully_table = db_api.register(table, table_name)
# construct a SQL query from ColumnExpressions and run it against backend
splink_dialect = SplinkDialect.from_string(dialect)
diff --git a/tests/test_columns_selected.py b/tests/test_columns_selected.py
index c18db8f89d..9b0da37827 100644
--- a/tests/test_columns_selected.py
+++ b/tests/test_columns_selected.py
@@ -61,8 +61,10 @@ def test_regression(tmp_path):
connection=os.path.join(tmp_path, "duckdb.db"),
output_schema="splink_in_duckdb",
)
+ df_copy = df.copy()
+ df_sdf = db_api.register(df_copy)
- linker = Linker(df.copy(), settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
linker.inference.predict()
@@ -122,7 +124,9 @@ def test_discussion_example(tmp_path):
}
db_api = DuckDBAPI()
+ df_copy = df.copy()
+ df_sdf = db_api.register(df_copy)
- linker = Linker(df.copy(), settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
linker.inference.predict()
diff --git a/tests/test_compare_splink2.py b/tests/test_compare_splink2.py
index 7616efc392..606e4d40a7 100644
--- a/tests/test_compare_splink2.py
+++ b/tests/test_compare_splink2.py
@@ -14,8 +14,9 @@ def test_splink_2_predict():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings_dict = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
expected_record = pd.read_csv("tests/datasets/splink2_479_vs_481.csv")
@@ -35,7 +36,8 @@ def test_splink_2_predict():
@mark_with_dialects_including("spark")
def test_splink_2_predict_spark(df_spark, spark_api):
settings_dict = get_settings_dict()
- linker = Linker(df_spark, settings_dict, spark_api)
+ df_sdf = spark_api.register(df_spark)
+ linker = Linker(df_sdf, settings_dict)
df_e = linker.inference.predict().as_pandas_dataframe()
f1 = df_e["unique_id_l"] == "479"
@@ -62,7 +64,8 @@ def test_splink_2_predict_sqlite():
settings_dict = get_settings_dict()
db_api = SQLiteAPI(con)
- linker = Linker("fake_data_1", settings_dict, db_api=db_api)
+ df_sdf = db_api.register("fake_data_1")
+ linker = Linker(df_sdf, settings_dict)
df_e = linker.inference.predict().as_pandas_dataframe()
@@ -83,8 +86,9 @@ def test_splink_2_em_fixed_u():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings_dict = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
# Check lambda history is the same
expected_prop_history = pd.read_csv(
@@ -131,8 +135,9 @@ def test_splink_2_em_no_fix():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings_dict = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
# Check lambda history is the same
expected_prop_history = pd.read_csv(
@@ -189,8 +194,9 @@ def test_lambda():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings_dict, db_api=db_api)
+ linker = Linker(df_sdf, settings_dict)
ma = linker.inference.predict().as_pandas_dataframe()
f1 = ma["unique_id_l"] == 924
diff --git a/tests/test_compare_two_records.py b/tests/test_compare_two_records.py
index 3b0ee04de5..cae7381409 100644
--- a/tests/test_compare_two_records.py
+++ b/tests/test_compare_two_records.py
@@ -19,7 +19,6 @@ def test_compare_two_records_1(test_helpers, dialect):
# - User provides a city tf tble
# - But first_name tf table derived from input data
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_parquet(
"./tests/datasets/fake_1000_from_splink_demos_strip_datetypes.parquet"
@@ -43,7 +42,7 @@ def test_compare_two_records_1(test_helpers, dialect):
retain_matching_columns=True,
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
city_tf = pd.DataFrame(
[
@@ -96,7 +95,6 @@ def test_compare_two_records_2(test_helpers, dialect):
# - But specific values provided in input data, which take precedence
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_parquet(
"./tests/datasets/fake_1000_from_splink_demos_strip_datetypes.parquet"
@@ -120,7 +118,7 @@ def test_compare_two_records_2(test_helpers, dialect):
retain_matching_columns=True,
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
city_tf = pd.DataFrame(
[
diff --git a/tests/test_comparison_level_composition.py b/tests/test_comparison_level_composition.py
index 10dfcce978..8f0c424894 100644
--- a/tests/test_comparison_level_composition.py
+++ b/tests/test_comparison_level_composition.py
@@ -188,7 +188,7 @@ def test_composition_outputs(test_helpers, dialect):
"comparisons": [full_name],
}
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
pred = linker.inference.predict()
out = pred.as_pandas_dataframe().sort_values(by=["unique_id_l", "unique_id_r"])
diff --git a/tests/test_comparison_level_lib.py b/tests/test_comparison_level_lib.py
index 1703b3fb91..d922cce3a7 100644
--- a/tests/test_comparison_level_lib.py
+++ b/tests/test_comparison_level_lib.py
@@ -9,7 +9,7 @@
@mark_with_dialects_excluding()
def test_columns_reversed_level(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -104,7 +104,7 @@ def test_columns_reversed_level(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_perc_difference(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
perc_comparison = cl.CustomComparison(
comparison_description="amount",
@@ -162,7 +162,7 @@ def test_perc_difference(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_levenshtein_level(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
levenshtein_comparison = cl.CustomComparison(
comparison_description="name",
@@ -227,7 +227,7 @@ def test_levenshtein_level(test_helpers, dialect):
@mark_with_dialects_excluding("postgres")
def test_damerau_levenshtein_level(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
damerau_levenshtein_comparison = cl.CustomComparison(
comparison_description="name",
@@ -297,7 +297,7 @@ def test_damerau_levenshtein_level(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_absolute_difference(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
abs_comparison = cl.CustomComparison(
comparison_description="amount",
@@ -370,7 +370,7 @@ def test_cosine_similarity_level(test_helpers, dialect):
import pyarrow as pa
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
EMBEDDING_DIMENSION = 4
diff --git a/tests/test_comparison_lib.py b/tests/test_comparison_lib.py
index 1c9405b82e..d8279aa6e5 100644
--- a/tests/test_comparison_lib.py
+++ b/tests/test_comparison_lib.py
@@ -32,8 +32,9 @@ def test_distance_function_comparison():
],
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
df_pred = linker.inference.predict().as_pandas_dataframe()
@@ -68,7 +69,7 @@ def test_distance_function_comparison():
@mark_with_dialects_excluding("sqlite", "postgres")
def test_pairwise_stringdistance_function_comparison(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -124,7 +125,7 @@ def test_pairwise_stringdistance_function_comparison(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_set_to_lowercase(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py
index 105085e7b6..cc83b4ad65 100644
--- a/tests/test_comparison_template_lib.py
+++ b/tests/test_comparison_template_lib.py
@@ -6,7 +6,7 @@
@mark_with_dialects_excluding("postgres", "sqlite")
def test_email_comparison(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -52,7 +52,7 @@ def test_email_comparison(dialect, test_helpers):
@mark_with_dialects_excluding("sqlite")
def test_date_of_birth_comparison_levels(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -108,7 +108,7 @@ def test_date_of_birth_comparison_levels(dialect, test_helpers):
@mark_with_dialects_excluding("postgres", "sqlite")
def test_postcode_comparison(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -156,7 +156,7 @@ def test_postcode_comparison(dialect, test_helpers):
@mark_with_dialects_excluding("postgres", "sqlite")
def test_name_comparison(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -202,7 +202,7 @@ def test_name_comparison(dialect, test_helpers):
@mark_with_dialects_excluding("postgres", "sqlite")
def test_forename_surname_comparison(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
diff --git a/tests/test_comparison_viewer_dashboard.py b/tests/test_comparison_viewer_dashboard.py
index ed20dd58f8..a6c8a481cb 100644
--- a/tests/test_comparison_viewer_dashboard.py
+++ b/tests/test_comparison_viewer_dashboard.py
@@ -21,10 +21,12 @@ def test_comparison_viewer_dashboard(tmp_path):
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings_dict = get_settings_dict()
+ db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
+
linker = Linker(
- df,
+ df_sdf,
settings=settings_dict,
- db_api=DuckDBAPI(),
)
df_predict = linker.inference.predict()
@@ -68,10 +70,12 @@ def test_comparison_viewer_table():
"retain_intermediate_calculation_columns": True,
}
+ db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
+
linker = Linker(
- df,
+ df_sdf,
settings=settings_dict,
- db_api=DuckDBAPI(),
)
df_predict = linker.inference.predict()
diff --git a/tests/test_completeness.py b/tests/test_completeness.py
index ddacb4e421..f41d857afa 100644
--- a/tests/test_completeness.py
+++ b/tests/test_completeness.py
@@ -11,17 +11,18 @@
@mark_with_dialects_excluding("sqlite")
def test_completeness_chart(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- completeness_chart(df, db_api)
- completeness_chart(df, db_api, cols=["first_name", "surname"])
- completeness_chart(df, db_api, cols=["first_name"], table_names_for_chart=["t1"])
+ df_sdf = db_api.register(df)
+ completeness_chart(df_sdf)
+ completeness_chart(df_sdf, cols=["first_name", "surname"])
+ completeness_chart(df_sdf, cols=["first_name"], table_names_for_chart=["t1"])
@mark_with_dialects_excluding("sqlite")
def test_completeness_chart_mismatched_columns(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
df_l = helper.load_frame_from_csv(
"./tests/datasets/fake_1000_from_splink_demos.csv"
@@ -30,14 +31,17 @@ def test_completeness_chart_mismatched_columns(dialect, test_helpers):
df_r.rename(columns={"surname": "surname_2"}, inplace=True)
df_r = helper.convert_frame(df_r)
+ df_l_sdf = db_api.register(df_l)
+ df_r_sdf = db_api.register(df_r)
+
with raises(SplinkException):
- completeness_chart([df_l, df_r], db_api)
+ completeness_chart([df_l_sdf, df_r_sdf])
@mark_with_dialects_excluding("sqlite")
def test_completeness_chart_complex_columns(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
df = pd.DataFrame(
{
"id": [1, 2, 3, 4, 5],
@@ -53,17 +57,19 @@ def test_completeness_chart_complex_columns(dialect, test_helpers):
}
)
df = helper.convert_frame(df)
+ df_sdf = db_api.register(df)
first = helper.arrays_from
# check completeness when we have more complicated column constructs, such as
# indexing into array columns
- completeness_chart(df, db_api, cols=["first_name", "surname", f"city_arr[{first}]"])
+ completeness_chart(df_sdf, cols=["first_name", "surname", f"city_arr[{first}]"])
@mark_with_dialects_excluding("sqlite")
def test_completeness_chart_source_dataset(dialect, test_helpers):
helper = test_helpers[dialect]
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
df_pd = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
df_pd["source_dataset"] = "fake_1000"
df = helper.convert_frame(df_pd)
- completeness_chart(df, db_api)
+ df_sdf = db_api.register(df)
+ completeness_chart(df_sdf)
diff --git a/tests/test_compound_comparison_levels.py b/tests/test_compound_comparison_levels.py
index 186f248695..6a278a0d50 100644
--- a/tests/test_compound_comparison_levels.py
+++ b/tests/test_compound_comparison_levels.py
@@ -119,8 +119,9 @@ def col_is_null(col):
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
all_cols_match_level = linker._settings_obj.comparisons[1].comparison_levels[1]
assert all_cols_match_level._is_exact_match
assert set(all_cols_match_level._exact_match_colnames) == {
@@ -217,7 +218,8 @@ def test_complex_compound_comparison_level():
],
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker.training.estimate_parameters_using_expectation_maximisation("1=1")
diff --git a/tests/test_correctness_of_convergence.py b/tests/test_correctness_of_convergence.py
index ae5d1e00b5..0abfebd08f 100644
--- a/tests/test_correctness_of_convergence.py
+++ b/tests/test_correctness_of_convergence.py
@@ -75,8 +75,9 @@ def test_splink_converges_to_known_params():
)
db_api = DuckDBAPI()
+ in_df_sdf = db_api.register(in_df)
- linker = Linker(in_df, settings, db_api=db_api)
+ linker = Linker(in_df_sdf, settings)
settings_obj = linker._settings_obj
@@ -105,7 +106,7 @@ def test_splink_converges_to_known_params():
cvv_hashed_tablename = re.search(pattern, str(e)).group()
- cvv_table = db_api.register_table(df, cvv_hashed_tablename)
+ cvv_table = db_api._create_backend_table(df, cvv_hashed_tablename)
cvv_table.templated_name = "__splink__df_comparison_vectors"
core_model_settings = em_training_session._train(cvv_table)
diff --git a/tests/test_date_levels_and_comparisons.py b/tests/test_date_levels_and_comparisons.py
index 2fd46efd88..2523ce31f2 100644
--- a/tests/test_date_levels_and_comparisons.py
+++ b/tests/test_date_levels_and_comparisons.py
@@ -14,7 +14,7 @@
@mark_with_dialects_excluding("sqlite")
def test_absolute_date_difference_level(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -65,7 +65,7 @@ def test_absolute_date_difference_level(test_helpers, dialect):
@mark_with_dialects_excluding("sqlite")
def test_absolute_time_difference_levels(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -116,7 +116,7 @@ def test_absolute_time_difference_levels(test_helpers, dialect):
@mark_with_dialects_excluding("sqlite")
def test_absolute_date_difference_at_thresholds(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
@@ -155,7 +155,7 @@ def test_absolute_date_difference_at_thresholds(test_helpers, dialect):
@mark_with_dialects_including("duckdb", pass_dialect=True)
def test_alternative_date_format(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
test_cases = [
{
diff --git a/tests/test_disable_tf_exact_match_detection.py b/tests/test_disable_tf_exact_match_detection.py
index e5689b033b..97218a6a99 100644
--- a/tests/test_disable_tf_exact_match_detection.py
+++ b/tests/test_disable_tf_exact_match_detection.py
@@ -158,7 +158,10 @@ def get_settings(disable_tf_exact_match_detection, tf_minimum_u_value=None):
settings_normal = get_settings(disable_tf_exact_match_detection=False)
- linker = Linker(df, settings_normal, DuckDBAPI())
+ db_api_1 = DuckDBAPI()
+ df_sdf = db_api_1.register(df)
+
+ linker = Linker(df_sdf, settings_normal)
tf_lookup = [
{"surname": "Taylor", "tf_surname": 0.4},
@@ -200,7 +203,9 @@ def get_settings(disable_tf_exact_match_detection, tf_minimum_u_value=None):
)
settings_disabled = get_settings(disable_tf_exact_match_detection=True)
- linker = Linker(df, settings_disabled, DuckDBAPI())
+ db_api_2 = DuckDBAPI()
+ df_sdf_2 = db_api_2.register(df)
+ linker = Linker(df_sdf_2, settings_disabled)
tf_lookup = [
{"surname": "Taylor", "tf_surname": 0.4},
@@ -244,10 +249,16 @@ def get_settings(disable_tf_exact_match_detection, tf_minimum_u_value=None):
disable_tf_exact_match_detection=True, tf_minimum_u_value=0.1
)
- linker_base = Linker(df, settings_disabled_with_min_tf, DuckDBAPI())
+ db_api_3 = DuckDBAPI()
+ df_sdf_3 = db_api_3.register(df)
+ linker_base = Linker(df_sdf_3, settings_disabled_with_min_tf)
+
+ # Create fresh db_api for second linker to avoid table conflicts
+ db_api_4 = DuckDBAPI()
+ df_sdf_4 = db_api_4.register(df)
linkers = [
linker_base,
- Linker(df, linker_base.misc.save_model_to_json(), DuckDBAPI()),
+ Linker(df_sdf_4, linker_base.misc.save_model_to_json()),
]
# This ensures we're checking that serialisation and deserialisation
diff --git a/tests/test_estimate_prob_two_rr_match.py b/tests/test_estimate_prob_two_rr_match.py
index 4faa104b88..c9bdbc3695 100644
--- a/tests/test_estimate_prob_two_rr_match.py
+++ b/tests/test_estimate_prob_two_rr_match.py
@@ -33,7 +33,7 @@ def test_prob_rr_match_dedupe(test_helpers, dialect):
deterministic_rules = ["l.first_name = r.first_name", "l.surname = r.surname"]
# Test dedupe only
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
linker.training.estimate_probability_two_random_records_match(
deterministic_rules, recall=1.0
)
@@ -86,7 +86,7 @@ def test_prob_rr_match_link_only(test_helpers, dialect):
deterministic_rules = ["l.first_name = r.first_name", "l.surname = r.surname"]
# Test dedupe only
- linker = helper.Linker([df_1, df_2], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df_1, df_2], settings)
linker.training.estimate_probability_two_random_records_match(
deterministic_rules, recall=1.0
)
@@ -126,7 +126,7 @@ def test_prob_rr_match_link_and_dedupe(test_helpers, dialect):
deterministic_rules = ["l.first_name = r.first_name", "l.surname = r.surname"]
# Test dedupe only
- linker = helper.Linker([df_1, df_2], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df_1, df_2], settings)
linker.training.estimate_probability_two_random_records_match(
deterministic_rules, recall=1.0
)
@@ -195,7 +195,7 @@ def test_prob_rr_match_link_only_multitable(test_helpers, dialect):
deterministic_rules = ["l.first_name = r.first_name", "l.surname = r.surname"]
- linker = helper.Linker(dfs, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(dfs, settings)
linker.training.estimate_probability_two_random_records_match(
deterministic_rules, recall=1.0
)
@@ -206,7 +206,7 @@ def test_prob_rr_match_link_only_multitable(test_helpers, dialect):
assert pytest.approx(prob) == 6 / 131
# if we define all record pairs to be a match, then the probability should be 1
- linker = helper.Linker(dfs, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(dfs, settings)
linker.training.estimate_probability_two_random_records_match(
["l.city = r.city"], recall=1.0
)
@@ -273,7 +273,7 @@ def test_prob_rr_match_link_and_dedupe_multitable(test_helpers, dialect):
deterministic_rules = ["l.first_name = r.first_name", "l.surname = r.surname"]
- linker = helper.Linker(dfs, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(dfs, settings)
linker.training.estimate_probability_two_random_records_match(
deterministic_rules, recall=1.0
)
@@ -284,7 +284,7 @@ def test_prob_rr_match_link_and_dedupe_multitable(test_helpers, dialect):
# (3 + 4 + 5 + 7)(3 + 4 + 5 + 7 - 1)/2 = 171 comparisons
assert pytest.approx(prob) == 10 / 171
- linker = helper.Linker(dfs, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(dfs, settings)
linker.training.estimate_probability_two_random_records_match(
["l.city = r.city"], recall=1.0
)
@@ -348,7 +348,7 @@ def check_range(p):
}
# Test dedupe only
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
with pytest.raises(ValueError):
# all comparisons matches using this rule, so we must have perfect recall
# using recall = 80% is inconsistent, so should get an error
diff --git a/tests/test_expectation_maximisation.py b/tests/test_expectation_maximisation.py
index aa4c0d2432..5452ec783d 100644
--- a/tests/test_expectation_maximisation.py
+++ b/tests/test_expectation_maximisation.py
@@ -29,8 +29,9 @@ def test_clear_error_when_empty_block():
}
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- linker = Linker(df, settings, db_api=db_api)
+ linker = Linker(df_sdf, settings)
linker._debug_mode = True
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
linker.training.estimate_parameters_using_expectation_maximisation(
@@ -55,13 +56,15 @@ def test_estimate_without_term_frequencies():
],
}
- db_api = DuckDBAPI()
+ db_api_1 = DuckDBAPI()
+ df_sdf_1 = db_api_1.register(df)
- linker_0 = Linker(df, settings, db_api=db_api)
+ linker_0 = Linker(df_sdf_1, settings)
- db_api = DuckDBAPI()
+ db_api_2 = DuckDBAPI()
+ df_sdf_2 = db_api_2.register(df)
- linker_1 = Linker(df, settings, db_api=db_api)
+ linker_1 = Linker(df_sdf_2, settings)
session_fast = linker_0.training.estimate_parameters_using_expectation_maximisation(
blocking_rule="l.email = r.email",
@@ -149,7 +152,10 @@ def test_fix_probabilities():
additional_columns_to_retain=["cluster"],
)
- linker = Linker(df, settings, db_api=DuckDBAPI())
+ db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
+
+ linker = Linker(df_sdf, settings)
linker.training.estimate_u_using_random_sampling(max_pairs=1e4)
diff --git a/tests/test_extreme_match_weights.py b/tests/test_extreme_match_weights.py
index 34387d07c4..8f8ad1caa8 100644
--- a/tests/test_extreme_match_weights.py
+++ b/tests/test_extreme_match_weights.py
@@ -47,7 +47,8 @@ def test_extreme_match_weights_high_similarity():
)
db_api = DuckDBAPI()
- linker = Linker(df, settings, db_api=db_api)
+ df_sdf = db_api.register(df)
+ linker = Linker(df_sdf, settings)
predictions = linker.inference.predict().as_pandas_dataframe()
# Should get exactly one prediction (comparing record 1 with record 2)
@@ -85,7 +86,8 @@ def test_extreme_match_weights_low_similarity():
)
db_api = DuckDBAPI()
- linker = Linker(df, settings, db_api=db_api)
+ df_sdf = db_api.register(df)
+ linker = Linker(df_sdf, settings)
predictions = linker.inference.predict().as_pandas_dataframe()
match_prob = predictions["match_probability"].iloc[0]
diff --git a/tests/test_find_new_matches.py b/tests/test_find_new_matches.py
index 648c9e165c..14c4d6506e 100644
--- a/tests/test_find_new_matches.py
+++ b/tests/test_find_new_matches.py
@@ -45,13 +45,11 @@ def get_different_settings_dicts():
@mark_with_dialects_excluding()
def test_tf_tables_init_works(test_helpers, dialect):
helper = test_helpers[dialect]
- Linker = helper.Linker
for idx, s in enumerate(get_different_settings_dicts()):
- linker = Linker(
+ linker = helper.linker_with_registration(
df,
s,
- **helper.extra_linker_args(),
input_table_aliases=f"test_tf_table_alias_{idx}",
)
@@ -81,11 +79,10 @@ def test_tf_tables_init_works(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_matches_work(test_helpers, dialect):
helper = test_helpers[dialect]
- Linker = helper.Linker
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- linker = Linker(df, get_settings_dict(), **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, get_settings_dict())
# Train our model to get more reasonable outputs...
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
diff --git a/tests/test_full_example_deterministic_link.py b/tests/test_full_example_deterministic_link.py
index 7ee4e056f2..28bbe812fc 100644
--- a/tests/test_full_example_deterministic_link.py
+++ b/tests/test_full_example_deterministic_link.py
@@ -5,7 +5,6 @@
from splink.blocking_analysis import (
cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)
-from splink.internals.linker import Linker
from .decorator import mark_with_dialects_excluding
@@ -28,17 +27,17 @@ def test_deterministic_link_full_example(dialect, tmp_path, test_helpers):
"retain_matching_columns": True,
"retain_intermediate_calculation_columns": True,
}
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ helper_db_api = helper.db_api()
+ df_sdf = helper_db_api.register(df)
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
- table_or_tables=df,
+ df_sdf,
blocking_rules=br_for_predict,
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
- linker = Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_predict = linker.inference.deterministic_link()
diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py
index 7f6476c73e..a25dcf20d5 100644
--- a/tests/test_full_example_duckdb.py
+++ b/tests/test_full_example_duckdb.py
@@ -41,25 +41,23 @@ def test_full_example_duckdb(tmp_path):
]
db_api = DuckDBAPI(connection=os.path.join(tmp_path, "duckdb.db"))
+ df_sdf = db_api.register(df)
count_comparisons_from_blocking_rule(
- table_or_tables=df,
+ df_sdf,
blocking_rule='l.first_name = r.first_name and l."SUR name" = r."SUR name"', # noqa: E501
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
linker = Linker(
- df,
+ df_sdf,
settings=settings_dict,
- db_api=db_api,
# output_schema="splink_in_duckdb",
)
profile_columns(
- df,
- db_api,
+ df_sdf,
[
"first_name",
'"SUR name"',
@@ -67,7 +65,7 @@ def test_full_example_duckdb(tmp_path):
"concat(city, first_name)",
],
)
- completeness_chart(df, db_api)
+ completeness_chart(df_sdf)
linker.table_management.compute_tf_table("city")
linker.table_management.compute_tf_table("first_name")
@@ -131,9 +129,10 @@ def test_full_example_duckdb(tmp_path):
linker.misc.save_model_to_json(path)
db_api = DuckDBAPI()
- linker_2 = Linker(df, settings=simple_settings, db_api=db_api)
+ df_sdf2 = db_api.register(df)
+ linker_2 = Linker(df_sdf2, settings=simple_settings)
- linker_2 = Linker(df, db_api=db_api, settings=path)
+ linker_2 = Linker(df_sdf2, settings=path)
# Test that writing to files works as expected
_test_write_functionality(linker_2, pd.read_csv)
@@ -182,7 +181,12 @@ def test_link_only(input, source_l, source_r):
settings["source_dataset_column_name"] = "source_dataset"
db_api = DuckDBAPI()
- linker = Linker(input, settings, db_api=db_api)
+ if isinstance(input, list):
+ input_sdf = [db_api.register(inp) for inp in input]
+ else:
+ input_sdf = db_api.register(input)
+
+ linker = Linker(input_sdf, settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == 7257
@@ -219,10 +223,10 @@ def test_duckdb_load_different_tablish_types(df):
settings = get_settings_dict()
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
linker = Linker(
- df,
+ df_sdf,
settings,
- db_api=db_api,
)
assert len(linker.inference.predict().as_pandas_dataframe()) == 3167
@@ -230,11 +234,11 @@ def test_duckdb_load_different_tablish_types(df):
settings["link_type"] = "link_only"
db_api = DuckDBAPI()
+ df_sdf1 = db_api.register(df, source_dataset_name="testing1")
+ df_sdf2 = db_api.register(df, source_dataset_name="testing2")
linker = Linker(
- [df, df],
+ [df_sdf1, df_sdf2],
settings,
- db_api=db_api,
- input_table_aliases=["testing1", "testing2"],
)
assert len(linker.inference.predict().as_pandas_dataframe()) == 7257
@@ -257,15 +261,15 @@ def test_duckdb_arrow_array():
# ]
db_api = DuckDBAPI()
+ array_data_sdf = db_api.register(array_data)
linker = Linker(
- array_data,
+ array_data_sdf,
{
"link_type": "dedupe_only",
"unique_id_column_name": "uid",
"comparisons": [cl.ExactMatch("b")],
"blocking_rules_to_generate_predictions": ["l.a[1] = r.a[1]"],
},
- db_api=db_api,
)
df = linker.inference.deterministic_link().as_pandas_dataframe()
assert len(df) == 2
@@ -310,7 +314,8 @@ def test_small_example_duckdb(tmp_path):
}
db_api = DuckDBAPI()
- linker = Linker(df, settings_dict, db_api=db_api)
+ df_sdf = db_api.register(df)
+ linker = Linker(df_sdf, settings_dict)
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
blocking_rule = "l.full_name = r.full_name"
@@ -333,5 +338,7 @@ def test_duckdb_input_is_duckdbpyrelation():
blocking_rules_to_generate_predictions=[block_on("first_name", "surname")],
)
db_api = DuckDBAPI(connection=":default:")
- linker = Linker([df1, df2], settings, db_api)
+ df1_sdf = db_api.register(df1)
+ df2_sdf = db_api.register(df2)
+ linker = Linker([df1_sdf, df2_sdf], settings)
linker.inference.predict()
diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py
index 9bdf934c64..aea6b333b0 100644
--- a/tests/test_full_example_postgres.py
+++ b/tests/test_full_example_postgres.py
@@ -26,35 +26,33 @@ def test_full_example_postgres(tmp_path, pg_engine):
settings_dict = get_settings_dict()
db_api = PostgresAPI(engine=pg_engine)
+ df_sdf = db_api.register(df)
+
linker = Linker(
- df,
+ df_sdf,
settings_dict,
- db_api=db_api,
)
count_comparisons_from_blocking_rule(
- table_or_tables=df,
+ df_sdf,
blocking_rule='l.first_name = r.first_name and l."surname" = r."surname"', # noqa: E501
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
- table_or_tables=df,
+ df_sdf,
blocking_rules=[
"l.first_name = r.first_name",
"l.surname = r.surname",
"l.city = r.city",
],
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
profile_columns(
- df,
- db_api,
+ df_sdf,
[
"first_name",
'"surname"',
@@ -63,7 +61,7 @@ def test_full_example_postgres(tmp_path, pg_engine):
],
)
- completeness_chart(df, db_api=db_api)
+ completeness_chart(df_sdf)
linker.table_management.compute_tf_table("city")
linker.table_management.compute_tf_table("first_name")
@@ -126,7 +124,7 @@ def test_full_example_postgres(tmp_path, pg_engine):
path = os.path.join(tmp_path, "model.json")
linker.misc.save_model_to_json(path)
- Linker(df, path, db_api=db_api)
+ Linker(df_sdf, path)
@mark_with_dialects_including("postgres")
@@ -139,9 +137,10 @@ def test_postgres_use_existing_table(tmp_path, pg_engine):
settings_dict = get_settings_dict()
db_api = PostgresAPI(engine=pg_engine)
+ df_sdf = db_api.register(table_name)
+
linker = Linker(
- table_name,
- db_api=db_api,
- settings=settings_dict,
+ df_sdf,
+ settings_dict,
)
linker.inference.predict()
diff --git a/tests/test_full_example_spark.py b/tests/test_full_example_spark.py
index 94f14f16e5..62a25bbd29 100644
--- a/tests/test_full_example_spark.py
+++ b/tests/test_full_example_spark.py
@@ -36,7 +36,8 @@ def test_full_example_spark(spark, df_spark, tmp_path, spark_api, break_lineage_
spark.sql("CREATE DATABASE IF NOT EXISTS `1111`")
# Annoyingly, this needs an independent linker as csv doesn't
# accept arrays as inputs, which we are adding to df_spark below
- linker = Linker(df_spark, get_settings_dict(), spark_api)
+ df_spark_sdf = spark_api.register(df_spark)
+ linker = Linker(df_spark_sdf, get_settings_dict())
# Test that writing to files works as expected
def spark_csv_read(x):
@@ -79,24 +80,22 @@ def spark_csv_read(x):
"max_iterations": 2,
}
+ df_spark_sdf_profile = spark_api.register(df_spark)
profile_columns(
- df_spark,
- spark_api,
+ df_spark_sdf_profile,
["first_name", "surname", "first_name || surname", "concat(city, first_name)"],
)
- completeness_chart(df_spark, spark_api)
+ completeness_chart(df_spark_sdf_profile)
spark.sql("USE DATABASE `1111`")
- linker = Linker(
- df_spark,
- settings,
- SparkAPI(
- spark_session=spark,
- break_lineage_method=break_lineage_method,
- num_partitions_on_repartition=2,
- ),
+ spark_api_2 = SparkAPI(
+ spark_session=spark,
+ break_lineage_method=break_lineage_method,
+ num_partitions_on_repartition=2,
)
+ df_spark_sdf_2 = spark_api_2.register(df_spark)
+ linker = Linker(df_spark_sdf_2, settings)
linker.table_management.compute_tf_table("city")
linker.table_management.compute_tf_table("first_name")
@@ -163,21 +162,21 @@ def spark_csv_read(x):
# Test differing inputs are accepted
settings["link_type"] = "link_only"
- linker = Linker(
- [df_spark, df_spark.toPandas()],
- settings,
- SparkAPI(
- spark_session=spark,
- break_lineage_method="checkpoint",
- num_partitions_on_repartition=2,
- ),
+ spark_api_3 = SparkAPI(
+ spark_session=spark,
+ break_lineage_method="checkpoint",
+ num_partitions_on_repartition=2,
)
+ df_spark_sdf_3 = spark_api_3.register(df_spark)
+ df_pandas_sdf_3 = spark_api_3.register(df_spark.toPandas())
+ linker = Linker([df_spark_sdf_3, df_pandas_sdf_3], settings)
# Test saving and loading
path = os.path.join(tmp_path, "model.json")
linker.misc.save_model_to_json(path)
- Linker(df_spark, settings=path, db_api=spark_api)
+ df_spark_sdf_final = spark_api.register(df_spark)
+ Linker(df_spark_sdf_final, settings=path)
@mark_with_dialects_including("spark")
@@ -189,15 +188,14 @@ def test_link_only(spark, df_spark, spark_api):
df_spark_a = df_spark.withColumn("source_dataset", f.lit("my_left_ds"))
df_spark_b = df_spark.withColumn("source_dataset", f.lit("my_right_ds"))
- linker = Linker(
- [df_spark_a, df_spark_b],
- settings,
- SparkAPI(
- spark_session=spark,
- break_lineage_method="checkpoint",
- num_partitions_on_repartition=2,
- ),
+ spark_api_link = SparkAPI(
+ spark_session=spark,
+ break_lineage_method="checkpoint",
+ num_partitions_on_repartition=2,
)
+ df_spark_a_sdf = spark_api_link.register(df_spark_a)
+ df_spark_b_sdf = spark_api_link.register(df_spark_b)
+ linker = Linker([df_spark_a_sdf, df_spark_b_sdf], settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == 7257
@@ -218,10 +216,7 @@ def test_link_only(spark, df_spark, spark_api):
def test_spark_load_from_file(df, spark, spark_api):
settings = get_settings_dict()
- linker = Linker(
- df,
- settings,
- spark_api,
- )
+ df_sdf = spark_api.register(df)
+ linker = Linker(df_sdf, settings)
assert len(linker.inference.predict().as_pandas_dataframe()) == 3167
diff --git a/tests/test_full_example_sqlite.py b/tests/test_full_example_sqlite.py
index 08bd13ee71..2b49495726 100644
--- a/tests/test_full_example_sqlite.py
+++ b/tests/test_full_example_sqlite.py
@@ -20,18 +20,15 @@ def test_full_example_sqlite(tmp_path):
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- df.to_sql("input_df_tablename", con)
-
settings_dict = get_settings_dict()
db_api = SQLiteAPI(con)
+ df_sdf = db_api.register(df, source_dataset_name="fake_data_1")
linker = Linker(
- "input_df_tablename",
+ df_sdf,
settings_dict,
- db_api=db_api,
- input_table_aliases="fake_data_1",
)
- profile_columns(df, db_api, ["first_name", "surname", "first_name || surname"])
+ profile_columns(df_sdf, ["first_name", "surname", "first_name || surname"])
linker.table_management.compute_tf_table("city")
linker.table_management.compute_tf_table("first_name")
@@ -74,14 +71,12 @@ def test_small_link_example_sqlite():
settings_dict["link_type"] = "link_only"
- df.to_sql("input_df_tablename", con)
-
db_api = SQLiteAPI(con)
+ df_1_sdf = db_api.register(df, source_dataset_name="fake_data_1")
+ df_2_sdf = db_api.register(df, source_dataset_name="fake_data_2")
linker = Linker(
- ["input_df_tablename", "input_df_tablename"],
+ [df_1_sdf, df_2_sdf],
settings_dict,
- db_api,
- input_table_aliases=["fake_data_1", "fake_data_2"],
)
linker.inference.predict()
@@ -94,6 +89,7 @@ def test_default_conn_sqlite(tmp_path):
settings_dict = get_settings_dict()
db_api = SQLiteAPI()
- linker = Linker(df, settings_dict, db_api)
+ df_sdf = db_api.register(df)
+ linker = Linker(df_sdf, settings_dict)
linker.inference.predict()
diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py
index b0f1ad8b33..fbfc64a8a3 100644
--- a/tests/test_graph_metrics.py
+++ b/tests/test_graph_metrics.py
@@ -36,8 +36,9 @@ def test_size_density_dedupe():
],
}
db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1)
- linker = Linker(df_1, settings, db_api=db_api)
+ linker = Linker(df_1_sdf, settings)
df_predict = linker.inference.predict()
df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
@@ -70,12 +71,12 @@ def test_size_density_link():
],
}
db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1, source_dataset_name="df_left")
+ df_2_sdf = db_api.register(df_2, source_dataset_name="df_right")
linker = Linker(
- [df_1, df_2],
+ [df_1_sdf, df_2_sdf],
settings,
- input_table_aliases=["df_left", "df_right"],
- db_api=db_api,
)
df_predict = linker.inference.predict()
@@ -229,10 +230,9 @@ def test_metrics(dialect, test_helpers):
]
# pass in dummy frame to linker
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
helper.convert_frame(df_1),
{"link_type": "dedupe_only"},
- **helper.extra_linker_args(),
)
df_predict = linker.table_management.register_table(
helper.convert_frame(df_e), "predict"
@@ -350,10 +350,9 @@ def test_is_bridge(dialect, test_helpers):
+ [{"cluster_id": 2, "unique_id": i} for i in range(5, 10 + 1)]
+ [{"cluster_id": 3, "unique_id": i} for i in range(11, 18 + 1)]
)
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
helper.convert_frame(df_1),
{"link_type": "dedupe_only"},
- **helper.extra_linker_args(),
)
df_predict = linker.table_management.register_table(
helper.convert_frame(df_e), "br_predict"
@@ -407,7 +406,9 @@ def test_edges_without_igraph():
ExactMatch("dob"),
],
}
- linker = Linker(df_1, settings, DuckDBAPI())
+ db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1)
+ linker = Linker(df_1_sdf, settings)
df_predict = linker.inference.predict()
df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
@@ -445,7 +446,9 @@ def test_no_threshold_provided():
)
settings = {"link_type": "dedupe_only"}
- linker = Linker(df_1, settings, DuckDBAPI())
+ db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1)
+ linker = Linker(df_1_sdf, settings)
df_predict = linker.table_management.register_table(df_e, "predict")
df_clustered = linker.table_management.register_table(df_c, "clusters")
@@ -467,7 +470,9 @@ def test_override_metadata_threshold():
)
df_c = pd.DataFrame([{"cluster_id": 1, "unique_id": i} for i in range(1, 3 + 1)])
settings = {"link_type": "dedupe_only"}
- linker = Linker(df_1, settings, DuckDBAPI())
+ db_api = DuckDBAPI()
+ df_1_sdf = db_api.register(df_1)
+ linker = Linker(df_1_sdf, settings)
# linker.debug_mode = True
df_predict = linker.table_management.register_table(df_e, "predict")
df_clustered = linker.table_management.register_table(df_c, "clusters")
diff --git a/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py b/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py
index 4cfab91412..736d5ff389 100644
--- a/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py
+++ b/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py
@@ -114,10 +114,10 @@ def test_dedupe_only():
],
}
db_api = DuckDBAPI()
+ df_one_sdf = db_api.register(df_one)
linker = Linker(
- df_one,
+ df_one_sdf,
settings,
- db_api=db_api,
set_up_basic_logging=False,
)
logging.getLogger("splink").setLevel(1)
@@ -167,11 +167,11 @@ def test_link_and_dedupe():
],
}
db_api = DuckDBAPI()
+ df_one_sdf = db_api.register(df_one, source_dataset_name="df_one")
+ df_two_sdf = db_api.register(df_two, source_dataset_name="df_two")
linker = Linker(
- [df_one, df_two],
+ [df_one_sdf, df_two_sdf],
settings,
- db_api=db_api,
- input_table_aliases=["df_one", "df_two"],
set_up_basic_logging=False,
)
@@ -223,11 +223,11 @@ def test_link_only_two():
],
}
db_api = DuckDBAPI()
+ df_one_sdf = db_api.register(df_one, source_dataset_name="df_one")
+ df_two_sdf = db_api.register(df_two, source_dataset_name="df_two")
linker = Linker(
- [df_one, df_two],
+ [df_one_sdf, df_two_sdf],
settings,
- db_api=db_api,
- input_table_aliases=["df_one", "df_two"],
set_up_basic_logging=False,
)
@@ -280,11 +280,12 @@ def test_link_only_three():
],
}
db_api = DuckDBAPI()
+ df_one_sdf = db_api.register(df_one, source_dataset_name="df_one")
+ df_two_sdf = db_api.register(df_two, source_dataset_name="df_two")
+ df_three_sdf = db_api.register(df_three, source_dataset_name="df_three")
linker = Linker(
- [df_one, df_two, df_three],
+ [df_one_sdf, df_two_sdf, df_three_sdf],
settings,
- db_api=db_api,
- input_table_aliases=["df_one", "df_two", "df_three"],
set_up_basic_logging=False,
)
diff --git a/tests/test_km_distance_level.py b/tests/test_km_distance_level.py
index 52708eec25..7e02f01446 100644
--- a/tests/test_km_distance_level.py
+++ b/tests/test_km_distance_level.py
@@ -124,9 +124,9 @@ def test_km_distance_levels(dialect, test_helpers):
df = helper.convert_frame(df)
- linker = helper.Linker(df, settings_cl, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings_cl)
cl_df_e = linker.inference.predict().as_pandas_dataframe()
- linker = helper.Linker(df, settings_cll, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings_cll)
cll_df_e = linker.inference.predict().as_pandas_dataframe()
linker_outputs = {
@@ -226,7 +226,8 @@ def test_haversine_level():
db_api = DuckDBAPI()
- linker = Linker(df, settings, input_table_aliases="test", db_api=db_api)
+ df_sdf = db_api.register(df, source_dataset_name="test")
+ linker = Linker(df_sdf, settings)
df_e = linker.inference.predict().as_pandas_dataframe()
row = dict(df_e.query("id_l == 1 and id_r == 2").iloc[0])
diff --git a/tests/test_linker_variants.py b/tests/test_linker_variants.py
index d54c5b3c02..60fd986925 100644
--- a/tests/test_linker_variants.py
+++ b/tests/test_linker_variants.py
@@ -63,7 +63,8 @@ def test_dedupe_only_join_condition():
settings["link_type"] = "dedupe_only"
db_api = DuckDBAPI()
- linker = Linker(df.copy(), settings, db_api=db_api)
+ df_sdf = db_api.register(df.copy())
+ linker = Linker(df_sdf, settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == (6 * 5) / 2
@@ -77,7 +78,9 @@ def test_link_only_two_join_condition():
settings["link_type"] = "link_only"
db_api = DuckDBAPI()
- linker = Linker([sds_d_only, sds_b_only], settings, db_api=db_api)
+ sds_d_sdf = db_api.register(sds_d_only)
+ sds_b_sdf = db_api.register(sds_b_only)
+ linker = Linker([sds_d_sdf, sds_b_sdf], settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == 4
@@ -97,7 +100,10 @@ def test_link_only_three_join_condition():
settings["link_type"] = "link_only"
db_api = DuckDBAPI()
- linker = Linker([sds_d_only, sds_b_only, sds_c_only], settings, db_api=db_api)
+ sds_d_sdf = db_api.register(sds_d_only)
+ sds_b_sdf = db_api.register(sds_b_only)
+ sds_c_sdf = db_api.register(sds_c_only)
+ linker = Linker([sds_d_sdf, sds_b_sdf, sds_c_sdf], settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == 12
@@ -117,7 +123,9 @@ def test_link_and_dedupe_two_join_condition():
settings["link_type"] = "link_and_dedupe"
db_api = DuckDBAPI()
- linker = Linker([sds_d_only, sds_b_only], settings, db_api=db_api)
+ sds_d_sdf = db_api.register(sds_d_only)
+ sds_b_sdf = db_api.register(sds_b_only)
+ linker = Linker([sds_d_sdf, sds_b_sdf], settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == (4 * 3) / 2
@@ -137,7 +145,10 @@ def test_link_and_dedupe_three_join_condition():
settings["link_type"] = "link_and_dedupe"
db_api = DuckDBAPI()
- linker = Linker([sds_d_only, sds_b_only, sds_c_only], settings, db_api=db_api)
+ sds_d_sdf = db_api.register(sds_d_only)
+ sds_b_sdf = db_api.register(sds_b_only)
+ sds_c_sdf = db_api.register(sds_c_only)
+ linker = Linker([sds_d_sdf, sds_b_sdf, sds_c_sdf], settings)
df_predict = linker.inference.predict().as_pandas_dataframe()
assert len(df_predict) == (6 * 5) / 2
diff --git a/tests/test_m_train.py b/tests/test_m_train.py
index 0589232392..5c36fff012 100644
--- a/tests/test_m_train.py
+++ b/tests/test_m_train.py
@@ -24,7 +24,8 @@ def test_m_train():
# Train from label column
db_api = DuckDBAPI()
- linker = Linker(df, settings, db_api=db_api)
+ df_sdf = db_api.register(df)
+ linker = Linker(df_sdf, settings)
linker.training.estimate_m_from_label_column("cluster")
cc_name = linker._settings_obj.comparisons[0]
@@ -55,9 +56,10 @@ def test_m_train():
db_api = DuckDBAPI()
- linker_pairwise = Linker(df, settings, db_api=db_api)
+ df_sdf = db_api.register(df)
+ linker_pairwise = Linker(df_sdf, settings)
- linker_pairwise.table_management.register_table(df_labels, "labels")
+ db_api.register(df_labels, "labels")
linker_pairwise.training.estimate_m_from_pairwise_labels("labels")
cc_name = linker_pairwise._settings_obj.comparisons[0]
diff --git a/tests/test_new_comparison_levels.py b/tests/test_new_comparison_levels.py
index af0865e1eb..e4bc711e2b 100644
--- a/tests/test_new_comparison_levels.py
+++ b/tests/test_new_comparison_levels.py
@@ -76,7 +76,7 @@ def test_cll_creators_run_predict(dialect, test_helpers):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- linker = helper.Linker(df, cll_settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, cll_settings)
linker.inference.predict()
@@ -171,7 +171,7 @@ def test_cl_creators_run_predict(dialect, test_helpers):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- linker = helper.Linker(df, cl_settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, cl_settings)
linker.inference.predict()
@@ -201,7 +201,7 @@ def test_regex_fall_through(dialect, test_helpers):
],
}
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_e = linker.inference.predict().as_pandas_dataframe()
# only entry should be in Else level
@@ -231,7 +231,7 @@ def test_null_pattern_match(dialect, test_helpers):
],
}
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
df_e = linker.inference.predict().as_pandas_dataframe()
# only entry should be in Null level
@@ -275,7 +275,7 @@ def test_ctl_creators_run_predict(dialect, test_helpers):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- linker = helper.Linker(df, cl_settings_2, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, cl_settings_2)
linker.inference.predict()
diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py
index c88a2bc2ef..ead9025ac3 100644
--- a/tests/test_new_db_api.py
+++ b/tests/test_new_db_api.py
@@ -63,12 +63,7 @@ def test_run_predict(dialect, test_helpers):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- linker = Linker(
- df,
- cl_settings,
- db_api,
- )
+ linker = helper.linker_with_registration(df, cl_settings)
linker.inference.predict()
@@ -77,12 +72,7 @@ def test_full_run(dialect, test_helpers, tmp_path):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- linker = Linker(
- df,
- cl_settings,
- db_api,
- )
+ linker = helper.linker_with_registration(df, cl_settings)
linker.training.estimate_probability_two_random_records_match(
["l.first_name = r.first_name AND l.surname = r.surname"],
0.6,
@@ -116,17 +106,17 @@ def test_charts(dialect, test_helpers, tmp_path):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- db_api = helper.DatabaseAPI(**helper.db_api_args())
+ db_api = helper.db_api()
+ df_sdf = db_api.register(df)
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
- table_or_tables=df,
+ df_sdf,
blocking_rules=[block_on("dob"), block_on("first_name")],
link_type="dedupe_only",
- db_api=db_api,
unique_id_column_name="unique_id",
)
- linker = Linker(df, cl_settings, db_api)
+ linker = Linker(df_sdf, cl_settings)
linker.training.estimate_probability_two_random_records_match(
["l.first_name = r.first_name AND l.surname = r.surname"],
@@ -149,5 +139,6 @@ def test_exploratory_charts(dialect, test_helpers):
helper = test_helpers[dialect]
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
- db_api = helper.DatabaseAPI(**helper.db_api_args())
- profile_columns(df, db_api, "first_name")
+ db_api = helper.db_api()
+ df_sdf = db_api.register(df)
+ profile_columns(df_sdf, "first_name")
diff --git a/tests/test_postgres_udfs.py b/tests/test_postgres_udfs.py
index 90ed55c7c3..875d7d40ed 100644
--- a/tests/test_postgres_udfs.py
+++ b/tests/test_postgres_udfs.py
@@ -17,7 +17,7 @@ def test_log2(pg_engine):
db_api = PostgresAPI(engine=pg_engine)
df = pd.DataFrame({"x": [2, 8, 0.5, 1]})
expected_log2_vals = [1, 3, -1, 0]
- db_api.register_table(df, "log_values")
+ db_api._create_backend_table(df, "log_values")
sql = """SELECT log2("x") AS logs FROM log_values"""
frame = db_api._execute_sql(sql, "test_log_table").as_pandas_dataframe()
diff --git a/tests/test_profile_data.py b/tests/test_profile_data.py
index d0a65cef9d..d46c28283c 100644
--- a/tests/test_profile_data.py
+++ b/tests/test_profile_data.py
@@ -17,7 +17,7 @@
def generate_raw_profile_dataset(table, columns_to_profile, db_api):
input_alias = "__splink__profile_data"
- _splink_df = db_api.register_table(table, input_alias, overwrite=True)
+ _splink_df = db_api._create_backend_table(table, input_alias, overwrite=True)
pipeline = CTEPipeline()
@@ -34,11 +34,9 @@ def generate_raw_profile_dataset(table, columns_to_profile, db_api):
def test_profile_default_cols_duckdb():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
db_api = DuckDBAPI()
+ df_sdf = db_api.register(df)
- profile_columns(
- df,
- db_api,
- )
+ profile_columns(df_sdf)
@mark_with_dialects_including("duckdb")
@@ -46,17 +44,16 @@ def test_profile_using_duckdb():
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
df["blank"] = None
db_api = DuckDBAPI(connection=":memory:")
+ df_sdf = db_api.register(df)
profile_columns(
- df,
- db_api,
+ df_sdf,
["first_name", "surname", "first_name || surname", "concat(city, first_name)"],
top_n=15,
bottom_n=15,
)
profile_columns(
- df,
- db_api,
+ df_sdf,
[
"first_name",
["surname"],
@@ -118,12 +115,12 @@ def test_profile_with_arrays_duckdb():
df = pd.DataFrame(dic)
db_api = DuckDBAPI(connection=":memory:")
+ df_sdf = db_api.register(df)
column_expressions = ["forename", "surname", "offence_code_arr", "lat_long"]
profile_columns(
- df,
- db_api,
+ df_sdf,
column_expressions,
top_n=3,
bottom_n=3,
@@ -134,12 +131,12 @@ def test_profile_with_arrays_duckdb():
def test_profile_with_arrays_spark(spark, spark_api):
spark_df = spark.read.parquet("tests/datasets/arrays_df.parquet")
spark_df.persist()
+ spark_df_sdf = spark_api.register(spark_df)
column_expressions = ["forename", "surname", "offence_code_arr", "lat_long"]
profile_columns(
- spark_df,
- spark_api,
+ spark_df_sdf,
column_expressions,
top_n=3,
bottom_n=3,
@@ -155,10 +152,10 @@ def test_profile_using_sqlite():
df.to_sql("fake_data_1", con, if_exists="replace")
db_api = SQLiteAPI(con)
+ df_sdf = db_api.register(df)
profile_columns(
- df,
- db_api,
+ df_sdf,
["first_name", "surname", "first_name || surname"],
)
@@ -170,17 +167,16 @@ def test_profile_using_spark(df_spark, spark_api):
from pyspark.sql.types import StringType
df_spark = df_spark.withColumn("blank", lit(None).cast(StringType()))
+ df_spark_sdf = spark_api.register(df_spark)
profile_columns(
- df_spark,
- spark_api,
+ df_spark_sdf,
["first_name", "surname", "first_name || surname", "concat(city, first_name)"],
top_n=15,
bottom_n=15,
)
profile_columns(
- df_spark,
- spark_api,
+ df_spark_sdf,
[
"first_name",
["surname"],
@@ -209,8 +205,9 @@ def test_profile_null_columns(caplog):
)
db_api = DuckDBAPI(connection=":memory:")
+ df_sdf = db_api.register(df)
- profile_columns(df, db_api, ["test_1", "test_2"])
+ profile_columns(df_sdf, ["test_1", "test_2"])
captured_logs = caplog.text
assert (
diff --git a/tests/test_realtime.py b/tests/test_realtime.py
index e004cd3c82..b7c125004d 100644
--- a/tests/test_realtime.py
+++ b/tests/test_realtime.py
@@ -17,7 +17,7 @@ def test_realtime_cache_two_records(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
df1 = pd.DataFrame(
[
@@ -117,7 +117,7 @@ def test_realtime_cache_multiple_records(test_helpers, dialect):
# or not with multiple records in each DataFrame
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
df1 = pd.DataFrame(
[
@@ -298,7 +298,7 @@ def test_realtime_cache_multiple_records(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_realtime_cache_different_settings(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
df1 = pd.DataFrame(
[
@@ -363,7 +363,7 @@ def test_realtime_cache_different_settings(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_realtime_cache_different_settings_dict(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
df1 = pd.DataFrame(
[
@@ -433,7 +433,7 @@ def test_realtime_cache_different_settings_dict(test_helpers, dialect):
@mark_with_dialects_excluding()
def test_realtime_custom_join(test_helpers, dialect):
helper = test_helpers[dialect]
- db_api = helper.extra_linker_args()["db_api"]
+ db_api = helper.db_api()
df = pd.DataFrame(
[
diff --git a/tests/test_regex_param.py b/tests/test_regex_param.py
index 774fe5a5fd..5160a6148c 100644
--- a/tests/test_regex_param.py
+++ b/tests/test_regex_param.py
@@ -129,7 +129,7 @@ def test_regex(dialect, test_helpers, level_set, record_pairs_gamma):
comparison_name = level_set["output_column_name"]
df = helper.convert_frame(df_pandas)
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
linker_output = linker.inference.predict().as_pandas_dataframe()
diff --git a/tests/test_score_missing_edges.py b/tests/test_score_missing_edges.py
index 402c1c51c8..15d434d0a5 100644
--- a/tests/test_score_missing_edges.py
+++ b/tests/test_score_missing_edges.py
@@ -2,7 +2,7 @@
from pytest import mark
import splink.comparison_library as cl
-from splink import Linker, SettingsCreator, block_on
+from splink import SettingsCreator, block_on
from .decorator import mark_with_dialects_excluding
@@ -22,6 +22,7 @@ def test_score_missing_edges(test_helpers, dialect, link_type, copies_of_df):
helper = test_helpers[dialect]
df = helper.convert_frame(df_pd)
+
settings = SettingsCreator(
link_type=link_type,
comparisons=[
@@ -36,8 +37,9 @@ def test_score_missing_edges(test_helpers, dialect, link_type, copies_of_df):
],
retain_intermediate_calculation_columns=True,
)
+
linker_input = df if copies_of_df == 1 else [df for _ in range(copies_of_df)]
- linker = Linker(linker_input, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(linker_input, settings)
df_predict = linker.inference.predict()
df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
@@ -63,6 +65,7 @@ def test_score_missing_edges_all_edges(test_helpers, dialect, link_type, copies_
helper = test_helpers[dialect]
df = helper.convert_frame(df_pd)
+
settings = SettingsCreator(
link_type=link_type,
comparisons=[
@@ -77,8 +80,9 @@ def test_score_missing_edges_all_edges(test_helpers, dialect, link_type, copies_
],
retain_intermediate_calculation_columns=True,
)
+
linker_input = df if copies_of_df == 1 else [df for _ in range(copies_of_df)]
- linker = Linker(linker_input, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(linker_input, settings)
df_predict = linker.inference.predict()
df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
@@ -129,7 +133,7 @@ def test_score_missing_edges_changed_column_names(test_helpers, dialect, link_ty
df_2 = df.copy()
df_2["sds"] = "frame_2"
linker_input = [helper.convert_frame(df), helper.convert_frame(df_2)]
- linker = Linker(linker_input, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(linker_input, settings)
df_predict = linker.inference.predict()
df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
diff --git a/tests/test_settings_options.py b/tests/test_settings_options.py
index 1051ea7644..3c875f55b2 100644
--- a/tests/test_settings_options.py
+++ b/tests/test_settings_options.py
@@ -4,7 +4,6 @@
import splink.internals.comparison_library as cl
from splink import block_on
-from splink.internals.linker import Linker
from .decorator import mark_with_dialects_excluding
@@ -58,7 +57,7 @@ def test_model_heavily_customised_settings(test_helpers, dialect, tmp_path):
"term_frequency_adjustment_column_prefix": "term_freq__",
"comparison_vector_value_column_prefix": "cvv__",
}
- linker = Linker([df_l, df_r], settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration([df_l, df_r], settings)
# run through a few common operations to check functioning
linker.training.estimate_probability_two_random_records_match(
["l.dob = r.dob"], 0.5
diff --git a/tests/test_settings_validation.py b/tests/test_settings_validation.py
index ac6539eba3..ec0bbc836c 100644
--- a/tests/test_settings_validation.py
+++ b/tests/test_settings_validation.py
@@ -243,8 +243,9 @@ def test_settings_validation_logs(caplog):
# Execute the DuckDBLinker to generate logs
with caplog.at_level(logging.WARNING):
db_api = DuckDBAPI()
+ df_sdf = db_api.register(DF)
- Linker(DF, settings, validate_settings=True, db_api=db_api)
+ Linker(df_sdf, settings, validate_settings=True)
# Define expected log segments
expected_log_segments = [
diff --git a/tests/test_spark_udfs.py b/tests/test_spark_udfs.py
index a65a82457a..f4d93a6190 100644
--- a/tests/test_spark_udfs.py
+++ b/tests/test_spark_udfs.py
@@ -60,11 +60,8 @@ def test_udf_registration(spark_api):
"tests/datasets/fake_1000_from_splink_demos.csv", header=True
)
- linker = Linker(
- df_spark,
- settings,
- spark_api,
- )
+ df_sdf = spark_api.register(df_spark)
+ linker = Linker(df_sdf, settings)
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
blocking_rule = "l.first_name = r.first_name"
linker.training.estimate_parameters_using_expectation_maximisation(blocking_rule)
@@ -82,12 +79,8 @@ def test_damerau_levenshtein(spark_api):
df["id"] = df.index
df_spark_dam_lev = spark.createDataFrame(df)
- linker = Linker(
- df_spark_dam_lev,
- settings,
- spark_api,
- input_table_aliases="test_dl_df",
- )
+ df_sdf = spark_api.register(df_spark_dam_lev, source_dataset_name="test_dl_df")
+ linker = Linker(df_sdf, settings)
sql = """
select
@@ -169,12 +162,8 @@ def test_jaro(spark_api):
df["id"] = df.index
df_spark_jaro = spark.createDataFrame(df)
- linker = Linker(
- df_spark_jaro,
- settings,
- spark_api,
- input_table_aliases="test_jaro_df",
- )
+ df_sdf = spark_api.register(df_spark_jaro, source_dataset_name="test_jaro_df")
+ linker = Linker(df_sdf, settings)
sql = """
select
@@ -251,12 +240,8 @@ def test_jaro_winkler(spark_api):
df["id"] = df.index
df_spark_jaro_winkler = spark.createDataFrame(df)
- linker = Linker(
- df_spark_jaro_winkler,
- settings,
- spark_api,
- input_table_aliases="test_jw_df",
- )
+ df_sdf = spark_api.register(df_spark_jaro_winkler, source_dataset_name="test_jw_df")
+ linker = Linker(df_sdf, settings)
sql = """
select
diff --git a/tests/test_splink_datasets.py b/tests/test_splink_datasets.py
index 0f9007dd9e..3bf9c8ceca 100644
--- a/tests/test_splink_datasets.py
+++ b/tests/test_splink_datasets.py
@@ -20,9 +20,5 @@ def test_datasets_basic_link(test_helpers):
block_on("surname"),
],
)
- linker = helper.Linker(
- df,
- settings,
- **helper.extra_linker_args(),
- )
+ linker = helper.linker_with_registration(df, settings)
linker.inference.predict()
diff --git a/tests/test_term_frequencies.py b/tests/test_term_frequencies.py
index 76a9b81fb1..eb3bd0c2f4 100644
--- a/tests/test_term_frequencies.py
+++ b/tests/test_term_frequencies.py
@@ -82,7 +82,8 @@ def test_tf_basic():
}
db_api = DuckDBAPI(connection=":memory:")
- linker = Linker(data, settings, db_api=db_api)
+ data_sdf = db_api.register(data)
+ linker = Linker(data_sdf, settings)
df_predict = linker.inference.predict()
results = filter_results(df_predict)
@@ -119,7 +120,8 @@ def test_tf_clamp():
}
db_api = DuckDBAPI(connection=":memory:")
- linker = Linker(data, settings, db_api=db_api)
+ data_sdf = db_api.register(data)
+ linker = Linker(data_sdf, settings)
df_predict = linker.inference.predict()
results = filter_results(df_predict)
@@ -157,7 +159,8 @@ def test_weight():
db_api = DuckDBAPI(connection=":memory:")
- linker = Linker(data, settings, db_api=db_api)
+ data_sdf = db_api.register(data)
+ linker = Linker(data_sdf, settings)
df_predict = linker.inference.predict()
results = filter_results(df_predict)
@@ -208,7 +211,8 @@ def test_weightand_clamp():
db_api = DuckDBAPI(connection=":memory:")
- linker = Linker(data, settings, db_api=db_api)
+ data_sdf = db_api.register(data)
+ linker = Linker(data_sdf, settings)
df_predict = linker.inference.predict()
results = filter_results(df_predict)
@@ -251,7 +255,8 @@ def test_tf_missing_values_in_lookup():
}
db_api = DuckDBAPI(connection=":memory:")
- linker = Linker(data, settings, db_api=db_api)
+ data_sdf = db_api.register(data)
+ linker = Linker(data_sdf, settings)
# Register only London in the TF table - Paris is intentionally missing
# u_base = 0.2, tf_london = 0.1 (half u_base), so adj mw = log2(0.2/0.1) = 1.0
diff --git a/tests/test_total_comparison_count.py b/tests/test_total_comparison_count.py
index a7d852565d..c0a5d64727 100644
--- a/tests/test_total_comparison_count.py
+++ b/tests/test_total_comparison_count.py
@@ -85,12 +85,12 @@ def make_dummy_frame(row_count):
dfs = list(map(make_dummy_frame, frame_sizes))
db_api = DuckDBAPI()
+ dfs_sdf = [db_api.register(df) for df in dfs]
res_dict = count_comparisons_from_blocking_rule(
- table_or_tables=dfs,
+ dfs_sdf,
blocking_rule="1=1",
link_type=link_type,
- db_api=db_api,
unique_id_column_name="unique_id",
)
@@ -98,8 +98,9 @@ def make_dummy_frame(row_count):
# compare with count from each frame
pipeline = CTEPipeline()
+ dfs_sdf_dict = {df.templated_name: df for df in dfs_sdf}
sql = vertically_concatenate_sql(
- input_tables=db_api.register_multiple_tables(dfs),
+ input_tables=dfs_sdf_dict,
source_dataset_input_column=InputColumn(
"source_dataset", sqlglot_dialect_str="duckdb"
),
diff --git a/tests/test_train_vs_predict.py b/tests/test_train_vs_predict.py
index 779a20750a..948c1031fb 100644
--- a/tests/test_train_vs_predict.py
+++ b/tests/test_train_vs_predict.py
@@ -20,7 +20,7 @@ def test_train_vs_predict(test_helpers, dialect):
df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
settings_dict = get_settings_dict()
settings_dict["blocking_rules_to_generate_predictions"] = ["l.surname = r.surname"]
- linker = helper.Linker(df, settings_dict, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings_dict)
training_session = (
linker.training.estimate_parameters_using_expectation_maximisation(
diff --git a/tests/test_u_train.py b/tests/test_u_train.py
index 4365a91317..ef0f21d91b 100644
--- a/tests/test_u_train.py
+++ b/tests/test_u_train.py
@@ -29,7 +29,7 @@ def test_u_train(test_helpers, dialect):
}
df_linker = helper.convert_frame(df)
- linker = helper.Linker(df_linker, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df_linker, settings)
linker._debug_mode = True
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
cc_name = linker._settings_obj.comparisons[0]
@@ -78,11 +78,10 @@ def test_u_train_link_only(test_helpers, dialect):
df_l = helper.convert_frame(df_l)
df_r = helper.convert_frame(df_r)
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
[df_l, df_r],
settings,
input_table_aliases=["l", "r"],
- **helper.extra_linker_args(),
)
linker._debug_mode = True
linker._db_api.debug_keep_temp_views = True
@@ -144,11 +143,10 @@ def test_u_train_link_only_sample(test_helpers, dialect):
df_l = helper.convert_frame(df_l)
df_r = helper.convert_frame(df_r)
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
[df_l, df_r],
settings,
input_table_aliases=["_a", "_b"],
- **helper.extra_linker_args(),
)
linker._debug_mode = True
linker._db_api.debug_keep_temp_views = True
@@ -275,11 +273,10 @@ def test_u_train_multilink(test_helpers, dialect):
"blocking_rules_to_generate_predictions": [],
}
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
dfs,
settings,
input_table_aliases=["a", "b", "c", "d"],
- **helper.extra_linker_args(),
)
linker._debug_mode = True
linker._db_api.debug_keep_temp_views = True
@@ -314,11 +311,10 @@ def test_u_train_multilink(test_helpers, dialect):
# also check the numbers on a link + dedupe with same inputs
settings["link_type"] = "link_and_dedupe"
- linker = helper.Linker(
+ linker = helper.linker_with_registration(
dfs,
settings,
input_table_aliases=["e", "f", "g", "h"],
- **helper.extra_linker_args(),
)
linker._debug_mode = True
linker._db_api.debug_keep_temp_views = True
@@ -363,9 +359,9 @@ def test_seed_u_outputs(test_helpers, dialect):
"comparisons": [cl.LevenshteinAtThresholds("first_name", 2)],
}
- linker_1 = helper.Linker(df, settings, **helper.extra_linker_args())
- linker_2 = helper.Linker(df, settings, **helper.extra_linker_args())
- linker_3 = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker_1 = helper.linker_with_registration(df, settings)
+ linker_2 = helper.linker_with_registration(df, settings)
+ linker_3 = helper.linker_with_registration(df, settings)
linker_1.training.estimate_u_using_random_sampling(max_pairs=1e3, seed=1)
linker_2.training.estimate_u_using_random_sampling(max_pairs=1e3, seed=1)
@@ -423,7 +419,7 @@ def test_seed_u_outputs_different_order(test_helpers, dialect):
df_pd = input_frame.sample(frac=1, random_state=i)
df = helper.convert_frame(df_pd)
- linker = helper.Linker(df, settings, **helper.extra_linker_args())
+ linker = helper.linker_with_registration(df, settings)
linker.training.estimate_u_using_random_sampling(67, 5330)
u_prob = (
linker._settings_obj.comparisons[0]