moj-analytical-services · RobinL · Jan 12, 2026 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb b/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb
diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb
diff --git a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb
diff --git a/docs/demos/examples/duckdb/febrl3.ipynb b/docs/demos/examples/duckdb/febrl3.ipynb
diff --git a/docs/demos/examples/duckdb/febrl4.ipynb b/docs/demos/examples/duckdb/febrl4.ipynb
diff --git a/docs/demos/examples/duckdb/link_only.ipynb b/docs/demos/examples/duckdb/link_only.ipynb
diff --git a/docs/demos/examples/duckdb/pairwise_labels.ipynb b/docs/demos/examples/duckdb/pairwise_labels.ipynb
diff --git a/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb b/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb
diff --git a/docs/demos/examples/duckdb/real_time_record_linkage.ipynb b/docs/demos/examples/duckdb/real_time_record_linkage.ipynb
diff --git a/docs/demos/examples/duckdb/transactions.ipynb b/docs/demos/examples/duckdb/transactions.ipynb
diff --git a/docs/demos/examples/duckdb_no_test/bias_eval.ipynb b/docs/demos/examples/duckdb_no_test/bias_eval.ipynb
@@ -290,11 +290,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "linker = Linker(production_df, settings='../../demo_settings/model_h50k.json', db_api=db_api)"
+    "db_api = DuckDBAPI()\n",
+    "production_df_sdf = db_api.register(production_df)\n",
+    "linker = Linker(production_df_sdf, settings='../../demo_settings/model_h50k.json')"
    ]
   },
   {

diff --git a/docs/demos/examples/duckdb_no_test/business_rates_match.ipynb b/docs/demos/examples/duckdb_no_test/business_rates_match.ipynb
@@ -435,7 +435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -477,8 +477,7 @@
     "    company_name,\n",
     "    company_number,\n",
     "    COALESCE(\n",
-    "        REGEXP_EXTRACT(address_concat, '(\\\\d+[A-Z]?)'),\n",
-    "        REGEXP_EXTRACT(address_concat, '(\\\\S+)(?=\\\\s+HOUSE)')\n",
+    "        REGEXP_EXTRACT(address_concat, '(\\\\d+[A-Z]?)')\n",
     "    ) AS first_num_in_address,\n",
     "    postcode,\n",
     "    name_tokens_with_freq,\n",
@@ -540,7 +539,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -612,7 +611,9 @@
     "    retain_matching_columns=True,\n",
     ")\n",
     "\n",
-    "linker = Linker([df_stockport, df_all_companies], settings, db_api)"
+    "df_stockport_sdf = db_api.register(df_stockport)\n",
+    "df_all_companies_sdf = db_api.register(df_all_companies)\n",
+    "linker = Linker([df_stockport_sdf, df_all_companies_sdf], settings)"
    ]
   },
   {
@@ -960,7 +961,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "splink (3.11.11)",
    "language": "python",
    "name": "python3"
   },
@@ -974,7 +975,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.8"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

diff --git a/docs/demos/examples/duckdb_no_test/cookbook.ipynb b/docs/demos/examples/duckdb_no_test/cookbook.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -282,7 +282,9 @@
     ")\n",
     "\n",
     "\n",
-    "linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)\n",
+    "db_api = DuckDBAPI()\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
     "\n",
     "linker.inference.predict().as_pandas_dataframe()"
    ]
@@ -298,7 +300,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -393,9 +395,11 @@
     ")\n",
     "\n",
     "\n",
-    "linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)\n",
+    "db_api = DuckDBAPI()\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
     "\n",
-    "linker.inference.predict().as_pandas_dataframe()\n"
+    "linker.inference.predict().as_pandas_dataframe()"
    ]
   },
   {
@@ -416,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -477,6 +481,7 @@
     "duckdb_df = duckdb.read_parquet(temp_file_path)\n",
     "\n",
     "db_api = DuckDBAPI(\":default:\")\n",
+    "df_sdf = db_api.register(df)\n",
     "settings = SettingsCreator(\n",
     "    link_type=\"dedupe_only\",\n",
     "    comparisons=[\n",
@@ -489,7 +494,7 @@
     "    ],\n",
     ")\n",
     "\n",
-    "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+    "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
     "\n",
     "result = linker.inference.predict().as_duckdbpyrelation()\n",
     "\n",
@@ -498,7 +503,7 @@
     "\n",
     "# For example, we can use the `sort` function to sort the results,\n",
     "# or could use result.to_parquet() to write to a parquet file.\n",
-    "result.sort(\"match_weight\")\n"
+    "result.sort(\"match_weight\")"
    ]
   },
   {
@@ -510,7 +515,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -628,7 +633,8 @@
     ")\n",
     "\n",
     "df = splink_datasets.fake_1000\n",
-    "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
     "\n",
     "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)\n",
     "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
@@ -647,7 +653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -754,7 +760,8 @@
     "    ],\n",
     ")\n",
     "df = splink_datasets.fake_1000\n",
-    "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
     "\n",
     "linker.training.estimate_u_using_random_sampling(max_pairs=1e6)\n",
     "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
@@ -781,7 +788,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -808,7 +815,8 @@
     "    max_iterations=2,\n",
     ")\n",
     "\n",
-    "linker = Linker(df, settings, db_api, set_up_basic_logging=False)\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings, set_up_basic_logging=False)\n",
     "\n",
     "linker.training.estimate_probability_two_random_records_match(\n",
     "    [block_on(\"first_name\", \"surname\")], recall=0.7\n",
@@ -859,7 +867,8 @@
     "    ]\n",
     ")\n",
     "\n",
-    "linker = Linker(df, settings, db_api)\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings)\n",
     "\n",
     "\n",
     "linker.misc.save_model_to_json(\"mod.json\", overwrite=True)\n",
@@ -869,8 +878,10 @@
     "new_settings.retain_intermediate_calculation_columns = True\n",
     "new_settings.blocking_rules_to_generate_predictions = [\"1=1\"]\n",
     "new_settings.additional_columns_to_retain = [\"cluster\"]\n",
+    "db_api_new = DuckDBAPI()\n",
+    "df_sdf_new = db_api_new.register(df)\n",
+    "linker = Linker(df_sdf_new, new_settings)\n",
     "\n",
-    "linker = Linker(df, new_settings, DuckDBAPI())\n",
     "\n",
     "linker.inference.predict().as_duckdbpyrelation().show()"
    ]
@@ -891,6 +902,7 @@
     "import difflib\n",
     "\n",
     "import duckdb\n",
+    "from duckdb.sqltypes import VARCHAR, DOUBLE\n",
     "\n",
     "import splink.comparison_level_library as cll\n",
     "import splink.comparison_library as cl\n",
@@ -910,8 +922,8 @@
     "con.create_function(\n",
     "    \"custom_partial_ratio\",\n",
     "    custom_partial_ratio,\n",
-    "    [duckdb.typing.VARCHAR, duckdb.typing.VARCHAR],\n",
-    "    duckdb.typing.DOUBLE,\n",
+    "    [VARCHAR, VARCHAR],\n",
+    "    DOUBLE,\n",
     ")\n",
     "db_api = DuckDBAPI(connection=con)\n",
     "\n",
@@ -945,7 +957,8 @@
     "    max_iterations=2,\n",
     ")\n",
     "\n",
-    "linker = Linker(df, settings, db_api)\n",
+    "df_sdf = db_api.register(df)\n",
+    "linker = Linker(df_sdf, settings)\n",
     "\n",
     "linker.training.estimate_probability_two_random_records_match(\n",
     "    [block_on(\"first_name\", \"surname\")], recall=0.7\n",
@@ -1092,7 +1105,8 @@
     ")\n",
     "\n",
     "db_api = DuckDBAPI(connection=con)\n",
-    "company_linker = Linker(\"company_person_records\", company_settings, db_api)\n",
+    "company_records_sdf = db_api.register(\"company_person_records\")\n",
+    "company_linker = Linker(company_records_sdf, company_settings)\n",
     "company_predictions = company_linker.inference.predict(threshold_match_probability=0.5)\n",
     "\n",
     "print(\"\\nCompany pairwise matches:\")\n",
@@ -1176,8 +1190,8 @@
     "    retain_matching_columns=True,\n",
     ")\n",
     "\n",
-    "# Link persons within company clusters\n",
-    "person_linker = Linker(\"records_with_company_cluster\", person_settings, db_api2)\n",
+    "person_records_sdf = db_api2.register(\"records_with_company_cluster\")\n",
+    "person_linker = Linker(person_records_sdf, person_settings)\n",
     "person_predictions = person_linker.inference.predict(threshold_match_probability=0.5)\n",
     "\n",
     "print(\"\\nPerson pairwise matches (within company clusters):\")\n",
@@ -1187,7 +1201,8 @@
     "    person_predictions, threshold_match_probability=0.5\n",
     ")\n",
     "\n",
-    "person_clusters.as_duckdbpyrelation().sort(\"cluster_id\").show(max_width=1000)\n"
+    "person_clusters.as_duckdbpyrelation().sort(\"cluster_id\").show(max_width=1000)\n",
+    "\n"
    ]
   },
   {
@@ -1296,16 +1311,19 @@
     "    retain_intermediate_calculation_columns=True,\n",
     "    retain_matching_columns=True,\n",
     ")\n",
+    "db_api_linker = DuckDBAPI(con)\n",
+    "df_left_sdf = db_api_linker.register(\"df_left\")\n",
+    "df_right_sdf = db_api_linker.register(\"df_right\")\n",
     "linker = Linker(\n",
-    "    [\"df_left\", \"df_right\"],\n",
+    "    [df_left_sdf, df_right_sdf],\n",
     "    settings,\n",
-    "    db_api=DuckDBAPI(con),\n",
     ")\n",
     "\n",
     "# Skip training for demo purposes, just demonstrate that predict() works\n",
     "\n",
     "df_predict = linker.inference.predict()\n",
-    "df_predict.as_duckdbpyrelation()\n"
+    "\n",
+    "df_predict.as_duckdbpyrelation()"
    ]
   }
  ],