From 00aa2e630b4ea1119cee7d353952b1a8cfb01166 Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Thu, 6 Mar 2025 18:58:47 -0800
Subject: [PATCH 1/3] Simplified referential integrity checking.

Fixed correct model for tiktoken, fixes #41
---
 docs/how-to/Check-Referential-Integrity.ipynb | 569 +++++++++++++++---
 .../Index-Bioinformatics-Databases.ipynb      | 290 +++++++++
 src/linkml_store/api/database.py              |  30 +-
 .../api/stores/mongodb/mongodb_database.py    |   2 +
 src/linkml_store/cli.py                       |  16 +-
 .../index/implementations/llm_indexer.py      |  17 +-
 src/linkml_store/utils/format_utils.py        |   4 +
 src/linkml_store/utils/llm_utils.py           |   1 +
 tests/test_index/test_index.py                |   1 +
 9 files changed, 838 insertions(+), 92 deletions(-)
 create mode 100644 docs/how-to/Index-Bioinformatics-Databases.ipynb

diff --git a/docs/how-to/Check-Referential-Integrity.ipynb b/docs/how-to/Check-Referential-Integrity.ipynb
index da826db..730b1aa 100644
--- a/docs/how-to/Check-Referential-Integrity.ipynb
+++ b/docs/how-to/Check-Referential-Integrity.ipynb
@@ -14,8 +14,6 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "outputs": [],
    "source": [
     "from linkml_store import Client\n",
     "\n",
@@ -24,36 +22,50 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.760981Z",
-     "start_time": "2024-05-04T19:51:08.378243Z"
+     "end_time": "2025-03-07T02:08:28.986444Z",
+     "start_time": "2025-03-07T02:08:27.758566Z"
     }
    },
-   "id": "initial_id"
+   "id": "initial_id",
+   "outputs": [],
+   "execution_count": 1
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "outputs": [],
    "source": [
-    "db = client.attach_database(\"mongodb://localhost:27017\", \"test\")\n",
+    "db = client.attach_database(\"mongodb://localhost:27017\", \"test-ri\")\n",
     "db.metadata.ensure_referential_integrity = True\n",
-    "db.set_schema_view(\"../../tests/input/countries/countries.linkml.yaml\")\n",
-    "countries_coll = db.create_collection(\"Country\", alias=\"countries\", recreate_if_exists=True)\n",
-    "routes_coll = db.create_collection(\"Route\", alias=\"routes\", recreate_if_exists=True)"
+    "db.set_schema_view(\"../../tests/input/countries/countries.linkml.yaml\")\n"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.788932Z",
-     "start_time": "2024-05-04T19:51:09.771112Z"
+     "end_time": "2025-03-07T02:08:29.030994Z",
+     "start_time": "2025-03-07T02:08:28.989892Z"
     }
    },
-   "id": "cc164c0acbe4c39d"
+   "id": "cc164c0acbe4c39d",
+   "outputs": [],
+   "execution_count": 2
   },
   {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:08:29.335618Z",
+     "start_time": "2025-03-07T02:08:29.318131Z"
+    }
+   },
    "cell_type": "code",
-   "execution_count": 3,
+   "source": [
+    "countries_coll = db.create_collection(\"Country\", alias=\"countries\", recreate_if_exists=True)\n",
+    "routes_coll = db.create_collection(\"Route\", alias=\"routes\", recreate_if_exists=True)"
+   ],
+   "id": "cec53323f880da30",
    "outputs": [],
+   "execution_count": 5
+  },
+  {
+   "cell_type": "code",
    "source": [
     "COUNTRIES = \"../../tests/input/countries/countries.jsonl\"\n",
     "ROUTES = \"../../tests/input/countries/routes.csv\""
@@ -61,25 +73,16 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.789681Z",
-     "start_time": "2024-05-04T19:51:09.786454Z"
+     "end_time": "2025-03-07T02:08:29.343921Z",
+     "start_time": "2025-03-07T02:08:29.341972Z"
     }
    },
-   "id": "5286ef4e9dd0f316"
+   "id": "5286ef4e9dd0f316",
+   "outputs": [],
+   "execution_count": 6
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]"
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "from linkml_store.utils.format_utils import load_objects\n",
     "\n",
@@ -90,16 +93,27 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.795894Z",
-     "start_time": "2024-05-04T19:51:09.790413Z"
+     "end_time": "2025-03-07T02:08:29.353362Z",
+     "start_time": "2025-03-07T02:08:29.349890Z"
     }
    },
-   "id": "2e21988e4fc13f58"
+   "id": "2e21988e4fc13f58",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 7
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "outputs": [],
    "source": [
     "countries_coll.insert(countries)\n",
     "routes_coll.insert(routes)"
@@ -107,41 +121,43 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.803272Z",
-     "start_time": "2024-05-04T19:51:09.798758Z"
+     "end_time": "2025-03-07T02:08:29.583920Z",
+     "start_time": "2025-03-07T02:08:29.359788Z"
     }
    },
-   "id": "668e59a8f28e7bfe"
+   "id": "668e59a8f28e7bfe",
+   "outputs": [],
+   "execution_count": 8
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]"
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "routes_coll.find().rows"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.810617Z",
-     "start_time": "2024-05-04T19:51:09.804004Z"
+     "end_time": "2025-03-07T02:08:29.596327Z",
+     "start_time": "2025-03-07T02:08:29.591085Z"
     }
    },
-   "id": "995e63f873ea9353"
+   "id": "995e63f873ea9353",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 9
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "outputs": [],
    "source": [
     "for result in db.iter_validate_database():\n",
     "    print(result)"
@@ -149,11 +165,13 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.956191Z",
-     "start_time": "2024-05-04T19:51:09.809082Z"
+     "end_time": "2025-03-07T02:08:29.737342Z",
+     "start_time": "2025-03-07T02:08:29.602408Z"
     }
    },
-   "id": "a8ef16a3fbc6bfe6"
+   "id": "a8ef16a3fbc6bfe6",
+   "outputs": [],
+   "execution_count": 10
   },
   {
    "cell_type": "markdown",
@@ -169,72 +187,463 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "outputs": [],
    "source": [
     "routes_coll.insert({\"origin\": \"ZZZ\", \"destination\": \"YYY\", \"method\": \"rail\"})"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.961815Z",
-     "start_time": "2024-05-04T19:51:09.956721Z"
+     "end_time": "2025-03-07T02:08:29.747005Z",
+     "start_time": "2025-03-07T02:08:29.743644Z"
     }
    },
-   "id": "f712a82be775f413"
+   "id": "f712a82be775f413",
+   "outputs": [],
+   "execution_count": 11
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "source": [
+    "routes_coll.find().rows_dataframe"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:08:29.766855Z",
+     "start_time": "2025-03-07T02:08:29.753525Z"
+    }
+   },
+   "id": "18ffa996e3893b96",
    "outputs": [
     {
      "data": {
-      "text/plain": "  origin destination method\n0     DE          FR   rail\n1    ZZZ         YYY   rail",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>origin</th>\n      <th>destination</th>\n      <th>method</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>DE</td>\n      <td>FR</td>\n      <td>rail</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>ZZZ</td>\n      <td>YYY</td>\n      <td>rail</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+      "text/plain": [
+       "  origin destination method\n",
+       "0     DE          FR   rail\n",
+       "1    ZZZ         YYY   rail"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>origin</th>\n",
+       "      <th>destination</th>\n",
+       "      <th>method</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>DE</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>rail</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ZZZ</td>\n",
+       "      <td>YYY</td>\n",
+       "      <td>rail</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
      },
-     "execution_count": 9,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "execution_count": 12
+  },
+  {
+   "cell_type": "code",
+   "source": "results = list(db.iter_validate_database())",
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:08:29.880295Z",
+     "start_time": "2025-03-07T02:08:29.792681Z"
+    }
+   },
+   "id": "c67517aece5d47c5",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "type='ReferentialIntegrity' severity=<Severity.ERROR: 'ERROR'> message='Referential integrity error: Country not found' instance='ZZZ' instance_index=None instantiates='Country' context=[] source=None\n",
+      "type='ReferentialIntegrity' severity=<Severity.ERROR: 'ERROR'> message='Referential integrity error: Country not found' instance='YYY' instance_index=None instantiates='Country' context=[] source=None\n"
+     ]
+    }
+   ],
+   "execution_count": 13
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:09:42.929682Z",
+     "start_time": "2025-03-07T02:09:42.926860Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "assert any(r for r in results if \"Referential integrity\" in r.message)",
+   "id": "ab65fa35df1319fa",
+   "outputs": [],
+   "execution_count": 14
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:09:59.275684Z",
+     "start_time": "2025-03-07T02:09:59.273035Z"
+    }
+   },
+   "cell_type": "code",
    "source": [
-    "routes_coll.find().rows_dataframe"
+    "for result in results:\n",
+    "    print(\"Expected error: \", result)"
+   ],
+   "id": "755df23ea86fb8fe",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Expected error:  type='ReferentialIntegrity' severity=<Severity.ERROR: 'ERROR'> message='Referential integrity error: Country not found' instance='ZZZ' instance_index=None instantiates='Country' context=[] source=None\n",
+      "Expected error:  type='ReferentialIntegrity' severity=<Severity.ERROR: 'ERROR'> message='Referential integrity error: Country not found' instance='YYY' instance_index=None instantiates='Country' context=[] source=None\n"
+     ]
+    }
+   ],
+   "execution_count": 16
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## Command Line Example using DuckDB\n",
+    "\n",
+    "We'll next show a command line example; we will use DuckDB here and CSVs, but the same principles apply to other databases and formats.\n",
+    "\n",
+    "First we'll make two CSVs, one for patients and one for samples. The samples will refer to patients.\n"
    ],
+   "id": "cbfa9918f43120bb"
+  },
+  {
    "metadata": {
-    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:51:09.974226Z",
-     "start_time": "2024-05-04T19:51:09.961675Z"
+     "end_time": "2025-03-07T02:53:43.124840Z",
+     "start_time": "2025-03-07T02:53:43.120716Z"
     }
    },
-   "id": "18ffa996e3893b96"
+   "cell_type": "code",
+   "source": [
+    "PATIENTS = \"\"\"id,name,age\n",
+    "p1,John Doe,34\n",
+    "p2,Jane Doe,65\n",
+    "\"\"\"\n",
+    "with open(\"output/patients.csv\", \"w\") as stream:\n",
+    "    stream.write(PATIENTS)"
+   ],
+   "id": "c5180f555f0d8532",
+   "outputs": [],
+   "execution_count": 81
   },
   {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:53:44.072343Z",
+     "start_time": "2025-03-07T02:53:44.069087Z"
+    }
+   },
    "cell_type": "code",
-   "execution_count": 16,
+   "source": [
+    "SAMPLES = \"\"\"id,patient\n",
+    "s1,p1\n",
+    "s2,p2\n",
+    "s3,p2\n",
+    "\"\"\"\n",
+    "with open(\"output/samples.csv\", \"w\") as stream:\n",
+    "    stream.write(SAMPLES)"
+   ],
+   "id": "b98c49c121875d2c",
+   "outputs": [],
+   "execution_count": 82
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "Note this dataset is well-behaved, every sample refers to a patient.\n",
+    "\n",
+    "There is one issue with the data though, and that is that the default loader doesn't perform ptype inference, so the ages will\n",
+    "be treated as strings.\n",
+    "\n",
+    "Next we'll add a schema file"
+   ],
+   "id": "e59cea007cb4677a"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:57:03.648738Z",
+     "start_time": "2025-03-07T02:57:03.642949Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "SCHEMA = \"\"\"\n",
+    "id: http://example.org/patients\n",
+    "name: patients\n",
+    "description: Patients and samples\n",
+    "prefixes:\n",
+    "  linkml: http://w3id.org/linkml/\n",
+    "  ex: http://example.org/\n",
+    "default_prefix: ex  \n",
+    "imports:\n",
+    "  - linkml:types\n",
+    "classes:\n",
+    "  Sample:\n",
+    "    attributes:\n",
+    "      id:\n",
+    "        identifier: true\n",
+    "      patient:\n",
+    "        range: Patient\n",
+    "\n",
+    "  Patient:\n",
+    "    attributes:\n",
+    "      id:\n",
+    "        identifier: true\n",
+    "      name:\n",
+    "        required: true\n",
+    "      age:\n",
+    "        range: integer\n",
+    "\"\"\"\n",
+    "with open(\"output/patients.linkml.yaml\", \"w\") as stream:\n",
+    "    stream.write(SCHEMA)"
+   ],
+   "id": "bce56a2623bda439",
+   "outputs": [],
+   "execution_count": 86
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Load data into DuckDB\n",
+    "\n",
+    "We'll first clear any older databases we may have created"
+   ],
+   "id": "89949c3688a654d2"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:57:04.596382Z",
+     "start_time": "2025-03-07T02:57:04.593728Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "Path(\"output/patient_samples.ddb\").unlink(missing_ok=True)"
+   ],
+   "id": "d1688d7868c91f51",
+   "outputs": [],
+   "execution_count": 87
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "Then we'll load the data",
+   "id": "d137280f635ffdaf"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:57:09.101026Z",
+     "start_time": "2025-03-07T02:57:05.936337Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "%%bash\n",
+    "linkml-store \\\n",
+    "   -d output/patient_samples.ddb \\\n",
+    "   -c Patient \\\n",
+    "   insert output/patients.csv\n"
+   ],
+   "id": "3fb54173c9dc7ef6",
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "type='ReferentialIntegrity' severity=<Severity.ERROR: 'ERROR'> message='Referential integrity error: Country not found' instance='ZZZ' instance_index=None instantiates='Country'\n",
-      "type='ReferentialIntegrity' severity=<Severity.ERROR: 'ERROR'> message='Referential integrity error: Country not found' instance='YYY' instance_index=None instantiates='Country'\n"
+      "Inserted 2 objects from output/patients.csv into collection 'Patient'.\n"
      ]
     }
    ],
+   "execution_count": 88
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:57:11.542085Z",
+     "start_time": "2025-03-07T02:57:09.108440Z"
+    }
+   },
+   "cell_type": "code",
    "source": [
-    "results = list(db.iter_validate_database())\n",
-    "for result in results:\n",
-    "    print(result)"
+    "%%bash\n",
+    "linkml-store \\\n",
+    "  -d output/patient_samples.ddb \\\n",
+    "  -c Sample \\\n",
+    "  insert output/samples.csv"
+   ],
+   "id": "b02ecd6e707d8c4",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Inserted 3 objects from output/samples.csv into collection 'Sample'.\n"
+     ]
+    }
    ],
+   "execution_count": 89
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Check Referential Integrity (no RI)\n",
+    "\n",
+    "We don't expect any referential integrity issues here\n",
+    "\n"
+   ],
+   "id": "beb5290779c89866"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:57:27.170889Z",
+     "start_time": "2025-03-07T02:57:24.627460Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "%%bash\n",
+    "linkml-store --schema output/patients.linkml.yaml -d output/patient_samples.ddb validate -O csv"
+   ],
+   "id": "1e2fac4b84ac1188",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "type,severity,message,instance,instance_index,instantiates,context\r\n",
+      "jsonschema validation,ERROR,\"'34' is not of type 'integer', 'null' in /age\",\"{'id': 'p1', 'name': 'John Doe', 'age': '34'}\",0,Patient,[]\r\n",
+      "jsonschema validation,ERROR,\"'65' is not of type 'integer', 'null' in /age\",\"{'id': 'p2', 'name': 'Jane Doe', 'age': '65'}\",0,Patient,[]\r\n",
+      "\n"
+     ]
+    }
+   ],
+   "execution_count": 90
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Adding dangling references\n",
+    "\n",
+    "We'll deliberately add a sample that refers to a non-existent patient"
+   ],
+   "id": "fcfe323a8374efe7"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-07T02:57:51.753795Z",
+     "start_time": "2025-03-07T02:57:48.526129Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "%%bash\n",
+    "linkml-store \\\n",
+    "  -d output/patient_samples.ddb \\\n",
+    "  -c Sample \\\n",
+    "  insert --object '{\"id\": \"s4\", \"patient\": \"p3\"}'"
+   ],
+   "id": "fbd2644bdba7b35",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Inserted 1 objects from {\"id\": \"s4\", \"patient\": \"p3\"} into collection 'Sample'.\n"
+     ]
+    }
+   ],
+   "execution_count": 91
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "And then re-validate",
+   "id": "6632297dfd6934d6"
+  },
+  {
    "metadata": {
-    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-04T19:52:20.044928Z",
-     "start_time": "2024-05-04T19:52:19.996008Z"
+     "end_time": "2025-03-07T02:58:06.960138Z",
+     "start_time": "2025-03-07T02:58:03.546955Z"
     }
    },
-   "id": "c67517aece5d47c5"
+   "cell_type": "code",
+   "source": [
+    "%%bash\n",
+    "linkml-store --schema output/patients.linkml.yaml --set ensure_referential_integrity=true -d output/patient_samples.ddb validate -O csv"
+   ],
+   "id": "9c572e7e68343dee",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "type,severity,message,instance,instance_index,instantiates,context\r\n",
+      "jsonschema validation,ERROR,\"'34' is not of type 'integer', 'null' in /age\",\"{'id': 'p1', 'name': 'John Doe', 'age': '34'}\",0,Patient,[]\r\n",
+      "jsonschema validation,ERROR,\"'65' is not of type 'integer', 'null' in /age\",\"{'id': 'p2', 'name': 'Jane Doe', 'age': '65'}\",0,Patient,[]\r\n",
+      "ReferentialIntegrity,ERROR,Referential integrity error: Patient not found,p3,,Patient,[]\r\n",
+      "\n"
+     ]
+    }
+   ],
+   "execution_count": 92
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "edd5d9b201dbfa5f"
   }
  ],
  "metadata": {
diff --git a/docs/how-to/Index-Bioinformatics-Databases.ipynb b/docs/how-to/Index-Bioinformatics-Databases.ipynb
new file mode 100644
index 0000000..7cf7db1
--- /dev/null
+++ b/docs/how-to/Index-Bioinformatics-Databases.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Indexing Bioinformatics Databases\n",
+    "\n"
+   ],
+   "id": "eb43a476bbbf18d1"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## SIB Expasy Enzyme Database",
+   "id": "5e03abfb81d962cc"
+  },
+  {
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-03-06T03:04:45.923151Z",
+     "start_time": "2025-03-06T03:04:42.738146Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "%%bash\n",
+    "linkml-store -d mongodb://localhost:27017/bioinf -c enzyme insert ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
+   ],
+   "id": "initial_id",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Inserted 8371 objects from ftp://ftp.expasy.org/databases/enzyme/enzyme.dat into collection 'enzyme'.\n"
+     ]
+    }
+   ],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-03-06T17:50:12.539044Z",
+     "start_time": "2025-03-06T17:50:05.567015Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "%%bash\n",
+    "linkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \"degradation pathways\" -l 10 "
+   ],
+   "id": "eecde4757986f082",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: phenopackets, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gaf_mgi, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gaf_pombase, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gaf_gcrp, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: nmdc, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: amigo, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gocams, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: cadsr, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: npatlas, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: obo, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: metabolights, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: mibig, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: mixs, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: mondo, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: hpoa, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: hpoa_mongo, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: hpoa_kg, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: maxoa, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: refmet, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: neo4j, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gold, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: nmdc_duckdb, base_dir: /Users/cjm\n",
+      "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Creating/attaching database: mongodb://localhost:27017/bioinf\n",
+      "2025-03-06 09:50:11,490 - linkml_store.api.client - INFO - Initializing databases\n",
+      "2025-03-06 09:50:11,490 - linkml_store.api.client - INFO - Attaching mongodb://localhost:27017/bioinf\n",
+      "2025-03-06 09:50:11,490 - linkml_store.api.database - DEBUG - Initializing collections\n",
+      "2025-03-06 09:50:11,494 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Starting topology monitoring\"}\n",
+      "2025-03-06 09:50:11,494 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"previousDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Unknown, servers: []>\", \"newDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None>]>\", \"message\": \"Topology description changed\"}\n",
+      "2025-03-06 09:50:11,494 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"message\": \"Starting server monitoring\"}\n",
+      "2025-03-06 09:50:11,494 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection pool created\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,494 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"listCollections\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n",
+      "2025-03-06 09:50:11,495 - pymongo.serverSelection - DEBUG - {\"message\": \"Waiting for suitable server to become available\", \"selector\": \"Primary()\", \"operation\": \"listCollections\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"remainingTimeMS\": 29}\n",
+      "2025-03-06 09:50:11,497 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"driverConnectionId\": 1, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": false, \"message\": \"Server heartbeat started\"}\n",
+      "2025-03-06 09:50:11,500 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"driverConnectionId\": 1, \"serverConnectionId\": 1165, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": false, \"durationMS\": 2.4912499357014894, \"reply\": \"{\\\"helloOk\\\": true, \\\"ismaster\\\": true, \\\"topologyVersion\\\": {\\\"processId\\\": {\\\"$oid\\\": \\\"67bd405a9a90e0cc87eb813e\\\"}}, \\\"maxBsonObjectSize\\\": 16777216, \\\"maxMessageSizeBytes\\\": 48000000, \\\"maxWriteBatchSize\\\": 100000, \\\"localTime\\\": {\\\"$date\\\": \\\"2025-03-06T17:50:11.499Z\\\"}, \\\"logicalSessionTimeoutMinutes\\\": 30, \\\"connectionId\\\": 1165, \\\"maxWireVersion\\\": 21, \\\"ok\\\": 1.0}\", \"message\": \"Server heartbeat succeeded\"}\n",
+      "2025-03-06 09:50:11,500 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection pool ready\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,500 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"previousDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None>]>\", \"newDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"message\": \"Topology description changed\"}\n",
+      "2025-03-06 09:50:11,500 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"listCollections\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,500 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,500 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection created\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,500 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"driverConnectionId\": 1, \"serverConnectionId\": 1165, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": true, \"message\": \"Server heartbeat started\"}\n",
+      "2025-03-06 09:50:11,503 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection ready\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 0.0017190839862450957}\n",
+      "2025-03-06 09:50:11,503 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 0.0026320830220356584}\n",
+      "2025-03-06 09:50:11,503 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"listCollections\\\": 1, \\\"nameOnly\\\": true, \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"listCollections\", \"databaseName\": \"bioinf\", \"requestId\": 1144108930, \"operationId\": 1144108930, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,507 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 4.3759999999999994, \"reply\": \"{\\\"cursor\\\": {\\\"ns\\\": \\\"bioinf.$cmd.listCollections\\\", \\\"firstBatch\\\": [{\\\"name\\\": \\\"enzyme\\\", \\\"type\\\": \\\"collection\\\"}]}, \\\"ok\\\": 1.0}\", \"commandName\": \"listCollections\", \"databaseName\": \"bioinf\", \"requestId\": 1144108930, \"operationId\": 1144108930, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,507 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,507 - linkml_store.cli - INFO - Attaching index to collection enzyme: {'name': 'llm', 'index_type': 'llm', 'index_function': None, 'distance_function': None, 'index_attributes': None, 'text_template': None, 'text_template_syntax': None, 'filter_nulls': True, 'vector_default_length': 1000, 'index_field': '__index__', 'embedding_model_name': 'ada-002', 'cached_embeddings_database': None, 'cached_embeddings_collection': None, 'cache_queries': False, 'truncation_method': None}\n",
+      "2025-03-06 09:50:11,507 - linkml_store.api.collection - DEBUG - Pre-query hook (state: None; Q= None\n",
+      "2025-03-06 09:50:11,507 - linkml_store.api.collection - INFO - No metadata for enzyme; no derivations\n",
+      "2025-03-06 09:50:11,507 - linkml_store.api.database - DEBUG - Creating new collection: internal__index__enzyme__llm kwargs: {}\n",
+      "2025-03-06 09:50:11,507 - linkml_store.api.collection - DEBUG - Using indexer <class 'linkml_store.index.implementations.llm_indexer.LLMIndexer'> with name llm\n",
+      "2025-03-06 09:50:11,507 - linkml_store.api.collection - DEBUG - Pre-query hook (state: None; Q= from_table='internal__index__enzyme__llm' select_cols=None where_clause={} sort_by=None limit=None offset=None include_facet_counts=False facet_slots=None\n",
+      "2025-03-06 09:50:11,508 - linkml_store.api.collection - INFO - No metadata for internal__index__enzyme__llm; no derivations\n",
+      "2025-03-06 09:50:11,508 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n",
+      "2025-03-06 09:50:11,508 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,508 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,508 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 3.2374984584748745e-05}\n",
+      "2025-03-06 09:50:11,508 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"find\\\": \\\"internal__index__enzyme__llm\\\", \\\"limit\\\": 1, \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 470211272, \"operationId\": 470211272, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,509 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 0.9810000000000001, \"reply\": \"{\\\"cursor\\\": {\\\"ns\\\": \\\"bioinf.internal__index__enzyme__llm\\\"}, \\\"ok\\\": 1.0}\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 470211272, \"operationId\": 470211272, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,509 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,509 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n",
+      "2025-03-06 09:50:11,509 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,509 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,509 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 2.9582995921373367e-05}\n",
+      "2025-03-06 09:50:11,509 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"aggregate\\\": \\\"internal__index__enzyme__llm\\\", \\\"pipeline\\\": [{\\\"$group\\\": {\\\"_id\\\": 1, \\\"n\\\": {\\\"$sum\\\": 1}}}], \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 101027544, \"operationId\": 101027544, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,510 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 0.8019999999999999, \"reply\": \"{\\\"cursor\\\": {\\\"ns\\\": \\\"bioinf.internal__index__enzyme__llm\\\"}, \\\"ok\\\": 1.0}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 101027544, \"operationId\": 101027544, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,510 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,510 - linkml_store.api.collection - INFO - Index llm is empty; indexing all objects\n",
+      "2025-03-06 09:50:11,510 - linkml_store.api.collection - DEBUG - Pre-query hook (state: True; Q= from_table='enzyme' select_cols=None where_clause=None sort_by=None limit=None offset=None include_facet_counts=False facet_slots=None\n",
+      "2025-03-06 09:50:11,510 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n",
+      "2025-03-06 09:50:11,510 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,510 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,510 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 0.0001430839765816927}\n",
+      "2025-03-06 09:50:11,510 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"find\\\": \\\"enzyme\\\", \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 1457850878, \"operationId\": 1457850878, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,512 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 1.246, \"reply\": \"{\\\"cursor\\\": {\\\"firstBatch\\\": [{\\\"_id\\\": {\\\"$oid\\\": \\\"67c910cdad347dd8ce68a02d\\\"}, \\\"ID\\\": \\\"1.1.1.1\\\", \\\"DE\\\": \\\"alcohol dehydrogenase\\\", \\\"AN\\\": [\\\"aldehyde reductase\\\"], \\\"CA\\\": [\\\"(1) a primary alcohol + NAD(+) = an aldehyde + NADH + H(+)\\\", \\\"(2) a secondary alcohol + NAD(+) = a ketone + NADH + H(+)\\\"], \\\"CC\\\": [\\\"-!- Acts on primary or secondary alcohols or hemi-acetals with very broad specificity; however the enzyme oxidizes methanol much more poorly than ethanol.\\\", \\\"-!- The animal, but not the yeast, enzyme acts also on cyclic secondary alcohols.\\\", \\\"-!- Formerly EC 1.1.1.32.\\\"], \\\"DR\\\": \\\"P07327, ADH1A_HUMAN;  P28469, ADH1A_MACMU;  Q5RBP7, ADH1A_PONAB;P25405, ADH1A_SAAHA;  P25406, ADH1B_SAAHA;  P00327, ADH1E_HORSE;P00326, ADH1G_HUMAN;  O97959, ADH1G_PAPHA;  P00328, ADH1S_HORSE;P80222, ADH1_ALLMI ;  P30350, ADH1_ANAPL ;  P49645, ADH1_APTAU ;P06525, ADH1_ARATH ;  P41747, ADH1_ASPFN ;  Q17334, ADH1_CAEEL ;P43067, ADH1_CANAX ;  P85440, ADH1_CATRO ;  P14219, ADH1_CENAM ;P48814, ADH1_CERCA ;  Q70UN9, ADH1_CERCO ;  P23...\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 1457850878, \"operationId\": 1457850878, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,513 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,513 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"<function any_server_selector at 0x29ca04670>\", \"operation\": \"getMore\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n",
+      "2025-03-06 09:50:11,513 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"<function any_server_selector at 0x29ca04670>\", \"operation\": \"getMore\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,513 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,513 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 3.4708064049482346e-05}\n",
+      "2025-03-06 09:50:11,513 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"getMore\\\": 3344471647564691207, \\\"collection\\\": \\\"enzyme\\\", \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"getMore\", \"databaseName\": \"bioinf\", \"requestId\": 1458777923, \"operationId\": 1458777923, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,561 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 47.666, \"reply\": \"{\\\"cursor\\\": {\\\"nextBatch\\\": [{\\\"_id\\\": {\\\"$oid\\\": \\\"67c910cdad347dd8ce68a092\\\"}, \\\"ID\\\": \\\"1.1.1.102\\\", \\\"DE\\\": \\\"3-dehydrosphinganine reductase\\\", \\\"AN\\\": [\\\"3-ketosphinganine reductase\\\", \\\"3-oxosphinganine:NADPH oxidoreductase\\\", \\\"3-oxosphinganine reductase\\\", \\\"D-3-dehydrosphinganine reductase\\\", \\\"D-3-oxosphinganine:B-NADPH oxidoreductase\\\", \\\"D-3-oxosphinganine reductase\\\", \\\"DSR\\\", \\\"KTS reductase\\\"], \\\"CA\\\": [\\\"sphinganine + NADP(+) = 3-oxosphinganine + NADPH + H(+)\\\"], \\\"DR\\\": \\\"Q9Y7P2, GPI11_SCHPO;  Q0WRJ2, KDSRA_ARATH;  F4JZN6, KDSRB_ARATH;Q4WSZ0, KDSR_ASPFU ;  Q8A945, KDSR_BACTN ;  Q2KIJ5, KDSR_BOVIN ;Q59RQ2, KDSR_CANAL ;  Q6FQ42, KDSR_CANGA ;  P0CR37, KDSR_CRYNB ;P0CR36, KDSR_CRYNJ ;  F1QWW8, KDSR_DANRE ;  Q6BQK1, KDSR_DEBHA ;Q556J2, KDSR_DICDI ;  Q5BE65, KDSR_EMENI ;  Q758B6, KDSR_EREGS ;Q06136, KDSR_HUMAN ;  Q6CLN0, KDSR_KLULA ;  Q6GV12, KDSR_MOUSE ;Q7RZR2, KDSR_NEUCR ;  Q6CE86, KDSR_YARLI ;  P38342, KDSR_YEAST ;\\\"}, {\\\"_id\\\": {\\\"$oid\\\": \\\"67c910cdad347dd8ce68a093\\\"}, \\\"ID\\\": \\\"1.1.1.103\\\", \\\"DE\\\": \\\"L-threonine 3-dehydrogen...\", \"commandName\": \"getMore\", \"databaseName\": \"bioinf\", \"requestId\": 1458777923, \"operationId\": 1458777923, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,561 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,566 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n",
+      "2025-03-06 09:50:11,567 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"<TopologyDescription id: 67c9e0533f1b6043a9a9dd16, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Standalone, rtt: 0.0024912499357014894>]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,567 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,567 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 4.2749918065965176e-05}\n",
+      "2025-03-06 09:50:11,567 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"aggregate\\\": \\\"enzyme\\\", \\\"pipeline\\\": [{\\\"$group\\\": {\\\"_id\\\": 1, \\\"n\\\": {\\\"$sum\\\": 1}}}], \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 2007237709, \"operationId\": 2007237709, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,573 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 5.842, \"reply\": \"{\\\"cursor\\\": {\\\"firstBatch\\\": [{\\\"_id\\\": 1, \\\"n\\\": 8371}], \\\"ns\\\": \\\"bioinf.enzyme\\\"}, \\\"ok\\\": 1.0}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 2007237709, \"operationId\": 2007237709, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n",
+      "2025-03-06 09:50:11,573 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n",
+      "2025-03-06 09:50:11,618 - root - INFO - Converting 8371 texts to vectors\n",
+      "2025-03-06 09:50:11,635 - root - INFO - Token limit for text-embedding-ada-002: 7892\n",
+      "2025-03-06 09:50:11,639 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): openaipublic.blob.core.windows.net:443\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connection.py\", line 198, in _new_conn\n",
+      "    sock = connection.create_connection(\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/util/connection.py\", line 60, in create_connection\n",
+      "    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):\n",
+      "  File \"/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/socket.py\", line 955, in getaddrinfo\n",
+      "    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):\n",
+      "socket.gaierror: [Errno 8] nodename nor servname provided, or not known\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 787, in urlopen\n",
+      "    response = self._make_request(\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 488, in _make_request\n",
+      "    raise new_e\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 464, in _make_request\n",
+      "    self._validate_conn(conn)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 1093, in _validate_conn\n",
+      "    conn.connect()\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connection.py\", line 704, in connect\n",
+      "    self.sock = sock = self._new_conn()\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connection.py\", line 205, in _new_conn\n",
+      "    raise NameResolutionError(self.host, self, e) from e\n",
+      "urllib3.exceptions.NameResolutionError: <urllib3.connection.HTTPSConnection object at 0x29ce69bd0>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/adapters.py\", line 667, in send\n",
+      "    resp = conn.urlopen(\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 841, in urlopen\n",
+      "    retries = retries.increment(\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/util/retry.py\", line 519, in increment\n",
+      "    raise MaxRetryError(_pool, url, reason) from reason  # type: ignore[arg-type]\n",
+      "urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_base.tiktoken (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x29ce69bd0>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)\"))\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/bin/linkml-store\", line 6, in <module>\n",
+      "    sys.exit(cli())\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1161, in __call__\n",
+      "    return self.main(*args, **kwargs)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1082, in main\n",
+      "    rv = self.invoke(ctx)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1697, in invoke\n",
+      "    return _process_result(sub_ctx.command.invoke(sub_ctx))\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1443, in invoke\n",
+      "    return ctx.invoke(self.callback, **ctx.params)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 788, in invoke\n",
+      "    return __callback(*args, **kwargs)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/decorators.py\", line 33, in new_func\n",
+      "    return f(get_current_context(), *args, **kwargs)\n",
+      "  File \"/Users/cjm/repos/linkml-store/src/linkml_store/cli.py\", line 882, in search\n",
+      "    result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)\n",
+      "  File \"/Users/cjm/repos/linkml-store/src/linkml_store/api/collection.py\", line 595, in search\n",
+      "    self.index_objects(all_objs, index_name, replace=True, **kwargs)\n",
+      "  File \"/Users/cjm/repos/linkml-store/src/linkml_store/api/collection.py\", line 905, in index_objects\n",
+      "    vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]\n",
+      "  File \"/Users/cjm/repos/linkml-store/src/linkml_store/index/indexer.py\", line 103, in objects_to_vectors\n",
+      "    return self.texts_to_vectors([self.object_to_text(obj) for obj in objs])\n",
+      "  File \"/Users/cjm/repos/linkml-store/src/linkml_store/index/implementations/llm_indexer.py\", line 73, in texts_to_vectors\n",
+      "    encoding = encoding_for_model(\"gpt-4o\")\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/model.py\", line 110, in encoding_for_model\n",
+      "    return get_encoding(encoding_name_for_model(model_name))\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/registry.py\", line 86, in get_encoding\n",
+      "    enc = Encoding(**constructor())\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken_ext/openai_public.py\", line 96, in o200k_base\n",
+      "    mergeable_ranks = load_tiktoken_bpe(\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/load.py\", line 148, in load_tiktoken_bpe\n",
+      "    contents = read_file_cached(tiktoken_bpe_file, expected_hash)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/load.py\", line 63, in read_file_cached\n",
+      "    contents = read_file(blobpath)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/load.py\", line 22, in read_file\n",
+      "    resp = requests.get(blobpath)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/api.py\", line 73, in get\n",
+      "    return request(\"get\", url, params=params, **kwargs)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/api.py\", line 59, in request\n",
+      "    return session.request(method=method, url=url, **kwargs)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/sessions.py\", line 589, in request\n",
+      "    resp = self.send(prep, **send_kwargs)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/sessions.py\", line 703, in send\n",
+      "    r = adapter.send(request, **kwargs)\n",
+      "  File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/adapters.py\", line 700, in send\n",
+      "    raise ConnectionError(e, request=request)\n",
+      "requests.exceptions.ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_base.tiktoken (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x29ce69bd0>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)\"))\n",
+      "2025-03-06 09:50:12,001 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": true, \"durationMS\": 501.27241597510874, \"failure\": \"\\\"_OperationCancelled('operation cancelled')\\\"\", \"driverConnectionId\": 1, \"message\": \"Server heartbeat failed\"}\n"
+     ]
+    },
+    {
+     "ename": "CalledProcessError",
+     "evalue": "Command 'b'linkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \"degradation pathways\" -l 10 \\n'' returned non-zero exit status 1.",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mCalledProcessError\u001B[0m                        Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mget_ipython\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_cell_magic\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mbash\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mlinkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mdegradation pathways\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m -l 10 \u001B[39;49m\u001B[38;5;130;43;01m\\n\u001B[39;49;00m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2543\u001B[0m, in \u001B[0;36mInteractiveShell.run_cell_magic\u001B[0;34m(self, magic_name, line, cell)\u001B[0m\n\u001B[1;32m   2541\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mbuiltin_trap:\n\u001B[1;32m   2542\u001B[0m     args \u001B[38;5;241m=\u001B[39m (magic_arg_s, cell)\n\u001B[0;32m-> 2543\u001B[0m     result \u001B[38;5;241m=\u001B[39m \u001B[43mfn\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   2545\u001B[0m \u001B[38;5;66;03m# The code below prevents the output from being displayed\u001B[39;00m\n\u001B[1;32m   2546\u001B[0m \u001B[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001B[39;00m\n\u001B[1;32m   2547\u001B[0m \u001B[38;5;66;03m# when the last Python token in the expression is a ';'.\u001B[39;00m\n\u001B[1;32m   2548\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mgetattr\u001B[39m(fn, magic\u001B[38;5;241m.\u001B[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001B[38;5;28;01mFalse\u001B[39;00m):\n",
+      "File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/IPython/core/magics/script.py:159\u001B[0m, in \u001B[0;36mScriptMagics._make_script_magic.<locals>.named_script_magic\u001B[0;34m(line, cell)\u001B[0m\n\u001B[1;32m    157\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m    158\u001B[0m     line \u001B[38;5;241m=\u001B[39m script\n\u001B[0;32m--> 159\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mshebang\u001B[49m\u001B[43m(\u001B[49m\u001B[43mline\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcell\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/IPython/core/magics/script.py:336\u001B[0m, in \u001B[0;36mScriptMagics.shebang\u001B[0;34m(self, line, cell)\u001B[0m\n\u001B[1;32m    331\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m args\u001B[38;5;241m.\u001B[39mraise_error \u001B[38;5;129;01mand\u001B[39;00m p\u001B[38;5;241m.\u001B[39mreturncode \u001B[38;5;241m!=\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[1;32m    332\u001B[0m     \u001B[38;5;66;03m# If we get here and p.returncode is still None, we must have\u001B[39;00m\n\u001B[1;32m    333\u001B[0m     \u001B[38;5;66;03m# killed it but not yet seen its return code. We don't wait for it,\u001B[39;00m\n\u001B[1;32m    334\u001B[0m     \u001B[38;5;66;03m# in case it's stuck in uninterruptible sleep. -9 = SIGKILL\u001B[39;00m\n\u001B[1;32m    335\u001B[0m     rc \u001B[38;5;241m=\u001B[39m p\u001B[38;5;241m.\u001B[39mreturncode \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m9\u001B[39m\n\u001B[0;32m--> 336\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m CalledProcessError(rc, cell)\n",
+      "\u001B[0;31mCalledProcessError\u001B[0m: Command 'b'linkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \"degradation pathways\" -l 10 \\n'' returned non-zero exit status 1."
+     ]
+    }
+   ],
+   "execution_count": 1
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "4264e705488b6d20"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py
index 5329950..08ad918 100644
--- a/src/linkml_store/api/database.py
+++ b/src/linkml_store/api/database.py
@@ -595,7 +595,29 @@ def induce_schema_view(self) -> SchemaView:
             sb.add_class(coll.target_class_name)
         return SchemaView(sb.schema)
 
-    def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
+    def validate_database(self, **kwargs) -> List["ValidationResult"]:
+        """
+        Validate the contents of the database.
+
+        As `iter_validate_database`, but returns a list of validation results.
+
+        :param kwargs:
+        :return:
+        """
+        return list(self.iter_validate_database(**kwargs))
+
+    def validate_database(self, **kwargs) -> List["ValidationResult"]:
+        """
+        Validate the contents of the database.
+
+        As `iter_validate_database`, but returns a list of validation results.
+
+        :param kwargs:
+        :return:
+        """
+        return list(self.iter_validate_database(**kwargs))
+
+    def iter_validate_database(self, ensure_referential_integrity: bool = None, **kwargs) -> Iterator["ValidationResult"]:
         """
         Validate the contents of the database.
 
@@ -635,12 +657,14 @@ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
         'capital' is a required property
         'continent' is a required proper
 
+        :param ensure_referential_integrity: ensure referential integrity
         :param kwargs:
         :return: iterator over validation results
         """
         for collection in self.list_collections():
             yield from collection.iter_validate_collection(**kwargs)
-        if self.metadata.ensure_referential_integrity:
+        if self.metadata.ensure_referential_integrity or ensure_referential_integrity:
+            logger.info(f"Validating referential integrity on {self.alias}")
             yield from self._validate_referential_integrity(**kwargs)
 
     def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
@@ -661,7 +685,9 @@ def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResul
             induced_slots = sv.class_induced_slots(cd.name)
             slot_map = {s.name: s for s in induced_slots}
             # rmap = {s.name: s.range for s in induced_slots}
+            # map slot ranges to a collection where that range is stored
             sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
+            logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}")
             for obj in collection.find_iter():
                 for k, v in obj.items():
                     if k not in sr_to_coll:
diff --git a/src/linkml_store/api/stores/mongodb/mongodb_database.py b/src/linkml_store/api/stores/mongodb/mongodb_database.py
index 499c222..0361bfa 100644
--- a/src/linkml_store/api/stores/mongodb/mongodb_database.py
+++ b/src/linkml_store/api/stores/mongodb/mongodb_database.py
@@ -42,6 +42,8 @@ def _db_name(self) -> str:
             parsed_url = urlparse(self.handle)
             path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
             db_name = path_parts[0] if path_parts else "default"
+            if not db_name:
+                db_name = self.alias
         else:
             db_name = "default"
         return db_name
diff --git a/src/linkml_store/cli.py b/src/linkml_store/cli.py
index 667357d..9b04618 100644
--- a/src/linkml_store/cli.py
+++ b/src/linkml_store/cli.py
@@ -246,6 +246,8 @@ def insert(ctx, files, replace, object, format, source_field, json_select_query)
         for object_str in object:
             logger.info(f"Parsing: {object_str}")
             objects = yaml.safe_load(object_str)
+            if not isinstance(objects, list):
+                objects = [objects]
             if replace:
                 collection.replace(objects)
             else:
@@ -903,12 +905,18 @@ def indexes(ctx):
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--collection-only/--no-collection-only", default=False, show_default=True, help="Only validate specified collection")
+@click.option("--ensure-referential-integrity/--no-ensure-referential-integrity", default=True, show_default=True, help="Ensure referential integrity")
 @click.pass_context
-def validate(ctx, output_type, output):
+def validate(ctx, output_type, output, collection_only, **kwargs):
     """Validate objects in the specified collection."""
-    collection = ctx.obj["settings"].collection
-    logger.info(f"Validating collection {collection.alias}")
-    validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
+    if collection_only:
+        collection = ctx.obj["settings"].collection
+        logger.info(f"Validating collection {collection.alias}")
+        validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection(**kwargs)]
+    else:
+        db = ctx.obj["settings"].database
+        validation_results = [json_dumper.to_dict(x) for x in db.validate_database(**kwargs)]
     output_data = render_output(validation_results, output_type)
     if output:
         with open(output, "w") as f:
diff --git a/src/linkml_store/index/implementations/llm_indexer.py b/src/linkml_store/index/implementations/llm_indexer.py
index b8707b5..4d19944 100644
--- a/src/linkml_store/index/implementations/llm_indexer.py
+++ b/src/linkml_store/index/implementations/llm_indexer.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, List, Optional
 
 import numpy as np
+import openai
 
 from linkml_store.api.config import CollectionConfig
 from linkml_store.index.indexer import INDEX_ITEM, Indexer
@@ -11,6 +12,7 @@
 if TYPE_CHECKING:
     import llm
 
+CHUNK_SIZE = 1000
 
 logger = logging.getLogger(__name__)
 
@@ -25,7 +27,7 @@ class LLMIndexer(Indexer):
     >>> vector = indexer.text_to_vector("hello")
     """
 
-    embedding_model_name: str = "ada-002"
+    embedding_model_name: str = "text-embedding-ada-002"
     _embedding_model: "llm.EmbeddingModel" = None
     cached_embeddings_database: str = None
     cached_embeddings_collection: str = None
@@ -52,7 +54,7 @@ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
         """
         return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
 
-    def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
+    def texts_to_vectors(self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs) -> List[INDEX_ITEM]:
         """
         Use LLM to embed.
 
@@ -60,18 +62,21 @@ def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> Li
         >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
 
         :param texts:
+        :param cache:
+        :param token_limit_penalty:
         :return:
         """
         from tiktoken import encoding_for_model
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
         # TODO: make this more accurate
-        token_limit = get_token_limit(model.model_id) - 200
-        encoding = encoding_for_model("gpt-4o")
+        token_limit = get_token_limit(model.model_id) - token_limit_penalty
+        logging.info(f"Token limit for {model.model_id}: {token_limit}")
+        encoding = encoding_for_model(self.embedding_model_name)
 
         def truncate_text(text: str) -> str:
             # split into tokens every 1000 chars:
-            parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
+            parts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
             truncated = render_formatted_text(
                 lambda x: "".join(x),
                 parts,
@@ -140,5 +145,5 @@ def truncate_text(text: str) -> str:
                 embeddings_collection.commit()
         else:
             logger.info(f"Embedding {len(texts)} texts")
-            embeddings = model.embed_multi(texts)
+            embeddings = list(model.embed_multi(texts, batch_size=1))
         return [np.array(v, dtype=float) for v in embeddings]
diff --git a/src/linkml_store/utils/format_utils.py b/src/linkml_store/utils/format_utils.py
index 00262ed..c9f9f85 100644
--- a/src/linkml_store/utils/format_utils.py
+++ b/src/linkml_store/utils/format_utils.py
@@ -440,6 +440,10 @@ def render_output(
         return "\n".join(json.dumps(obj) for obj in data)
     elif format == Format.PYTHON:
         return str(data)
+    elif format == Format.MARKDOWN:
+        def as_markdown(obj: dict):
+            return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
+        return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
     elif format == Format.TABLE:
         from tabulate import tabulate
         return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
diff --git a/src/linkml_store/utils/llm_utils.py b/src/linkml_store/utils/llm_utils.py
index d53d8e7..f28cd15 100644
--- a/src/linkml_store/utils/llm_utils.py
+++ b/src/linkml_store/utils/llm_utils.py
@@ -76,6 +76,7 @@ def render_formatted_text(
         return text
     if not values:
         raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
+    # remove last element and try again
     return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
 
 
diff --git a/tests/test_index/test_index.py b/tests/test_index/test_index.py
index 2bb5826..e60f8f4 100644
--- a/tests/test_index/test_index.py
+++ b/tests/test_index/test_index.py
@@ -56,3 +56,4 @@ def test_index(index_class, texts):
         # Ensure the queried text appears at the top of the search results
         exact_matches = [r[1] for r in results if np.isclose(r[0], 1.0, rtol=1e-3)]
         assert text_id in exact_matches, f"Exact match not found in : {results}"
+

From e9f4e2247f1b4becb968c91655c3a32e8be91260 Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Thu, 6 Mar 2025 19:05:59 -0800
Subject: [PATCH 2/3] format-code

---
 src/linkml_store/api/client.py                |   5 +-
 src/linkml_store/api/collection.py            |  42 +++---
 src/linkml_store/api/database.py              |   4 +-
 .../api/stores/duckdb/duckdb_database.py      |   6 +-
 .../api/stores/filesystem/__init__.py         |   2 +-
 .../api/stores/mongodb/mongodb_collection.py  |  27 ++--
 .../api/stores/solr/solr_collection.py        |  10 +-
 src/linkml_store/cli.py                       |  49 +++++--
 .../index/implementations/llm_indexer.py      |   5 +-
 src/linkml_store/index/indexer.py             |  11 +-
 .../implementations/llm_inference_engine.py   |  22 +--
 .../implementations/rag_inference_engine.py   |  23 +--
 .../inference/inference_config.py             |   1 +
 src/linkml_store/utils/dat_parser.py          |  26 ++--
 src/linkml_store/utils/enrichment_analyzer.py |  46 +++---
 src/linkml_store/utils/format_utils.py        |  50 ++++---
 src/linkml_store/utils/llm_utils.py           |   3 +-
 src/linkml_store/utils/pandas_utils.py        |   2 +-
 src/linkml_store/utils/sql_utils.py           |   2 +-
 src/linkml_store/utils/vector_utils.py        |  13 +-
 tests/test_api/test_api.py                    |   4 +-
 tests/test_api/test_mongodb_adapter.py        |  14 +-
 tests/test_api/test_neo4j_adapter.py          |   1 +
 tests/test_index/test_index.py                |   1 -
 tests/test_inference/test_rag_engine.py       |   9 +-
 tests/test_utils/test_dat_parser.py           |   7 +-
 tests/test_utils/test_enrichment_analyzer.py  | 138 +++++++++---------
 27 files changed, 283 insertions(+), 240 deletions(-)

diff --git a/src/linkml_store/api/client.py b/src/linkml_store/api/client.py
index 1208660..214e656 100644
--- a/src/linkml_store/api/client.py
+++ b/src/linkml_store/api/client.py
@@ -12,7 +12,6 @@
 logger = logging.getLogger(__name__)
 
 
-
 HANDLE_MAP = {
     "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
     "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
@@ -220,14 +219,14 @@ def attach_database(
             scheme, _ = handle.split(":", 1)
         if scheme not in HANDLE_MAP:
             raise ValueError(f"Unknown scheme: {scheme}")
-        module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
+        module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1)
         try:
             module = importlib.import_module(module_path)
             cls = getattr(module, class_name)
         except ImportError as e:
             raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
 
-        #cls = HANDLE_MAP[scheme]
+        # cls = HANDLE_MAP[scheme]
         db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
         if schema_view:
             db.set_schema_view(schema_view)
diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py
index a043bcb..556fbad 100644
--- a/src/linkml_store/api/collection.py
+++ b/src/linkml_store/api/collection.py
@@ -211,7 +211,7 @@ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
         """
         raise NotImplementedError
 
-    def index (
+    def index(
         self,
         objs: Union[OBJECT, List[OBJECT]],
         index_name: Optional[str] = None,
@@ -231,10 +231,13 @@ def index (
         """
         raise NotImplementedError
 
-    def upsert(self,
-               objs: Union[OBJECT, List[OBJECT]],
-               filter_fields: List[str],
-               update_fields: Union[List[str], None] = None, **kwargs):
+    def upsert(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        filter_fields: List[str],
+        update_fields: Union[List[str], None] = None,
+        **kwargs,
+    ):
         """
         Add one or more objects to the collection.
 
@@ -455,10 +458,10 @@ def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
         return None
 
     def find(
-            self,
-            where: Optional[Any] = None,
-            select_cols: Optional[List[str] ] = None,
-            **kwargs,
+        self,
+        where: Optional[Any] = None,
+        select_cols: Optional[List[str]] = None,
+        **kwargs,
     ) -> QueryResult:
         """
         Find objects in the collection using a where query.
@@ -596,6 +599,7 @@ def search(
                 assert ix_coll.size() > 0
         qr = ix_coll.find(where=where, limit=-1, **kwargs)
         index_col = ix.index_field
+
         # TODO: optimize this for large indexes
         def row2array(row):
             v = row[index_col]
@@ -603,6 +607,7 @@ def row2array(row):
                 # sqlite stores arrays as strings
                 v = json.loads(v)
             return np.array(v, dtype=float)
+
         vector_pairs = [(row, row2array(row)) for row in qr.rows]
         results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
         for r in results:
@@ -618,12 +623,12 @@ def row2array(row):
         return new_qr
 
     def group_by(
-            self,
-            group_by_fields: List[str],
-            inlined_field = "objects",
-            agg_map: Optional[Dict[str, str]] = None,
-            where: Optional[Dict] = None,
-            **kwargs,
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
     ) -> QueryResult:
         """
         Group objects in the collection by a column.
@@ -650,14 +655,9 @@ def group_by(
             top_obj = {k: v for k, v in zip(pk_fields, pk)}
             top_obj[inlined_field] = objs
             results.append(top_obj)
-        r = QueryResult(
-            num_rows=len(results),
-            rows=results
-        )
+        r = QueryResult(num_rows=len(results), rows=results)
         return r
 
-
-
     @property
     def is_internal(self) -> bool:
         """
diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py
index 08ad918..ec4c2b3 100644
--- a/src/linkml_store/api/database.py
+++ b/src/linkml_store/api/database.py
@@ -617,7 +617,9 @@ def validate_database(self, **kwargs) -> List["ValidationResult"]:
         """
         return list(self.iter_validate_database(**kwargs))
 
-    def iter_validate_database(self, ensure_referential_integrity: bool = None, **kwargs) -> Iterator["ValidationResult"]:
+    def iter_validate_database(
+        self, ensure_referential_integrity: bool = None, **kwargs
+    ) -> Iterator["ValidationResult"]:
         """
         Validate the contents of the database.
 
diff --git a/src/linkml_store/api/stores/duckdb/duckdb_database.py b/src/linkml_store/api/stores/duckdb/duckdb_database.py
index 50406b6..6fdffac 100644
--- a/src/linkml_store/api/stores/duckdb/duckdb_database.py
+++ b/src/linkml_store/api/stores/duckdb/duckdb_database.py
@@ -100,9 +100,9 @@ def _table_exists(self, table: str) -> bool:
             meta_query = Query(
                 from_table="sqlite_master",
                 where_clause={
-                    #"type": "table",
+                    # "type": "table",
                     "name": table,
-                }
+                },
             )
         else:
             if table.startswith("information_schema"):
@@ -112,7 +112,7 @@ def _table_exists(self, table: str) -> bool:
                 where_clause={
                     "table_type": "BASE TABLE",
                     "table_name": table,
-                }
+                },
             )
 
         qr = self.query(meta_query)
diff --git a/src/linkml_store/api/stores/filesystem/__init__.py b/src/linkml_store/api/stores/filesystem/__init__.py
index 742d463..405eb7a 100644
--- a/src/linkml_store/api/stores/filesystem/__init__.py
+++ b/src/linkml_store/api/stores/filesystem/__init__.py
@@ -4,7 +4,7 @@
 Handles have the form:
 
  - ``file:<path>`` for a local file
- """
+"""
 
 from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
 from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
diff --git a/src/linkml_store/api/stores/mongodb/mongodb_collection.py b/src/linkml_store/api/stores/mongodb/mongodb_collection.py
index 6e37a8b..2868f58 100644
--- a/src/linkml_store/api/stores/mongodb/mongodb_collection.py
+++ b/src/linkml_store/api/stores/mongodb/mongodb_collection.py
@@ -41,13 +41,14 @@ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
             del obj["_id"]
         self._post_insert_hook(objs)
 
-
-    def index(self,
-              objs: Union[OBJECT, List[OBJECT]],
-              index_name: Optional[str] = None,
-              replace: bool = False,
-              unique: bool = False,
-              **kwargs):
+    def index(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        index_name: Optional[str] = None,
+        replace: bool = False,
+        unique: bool = False,
+        **kwargs,
+    ):
         """
         Create indexes on the collection.
 
@@ -86,11 +87,13 @@ def index(self,
             else:
                 logging.debug(f"Index already exists for field {obj}, skipping creation.")
 
-    def upsert(self,
-               objs: Union[OBJECT, List[OBJECT]],
-               filter_fields: List[str],
-               update_fields: Optional[List[str]] = None,
-               **kwargs):
+    def upsert(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        filter_fields: List[str],
+        update_fields: Optional[List[str]] = None,
+        **kwargs,
+    ):
         """
         Upsert one or more documents into the MongoDB collection.
 
diff --git a/src/linkml_store/api/stores/solr/solr_collection.py b/src/linkml_store/api/stores/solr/solr_collection.py
index bb80dd9..45a67f0 100644
--- a/src/linkml_store/api/stores/solr/solr_collection.py
+++ b/src/linkml_store/api/stores/solr/solr_collection.py
@@ -63,11 +63,11 @@ def query(self, query: Query, **kwargs) -> QueryResult:
 
     def query_facets(
         self,
-            where: Optional[Dict] = None,
-            facet_columns: List[str] = None,
-            facet_limit=DEFAULT_FACET_LIMIT,
-            facet_min_count: int = 1,
-            **kwargs
+        where: Optional[Dict] = None,
+        facet_columns: List[str] = None,
+        facet_limit=DEFAULT_FACET_LIMIT,
+        facet_min_count: int = 1,
+        **kwargs,
     ) -> Dict[str, Dict[str, int]]:
         solr_query = self._build_solr_query(where)
         solr_query["facet"] = "true"
diff --git a/src/linkml_store/cli.py b/src/linkml_store/cli.py
index 9b04618..7cb1bf5 100644
--- a/src/linkml_store/cli.py
+++ b/src/linkml_store/cli.py
@@ -142,7 +142,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
     if input:
-        database = "duckdb" # default: store in duckdb
+        database = "duckdb"  # default: store in duckdb
         if input.startswith("http"):
             parts = input.split("/")
             collection = parts[-1]
@@ -150,8 +150,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         else:
             stem = underscore(Path(input).stem)
             collection = stem
-        logger.info(f"Using input file: {input}, "
-                    f"default storage is {database} and collection is {collection}")
+        logger.info(f"Using input file: {input}, " f"default storage is {database} and collection is {collection}")
         config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
     if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
         config = DEFAULT_LOCAL_CONF_PATH
@@ -206,7 +205,7 @@ def drop(ctx):
 @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
-@click.option("--source-field",  help="If provided, inject file path source as this field")
+@click.option("--source-field", help="If provided, inject file path source as this field")
 @json_select_query_option
 @click.pass_context
 def insert(ctx, files, replace, object, format, source_field, json_select_query):
@@ -632,10 +631,12 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
         value_key = tuple([row.get(att) for att in value_atts])
         pivoted[index_key][column_key] = value_key
     pivoted_objs = []
+
     def detuple(t: Tuple) -> Any:
         if len(t) == 1:
             return t[0]
         return str(t)
+
     for index_key, data in pivoted.items():
         obj = {att: key for att, key in zip(index_atts, index_key)}
         for column_key, value_key in data.items():
@@ -651,16 +652,27 @@ def detuple(t: Tuple) -> Any:
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
 @click.option("--sample-field", "-I", help="Field to use as the sample identifier")
 @click.option("--classification-field", "-L", help="Field to use as for classification")
-@click.option("--p-value-threshold", "-P", type=click.FLOAT,
-              default=0.05, show_default=True,
-              help="P-value threshold for enrichment")
-@click.option("--multiple-testing-correction", "-M", type=click.STRING,
-              default="bh", show_default=True,
-              help="Multiple test correction method")
+@click.option(
+    "--p-value-threshold",
+    "-P",
+    type=click.FLOAT,
+    default=0.05,
+    show_default=True,
+    help="P-value threshold for enrichment",
+)
+@click.option(
+    "--multiple-testing-correction",
+    "-M",
+    type=click.STRING,
+    default="bh",
+    show_default=True,
+    help="Multiple test correction method",
+)
 @click.argument("samples", type=click.STRING, nargs=-1)
 @click.pass_context
 def enrichment(ctx, where, limit, output_type, output, sample_field, classification_field, samples, **kwargs):
     from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer
+
     collection = ctx.obj["settings"].collection
     where_clause = yaml.safe_load(where) if where else None
     column_atts = [sample_field, classification_field]
@@ -683,6 +695,7 @@ def enrichment(ctx, where, limit, output_type, output, sample_field, classificat
     else:
         click.echo(output_data)
 
+
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -690,7 +703,7 @@ def enrichment(ctx, where, limit, output_type, output, sample_field, classificat
 @click.option(
     "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
 )
-@click.option("--training-collection", type=click.STRING,help="Collection to use for training")
+@click.option("--training-collection", type=click.STRING, help="Collection to use for training")
 @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
 @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
 @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
@@ -905,8 +918,18 @@ def indexes(ctx):
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
-@click.option("--collection-only/--no-collection-only", default=False, show_default=True, help="Only validate specified collection")
-@click.option("--ensure-referential-integrity/--no-ensure-referential-integrity", default=True, show_default=True, help="Ensure referential integrity")
+@click.option(
+    "--collection-only/--no-collection-only",
+    default=False,
+    show_default=True,
+    help="Only validate specified collection",
+)
+@click.option(
+    "--ensure-referential-integrity/--no-ensure-referential-integrity",
+    default=True,
+    show_default=True,
+    help="Ensure referential integrity",
+)
 @click.pass_context
 def validate(ctx, output_type, output, collection_only, **kwargs):
     """Validate objects in the specified collection."""
diff --git a/src/linkml_store/index/implementations/llm_indexer.py b/src/linkml_store/index/implementations/llm_indexer.py
index 4d19944..e45858c 100644
--- a/src/linkml_store/index/implementations/llm_indexer.py
+++ b/src/linkml_store/index/implementations/llm_indexer.py
@@ -54,7 +54,9 @@ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
         """
         return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
 
-    def texts_to_vectors(self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs) -> List[INDEX_ITEM]:
+    def texts_to_vectors(
+        self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs
+    ) -> List[INDEX_ITEM]:
         """
         Use LLM to embed.
 
@@ -67,6 +69,7 @@ def texts_to_vectors(self, texts: List[str], cache: bool = None, token_limit_pen
         :return:
         """
         from tiktoken import encoding_for_model
+
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
         # TODO: make this more accurate
diff --git a/src/linkml_store/index/indexer.py b/src/linkml_store/index/indexer.py
index 70e227b..837ad84 100644
--- a/src/linkml_store/index/indexer.py
+++ b/src/linkml_store/index/indexer.py
@@ -154,8 +154,11 @@ def object_to_text(self, obj: Dict[str, Any]) -> str:
         return str(obj)
 
     def search(
-        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
-            mmr_relevance_factor: Optional[float] = None
+        self,
+        query: str,
+        vectors: List[Tuple[str, INDEX_ITEM]],
+        limit: Optional[int] = None,
+        mmr_relevance_factor: Optional[float] = None,
     ) -> List[Tuple[float, Any]]:
         """
         Use the indexer to search against a database of vectors.
@@ -175,8 +178,8 @@ def search(
             vlist = [v for _, v in vectors]
             idlist = [id for id, _ in vectors]
             sorted_indices = mmr_diversified_search(
-                query_vector, vlist,
-                relevance_factor=mmr_relevance_factor, top_n=limit)
+                query_vector, vlist, relevance_factor=mmr_relevance_factor, top_n=limit
+            )
             results = []
             # TODO: this is inefficient when limit is high
             for i in range(limit):
diff --git a/src/linkml_store/inference/implementations/llm_inference_engine.py b/src/linkml_store/inference/implementations/llm_inference_engine.py
index 996221e..4cc10f2 100644
--- a/src/linkml_store/inference/implementations/llm_inference_engine.py
+++ b/src/linkml_store/inference/implementations/llm_inference_engine.py
@@ -79,21 +79,24 @@ def object_to_text(self, object: OBJECT) -> str:
     def _schema_str(self) -> str:
         db = self.training_data.base_collection.parent
         from linkml_runtime.dumpers import json_dumper
+
         schema_dict = json_dumper.to_dict(db.schema_view.schema)
         return yaml.dump(schema_dict)
 
-    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
+    def derive(
+        self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
+    ) -> Optional[LLMInference]:
         import llm
 
         model: llm.Model = self.model
-        #model_name = self.config.llm_config.model_name
-        #feature_attributes = self.config.feature_attributes
+        # model_name = self.config.llm_config.model_name
+        # feature_attributes = self.config.feature_attributes
         target_attributes = self.config.target_attributes
         query_text = self.object_to_text(object)
 
         if not target_attributes:
             target_attributes = [k for k, v in object.items() if v is None or v == ""]
-        #if not feature_attributes:
+        # if not feature_attributes:
         #    feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
 
         system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
@@ -107,7 +110,9 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[
             "```yaml\n"
             f"{stub}\n"
             "```\n"
-            "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
+            "---\nQuery:\n"
+            f"## INCOMPLETE OBJECT:\n{query_text}\n"
+            "## OUTPUT:\n"
         )
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system=system_prompt)
@@ -130,9 +135,8 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[
                     "\nThis was invalid.\n",
                     "Validation errors:\n",
                 ] + [self.object_to_text(e) for e in errs]
-                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
-        return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
-
+                return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
+        return LLMInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
 
     def export_model(
         self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
@@ -149,4 +153,4 @@ def save_model(self, output: Union[str, Path]) -> None:
 
     @classmethod
     def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
-        raise NotImplementedError("Does not make sense for this engine")
\ No newline at end of file
+        raise NotImplementedError("Does not make sense for this engine")
diff --git a/src/linkml_store/inference/implementations/rag_inference_engine.py b/src/linkml_store/inference/implementations/rag_inference_engine.py
index 64d321a..942801b 100644
--- a/src/linkml_store/inference/implementations/rag_inference_engine.py
+++ b/src/linkml_store/inference/implementations/rag_inference_engine.py
@@ -111,7 +111,9 @@ def initialize_model(self, **kwargs):
     def object_to_text(self, object: OBJECT) -> str:
         return yaml.dump(object)
 
-    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
+    def derive(
+        self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
+    ) -> Optional[RAGInference]:
         import llm
         from tiktoken import encoding_for_model
 
@@ -131,8 +133,9 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[
             if not self.rag_collection.indexers:
                 raise ValueError("RAG collection must have an indexer attached")
             logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
-            rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
-                                            mmr_relevance_factor=mmr_relevance_factor)
+            rs = self.rag_collection.search(
+                query_text, limit=num_examples, index_name="llm", mmr_relevance_factor=mmr_relevance_factor
+            )
             examples = rs.rows
             logger.info(f"Found {len(examples)} examples")
             if not examples:
@@ -153,11 +156,11 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[
             input_obj_text = self.object_to_text(input_obj)
             if input_obj_text == query_text:
                 continue
-                #raise ValueError(
+                # raise ValueError(
                 #    f"Query object {query_text} is the same as example object {input_obj_text}\n"
                 #    "This indicates possible test data leakage\n."
                 #    "TODO: allow an option that allows user to treat this as a basic lookup\n"
-                #)
+                # )
             output_obj = select_nested(example, target_attributes)
             prompt_clause = (
                 "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -176,9 +179,9 @@ def make_text(texts: List[str]):
         except KeyError:
             encoding = encoding_for_model("gpt-4")
         token_limit = get_token_limit(model_name)
-        prompt = render_formatted_text(make_text, values=prompt_clauses,
-                                       encoding=encoding, token_limit=token_limit,
-                                       additional_text=system_prompt)
+        prompt = render_formatted_text(
+            make_text, values=prompt_clauses, encoding=encoding, token_limit=token_limit, additional_text=system_prompt
+        )
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system=system_prompt)
         yaml_str = response.text()
@@ -199,8 +202,8 @@ def make_text(texts: List[str]):
                     "\nThis was invalid.\n",
                     "Validation errors:\n",
                 ] + [self.object_to_text(e) for e in errs]
-                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
-        return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+                return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
+        return RAGInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
 
     def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
         if "```" in yaml_str:
diff --git a/src/linkml_store/inference/inference_config.py b/src/linkml_store/inference/inference_config.py
index 1556d27..538320a 100644
--- a/src/linkml_store/inference/inference_config.py
+++ b/src/linkml_store/inference/inference_config.py
@@ -59,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
     """
     Result of an inference derivation.
     """
+
     query: Optional[OBJECT] = Field(default=None, description="The query object.")
     predicted_object: OBJECT = Field(..., description="The predicted object.")
     confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
diff --git a/src/linkml_store/utils/dat_parser.py b/src/linkml_store/utils/dat_parser.py
index 625ada8..f271dfe 100644
--- a/src/linkml_store/utils/dat_parser.py
+++ b/src/linkml_store/utils/dat_parser.py
@@ -2,6 +2,7 @@
 
 ENTRY = Dict[str, Any]
 
+
 def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
     """
     Parse SIB/Swiss-Prot format data into a structured dictionary.
@@ -13,7 +14,7 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
         dict: A dictionary with entry IDs as keys and parsed data as values
     """
     # Split the text into entries (separated by //)
-    entries = text.split('//\n')
+    entries = text.split("//\n")
     header = None
 
     # Initialize results dictionary
@@ -29,12 +30,12 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
         current_code = None
 
         # Process each line
-        for line in entry.strip().split('\n'):
+        for line in entry.strip().split("\n"):
             if not line.strip():
                 continue
 
             # Check if this is a new field (starts with a 2-letter code followed by space)
-            if len(line) > 2 and line[2] == ' ':
+            if len(line) > 2 and line[2] == " ":
                 current_code = line[0:2]
                 # Remove the code and the following space(s)
                 value = line[3:].strip()
@@ -48,7 +49,7 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
             # Continuation of previous field
             elif current_code is not None:
                 # Handle continuation lines (typically indented)
-                if current_code == 'CC':
+                if current_code == "CC":
                     # For comments, preserve the indentation
                     current_entry[current_code].append(line)
                 else:
@@ -59,35 +60,36 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
         # -!- ...
         #     ...
         # -!- ...
-        ccs = current_entry.get('CC', [])
+        ccs = current_entry.get("CC", [])
         new_ccs = []
         for cc in ccs:
-            if not cc.startswith('-!-') and new_ccs:
+            if not cc.startswith("-!-") and new_ccs:
                 new_ccs[-1] += " " + cc
             else:
                 new_ccs.append(cc)
-        current_entry['CC'] = new_ccs
+        current_entry["CC"] = new_ccs
         for k, vs in current_entry.items():
-            if k != 'CC':
-                combined = ''.join(vs)
+            if k != "CC":
+                combined = "".join(vs)
                 combined = combined.strip()
                 if combined.endswith("."):
                     combined = combined.split(".")
                     combined = [c.strip() for c in combined if c.strip()]
-                    if k == 'DE':
+                    if k == "DE":
                         combined = combined[0]
                 current_entry[k] = combined
 
-        if 'ID' in current_entry:
+        if "ID" in current_entry:
             results.append(current_entry)
         else:
             header = current_entry
 
     return header, results
 
+
 # Example usage:
 # data = parse_sib_format(text)
 # for entry_id, entry_data in data.items():
 #     print(f"Entry: {entry_id}")
 #     for code, values in entry_data.items():
-#         print(f"  {code}: {values}")
\ No newline at end of file
+#         print(f"  {code}: {values}")
diff --git a/src/linkml_store/utils/enrichment_analyzer.py b/src/linkml_store/utils/enrichment_analyzer.py
index 5b60058..0d759c7 100644
--- a/src/linkml_store/utils/enrichment_analyzer.py
+++ b/src/linkml_store/utils/enrichment_analyzer.py
@@ -10,6 +10,7 @@ class EnrichedCategory(BaseModel):
     """
     Information about a category enriched in a sample
     """
+
     category: str
     fold_change: float
     original_p_value: float
@@ -41,7 +42,7 @@ def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
         self.sample_cache: Dict[str, Counter] = {}
 
     @classmethod
-    def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> 'EnrichmentAnalyzer':
+    def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer":
         """
         Initialize the analyzer with a Collection and key column names.
         Precomputes category frequencies for the entire dataset.
@@ -91,7 +92,7 @@ def _get_sample_stats(self, sample_id: str) -> Counter:
         if sample_data.empty:
             raise KeyError(f"Sample ID '{sample_id}' not found")
         sample_data = sample_data.dropna()
-        #if sample_data.empty:
+        # if sample_data.empty:
         #    raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA")
         counter = Counter()
 
@@ -104,10 +105,13 @@ def _get_sample_stats(self, sample_id: str) -> Counter:
         self.sample_cache[sample_id] = counter
         return counter
 
-    def find_enriched_categories(self, sample_id: str,
-                                 min_occurrences: int = 5,
-                                 p_value_threshold: float = 0.05,
-                                 multiple_testing_correction: str = 'bh') -> List[EnrichedCategory]:
+    def find_enriched_categories(
+        self,
+        sample_id: str,
+        min_occurrences: int = 5,
+        p_value_threshold: float = 0.05,
+        multiple_testing_correction: str = "bh",
+    ) -> List[EnrichedCategory]:
         """
         Find categories that are enriched in the given sample.
 
@@ -135,14 +139,18 @@ def find_enriched_categories(self, sample_id: str,
             # Calculate fold change
             sample_freq = sample_count / total_sample_annotations
             global_freq = global_count / total_global_annotations
-            fold_change = sample_freq / global_freq if global_freq > 0 else float('inf')
+            fold_change = sample_freq / global_freq if global_freq > 0 else float("inf")
 
             # Perform Fisher's exact test
-            contingency_table = np.array([
-                [sample_count, global_count - sample_count],
-                [total_sample_annotations - sample_count,
-                 total_global_annotations - total_sample_annotations - (global_count - sample_count)]
-            ])
+            contingency_table = np.array(
+                [
+                    [sample_count, global_count - sample_count],
+                    [
+                        total_sample_annotations - sample_count,
+                        total_global_annotations - total_sample_annotations - (global_count - sample_count),
+                    ],
+                ]
+            )
 
             _, p_value = stats.fisher_exact(contingency_table)
 
@@ -158,12 +166,12 @@ def find_enriched_categories(self, sample_id: str,
         # Apply multiple testing correction
         categories, fold_changes, p_values = zip(*results)
 
-        if multiple_testing_correction.lower() == 'bonf':
+        if multiple_testing_correction.lower() == "bonf":
             # Bonferroni correction
             n_tests = len(self.global_stats)  # Total number of categories tested
             adjusted_p_values = [min(1.0, p * n_tests) for p in p_values]
 
-        elif multiple_testing_correction.lower() == 'bh':
+        elif multiple_testing_correction.lower() == "bh":
             # Benjamini-Hochberg correction
             n = len(p_values)
             sorted_indices = np.argsort(p_values)
@@ -192,12 +200,7 @@ def find_enriched_categories(self, sample_id: str,
         # Filter by adjusted p-value threshold and create final results
         # Create EnrichedCategory objects
         final_results = [
-            EnrichedCategory(
-                category=cat,
-                fold_change=fc,
-                original_p_value=p,
-                adjusted_p_value=adj_p
-            )
+            EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p)
             for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values)
             if adj_p < p_value_threshold
         ]
@@ -206,8 +209,9 @@ def find_enriched_categories(self, sample_id: str,
         final_results.sort(key=lambda x: x.adjusted_p_value)
         return final_results
 
+
 # Example usage:
 # analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories')
 # enriched = analyzer.find_enriched_categories('sample1')
 # for category, fold_change, p_value in enriched:
-#     print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")
\ No newline at end of file
+#     print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")
diff --git a/src/linkml_store/utils/format_utils.py b/src/linkml_store/utils/format_utils.py
index c9f9f85..9e1d589 100644
--- a/src/linkml_store/utils/format_utils.py
+++ b/src/linkml_store/utils/format_utils.py
@@ -139,12 +139,13 @@ def clean_nested_structure(obj):
     else:
         return clean_pandas_value(obj)
 
+
 def process_file(
-        f: IO,
-        format: Format,
-        expected_type: Optional[Type] = None,
-        header_comment_token: Optional[str] = None,
-        format_options: Optional[Dict[str, Any]] = None,
+    f: IO,
+    format: Format,
+    expected_type: Optional[Type] = None,
+    header_comment_token: Optional[str] = None,
+    format_options: Optional[Dict[str, Any]] = None,
 ) -> List[Dict[str, Any]]:
     """
     Process a single file and return a list of objects.
@@ -173,6 +174,7 @@ def process_file(
             objs = yaml.safe_load(f)
     elif format == Format.TOML:
         import toml
+
         objs = toml.load(f)
         if not isinstance(objs, list):
             objs = [objs]
@@ -214,13 +216,15 @@ def process_file(
         for line in f:
             parts = line.strip().split("\t")
             desc = parts[1]
-            objs.append({
-                "library": lib_name,
-                "uid": f"{lib_name}.{parts[0]}",
-                "name": parts[0],
-                "description": desc if desc else None,
-                "genes": parts[2:],
-            })
+            objs.append(
+                {
+                    "library": lib_name,
+                    "uid": f"{lib_name}.{parts[0]}",
+                    "name": parts[0],
+                    "description": desc if desc else None,
+                    "genes": parts[2:],
+                }
+            )
     elif format == Format.FASTA:
         objs = []
         current_obj = None
@@ -237,29 +241,33 @@ def process_file(
     elif format == Format.OBO:
         blocks = split_document(f.read(), "\n\n")
         id_pattern = re.compile(r"id: (\S+)")
+
         def get_id(block):
             m = id_pattern.search(block)
             return m.group(1) if m else None
+
         objs = [{"id": get_id(block), "content": block} for block in blocks]
         objs = [obj for obj in objs if obj["id"]]
     elif format == Format.DAT:
         from linkml_store.utils.dat_parser import parse_sib_format
+
         _, objs = parse_sib_format(f.read())
     elif format in (Format.RDFXML, Format.TURTLE):
         import lightrdf
+
         parser = lightrdf.Parser()
         objs = []
         ext_fmt = "rdfxml"
         if format == Format.TURTLE:
             ext_fmt = "ttl"
-        bytesio = io.BytesIO(f.read().encode('utf-8'))
+        bytesio = io.BytesIO(f.read().encode("utf-8"))
         buffer = io.BufferedReader(bytesio)
         for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
             obj = {
-                    "subject": s,
-                    "predicate": p,
-                    "object": o,
-                }
+                "subject": s,
+                "predicate": p,
+                "object": o,
+            }
             if format_options.get("pivot", False):
                 obj = {
                     "subject": s,
@@ -389,7 +397,8 @@ def write_output(
 
 
 def render_output(
-    data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]], format: Optional[Union[Format, str]] = Format.YAML
+    data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
+    format: Optional[Union[Format, str]] = Format.YAML,
 ) -> str:
     """
     Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -441,11 +450,14 @@ def render_output(
     elif format == Format.PYTHON:
         return str(data)
     elif format == Format.MARKDOWN:
+
         def as_markdown(obj: dict):
             return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
+
         return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
     elif format == Format.TABLE:
         from tabulate import tabulate
+
         return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
     elif format == Format.YAML:
         if isinstance(data, list):
@@ -510,4 +522,4 @@ def split_document(doc: str, delimiter: str):
     :param delimiter: The delimiter.
     :return: The parts of the document.
     """
-    return doc.split(delimiter)
\ No newline at end of file
+    return doc.split(delimiter)
diff --git a/src/linkml_store/utils/llm_utils.py b/src/linkml_store/utils/llm_utils.py
index f28cd15..ca53be1 100644
--- a/src/linkml_store/utils/llm_utils.py
+++ b/src/linkml_store/utils/llm_utils.py
@@ -105,6 +105,7 @@ def get_token_limit(model_name: str) -> int:
 
 def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
     import yaml
+
     if "```" in yaml_str:
         yaml_str = yaml_str.split("```")[1].strip()
         if yaml_str.startswith("yaml"):
@@ -115,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
         if strict:
             raise e
         logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
-        return None
\ No newline at end of file
+        return None
diff --git a/src/linkml_store/utils/pandas_utils.py b/src/linkml_store/utils/pandas_utils.py
index 4355bb2..5229dd1 100644
--- a/src/linkml_store/utils/pandas_utils.py
+++ b/src/linkml_store/utils/pandas_utils.py
@@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
 
 
 def facet_summary_to_dataframe_unmelted(
-    facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
+    facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
 ) -> pd.DataFrame:
     rows = []
 
diff --git a/src/linkml_store/utils/sql_utils.py b/src/linkml_store/utils/sql_utils.py
index 01004aa..bac73d8 100644
--- a/src/linkml_store/utils/sql_utils.py
+++ b/src/linkml_store/utils/sql_utils.py
@@ -116,7 +116,7 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
         modified_where = " AND ".join(conditions)
 
     def make_col_safe(col):
-        return '"' + quoted_name(col, True) + '"' if ' ' in col else col
+        return '"' + quoted_name(col, True) + '"' if " " in col else col
 
     if isinstance(facet_column, str):
         facet_column = make_col_safe(facet_column)
diff --git a/src/linkml_store/utils/vector_utils.py b/src/linkml_store/utils/vector_utils.py
index 98e727e..f091206 100644
--- a/src/linkml_store/utils/vector_utils.py
+++ b/src/linkml_store/utils/vector_utils.py
@@ -8,6 +8,7 @@
 
 LOL = List[List[float]]
 
+
 def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
     """
     Calculate the cosine similarity between two vectors.
@@ -77,9 +78,7 @@ def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.nd
     return top_match_indices, top_match_values
 
 
-def top_n_matches(
-    cosine_similarity_matrix: np.ndarray, n: int = 10
-) -> Tuple[np.ndarray, np.ndarray]:
+def top_n_matches(cosine_similarity_matrix: np.ndarray, n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
     # Find the indices that would sort each row in descending order
     sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
 
@@ -136,10 +135,7 @@ def mmr_diversified_search(
                     max_sim_to_selected = max(
                         [
                             np.dot(document_vectors[idx], document_vectors[s])
-                            / (
-                                np.linalg.norm(document_vectors[idx])
-                                * np.linalg.norm(document_vectors[s])
-                            )
+                            / (np.linalg.norm(document_vectors[idx]) * np.linalg.norm(document_vectors[s]))
                             for s in selected_indices
                         ]
                     )
@@ -160,6 +156,3 @@ def mmr_diversified_search(
         selected_indices.add(best_index)
 
     return result_indices
-
-
-
diff --git a/tests/test_api/test_api.py b/tests/test_api/test_api.py
index e395a4e..2952b03 100644
--- a/tests/test_api/test_api.py
+++ b/tests/test_api/test_api.py
@@ -65,7 +65,7 @@
 
 
 def is_persistent(handle: str) -> bool:
-    #if "duckdb" in handle:
+    # if "duckdb" in handle:
     #    # NOTE: in previous versions of duckdb, in-memory databases were not persistent
     #    return True
     return ".db" in handle or "mongodb" in handle or "file:" in handle
@@ -334,8 +334,6 @@ def test_group_by(handle):
             assert False, f"Unexpected id: {row['id']}"
 
 
-
-
 @pytest.mark.parametrize("handle", SCHEMES_PLUS)
 def test_collections_of_same_type(handle):
     """
diff --git a/tests/test_api/test_mongodb_adapter.py b/tests/test_api/test_mongodb_adapter.py
index 66e9269..5f3f58a 100644
--- a/tests/test_api/test_mongodb_adapter.py
+++ b/tests/test_api/test_mongodb_adapter.py
@@ -7,6 +7,7 @@
 from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
 from pymongo import MongoClient
 
+
 @pytest.fixture(scope="module")
 def mongodb_client():
     try:
@@ -220,11 +221,9 @@ def test_index_creation(mongodb_collection, unique_flag):
     mongodb_collection.mongo_collection.delete_many({})
 
     # Insert **unique, non-null** values for test_field to avoid duplicate key error
-    mongodb_collection.mongo_collection.insert_many([
-        {"_id": 1, "test_field": "value1"},
-        {"_id": 2, "test_field": "value2"},
-        {"_id": 3, "test_field": "value3"}
-    ])
+    mongodb_collection.mongo_collection.insert_many(
+        [{"_id": 1, "test_field": "value1"}, {"_id": 2, "test_field": "value2"}, {"_id": 3, "test_field": "value3"}]
+    )
 
     # Create the index using the method with the unique flag
     mongodb_collection.index(index_field, index_name=index_name, replace=True, unique=unique_flag)
@@ -239,5 +238,6 @@ def test_index_creation(mongodb_collection, unique_flag):
     if unique_flag:
         assert created_indexes[index_name]["unique"], f"Index {index_name} should be unique"
     else:
-        assert "unique" not in created_indexes[index_name] or not created_indexes[index_name]["unique"], \
-            f"Index {index_name} should not be unique"
+        assert (
+            "unique" not in created_indexes[index_name] or not created_indexes[index_name]["unique"]
+        ), f"Index {index_name} should not be unique"
diff --git a/tests/test_api/test_neo4j_adapter.py b/tests/test_api/test_neo4j_adapter.py
index 13e2e85..33b7dab 100644
--- a/tests/test_api/test_neo4j_adapter.py
+++ b/tests/test_api/test_neo4j_adapter.py
@@ -14,6 +14,7 @@
           neo4j
 
 """
+
 import pytest
 from linkml_runtime import SchemaView
 from linkml_runtime.utils.schema_builder import SchemaBuilder
diff --git a/tests/test_index/test_index.py b/tests/test_index/test_index.py
index e60f8f4..2bb5826 100644
--- a/tests/test_index/test_index.py
+++ b/tests/test_index/test_index.py
@@ -56,4 +56,3 @@ def test_index(index_class, texts):
         # Ensure the queried text appears at the top of the search results
         exact_matches = [r[1] for r in results if np.isclose(r[0], 1.0, rtol=1e-3)]
         assert text_id in exact_matches, f"Exact match not found in : {results}"
-
diff --git a/tests/test_inference/test_rag_engine.py b/tests/test_inference/test_rag_engine.py
index 7cf6e24..274bc48 100644
--- a/tests/test_inference/test_rag_engine.py
+++ b/tests/test_inference/test_rag_engine.py
@@ -105,7 +105,6 @@ def test_inference_nested(handle):
     # check_accuracy2(ie2, targets, threshold=0.33, features=features, test_data=ie.testing_data.as_dataframe())
 
 
-
 @pytest.mark.integration
 @pytest.mark.parametrize("handle", SCHEMES)
 def test_with_validation(handle):
@@ -172,7 +171,9 @@ def test_with_validation(handle):
     sb.add_slot("predicate", range="PredicateType", replace_if_present=True)
     sv = SchemaView(sb.schema)
     collection.parent.set_schema_view(sv)
-    errs = list(collection.iter_validate_collection([{"triples": [{"subject": "a", "predicate": "unknown", "object": "b"}]}]))
+    errs = list(
+        collection.iter_validate_collection([{"triples": [{"subject": "a", "predicate": "unknown", "object": "b"}]}])
+    )
     assert len(errs) == 1
     result = ie.derive({"paper": {"abstract": "Mark Hamill played a starring role in the movie Star Wars"}})
     assert result
@@ -184,7 +185,3 @@ def test_with_validation(handle):
     # (note that in future this unit test could conceivably be used in training models, in which case
     # it will need to be modified to a different hard-to-guess predicate)
     assert result.iterations > 1
-
-
-
-
diff --git a/tests/test_utils/test_dat_parser.py b/tests/test_utils/test_dat_parser.py
index 6eddd02..191bc59 100644
--- a/tests/test_utils/test_dat_parser.py
+++ b/tests/test_utils/test_dat_parser.py
@@ -3,13 +3,14 @@
 
 DAT_FILE = INPUT_DIR / "expasy-subset.dat"
 
+
 def test_parse_dat():
-    entries = process_file(open(DAT_FILE)   , Format.DAT)
+    entries = process_file(open(DAT_FILE), Format.DAT)
     assert len(entries) == 2
     e1 = entries[0]
     dr1 = e1["DR"]
-    assert dr1.endswith('Q46856, YQHD_ECOLI ;')
+    assert dr1.endswith("Q46856, YQHD_ECOLI ;")
     de1 = e1["DE"]
-    assert de1 == 'alcohol dehydrogenase (NADP(+))'
+    assert de1 == "alcohol dehydrogenase (NADP(+))"
     cc1 = e1["CC"]
     assert len(cc1) == 4
diff --git a/tests/test_utils/test_enrichment_analyzer.py b/tests/test_utils/test_enrichment_analyzer.py
index e8928f3..cfc92c5 100644
--- a/tests/test_utils/test_enrichment_analyzer.py
+++ b/tests/test_utils/test_enrichment_analyzer.py
@@ -2,31 +2,41 @@
 import pandas as pd
 import numpy as np
 from collections import Counter
-from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer  # Assuming the previous code is in enrichment_analysis.py
+from linkml_store.utils.enrichment_analyzer import (
+    EnrichmentAnalyzer,
+)  # Assuming the previous code is in enrichment_analysis.py
 
 
 @pytest.fixture
 def sample_df():
     """Create a test DataFrame with known enrichment patterns"""
     data = {
-        'sample_id': [
-            'sample1', 'sample1', 'sample1', 'sample1', 'sample1',
-            'sample2', 'sample2', 'sample2',
-            'sample3', 'sample3', 'sample3'
+        "sample_id": [
+            "sample1",
+            "sample1",
+            "sample1",
+            "sample1",
+            "sample1",
+            "sample2",
+            "sample2",
+            "sample2",
+            "sample3",
+            "sample3",
+            "sample3",
+        ],
+        "categories": [
+            ["A", "B"],
+            ["A", "C"],
+            ["A", "B"],
+            ["B", "C"],
+            ["A"],
+            ["C", "D"],
+            ["C", "D"],
+            ["D", "E"],
+            ["E", "F"],
+            ["E", "F"],
+            ["F", "G"],
         ],
-        'categories': [
-            ['A', 'B'],
-            ['A', 'C'],
-            ['A', 'B'],
-            ['B', 'C'],
-            ['A'],
-            ['C', 'D'],
-            ['C', 'D'],
-            ['D', 'E'],
-            ['E', 'F'],
-            ['E', 'F'],
-            ['F', 'G']
-        ]
     }
     return pd.DataFrame(data)
 
@@ -34,64 +44,52 @@ def sample_df():
 @pytest.fixture
 def analyzer(sample_df):
     """Create an EnrichmentAnalyzer instance with the sample data"""
-    return EnrichmentAnalyzer(sample_df, 'sample_id', 'categories')
+    return EnrichmentAnalyzer(sample_df, "sample_id", "categories")
 
 
 def test_initialization(analyzer, sample_df):
     """Test that the analyzer initializes correctly"""
     assert analyzer.df.equals(sample_df)
-    assert analyzer.sample_key == 'sample_id'
-    assert analyzer.classification_key == 'categories'
+    assert analyzer.sample_key == "sample_id"
+    assert analyzer.classification_key == "categories"
     assert isinstance(analyzer.global_stats, Counter)
     assert len(analyzer.sample_cache) == 0
 
 
 def test_global_stats_computation(analyzer):
     """Test that global statistics are computed correctly"""
-    expected_counts = {
-        'A': 4,
-        'B': 3,
-        'C': 4,
-        'D': 3,
-        'E': 3,
-        'F': 3,
-        'G': 1
-    }
+    expected_counts = {"A": 4, "B": 3, "C": 4, "D": 3, "E": 3, "F": 3, "G": 1}
     assert dict(analyzer.global_stats) == expected_counts
 
 
 def test_sample_stats_computation(analyzer):
     """Test that sample-specific statistics are computed correctly"""
-    sample1_stats = analyzer._get_sample_stats('sample1')
-    expected_sample1 = {
-        'A': 4,
-        'B': 3,
-        'C': 2
-    }
+    sample1_stats = analyzer._get_sample_stats("sample1")
+    expected_sample1 = {"A": 4, "B": 3, "C": 2}
     assert dict(sample1_stats) == expected_sample1
 
     # Test caching
-    assert 'sample1' in analyzer.sample_cache
-    assert dict(analyzer.sample_cache['sample1']) == expected_sample1
+    assert "sample1" in analyzer.sample_cache
+    assert dict(analyzer.sample_cache["sample1"]) == expected_sample1
 
 
 def test_enrichment_analysis(analyzer):
     """Test the enrichment analysis results with different multiple testing corrections"""
 
     # Test without correction
-    enriched_none = analyzer.find_enriched_categories('sample1', min_occurrences=2,
-                                                      p_value_threshold=0.05,
-                                                      multiple_testing_correction='none')
+    enriched_none = analyzer.find_enriched_categories(
+        "sample1", min_occurrences=2, p_value_threshold=0.05, multiple_testing_correction="none"
+    )
 
     # Test with Bonferroni correction
-    enriched_bonf = analyzer.find_enriched_categories('sample1', min_occurrences=2,
-                                                      p_value_threshold=0.05,
-                                                      multiple_testing_correction='bonf')
+    enriched_bonf = analyzer.find_enriched_categories(
+        "sample1", min_occurrences=2, p_value_threshold=0.05, multiple_testing_correction="bonf"
+    )
 
     # Test with Benjamini-Hochberg correction
-    enriched_bh = analyzer.find_enriched_categories('sample1', min_occurrences=2,
-                                                    p_value_threshold=0.05,
-                                                    multiple_testing_correction='bh')
+    enriched_bh = analyzer.find_enriched_categories(
+        "sample1", min_occurrences=2, p_value_threshold=0.05, multiple_testing_correction="bh"
+    )
 
     # Convert results to more easily testable format
     enriched_dict_none = {result.category: result for result in enriched_none}
@@ -103,12 +101,12 @@ def test_enrichment_analysis(analyzer):
     assert len(enriched_bh) >= len(enriched_bonf)  # BH should find more than Bonferroni
 
     # Check that A and B are enriched in at least one method
-    assert any(('A' in d) for d in [enriched_dict_none, enriched_dict_bonf, enriched_dict_bh])
+    assert any(("A" in d) for d in [enriched_dict_none, enriched_dict_bonf, enriched_dict_bh])
 
     # Check fold changes make sense
     for enriched_dict in [enriched_dict_none, enriched_dict_bonf, enriched_dict_bh]:
-        if 'A' in enriched_dict:
-            result = enriched_dict['A']
+        if "A" in enriched_dict:
+            result = enriched_dict["A"]
             assert result.fold_change > 1.0  # Should be enriched
 
     # Check p-values and adjusted p-values are valid
@@ -116,55 +114,51 @@ def test_enrichment_analysis(analyzer):
         for result in enriched_dict.values():
             assert 0 <= result.original_p_value <= 1
             assert 0 <= result.adjusted_p_value <= 1
-            assert result.adjusted_p_value >= result.original_p_value  # Adjusted p-value should never be smaller than original
+            assert (
+                result.adjusted_p_value >= result.original_p_value
+            )  # Adjusted p-value should never be smaller than original
 
 
 def test_edge_cases(sample_df):
     """Test edge cases and potential error conditions"""
 
     # Test empty DataFrame
-    empty_df = pd.DataFrame({'sample_id': [], 'categories': []})
-    analyzer_empty = EnrichmentAnalyzer(empty_df, 'sample_id', 'categories')
+    empty_df = pd.DataFrame({"sample_id": [], "categories": []})
+    analyzer_empty = EnrichmentAnalyzer(empty_df, "sample_id", "categories")
     assert len(analyzer_empty.global_stats) == 0
 
     # Test single category
-    single_cat_data = {
-        'sample_id': ['sample1', 'sample2'],
-        'categories': [['A'], ['A']]
-    }
+    single_cat_data = {"sample_id": ["sample1", "sample2"], "categories": [["A"], ["A"]]}
     single_cat_df = pd.DataFrame(single_cat_data)
-    analyzer_single = EnrichmentAnalyzer(single_cat_df, 'sample_id', 'categories')
-    assert dict(analyzer_single.global_stats) == {'A': 2}
+    analyzer_single = EnrichmentAnalyzer(single_cat_df, "sample_id", "categories")
+    assert dict(analyzer_single.global_stats) == {"A": 2}
 
     # Test non-list categories (string input)
-    string_cat_data = {
-        'sample_id': ['sample1', 'sample2'],
-        'categories': ['A', 'B']
-    }
+    string_cat_data = {"sample_id": ["sample1", "sample2"], "categories": ["A", "B"]}
     string_cat_df = pd.DataFrame(string_cat_data)
-    analyzer_string = EnrichmentAnalyzer(string_cat_df, 'sample_id', 'categories')
-    assert dict(analyzer_string.global_stats) == {'A': 1, 'B': 1}
+    analyzer_string = EnrichmentAnalyzer(string_cat_df, "sample_id", "categories")
+    assert dict(analyzer_string.global_stats) == {"A": 1, "B": 1}
 
 
 def test_invalid_sample_id(analyzer):
     """Test behavior with invalid sample ID"""
     with pytest.raises(KeyError):
-        analyzer._get_sample_stats('nonexistent_sample')
+        analyzer._get_sample_stats("nonexistent_sample")
 
 
 def test_min_occurrences_filter(analyzer):
     """Test that minimum occurrences filter works"""
     # Set high minimum occurrences to filter out most categories
-    enriched = analyzer.find_enriched_categories('sample1', min_occurrences=10)
+    enriched = analyzer.find_enriched_categories("sample1", min_occurrences=10)
     assert len(enriched) == 0  # No categories should meet this threshold
 
 
 def test_p_value_threshold(analyzer):
     """Test that p-value threshold works"""
     # Set very strict p-value threshold
-    strict_enriched = analyzer.find_enriched_categories('sample1', p_value_threshold=0.0001)
+    strict_enriched = analyzer.find_enriched_categories("sample1", p_value_threshold=0.0001)
     # Set loose p-value threshold
-    loose_enriched = analyzer.find_enriched_categories('sample1', p_value_threshold=0.5)
+    loose_enriched = analyzer.find_enriched_categories("sample1", p_value_threshold=0.5)
 
     # Should find more enriched categories with looser threshold
     assert len(strict_enriched) <= len(loose_enriched)
@@ -172,10 +166,10 @@ def test_p_value_threshold(analyzer):
 
 def test_result_sorting(analyzer):
     """Test that results are properly sorted by p-value"""
-    enriched = analyzer.find_enriched_categories('sample1')
+    enriched = analyzer.find_enriched_categories("sample1")
     p_values = [p for _, _, p in enriched]
     assert p_values == sorted(p_values)  # Should be sorted in ascending order
 
 
-if __name__ == '__main__':
-    pytest.main([__file__])
\ No newline at end of file
+if __name__ == "__main__":
+    pytest.main([__file__])

From 993845b563c8bda51366ff098b7a3b5432ec783a Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Thu, 6 Mar 2025 19:14:48 -0800
Subject: [PATCH 3/3] checks

---
 .../inference/implementations/sklearn_inference_engine.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/linkml_store/inference/implementations/sklearn_inference_engine.py b/src/linkml_store/inference/implementations/sklearn_inference_engine.py
index c8f8810..2f83990 100644
--- a/src/linkml_store/inference/implementations/sklearn_inference_engine.py
+++ b/src/linkml_store/inference/implementations/sklearn_inference_engine.py
@@ -94,6 +94,8 @@ def initialize_model(self, **kwargs):
         if not feature_cols:
             feature_cols = df.columns.difference(target_cols).tolist()
             self.config.feature_attributes = feature_cols
+            if not feature_cols:
+                raise ValueError("No features found in the data")
         target_col = target_cols[0]
         logger.info(f"Feature columns: {feature_cols}")
         X = df[feature_cols].copy()
@@ -102,6 +104,8 @@ def initialize_model(self, **kwargs):
 
         # find list of features to skip (categorical with > N categories)
         skip_features = []
+        if not len(X.columns):
+            raise ValueError("No features to train on")
         for col in X.columns:
             unique_values = self._get_unique_values(X[col])
             if len(unique_values) > self.maximum_proportion_distinct_features * len(X[col]):
@@ -115,6 +119,8 @@ def initialize_model(self, **kwargs):
 
         # Encode features
         encoded_features = []
+        if not len(X.columns):
+            raise ValueError(f"No features to train on from after skipping {skip_features}")
         for col in X.columns:
             logger.info(f"Checking whether to encode: {col}")
             col_encoder = self._get_encoder(X[col])
@@ -153,7 +159,7 @@ def initialize_model(self, **kwargs):
             y = y_encoder.fit_transform(y.values.ravel())  # Convert to 1D numpy array
             self.transformed_targets = y_encoder.classes_
 
-        # print(f"Fitting model with features: {X.columns}")
+        # print(f"Fitting model with features: {X.columns}, y={y}, X={X}")
         clf = DecisionTreeClassifier(random_state=42)
         clf.fit(X, y)
         self.classifier = clf