From 00aa2e630b4ea1119cee7d353952b1a8cfb01166 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Thu, 6 Mar 2025 18:58:47 -0800 Subject: [PATCH 1/3] Simplified referential integrity checking. Fixed correct model for tiktoken, fixes #41 --- docs/how-to/Check-Referential-Integrity.ipynb | 569 +++++++++++++++--- .../Index-Bioinformatics-Databases.ipynb | 290 +++++++++ src/linkml_store/api/database.py | 30 +- .../api/stores/mongodb/mongodb_database.py | 2 + src/linkml_store/cli.py | 16 +- .../index/implementations/llm_indexer.py | 17 +- src/linkml_store/utils/format_utils.py | 4 + src/linkml_store/utils/llm_utils.py | 1 + tests/test_index/test_index.py | 1 + 9 files changed, 838 insertions(+), 92 deletions(-) create mode 100644 docs/how-to/Index-Bioinformatics-Databases.ipynb diff --git a/docs/how-to/Check-Referential-Integrity.ipynb b/docs/how-to/Check-Referential-Integrity.ipynb index da826db..730b1aa 100644 --- a/docs/how-to/Check-Referential-Integrity.ipynb +++ b/docs/how-to/Check-Referential-Integrity.ipynb @@ -14,8 +14,6 @@ }, { "cell_type": "code", - "execution_count": 1, - "outputs": [], "source": [ "from linkml_store import Client\n", "\n", @@ -24,36 +22,50 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.760981Z", - "start_time": "2024-05-04T19:51:08.378243Z" + "end_time": "2025-03-07T02:08:28.986444Z", + "start_time": "2025-03-07T02:08:27.758566Z" } }, - "id": "initial_id" + "id": "initial_id", + "outputs": [], + "execution_count": 1 }, { "cell_type": "code", - "execution_count": 2, - "outputs": [], "source": [ - "db = client.attach_database(\"mongodb://localhost:27017\", \"test\")\n", + "db = client.attach_database(\"mongodb://localhost:27017\", \"test-ri\")\n", "db.metadata.ensure_referential_integrity = True\n", - "db.set_schema_view(\"../../tests/input/countries/countries.linkml.yaml\")\n", - "countries_coll = db.create_collection(\"Country\", alias=\"countries\", recreate_if_exists=True)\n", - "routes_coll = db.create_collection(\"Route\", alias=\"routes\", recreate_if_exists=True)" + "db.set_schema_view(\"../../tests/input/countries/countries.linkml.yaml\")\n" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.788932Z", - "start_time": "2024-05-04T19:51:09.771112Z" + "end_time": "2025-03-07T02:08:29.030994Z", + "start_time": "2025-03-07T02:08:28.989892Z" } }, - "id": "cc164c0acbe4c39d" + "id": "cc164c0acbe4c39d", + "outputs": [], + "execution_count": 2 }, { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:08:29.335618Z", + "start_time": "2025-03-07T02:08:29.318131Z" + } + }, "cell_type": "code", - "execution_count": 3, + "source": [ + "countries_coll = db.create_collection(\"Country\", alias=\"countries\", recreate_if_exists=True)\n", + "routes_coll = db.create_collection(\"Route\", alias=\"routes\", recreate_if_exists=True)" + ], + "id": "cec53323f880da30", "outputs": [], + "execution_count": 5 + }, + { + "cell_type": "code", "source": [ "COUNTRIES = \"../../tests/input/countries/countries.jsonl\"\n", "ROUTES = \"../../tests/input/countries/routes.csv\"" @@ -61,25 +73,16 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.789681Z", - "start_time": "2024-05-04T19:51:09.786454Z" + "end_time": "2025-03-07T02:08:29.343921Z", + "start_time": "2025-03-07T02:08:29.341972Z" } }, - "id": "5286ef4e9dd0f316" + "id": "5286ef4e9dd0f316", + "outputs": [], + "execution_count": 6 }, { "cell_type": "code", - "execution_count": 4, - "outputs": [ - { - "data": { - "text/plain": "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]" - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from linkml_store.utils.format_utils import load_objects\n", "\n", @@ -90,16 +93,27 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.795894Z", - "start_time": "2024-05-04T19:51:09.790413Z" + "end_time": "2025-03-07T02:08:29.353362Z", + "start_time": "2025-03-07T02:08:29.349890Z" } }, - "id": "2e21988e4fc13f58" + "id": "2e21988e4fc13f58", + "outputs": [ + { + "data": { + "text/plain": [ + "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 }, { "cell_type": "code", - "execution_count": 5, - "outputs": [], "source": [ "countries_coll.insert(countries)\n", "routes_coll.insert(routes)" @@ -107,41 +121,43 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.803272Z", - "start_time": "2024-05-04T19:51:09.798758Z" + "end_time": "2025-03-07T02:08:29.583920Z", + "start_time": "2025-03-07T02:08:29.359788Z" } }, - "id": "668e59a8f28e7bfe" + "id": "668e59a8f28e7bfe", + "outputs": [], + "execution_count": 8 }, { "cell_type": "code", - "execution_count": 6, - "outputs": [ - { - "data": { - "text/plain": "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]" - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "routes_coll.find().rows" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.810617Z", - "start_time": "2024-05-04T19:51:09.804004Z" + "end_time": "2025-03-07T02:08:29.596327Z", + "start_time": "2025-03-07T02:08:29.591085Z" } }, - "id": "995e63f873ea9353" + "id": "995e63f873ea9353", + "outputs": [ + { + "data": { + "text/plain": [ + "[{'origin': 'DE', 'destination': 'FR', 'method': 'rail'}]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 9 }, { "cell_type": "code", - "execution_count": 7, - "outputs": [], "source": [ "for result in db.iter_validate_database():\n", " print(result)" @@ -149,11 +165,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.956191Z", - "start_time": "2024-05-04T19:51:09.809082Z" + "end_time": "2025-03-07T02:08:29.737342Z", + "start_time": "2025-03-07T02:08:29.602408Z" } }, - "id": "a8ef16a3fbc6bfe6" + "id": "a8ef16a3fbc6bfe6", + "outputs": [], + "execution_count": 10 }, { "cell_type": "markdown", @@ -169,72 +187,463 @@ }, { "cell_type": "code", - "execution_count": 8, - "outputs": [], "source": [ "routes_coll.insert({\"origin\": \"ZZZ\", \"destination\": \"YYY\", \"method\": \"rail\"})" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.961815Z", - "start_time": "2024-05-04T19:51:09.956721Z" + "end_time": "2025-03-07T02:08:29.747005Z", + "start_time": "2025-03-07T02:08:29.743644Z" } }, - "id": "f712a82be775f413" + "id": "f712a82be775f413", + "outputs": [], + "execution_count": 11 }, { "cell_type": "code", - "execution_count": 9, + "source": [ + "routes_coll.find().rows_dataframe" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-03-07T02:08:29.766855Z", + "start_time": "2025-03-07T02:08:29.753525Z" + } + }, + "id": "18ffa996e3893b96", "outputs": [ { "data": { - "text/plain": " origin destination method\n0 DE FR rail\n1 ZZZ YYY rail", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
origindestinationmethod
0DEFRrail
1ZZZYYYrail
\n
" + "text/plain": [ + " origin destination method\n", + "0 DE FR rail\n", + "1 ZZZ YYY rail" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
origindestinationmethod
0DEFRrail
1ZZZYYYrail
\n", + "
" + ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], + "execution_count": 12 + }, + { + "cell_type": "code", + "source": "results = list(db.iter_validate_database())", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-03-07T02:08:29.880295Z", + "start_time": "2025-03-07T02:08:29.792681Z" + } + }, + "id": "c67517aece5d47c5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "type='ReferentialIntegrity' severity= message='Referential integrity error: Country not found' instance='ZZZ' instance_index=None instantiates='Country' context=[] source=None\n", + "type='ReferentialIntegrity' severity= message='Referential integrity error: Country not found' instance='YYY' instance_index=None instantiates='Country' context=[] source=None\n" + ] + } + ], + "execution_count": 13 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:09:42.929682Z", + "start_time": "2025-03-07T02:09:42.926860Z" + } + }, + "cell_type": "code", + "source": "assert any(r for r in results if \"Referential integrity\" in r.message)", + "id": "ab65fa35df1319fa", + "outputs": [], + "execution_count": 14 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:09:59.275684Z", + "start_time": "2025-03-07T02:09:59.273035Z" + } + }, + "cell_type": "code", "source": [ - "routes_coll.find().rows_dataframe" + "for result in results:\n", + " print(\"Expected error: \", result)" + ], + "id": "755df23ea86fb8fe", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Expected error: type='ReferentialIntegrity' severity= message='Referential integrity error: Country not found' instance='ZZZ' instance_index=None instantiates='Country' context=[] source=None\n", + "Expected error: type='ReferentialIntegrity' severity= message='Referential integrity error: Country not found' instance='YYY' instance_index=None instantiates='Country' context=[] source=None\n" + ] + } + ], + "execution_count": 16 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Command Line Example using DuckDB\n", + "\n", + "We'll next show a command line example; we will use DuckDB here and CSVs, but the same principles apply to other databases and formats.\n", + "\n", + "First we'll make two CSVs, one for patients and one for samples. The samples will refer to patients.\n" ], + "id": "cbfa9918f43120bb" + }, + { "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:51:09.974226Z", - "start_time": "2024-05-04T19:51:09.961675Z" + "end_time": "2025-03-07T02:53:43.124840Z", + "start_time": "2025-03-07T02:53:43.120716Z" } }, - "id": "18ffa996e3893b96" + "cell_type": "code", + "source": [ + "PATIENTS = \"\"\"id,name,age\n", + "p1,John Doe,34\n", + "p2,Jane Doe,65\n", + "\"\"\"\n", + "with open(\"output/patients.csv\", \"w\") as stream:\n", + " stream.write(PATIENTS)" + ], + "id": "c5180f555f0d8532", + "outputs": [], + "execution_count": 81 }, { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:53:44.072343Z", + "start_time": "2025-03-07T02:53:44.069087Z" + } + }, "cell_type": "code", - "execution_count": 16, + "source": [ + "SAMPLES = \"\"\"id,patient\n", + "s1,p1\n", + "s2,p2\n", + "s3,p2\n", + "\"\"\"\n", + "with open(\"output/samples.csv\", \"w\") as stream:\n", + " stream.write(SAMPLES)" + ], + "id": "b98c49c121875d2c", + "outputs": [], + "execution_count": 82 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "Note this dataset is well-behaved, every sample refers to a patient.\n", + "\n", + "There is one issue with the data though, and that is that the default loader doesn't perform ptype inference, so the ages will\n", + "be treated as strings.\n", + "\n", + "Next we'll add a schema file" + ], + "id": "e59cea007cb4677a" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:57:03.648738Z", + "start_time": "2025-03-07T02:57:03.642949Z" + } + }, + "cell_type": "code", + "source": [ + "SCHEMA = \"\"\"\n", + "id: http://example.org/patients\n", + "name: patients\n", + "description: Patients and samples\n", + "prefixes:\n", + " linkml: http://w3id.org/linkml/\n", + " ex: http://example.org/\n", + "default_prefix: ex \n", + "imports:\n", + " - linkml:types\n", + "classes:\n", + " Sample:\n", + " attributes:\n", + " id:\n", + " identifier: true\n", + " patient:\n", + " range: Patient\n", + "\n", + " Patient:\n", + " attributes:\n", + " id:\n", + " identifier: true\n", + " name:\n", + " required: true\n", + " age:\n", + " range: integer\n", + "\"\"\"\n", + "with open(\"output/patients.linkml.yaml\", \"w\") as stream:\n", + " stream.write(SCHEMA)" + ], + "id": "bce56a2623bda439", + "outputs": [], + "execution_count": 86 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Load data into DuckDB\n", + "\n", + "We'll first clear any older databases we may have created" + ], + "id": "89949c3688a654d2" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:57:04.596382Z", + "start_time": "2025-03-07T02:57:04.593728Z" + } + }, + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "\n", + "Path(\"output/patient_samples.ddb\").unlink(missing_ok=True)" + ], + "id": "d1688d7868c91f51", + "outputs": [], + "execution_count": 87 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Then we'll load the data", + "id": "d137280f635ffdaf" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:57:09.101026Z", + "start_time": "2025-03-07T02:57:05.936337Z" + } + }, + "cell_type": "code", + "source": [ + "%%bash\n", + "linkml-store \\\n", + " -d output/patient_samples.ddb \\\n", + " -c Patient \\\n", + " insert output/patients.csv\n" + ], + "id": "3fb54173c9dc7ef6", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "type='ReferentialIntegrity' severity= message='Referential integrity error: Country not found' instance='ZZZ' instance_index=None instantiates='Country'\n", - "type='ReferentialIntegrity' severity= message='Referential integrity error: Country not found' instance='YYY' instance_index=None instantiates='Country'\n" + "Inserted 2 objects from output/patients.csv into collection 'Patient'.\n" ] } ], + "execution_count": 88 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:57:11.542085Z", + "start_time": "2025-03-07T02:57:09.108440Z" + } + }, + "cell_type": "code", "source": [ - "results = list(db.iter_validate_database())\n", - "for result in results:\n", - " print(result)" + "%%bash\n", + "linkml-store \\\n", + " -d output/patient_samples.ddb \\\n", + " -c Sample \\\n", + " insert output/samples.csv" + ], + "id": "b02ecd6e707d8c4", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inserted 3 objects from output/samples.csv into collection 'Sample'.\n" + ] + } ], + "execution_count": 89 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Check Referential Integrity (no RI)\n", + "\n", + "We don't expect any referential integrity issues here\n", + "\n" + ], + "id": "beb5290779c89866" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:57:27.170889Z", + "start_time": "2025-03-07T02:57:24.627460Z" + } + }, + "cell_type": "code", + "source": [ + "%%bash\n", + "linkml-store --schema output/patients.linkml.yaml -d output/patient_samples.ddb validate -O csv" + ], + "id": "1e2fac4b84ac1188", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "type,severity,message,instance,instance_index,instantiates,context\r\n", + "jsonschema validation,ERROR,\"'34' is not of type 'integer', 'null' in /age\",\"{'id': 'p1', 'name': 'John Doe', 'age': '34'}\",0,Patient,[]\r\n", + "jsonschema validation,ERROR,\"'65' is not of type 'integer', 'null' in /age\",\"{'id': 'p2', 'name': 'Jane Doe', 'age': '65'}\",0,Patient,[]\r\n", + "\n" + ] + } + ], + "execution_count": 90 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Adding dangling references\n", + "\n", + "We'll deliberately add a sample that refers to a non-existent patient" + ], + "id": "fcfe323a8374efe7" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-07T02:57:51.753795Z", + "start_time": "2025-03-07T02:57:48.526129Z" + } + }, + "cell_type": "code", + "source": [ + "%%bash\n", + "linkml-store \\\n", + " -d output/patient_samples.ddb \\\n", + " -c Sample \\\n", + " insert --object '{\"id\": \"s4\", \"patient\": \"p3\"}'" + ], + "id": "fbd2644bdba7b35", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inserted 1 objects from {\"id\": \"s4\", \"patient\": \"p3\"} into collection 'Sample'.\n" + ] + } + ], + "execution_count": 91 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "And then re-validate", + "id": "6632297dfd6934d6" + }, + { "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T19:52:20.044928Z", - "start_time": "2024-05-04T19:52:19.996008Z" + "end_time": "2025-03-07T02:58:06.960138Z", + "start_time": "2025-03-07T02:58:03.546955Z" } }, - "id": "c67517aece5d47c5" + "cell_type": "code", + "source": [ + "%%bash\n", + "linkml-store --schema output/patients.linkml.yaml --set ensure_referential_integrity=true -d output/patient_samples.ddb validate -O csv" + ], + "id": "9c572e7e68343dee", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "type,severity,message,instance,instance_index,instantiates,context\r\n", + "jsonschema validation,ERROR,\"'34' is not of type 'integer', 'null' in /age\",\"{'id': 'p1', 'name': 'John Doe', 'age': '34'}\",0,Patient,[]\r\n", + "jsonschema validation,ERROR,\"'65' is not of type 'integer', 'null' in /age\",\"{'id': 'p2', 'name': 'Jane Doe', 'age': '65'}\",0,Patient,[]\r\n", + "ReferentialIntegrity,ERROR,Referential integrity error: Patient not found,p3,,Patient,[]\r\n", + "\n" + ] + } + ], + "execution_count": 92 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "edd5d9b201dbfa5f" } ], "metadata": { diff --git a/docs/how-to/Index-Bioinformatics-Databases.ipynb b/docs/how-to/Index-Bioinformatics-Databases.ipynb new file mode 100644 index 0000000..7cf7db1 --- /dev/null +++ b/docs/how-to/Index-Bioinformatics-Databases.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Indexing Bioinformatics Databases\n", + "\n" + ], + "id": "eb43a476bbbf18d1" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## SIB Expasy Enzyme Database", + "id": "5e03abfb81d962cc" + }, + { + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2025-03-06T03:04:45.923151Z", + "start_time": "2025-03-06T03:04:42.738146Z" + } + }, + "cell_type": "code", + "source": [ + "%%bash\n", + "linkml-store -d mongodb://localhost:27017/bioinf -c enzyme insert ftp://ftp.expasy.org/databases/enzyme/enzyme.dat" + ], + "id": "initial_id", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inserted 8371 objects from ftp://ftp.expasy.org/databases/enzyme/enzyme.dat into collection 'enzyme'.\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-06T17:50:12.539044Z", + "start_time": "2025-03-06T17:50:05.567015Z" + } + }, + "cell_type": "code", + "source": [ + "%%bash\n", + "linkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \"degradation pathways\" -l 10 " + ], + "id": "eecde4757986f082", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: phenopackets, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gaf_mgi, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gaf_pombase, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gaf_gcrp, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: nmdc, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: amigo, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gocams, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: cadsr, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: npatlas, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: obo, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: metabolights, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: mibig, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: mixs, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: mondo, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: hpoa, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: hpoa_mongo, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: hpoa_kg, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: maxoa, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: refmet, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: neo4j, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: gold, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Initializing database: nmdc_duckdb, base_dir: /Users/cjm\n", + "2025-03-06 09:50:11,450 - linkml_store.api.client - INFO - Creating/attaching database: mongodb://localhost:27017/bioinf\n", + "2025-03-06 09:50:11,490 - linkml_store.api.client - INFO - Initializing databases\n", + "2025-03-06 09:50:11,490 - linkml_store.api.client - INFO - Attaching mongodb://localhost:27017/bioinf\n", + "2025-03-06 09:50:11,490 - linkml_store.api.database - DEBUG - Initializing collections\n", + "2025-03-06 09:50:11,494 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Starting topology monitoring\"}\n", + "2025-03-06 09:50:11,494 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"previousDescription\": \"\", \"newDescription\": \"]>\", \"message\": \"Topology description changed\"}\n", + "2025-03-06 09:50:11,494 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"message\": \"Starting server monitoring\"}\n", + "2025-03-06 09:50:11,494 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection pool created\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,494 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"listCollections\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n", + "2025-03-06 09:50:11,495 - pymongo.serverSelection - DEBUG - {\"message\": \"Waiting for suitable server to become available\", \"selector\": \"Primary()\", \"operation\": \"listCollections\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"remainingTimeMS\": 29}\n", + "2025-03-06 09:50:11,497 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"driverConnectionId\": 1, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": false, \"message\": \"Server heartbeat started\"}\n", + "2025-03-06 09:50:11,500 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"driverConnectionId\": 1, \"serverConnectionId\": 1165, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": false, \"durationMS\": 2.4912499357014894, \"reply\": \"{\\\"helloOk\\\": true, \\\"ismaster\\\": true, \\\"topologyVersion\\\": {\\\"processId\\\": {\\\"$oid\\\": \\\"67bd405a9a90e0cc87eb813e\\\"}}, \\\"maxBsonObjectSize\\\": 16777216, \\\"maxMessageSizeBytes\\\": 48000000, \\\"maxWriteBatchSize\\\": 100000, \\\"localTime\\\": {\\\"$date\\\": \\\"2025-03-06T17:50:11.499Z\\\"}, \\\"logicalSessionTimeoutMinutes\\\": 30, \\\"connectionId\\\": 1165, \\\"maxWireVersion\\\": 21, \\\"ok\\\": 1.0}\", \"message\": \"Server heartbeat succeeded\"}\n", + "2025-03-06 09:50:11,500 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection pool ready\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,500 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"previousDescription\": \"]>\", \"newDescription\": \"]>\", \"message\": \"Topology description changed\"}\n", + "2025-03-06 09:50:11,500 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"listCollections\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,500 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,500 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection created\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,500 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"driverConnectionId\": 1, \"serverConnectionId\": 1165, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": true, \"message\": \"Server heartbeat started\"}\n", + "2025-03-06 09:50:11,503 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection ready\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 0.0017190839862450957}\n", + "2025-03-06 09:50:11,503 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 0.0026320830220356584}\n", + "2025-03-06 09:50:11,503 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"listCollections\\\": 1, \\\"nameOnly\\\": true, \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"listCollections\", \"databaseName\": \"bioinf\", \"requestId\": 1144108930, \"operationId\": 1144108930, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,507 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 4.3759999999999994, \"reply\": \"{\\\"cursor\\\": {\\\"ns\\\": \\\"bioinf.$cmd.listCollections\\\", \\\"firstBatch\\\": [{\\\"name\\\": \\\"enzyme\\\", \\\"type\\\": \\\"collection\\\"}]}, \\\"ok\\\": 1.0}\", \"commandName\": \"listCollections\", \"databaseName\": \"bioinf\", \"requestId\": 1144108930, \"operationId\": 1144108930, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,507 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,507 - linkml_store.cli - INFO - Attaching index to collection enzyme: {'name': 'llm', 'index_type': 'llm', 'index_function': None, 'distance_function': None, 'index_attributes': None, 'text_template': None, 'text_template_syntax': None, 'filter_nulls': True, 'vector_default_length': 1000, 'index_field': '__index__', 'embedding_model_name': 'ada-002', 'cached_embeddings_database': None, 'cached_embeddings_collection': None, 'cache_queries': False, 'truncation_method': None}\n", + "2025-03-06 09:50:11,507 - linkml_store.api.collection - DEBUG - Pre-query hook (state: None; Q= None\n", + "2025-03-06 09:50:11,507 - linkml_store.api.collection - INFO - No metadata for enzyme; no derivations\n", + "2025-03-06 09:50:11,507 - linkml_store.api.database - DEBUG - Creating new collection: internal__index__enzyme__llm kwargs: {}\n", + "2025-03-06 09:50:11,507 - linkml_store.api.collection - DEBUG - Using indexer with name llm\n", + "2025-03-06 09:50:11,507 - linkml_store.api.collection - DEBUG - Pre-query hook (state: None; Q= from_table='internal__index__enzyme__llm' select_cols=None where_clause={} sort_by=None limit=None offset=None include_facet_counts=False facet_slots=None\n", + "2025-03-06 09:50:11,508 - linkml_store.api.collection - INFO - No metadata for internal__index__enzyme__llm; no derivations\n", + "2025-03-06 09:50:11,508 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n", + "2025-03-06 09:50:11,508 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,508 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,508 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 3.2374984584748745e-05}\n", + "2025-03-06 09:50:11,508 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"find\\\": \\\"internal__index__enzyme__llm\\\", \\\"limit\\\": 1, \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 470211272, \"operationId\": 470211272, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,509 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 0.9810000000000001, \"reply\": \"{\\\"cursor\\\": {\\\"ns\\\": \\\"bioinf.internal__index__enzyme__llm\\\"}, \\\"ok\\\": 1.0}\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 470211272, \"operationId\": 470211272, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,509 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,509 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n", + "2025-03-06 09:50:11,509 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,509 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,509 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 2.9582995921373367e-05}\n", + "2025-03-06 09:50:11,509 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"aggregate\\\": \\\"internal__index__enzyme__llm\\\", \\\"pipeline\\\": [{\\\"$group\\\": {\\\"_id\\\": 1, \\\"n\\\": {\\\"$sum\\\": 1}}}], \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 101027544, \"operationId\": 101027544, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,510 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 0.8019999999999999, \"reply\": \"{\\\"cursor\\\": {\\\"ns\\\": \\\"bioinf.internal__index__enzyme__llm\\\"}, \\\"ok\\\": 1.0}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 101027544, \"operationId\": 101027544, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,510 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,510 - linkml_store.api.collection - INFO - Index llm is empty; indexing all objects\n", + "2025-03-06 09:50:11,510 - linkml_store.api.collection - DEBUG - Pre-query hook (state: True; Q= from_table='enzyme' select_cols=None where_clause=None sort_by=None limit=None offset=None include_facet_counts=False facet_slots=None\n", + "2025-03-06 09:50:11,510 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n", + "2025-03-06 09:50:11,510 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"find\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,510 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,510 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 0.0001430839765816927}\n", + "2025-03-06 09:50:11,510 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"find\\\": \\\"enzyme\\\", \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 1457850878, \"operationId\": 1457850878, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,512 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 1.246, \"reply\": \"{\\\"cursor\\\": {\\\"firstBatch\\\": [{\\\"_id\\\": {\\\"$oid\\\": \\\"67c910cdad347dd8ce68a02d\\\"}, \\\"ID\\\": \\\"1.1.1.1\\\", \\\"DE\\\": \\\"alcohol dehydrogenase\\\", \\\"AN\\\": [\\\"aldehyde reductase\\\"], \\\"CA\\\": [\\\"(1) a primary alcohol + NAD(+) = an aldehyde + NADH + H(+)\\\", \\\"(2) a secondary alcohol + NAD(+) = a ketone + NADH + H(+)\\\"], \\\"CC\\\": [\\\"-!- Acts on primary or secondary alcohols or hemi-acetals with very broad specificity; however the enzyme oxidizes methanol much more poorly than ethanol.\\\", \\\"-!- The animal, but not the yeast, enzyme acts also on cyclic secondary alcohols.\\\", \\\"-!- Formerly EC 1.1.1.32.\\\"], \\\"DR\\\": \\\"P07327, ADH1A_HUMAN; P28469, ADH1A_MACMU; Q5RBP7, ADH1A_PONAB;P25405, ADH1A_SAAHA; P25406, ADH1B_SAAHA; P00327, ADH1E_HORSE;P00326, ADH1G_HUMAN; O97959, ADH1G_PAPHA; P00328, ADH1S_HORSE;P80222, ADH1_ALLMI ; P30350, ADH1_ANAPL ; P49645, ADH1_APTAU ;P06525, ADH1_ARATH ; P41747, ADH1_ASPFN ; Q17334, ADH1_CAEEL ;P43067, ADH1_CANAX ; P85440, ADH1_CATRO ; P14219, ADH1_CENAM ;P48814, ADH1_CERCA ; Q70UN9, ADH1_CERCO ; P23...\", \"commandName\": \"find\", \"databaseName\": \"bioinf\", \"requestId\": 1457850878, \"operationId\": 1457850878, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,513 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,513 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"\", \"operation\": \"getMore\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n", + "2025-03-06 09:50:11,513 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"\", \"operation\": \"getMore\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,513 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,513 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 3.4708064049482346e-05}\n", + "2025-03-06 09:50:11,513 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"getMore\\\": 3344471647564691207, \\\"collection\\\": \\\"enzyme\\\", \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"getMore\", \"databaseName\": \"bioinf\", \"requestId\": 1458777923, \"operationId\": 1458777923, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,561 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 47.666, \"reply\": \"{\\\"cursor\\\": {\\\"nextBatch\\\": [{\\\"_id\\\": {\\\"$oid\\\": \\\"67c910cdad347dd8ce68a092\\\"}, \\\"ID\\\": \\\"1.1.1.102\\\", \\\"DE\\\": \\\"3-dehydrosphinganine reductase\\\", \\\"AN\\\": [\\\"3-ketosphinganine reductase\\\", \\\"3-oxosphinganine:NADPH oxidoreductase\\\", \\\"3-oxosphinganine reductase\\\", \\\"D-3-dehydrosphinganine reductase\\\", \\\"D-3-oxosphinganine:B-NADPH oxidoreductase\\\", \\\"D-3-oxosphinganine reductase\\\", \\\"DSR\\\", \\\"KTS reductase\\\"], \\\"CA\\\": [\\\"sphinganine + NADP(+) = 3-oxosphinganine + NADPH + H(+)\\\"], \\\"DR\\\": \\\"Q9Y7P2, GPI11_SCHPO; Q0WRJ2, KDSRA_ARATH; F4JZN6, KDSRB_ARATH;Q4WSZ0, KDSR_ASPFU ; Q8A945, KDSR_BACTN ; Q2KIJ5, KDSR_BOVIN ;Q59RQ2, KDSR_CANAL ; Q6FQ42, KDSR_CANGA ; P0CR37, KDSR_CRYNB ;P0CR36, KDSR_CRYNJ ; F1QWW8, KDSR_DANRE ; Q6BQK1, KDSR_DEBHA ;Q556J2, KDSR_DICDI ; Q5BE65, KDSR_EMENI ; Q758B6, KDSR_EREGS ;Q06136, KDSR_HUMAN ; Q6CLN0, KDSR_KLULA ; Q6GV12, KDSR_MOUSE ;Q7RZR2, KDSR_NEUCR ; Q6CE86, KDSR_YARLI ; P38342, KDSR_YEAST ;\\\"}, {\\\"_id\\\": {\\\"$oid\\\": \\\"67c910cdad347dd8ce68a093\\\"}, \\\"ID\\\": \\\"1.1.1.103\\\", \\\"DE\\\": \\\"L-threonine 3-dehydrogen...\", \"commandName\": \"getMore\", \"databaseName\": \"bioinf\", \"requestId\": 1458777923, \"operationId\": 1458777923, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,561 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,566 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection started\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}}\n", + "2025-03-06 09:50:11,567 - pymongo.serverSelection - DEBUG - {\"message\": \"Server selection succeeded\", \"selector\": \"Primary()\", \"operation\": \"count\", \"topologyDescription\": \"]>\", \"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,567 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checkout started\", \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,567 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked out\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1, \"durationMS\": 4.2749918065965176e-05}\n", + "2025-03-06 09:50:11,567 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command started\", \"command\": \"{\\\"aggregate\\\": \\\"enzyme\\\", \\\"pipeline\\\": [{\\\"$group\\\": {\\\"_id\\\": 1, \\\"n\\\": {\\\"$sum\\\": 1}}}], \\\"lsid\\\": {\\\"id\\\": {\\\"$binary\\\": {\\\"base64\\\": \\\"7XlM7LkFRb+PVBWeJpTtRw==\\\", \\\"subType\\\": \\\"04\\\"}}}, \\\"$db\\\": \\\"bioinf\\\"}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 2007237709, \"operationId\": 2007237709, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,573 - pymongo.command - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Command succeeded\", \"durationMS\": 5.842, \"reply\": \"{\\\"cursor\\\": {\\\"firstBatch\\\": [{\\\"_id\\\": 1, \\\"n\\\": 8371}], \\\"ns\\\": \\\"bioinf.enzyme\\\"}, \\\"ok\\\": 1.0}\", \"commandName\": \"aggregate\", \"databaseName\": \"bioinf\", \"requestId\": 2007237709, \"operationId\": 2007237709, \"driverConnectionId\": 1, \"serverConnectionId\": 1167, \"serverHost\": \"localhost\", \"serverPort\": 27017}\n", + "2025-03-06 09:50:11,573 - pymongo.connection - DEBUG - {\"clientId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"message\": \"Connection checked in\", \"serverHost\": \"localhost\", \"serverPort\": 27017, \"driverConnectionId\": 1}\n", + "2025-03-06 09:50:11,618 - root - INFO - Converting 8371 texts to vectors\n", + "2025-03-06 09:50:11,635 - root - INFO - Token limit for text-embedding-ada-002: 7892\n", + "2025-03-06 09:50:11,639 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): openaipublic.blob.core.windows.net:443\n", + "Traceback (most recent call last):\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connection.py\", line 198, in _new_conn\n", + " sock = connection.create_connection(\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/util/connection.py\", line 60, in create_connection\n", + " for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/socket.py\", line 955, in getaddrinfo\n", + " for res in _socket.getaddrinfo(host, port, family, type, proto, flags):\n", + "socket.gaierror: [Errno 8] nodename nor servname provided, or not known\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 787, in urlopen\n", + " response = self._make_request(\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 488, in _make_request\n", + " raise new_e\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 464, in _make_request\n", + " self._validate_conn(conn)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 1093, in _validate_conn\n", + " conn.connect()\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connection.py\", line 704, in connect\n", + " self.sock = sock = self._new_conn()\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connection.py\", line 205, in _new_conn\n", + " raise NameResolutionError(self.host, self, e) from e\n", + "urllib3.exceptions.NameResolutionError: : Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/adapters.py\", line 667, in send\n", + " resp = conn.urlopen(\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/connectionpool.py\", line 841, in urlopen\n", + " retries = retries.increment(\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/urllib3/util/retry.py\", line 519, in increment\n", + " raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]\n", + "urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_base.tiktoken (Caused by NameResolutionError(\": Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)\"))\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/bin/linkml-store\", line 6, in \n", + " sys.exit(cli())\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1161, in __call__\n", + " return self.main(*args, **kwargs)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1082, in main\n", + " rv = self.invoke(ctx)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1697, in invoke\n", + " return _process_result(sub_ctx.command.invoke(sub_ctx))\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 1443, in invoke\n", + " return ctx.invoke(self.callback, **ctx.params)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/core.py\", line 788, in invoke\n", + " return __callback(*args, **kwargs)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/click/decorators.py\", line 33, in new_func\n", + " return f(get_current_context(), *args, **kwargs)\n", + " File \"/Users/cjm/repos/linkml-store/src/linkml_store/cli.py\", line 882, in search\n", + " result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)\n", + " File \"/Users/cjm/repos/linkml-store/src/linkml_store/api/collection.py\", line 595, in search\n", + " self.index_objects(all_objs, index_name, replace=True, **kwargs)\n", + " File \"/Users/cjm/repos/linkml-store/src/linkml_store/api/collection.py\", line 905, in index_objects\n", + " vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]\n", + " File \"/Users/cjm/repos/linkml-store/src/linkml_store/index/indexer.py\", line 103, in objects_to_vectors\n", + " return self.texts_to_vectors([self.object_to_text(obj) for obj in objs])\n", + " File \"/Users/cjm/repos/linkml-store/src/linkml_store/index/implementations/llm_indexer.py\", line 73, in texts_to_vectors\n", + " encoding = encoding_for_model(\"gpt-4o\")\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/model.py\", line 110, in encoding_for_model\n", + " return get_encoding(encoding_name_for_model(model_name))\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/registry.py\", line 86, in get_encoding\n", + " enc = Encoding(**constructor())\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken_ext/openai_public.py\", line 96, in o200k_base\n", + " mergeable_ranks = load_tiktoken_bpe(\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/load.py\", line 148, in load_tiktoken_bpe\n", + " contents = read_file_cached(tiktoken_bpe_file, expected_hash)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/load.py\", line 63, in read_file_cached\n", + " contents = read_file(blobpath)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/tiktoken/load.py\", line 22, in read_file\n", + " resp = requests.get(blobpath)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/api.py\", line 73, in get\n", + " return request(\"get\", url, params=params, **kwargs)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/api.py\", line 59, in request\n", + " return session.request(method=method, url=url, **kwargs)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/sessions.py\", line 589, in request\n", + " resp = self.send(prep, **send_kwargs)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/sessions.py\", line 703, in send\n", + " r = adapter.send(request, **kwargs)\n", + " File \"/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/requests/adapters.py\", line 700, in send\n", + " raise ConnectionError(e, request=request)\n", + "requests.exceptions.ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_base.tiktoken (Caused by NameResolutionError(\": Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)\"))\n", + "2025-03-06 09:50:12,001 - pymongo.topology - DEBUG - {\"topologyId\": {\"$oid\": \"67c9e0533f1b6043a9a9dd16\"}, \"serverHost\": \"localhost\", \"serverPort\": 27017, \"awaited\": true, \"durationMS\": 501.27241597510874, \"failure\": \"\\\"_OperationCancelled('operation cancelled')\\\"\", \"driverConnectionId\": 1, \"message\": \"Server heartbeat failed\"}\n" + ] + }, + { + "ename": "CalledProcessError", + "evalue": "Command 'b'linkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \"degradation pathways\" -l 10 \\n'' returned non-zero exit status 1.", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mCalledProcessError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mget_ipython\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_cell_magic\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mbash\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mlinkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mdegradation pathways\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m -l 10 \u001B[39;49m\u001B[38;5;130;43;01m\\n\u001B[39;49;00m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2543\u001B[0m, in \u001B[0;36mInteractiveShell.run_cell_magic\u001B[0;34m(self, magic_name, line, cell)\u001B[0m\n\u001B[1;32m 2541\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mbuiltin_trap:\n\u001B[1;32m 2542\u001B[0m args \u001B[38;5;241m=\u001B[39m (magic_arg_s, cell)\n\u001B[0;32m-> 2543\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[43mfn\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 2545\u001B[0m \u001B[38;5;66;03m# The code below prevents the output from being displayed\u001B[39;00m\n\u001B[1;32m 2546\u001B[0m \u001B[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001B[39;00m\n\u001B[1;32m 2547\u001B[0m \u001B[38;5;66;03m# when the last Python token in the expression is a ';'.\u001B[39;00m\n\u001B[1;32m 2548\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mgetattr\u001B[39m(fn, magic\u001B[38;5;241m.\u001B[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001B[38;5;28;01mFalse\u001B[39;00m):\n", + "File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/IPython/core/magics/script.py:159\u001B[0m, in \u001B[0;36mScriptMagics._make_script_magic..named_script_magic\u001B[0;34m(line, cell)\u001B[0m\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 158\u001B[0m line \u001B[38;5;241m=\u001B[39m script\n\u001B[0;32m--> 159\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mshebang\u001B[49m\u001B[43m(\u001B[49m\u001B[43mline\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcell\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/IPython/core/magics/script.py:336\u001B[0m, in \u001B[0;36mScriptMagics.shebang\u001B[0;34m(self, line, cell)\u001B[0m\n\u001B[1;32m 331\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m args\u001B[38;5;241m.\u001B[39mraise_error \u001B[38;5;129;01mand\u001B[39;00m p\u001B[38;5;241m.\u001B[39mreturncode \u001B[38;5;241m!=\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[1;32m 332\u001B[0m \u001B[38;5;66;03m# If we get here and p.returncode is still None, we must have\u001B[39;00m\n\u001B[1;32m 333\u001B[0m \u001B[38;5;66;03m# killed it but not yet seen its return code. We don't wait for it,\u001B[39;00m\n\u001B[1;32m 334\u001B[0m \u001B[38;5;66;03m# in case it's stuck in uninterruptible sleep. -9 = SIGKILL\u001B[39;00m\n\u001B[1;32m 335\u001B[0m rc \u001B[38;5;241m=\u001B[39m p\u001B[38;5;241m.\u001B[39mreturncode \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m9\u001B[39m\n\u001B[0;32m--> 336\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m CalledProcessError(rc, cell)\n", + "\u001B[0;31mCalledProcessError\u001B[0m: Command 'b'linkml-store -vv --stacktrace -d mongodb://localhost:27017/bioinf::enzyme search -t llm \"degradation pathways\" -l 10 \\n'' returned non-zero exit status 1." + ] + } + ], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "4264e705488b6d20" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py index 5329950..08ad918 100644 --- a/src/linkml_store/api/database.py +++ b/src/linkml_store/api/database.py @@ -595,7 +595,29 @@ def induce_schema_view(self) -> SchemaView: sb.add_class(coll.target_class_name) return SchemaView(sb.schema) - def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]: + def validate_database(self, **kwargs) -> List["ValidationResult"]: + """ + Validate the contents of the database. + + As `iter_validate_database`, but returns a list of validation results. + + :param kwargs: + :return: + """ + return list(self.iter_validate_database(**kwargs)) + + def validate_database(self, **kwargs) -> List["ValidationResult"]: + """ + Validate the contents of the database. + + As `iter_validate_database`, but returns a list of validation results. + + :param kwargs: + :return: + """ + return list(self.iter_validate_database(**kwargs)) + + def iter_validate_database(self, ensure_referential_integrity: bool = None, **kwargs) -> Iterator["ValidationResult"]: """ Validate the contents of the database. @@ -635,12 +657,14 @@ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]: 'capital' is a required property 'continent' is a required proper + :param ensure_referential_integrity: ensure referential integrity :param kwargs: :return: iterator over validation results """ for collection in self.list_collections(): yield from collection.iter_validate_collection(**kwargs) - if self.metadata.ensure_referential_integrity: + if self.metadata.ensure_referential_integrity or ensure_referential_integrity: + logger.info(f"Validating referential integrity on {self.alias}") yield from self._validate_referential_integrity(**kwargs) def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]: @@ -661,7 +685,9 @@ def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResul induced_slots = sv.class_induced_slots(cd.name) slot_map = {s.name: s for s in induced_slots} # rmap = {s.name: s.range for s in induced_slots} + # map slot ranges to a collection where that range is stored sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range} + logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}") for obj in collection.find_iter(): for k, v in obj.items(): if k not in sr_to_coll: diff --git a/src/linkml_store/api/stores/mongodb/mongodb_database.py b/src/linkml_store/api/stores/mongodb/mongodb_database.py index 499c222..0361bfa 100644 --- a/src/linkml_store/api/stores/mongodb/mongodb_database.py +++ b/src/linkml_store/api/stores/mongodb/mongodb_database.py @@ -42,6 +42,8 @@ def _db_name(self) -> str: parsed_url = urlparse(self.handle) path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/") db_name = path_parts[0] if path_parts else "default" + if not db_name: + db_name = self.alias else: db_name = "default" return db_name diff --git a/src/linkml_store/cli.py b/src/linkml_store/cli.py index 667357d..9b04618 100644 --- a/src/linkml_store/cli.py +++ b/src/linkml_store/cli.py @@ -246,6 +246,8 @@ def insert(ctx, files, replace, object, format, source_field, json_select_query) for object_str in object: logger.info(f"Parsing: {object_str}") objects = yaml.safe_load(object_str) + if not isinstance(objects, list): + objects = [objects] if replace: collection.replace(objects) else: @@ -903,12 +905,18 @@ def indexes(ctx): @cli.command() @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format") @click.option("--output", "-o", type=click.Path(), help="Output file path") +@click.option("--collection-only/--no-collection-only", default=False, show_default=True, help="Only validate specified collection") +@click.option("--ensure-referential-integrity/--no-ensure-referential-integrity", default=True, show_default=True, help="Ensure referential integrity") @click.pass_context -def validate(ctx, output_type, output): +def validate(ctx, output_type, output, collection_only, **kwargs): """Validate objects in the specified collection.""" - collection = ctx.obj["settings"].collection - logger.info(f"Validating collection {collection.alias}") - validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()] + if collection_only: + collection = ctx.obj["settings"].collection + logger.info(f"Validating collection {collection.alias}") + validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection(**kwargs)] + else: + db = ctx.obj["settings"].database + validation_results = [json_dumper.to_dict(x) for x in db.validate_database(**kwargs)] output_data = render_output(validation_results, output_type) if output: with open(output, "w") as f: diff --git a/src/linkml_store/index/implementations/llm_indexer.py b/src/linkml_store/index/implementations/llm_indexer.py index b8707b5..4d19944 100644 --- a/src/linkml_store/index/implementations/llm_indexer.py +++ b/src/linkml_store/index/implementations/llm_indexer.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Optional import numpy as np +import openai from linkml_store.api.config import CollectionConfig from linkml_store.index.indexer import INDEX_ITEM, Indexer @@ -11,6 +12,7 @@ if TYPE_CHECKING: import llm +CHUNK_SIZE = 1000 logger = logging.getLogger(__name__) @@ -25,7 +27,7 @@ class LLMIndexer(Indexer): >>> vector = indexer.text_to_vector("hello") """ - embedding_model_name: str = "ada-002" + embedding_model_name: str = "text-embedding-ada-002" _embedding_model: "llm.EmbeddingModel" = None cached_embeddings_database: str = None cached_embeddings_collection: str = None @@ -52,7 +54,7 @@ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM: """ return self.texts_to_vectors([text], cache=cache, **kwargs)[0] - def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]: + def texts_to_vectors(self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs) -> List[INDEX_ITEM]: """ Use LLM to embed. @@ -60,18 +62,21 @@ def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> Li >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"]) :param texts: + :param cache: + :param token_limit_penalty: :return: """ from tiktoken import encoding_for_model logging.info(f"Converting {len(texts)} texts to vectors") model = self.embedding_model # TODO: make this more accurate - token_limit = get_token_limit(model.model_id) - 200 - encoding = encoding_for_model("gpt-4o") + token_limit = get_token_limit(model.model_id) - token_limit_penalty + logging.info(f"Token limit for {model.model_id}: {token_limit}") + encoding = encoding_for_model(self.embedding_model_name) def truncate_text(text: str) -> str: # split into tokens every 1000 chars: - parts = [text[i : i + 1000] for i in range(0, len(text), 1000)] + parts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)] truncated = render_formatted_text( lambda x: "".join(x), parts, @@ -140,5 +145,5 @@ def truncate_text(text: str) -> str: embeddings_collection.commit() else: logger.info(f"Embedding {len(texts)} texts") - embeddings = model.embed_multi(texts) + embeddings = list(model.embed_multi(texts, batch_size=1)) return [np.array(v, dtype=float) for v in embeddings] diff --git a/src/linkml_store/utils/format_utils.py b/src/linkml_store/utils/format_utils.py index 00262ed..c9f9f85 100644 --- a/src/linkml_store/utils/format_utils.py +++ b/src/linkml_store/utils/format_utils.py @@ -440,6 +440,10 @@ def render_output( return "\n".join(json.dumps(obj) for obj in data) elif format == Format.PYTHON: return str(data) + elif format == Format.MARKDOWN: + def as_markdown(obj: dict): + return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()]) + return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data) elif format == Format.TABLE: from tabulate import tabulate return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql") diff --git a/src/linkml_store/utils/llm_utils.py b/src/linkml_store/utils/llm_utils.py index d53d8e7..f28cd15 100644 --- a/src/linkml_store/utils/llm_utils.py +++ b/src/linkml_store/utils/llm_utils.py @@ -76,6 +76,7 @@ def render_formatted_text( return text if not values: raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}") + # remove last element and try again return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit) diff --git a/tests/test_index/test_index.py b/tests/test_index/test_index.py index 2bb5826..e60f8f4 100644 --- a/tests/test_index/test_index.py +++ b/tests/test_index/test_index.py @@ -56,3 +56,4 @@ def test_index(index_class, texts): # Ensure the queried text appears at the top of the search results exact_matches = [r[1] for r in results if np.isclose(r[0], 1.0, rtol=1e-3)] assert text_id in exact_matches, f"Exact match not found in : {results}" + From e9f4e2247f1b4becb968c91655c3a32e8be91260 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Thu, 6 Mar 2025 19:05:59 -0800 Subject: [PATCH 2/3] format-code --- src/linkml_store/api/client.py | 5 +- src/linkml_store/api/collection.py | 42 +++--- src/linkml_store/api/database.py | 4 +- .../api/stores/duckdb/duckdb_database.py | 6 +- .../api/stores/filesystem/__init__.py | 2 +- .../api/stores/mongodb/mongodb_collection.py | 27 ++-- .../api/stores/solr/solr_collection.py | 10 +- src/linkml_store/cli.py | 49 +++++-- .../index/implementations/llm_indexer.py | 5 +- src/linkml_store/index/indexer.py | 11 +- .../implementations/llm_inference_engine.py | 22 +-- .../implementations/rag_inference_engine.py | 23 +-- .../inference/inference_config.py | 1 + src/linkml_store/utils/dat_parser.py | 26 ++-- src/linkml_store/utils/enrichment_analyzer.py | 46 +++--- src/linkml_store/utils/format_utils.py | 50 ++++--- src/linkml_store/utils/llm_utils.py | 3 +- src/linkml_store/utils/pandas_utils.py | 2 +- src/linkml_store/utils/sql_utils.py | 2 +- src/linkml_store/utils/vector_utils.py | 13 +- tests/test_api/test_api.py | 4 +- tests/test_api/test_mongodb_adapter.py | 14 +- tests/test_api/test_neo4j_adapter.py | 1 + tests/test_index/test_index.py | 1 - tests/test_inference/test_rag_engine.py | 9 +- tests/test_utils/test_dat_parser.py | 7 +- tests/test_utils/test_enrichment_analyzer.py | 138 +++++++++--------- 27 files changed, 283 insertions(+), 240 deletions(-) diff --git a/src/linkml_store/api/client.py b/src/linkml_store/api/client.py index 1208660..214e656 100644 --- a/src/linkml_store/api/client.py +++ b/src/linkml_store/api/client.py @@ -12,7 +12,6 @@ logger = logging.getLogger(__name__) - HANDLE_MAP = { "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase", "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase", @@ -220,14 +219,14 @@ def attach_database( scheme, _ = handle.split(":", 1) if scheme not in HANDLE_MAP: raise ValueError(f"Unknown scheme: {scheme}") - module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1) + module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1) try: module = importlib.import_module(module_path) cls = getattr(module, class_name) except ImportError as e: raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}") - #cls = HANDLE_MAP[scheme] + # cls = HANDLE_MAP[scheme] db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs) if schema_view: db.set_schema_view(schema_view) diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py index a043bcb..556fbad 100644 --- a/src/linkml_store/api/collection.py +++ b/src/linkml_store/api/collection.py @@ -211,7 +211,7 @@ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs): """ raise NotImplementedError - def index ( + def index( self, objs: Union[OBJECT, List[OBJECT]], index_name: Optional[str] = None, @@ -231,10 +231,13 @@ def index ( """ raise NotImplementedError - def upsert(self, - objs: Union[OBJECT, List[OBJECT]], - filter_fields: List[str], - update_fields: Union[List[str], None] = None, **kwargs): + def upsert( + self, + objs: Union[OBJECT, List[OBJECT]], + filter_fields: List[str], + update_fields: Union[List[str], None] = None, + **kwargs, + ): """ Add one or more objects to the collection. @@ -455,10 +458,10 @@ def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]: return None def find( - self, - where: Optional[Any] = None, - select_cols: Optional[List[str] ] = None, - **kwargs, + self, + where: Optional[Any] = None, + select_cols: Optional[List[str]] = None, + **kwargs, ) -> QueryResult: """ Find objects in the collection using a where query. @@ -596,6 +599,7 @@ def search( assert ix_coll.size() > 0 qr = ix_coll.find(where=where, limit=-1, **kwargs) index_col = ix.index_field + # TODO: optimize this for large indexes def row2array(row): v = row[index_col] @@ -603,6 +607,7 @@ def row2array(row): # sqlite stores arrays as strings v = json.loads(v) return np.array(v, dtype=float) + vector_pairs = [(row, row2array(row)) for row in qr.rows] results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs) for r in results: @@ -618,12 +623,12 @@ def row2array(row): return new_qr def group_by( - self, - group_by_fields: List[str], - inlined_field = "objects", - agg_map: Optional[Dict[str, str]] = None, - where: Optional[Dict] = None, - **kwargs, + self, + group_by_fields: List[str], + inlined_field="objects", + agg_map: Optional[Dict[str, str]] = None, + where: Optional[Dict] = None, + **kwargs, ) -> QueryResult: """ Group objects in the collection by a column. @@ -650,14 +655,9 @@ def group_by( top_obj = {k: v for k, v in zip(pk_fields, pk)} top_obj[inlined_field] = objs results.append(top_obj) - r = QueryResult( - num_rows=len(results), - rows=results - ) + r = QueryResult(num_rows=len(results), rows=results) return r - - @property def is_internal(self) -> bool: """ diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py index 08ad918..ec4c2b3 100644 --- a/src/linkml_store/api/database.py +++ b/src/linkml_store/api/database.py @@ -617,7 +617,9 @@ def validate_database(self, **kwargs) -> List["ValidationResult"]: """ return list(self.iter_validate_database(**kwargs)) - def iter_validate_database(self, ensure_referential_integrity: bool = None, **kwargs) -> Iterator["ValidationResult"]: + def iter_validate_database( + self, ensure_referential_integrity: bool = None, **kwargs + ) -> Iterator["ValidationResult"]: """ Validate the contents of the database. diff --git a/src/linkml_store/api/stores/duckdb/duckdb_database.py b/src/linkml_store/api/stores/duckdb/duckdb_database.py index 50406b6..6fdffac 100644 --- a/src/linkml_store/api/stores/duckdb/duckdb_database.py +++ b/src/linkml_store/api/stores/duckdb/duckdb_database.py @@ -100,9 +100,9 @@ def _table_exists(self, table: str) -> bool: meta_query = Query( from_table="sqlite_master", where_clause={ - #"type": "table", + # "type": "table", "name": table, - } + }, ) else: if table.startswith("information_schema"): @@ -112,7 +112,7 @@ def _table_exists(self, table: str) -> bool: where_clause={ "table_type": "BASE TABLE", "table_name": table, - } + }, ) qr = self.query(meta_query) diff --git a/src/linkml_store/api/stores/filesystem/__init__.py b/src/linkml_store/api/stores/filesystem/__init__.py index 742d463..405eb7a 100644 --- a/src/linkml_store/api/stores/filesystem/__init__.py +++ b/src/linkml_store/api/stores/filesystem/__init__.py @@ -4,7 +4,7 @@ Handles have the form: - ``file:`` for a local file - """ +""" from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase diff --git a/src/linkml_store/api/stores/mongodb/mongodb_collection.py b/src/linkml_store/api/stores/mongodb/mongodb_collection.py index 6e37a8b..2868f58 100644 --- a/src/linkml_store/api/stores/mongodb/mongodb_collection.py +++ b/src/linkml_store/api/stores/mongodb/mongodb_collection.py @@ -41,13 +41,14 @@ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs): del obj["_id"] self._post_insert_hook(objs) - - def index(self, - objs: Union[OBJECT, List[OBJECT]], - index_name: Optional[str] = None, - replace: bool = False, - unique: bool = False, - **kwargs): + def index( + self, + objs: Union[OBJECT, List[OBJECT]], + index_name: Optional[str] = None, + replace: bool = False, + unique: bool = False, + **kwargs, + ): """ Create indexes on the collection. @@ -86,11 +87,13 @@ def index(self, else: logging.debug(f"Index already exists for field {obj}, skipping creation.") - def upsert(self, - objs: Union[OBJECT, List[OBJECT]], - filter_fields: List[str], - update_fields: Optional[List[str]] = None, - **kwargs): + def upsert( + self, + objs: Union[OBJECT, List[OBJECT]], + filter_fields: List[str], + update_fields: Optional[List[str]] = None, + **kwargs, + ): """ Upsert one or more documents into the MongoDB collection. diff --git a/src/linkml_store/api/stores/solr/solr_collection.py b/src/linkml_store/api/stores/solr/solr_collection.py index bb80dd9..45a67f0 100644 --- a/src/linkml_store/api/stores/solr/solr_collection.py +++ b/src/linkml_store/api/stores/solr/solr_collection.py @@ -63,11 +63,11 @@ def query(self, query: Query, **kwargs) -> QueryResult: def query_facets( self, - where: Optional[Dict] = None, - facet_columns: List[str] = None, - facet_limit=DEFAULT_FACET_LIMIT, - facet_min_count: int = 1, - **kwargs + where: Optional[Dict] = None, + facet_columns: List[str] = None, + facet_limit=DEFAULT_FACET_LIMIT, + facet_min_count: int = 1, + **kwargs, ) -> Dict[str, Dict[str, int]]: solr_query = self._build_solr_query(where) solr_query["facet"] = "true" diff --git a/src/linkml_store/cli.py b/src/linkml_store/cli.py index 9b04618..7cb1bf5 100644 --- a/src/linkml_store/cli.py +++ b/src/linkml_store/cli.py @@ -142,7 +142,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, logger.setLevel(logging.ERROR) ctx.ensure_object(dict) if input: - database = "duckdb" # default: store in duckdb + database = "duckdb" # default: store in duckdb if input.startswith("http"): parts = input.split("/") collection = parts[-1] @@ -150,8 +150,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, else: stem = underscore(Path(input).stem) collection = stem - logger.info(f"Using input file: {input}, " - f"default storage is {database} and collection is {collection}") + logger.info(f"Using input file: {input}, " f"default storage is {database} and collection is {collection}") config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}}) if config is None and DEFAULT_LOCAL_CONF_PATH.exists(): config = DEFAULT_LOCAL_CONF_PATH @@ -206,7 +205,7 @@ def drop(ctx): @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects") @click.option("--format", "-f", type=format_choice, help="Input format") @click.option("--object", "-i", multiple=True, help="Input object as YAML") -@click.option("--source-field", help="If provided, inject file path source as this field") +@click.option("--source-field", help="If provided, inject file path source as this field") @json_select_query_option @click.pass_context def insert(ctx, files, replace, object, format, source_field, json_select_query): @@ -632,10 +631,12 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output): value_key = tuple([row.get(att) for att in value_atts]) pivoted[index_key][column_key] = value_key pivoted_objs = [] + def detuple(t: Tuple) -> Any: if len(t) == 1: return t[0] return str(t) + for index_key, data in pivoted.items(): obj = {att: key for att, key in zip(index_atts, index_key)} for column_key, value_key in data.items(): @@ -651,16 +652,27 @@ def detuple(t: Tuple) -> Any: @click.option("--output", "-o", type=click.Path(), help="Output file path") @click.option("--sample-field", "-I", help="Field to use as the sample identifier") @click.option("--classification-field", "-L", help="Field to use as for classification") -@click.option("--p-value-threshold", "-P", type=click.FLOAT, - default=0.05, show_default=True, - help="P-value threshold for enrichment") -@click.option("--multiple-testing-correction", "-M", type=click.STRING, - default="bh", show_default=True, - help="Multiple test correction method") +@click.option( + "--p-value-threshold", + "-P", + type=click.FLOAT, + default=0.05, + show_default=True, + help="P-value threshold for enrichment", +) +@click.option( + "--multiple-testing-correction", + "-M", + type=click.STRING, + default="bh", + show_default=True, + help="Multiple test correction method", +) @click.argument("samples", type=click.STRING, nargs=-1) @click.pass_context def enrichment(ctx, where, limit, output_type, output, sample_field, classification_field, samples, **kwargs): from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer + collection = ctx.obj["settings"].collection where_clause = yaml.safe_load(where) if where else None column_atts = [sample_field, classification_field] @@ -683,6 +695,7 @@ def enrichment(ctx, where, limit, output_type, output, sample_field, classificat else: click.echo(output_data) + @cli.command() @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format") @click.option("--output", "-o", type=click.Path(), help="Output file path") @@ -690,7 +703,7 @@ def enrichment(ctx, where, limit, output_type, output, sample_field, classificat @click.option( "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)" ) -@click.option("--training-collection", type=click.STRING,help="Collection to use for training") +@click.option("--training-collection", type=click.STRING, help="Collection to use for training") @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file") @click.option("--export-model", "-E", type=click.Path(), help="Export model to file") @click.option("--load-model", "-L", type=click.Path(), help="Load model from file") @@ -905,8 +918,18 @@ def indexes(ctx): @cli.command() @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format") @click.option("--output", "-o", type=click.Path(), help="Output file path") -@click.option("--collection-only/--no-collection-only", default=False, show_default=True, help="Only validate specified collection") -@click.option("--ensure-referential-integrity/--no-ensure-referential-integrity", default=True, show_default=True, help="Ensure referential integrity") +@click.option( + "--collection-only/--no-collection-only", + default=False, + show_default=True, + help="Only validate specified collection", +) +@click.option( + "--ensure-referential-integrity/--no-ensure-referential-integrity", + default=True, + show_default=True, + help="Ensure referential integrity", +) @click.pass_context def validate(ctx, output_type, output, collection_only, **kwargs): """Validate objects in the specified collection.""" diff --git a/src/linkml_store/index/implementations/llm_indexer.py b/src/linkml_store/index/implementations/llm_indexer.py index 4d19944..e45858c 100644 --- a/src/linkml_store/index/implementations/llm_indexer.py +++ b/src/linkml_store/index/implementations/llm_indexer.py @@ -54,7 +54,9 @@ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM: """ return self.texts_to_vectors([text], cache=cache, **kwargs)[0] - def texts_to_vectors(self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs) -> List[INDEX_ITEM]: + def texts_to_vectors( + self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs + ) -> List[INDEX_ITEM]: """ Use LLM to embed. @@ -67,6 +69,7 @@ def texts_to_vectors(self, texts: List[str], cache: bool = None, token_limit_pen :return: """ from tiktoken import encoding_for_model + logging.info(f"Converting {len(texts)} texts to vectors") model = self.embedding_model # TODO: make this more accurate diff --git a/src/linkml_store/index/indexer.py b/src/linkml_store/index/indexer.py index 70e227b..837ad84 100644 --- a/src/linkml_store/index/indexer.py +++ b/src/linkml_store/index/indexer.py @@ -154,8 +154,11 @@ def object_to_text(self, obj: Dict[str, Any]) -> str: return str(obj) def search( - self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None, - mmr_relevance_factor: Optional[float] = None + self, + query: str, + vectors: List[Tuple[str, INDEX_ITEM]], + limit: Optional[int] = None, + mmr_relevance_factor: Optional[float] = None, ) -> List[Tuple[float, Any]]: """ Use the indexer to search against a database of vectors. @@ -175,8 +178,8 @@ def search( vlist = [v for _, v in vectors] idlist = [id for id, _ in vectors] sorted_indices = mmr_diversified_search( - query_vector, vlist, - relevance_factor=mmr_relevance_factor, top_n=limit) + query_vector, vlist, relevance_factor=mmr_relevance_factor, top_n=limit + ) results = [] # TODO: this is inefficient when limit is high for i in range(limit): diff --git a/src/linkml_store/inference/implementations/llm_inference_engine.py b/src/linkml_store/inference/implementations/llm_inference_engine.py index 996221e..4cc10f2 100644 --- a/src/linkml_store/inference/implementations/llm_inference_engine.py +++ b/src/linkml_store/inference/implementations/llm_inference_engine.py @@ -79,21 +79,24 @@ def object_to_text(self, object: OBJECT) -> str: def _schema_str(self) -> str: db = self.training_data.base_collection.parent from linkml_runtime.dumpers import json_dumper + schema_dict = json_dumper.to_dict(db.schema_view.schema) return yaml.dump(schema_dict) - def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]: + def derive( + self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None + ) -> Optional[LLMInference]: import llm model: llm.Model = self.model - #model_name = self.config.llm_config.model_name - #feature_attributes = self.config.feature_attributes + # model_name = self.config.llm_config.model_name + # feature_attributes = self.config.feature_attributes target_attributes = self.config.target_attributes query_text = self.object_to_text(object) if not target_attributes: target_attributes = [k for k, v in object.items() if v is None or v == ""] - #if not feature_attributes: + # if not feature_attributes: # feature_attributes = [k for k, v in object.items() if v is not None and v != ""] system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config) @@ -107,7 +110,9 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[ "```yaml\n" f"{stub}\n" "```\n" - "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n" + "---\nQuery:\n" + f"## INCOMPLETE OBJECT:\n{query_text}\n" + "## OUTPUT:\n" ) logger.info(f"Prompt: {prompt}") response = model.prompt(prompt, system=system_prompt) @@ -130,9 +135,8 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[ "\nThis was invalid.\n", "Validation errors:\n", ] + [self.object_to_text(e) for e in errs] - return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts) - return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object) - + return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts) + return LLMInference(predicted_object=predicted_object, iterations=iteration + 1, query=object) def export_model( self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs @@ -149,4 +153,4 @@ def save_model(self, output: Union[str, Path]) -> None: @classmethod def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine": - raise NotImplementedError("Does not make sense for this engine") \ No newline at end of file + raise NotImplementedError("Does not make sense for this engine") diff --git a/src/linkml_store/inference/implementations/rag_inference_engine.py b/src/linkml_store/inference/implementations/rag_inference_engine.py index 64d321a..942801b 100644 --- a/src/linkml_store/inference/implementations/rag_inference_engine.py +++ b/src/linkml_store/inference/implementations/rag_inference_engine.py @@ -111,7 +111,9 @@ def initialize_model(self, **kwargs): def object_to_text(self, object: OBJECT) -> str: return yaml.dump(object) - def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]: + def derive( + self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None + ) -> Optional[RAGInference]: import llm from tiktoken import encoding_for_model @@ -131,8 +133,9 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[ if not self.rag_collection.indexers: raise ValueError("RAG collection must have an indexer attached") logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}") - rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm", - mmr_relevance_factor=mmr_relevance_factor) + rs = self.rag_collection.search( + query_text, limit=num_examples, index_name="llm", mmr_relevance_factor=mmr_relevance_factor + ) examples = rs.rows logger.info(f"Found {len(examples)} examples") if not examples: @@ -153,11 +156,11 @@ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[ input_obj_text = self.object_to_text(input_obj) if input_obj_text == query_text: continue - #raise ValueError( + # raise ValueError( # f"Query object {query_text} is the same as example object {input_obj_text}\n" # "This indicates possible test data leakage\n." # "TODO: allow an option that allows user to treat this as a basic lookup\n" - #) + # ) output_obj = select_nested(example, target_attributes) prompt_clause = ( "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n" @@ -176,9 +179,9 @@ def make_text(texts: List[str]): except KeyError: encoding = encoding_for_model("gpt-4") token_limit = get_token_limit(model_name) - prompt = render_formatted_text(make_text, values=prompt_clauses, - encoding=encoding, token_limit=token_limit, - additional_text=system_prompt) + prompt = render_formatted_text( + make_text, values=prompt_clauses, encoding=encoding, token_limit=token_limit, additional_text=system_prompt + ) logger.info(f"Prompt: {prompt}") response = model.prompt(prompt, system=system_prompt) yaml_str = response.text() @@ -199,8 +202,8 @@ def make_text(texts: List[str]): "\nThis was invalid.\n", "Validation errors:\n", ] + [self.object_to_text(e) for e in errs] - return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts) - return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object) + return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts) + return RAGInference(predicted_object=predicted_object, iterations=iteration + 1, query=object) def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]: if "```" in yaml_str: diff --git a/src/linkml_store/inference/inference_config.py b/src/linkml_store/inference/inference_config.py index 1556d27..538320a 100644 --- a/src/linkml_store/inference/inference_config.py +++ b/src/linkml_store/inference/inference_config.py @@ -59,6 +59,7 @@ class Inference(BaseModel, extra="forbid"): """ Result of an inference derivation. """ + query: Optional[OBJECT] = Field(default=None, description="The query object.") predicted_object: OBJECT = Field(..., description="The predicted object.") confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0) diff --git a/src/linkml_store/utils/dat_parser.py b/src/linkml_store/utils/dat_parser.py index 625ada8..f271dfe 100644 --- a/src/linkml_store/utils/dat_parser.py +++ b/src/linkml_store/utils/dat_parser.py @@ -2,6 +2,7 @@ ENTRY = Dict[str, Any] + def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]: """ Parse SIB/Swiss-Prot format data into a structured dictionary. @@ -13,7 +14,7 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]: dict: A dictionary with entry IDs as keys and parsed data as values """ # Split the text into entries (separated by //) - entries = text.split('//\n') + entries = text.split("//\n") header = None # Initialize results dictionary @@ -29,12 +30,12 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]: current_code = None # Process each line - for line in entry.strip().split('\n'): + for line in entry.strip().split("\n"): if not line.strip(): continue # Check if this is a new field (starts with a 2-letter code followed by space) - if len(line) > 2 and line[2] == ' ': + if len(line) > 2 and line[2] == " ": current_code = line[0:2] # Remove the code and the following space(s) value = line[3:].strip() @@ -48,7 +49,7 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]: # Continuation of previous field elif current_code is not None: # Handle continuation lines (typically indented) - if current_code == 'CC': + if current_code == "CC": # For comments, preserve the indentation current_entry[current_code].append(line) else: @@ -59,35 +60,36 @@ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]: # -!- ... # ... # -!- ... - ccs = current_entry.get('CC', []) + ccs = current_entry.get("CC", []) new_ccs = [] for cc in ccs: - if not cc.startswith('-!-') and new_ccs: + if not cc.startswith("-!-") and new_ccs: new_ccs[-1] += " " + cc else: new_ccs.append(cc) - current_entry['CC'] = new_ccs + current_entry["CC"] = new_ccs for k, vs in current_entry.items(): - if k != 'CC': - combined = ''.join(vs) + if k != "CC": + combined = "".join(vs) combined = combined.strip() if combined.endswith("."): combined = combined.split(".") combined = [c.strip() for c in combined if c.strip()] - if k == 'DE': + if k == "DE": combined = combined[0] current_entry[k] = combined - if 'ID' in current_entry: + if "ID" in current_entry: results.append(current_entry) else: header = current_entry return header, results + # Example usage: # data = parse_sib_format(text) # for entry_id, entry_data in data.items(): # print(f"Entry: {entry_id}") # for code, values in entry_data.items(): -# print(f" {code}: {values}") \ No newline at end of file +# print(f" {code}: {values}") diff --git a/src/linkml_store/utils/enrichment_analyzer.py b/src/linkml_store/utils/enrichment_analyzer.py index 5b60058..0d759c7 100644 --- a/src/linkml_store/utils/enrichment_analyzer.py +++ b/src/linkml_store/utils/enrichment_analyzer.py @@ -10,6 +10,7 @@ class EnrichedCategory(BaseModel): """ Information about a category enriched in a sample """ + category: str fold_change: float original_p_value: float @@ -41,7 +42,7 @@ def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str): self.sample_cache: Dict[str, Counter] = {} @classmethod - def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> 'EnrichmentAnalyzer': + def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer": """ Initialize the analyzer with a Collection and key column names. Precomputes category frequencies for the entire dataset. @@ -91,7 +92,7 @@ def _get_sample_stats(self, sample_id: str) -> Counter: if sample_data.empty: raise KeyError(f"Sample ID '{sample_id}' not found") sample_data = sample_data.dropna() - #if sample_data.empty: + # if sample_data.empty: # raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA") counter = Counter() @@ -104,10 +105,13 @@ def _get_sample_stats(self, sample_id: str) -> Counter: self.sample_cache[sample_id] = counter return counter - def find_enriched_categories(self, sample_id: str, - min_occurrences: int = 5, - p_value_threshold: float = 0.05, - multiple_testing_correction: str = 'bh') -> List[EnrichedCategory]: + def find_enriched_categories( + self, + sample_id: str, + min_occurrences: int = 5, + p_value_threshold: float = 0.05, + multiple_testing_correction: str = "bh", + ) -> List[EnrichedCategory]: """ Find categories that are enriched in the given sample. @@ -135,14 +139,18 @@ def find_enriched_categories(self, sample_id: str, # Calculate fold change sample_freq = sample_count / total_sample_annotations global_freq = global_count / total_global_annotations - fold_change = sample_freq / global_freq if global_freq > 0 else float('inf') + fold_change = sample_freq / global_freq if global_freq > 0 else float("inf") # Perform Fisher's exact test - contingency_table = np.array([ - [sample_count, global_count - sample_count], - [total_sample_annotations - sample_count, - total_global_annotations - total_sample_annotations - (global_count - sample_count)] - ]) + contingency_table = np.array( + [ + [sample_count, global_count - sample_count], + [ + total_sample_annotations - sample_count, + total_global_annotations - total_sample_annotations - (global_count - sample_count), + ], + ] + ) _, p_value = stats.fisher_exact(contingency_table) @@ -158,12 +166,12 @@ def find_enriched_categories(self, sample_id: str, # Apply multiple testing correction categories, fold_changes, p_values = zip(*results) - if multiple_testing_correction.lower() == 'bonf': + if multiple_testing_correction.lower() == "bonf": # Bonferroni correction n_tests = len(self.global_stats) # Total number of categories tested adjusted_p_values = [min(1.0, p * n_tests) for p in p_values] - elif multiple_testing_correction.lower() == 'bh': + elif multiple_testing_correction.lower() == "bh": # Benjamini-Hochberg correction n = len(p_values) sorted_indices = np.argsort(p_values) @@ -192,12 +200,7 @@ def find_enriched_categories(self, sample_id: str, # Filter by adjusted p-value threshold and create final results # Create EnrichedCategory objects final_results = [ - EnrichedCategory( - category=cat, - fold_change=fc, - original_p_value=p, - adjusted_p_value=adj_p - ) + EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p) for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values) if adj_p < p_value_threshold ] @@ -206,8 +209,9 @@ def find_enriched_categories(self, sample_id: str, final_results.sort(key=lambda x: x.adjusted_p_value) return final_results + # Example usage: # analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories') # enriched = analyzer.find_enriched_categories('sample1') # for category, fold_change, p_value in enriched: -# print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})") \ No newline at end of file +# print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})") diff --git a/src/linkml_store/utils/format_utils.py b/src/linkml_store/utils/format_utils.py index c9f9f85..9e1d589 100644 --- a/src/linkml_store/utils/format_utils.py +++ b/src/linkml_store/utils/format_utils.py @@ -139,12 +139,13 @@ def clean_nested_structure(obj): else: return clean_pandas_value(obj) + def process_file( - f: IO, - format: Format, - expected_type: Optional[Type] = None, - header_comment_token: Optional[str] = None, - format_options: Optional[Dict[str, Any]] = None, + f: IO, + format: Format, + expected_type: Optional[Type] = None, + header_comment_token: Optional[str] = None, + format_options: Optional[Dict[str, Any]] = None, ) -> List[Dict[str, Any]]: """ Process a single file and return a list of objects. @@ -173,6 +174,7 @@ def process_file( objs = yaml.safe_load(f) elif format == Format.TOML: import toml + objs = toml.load(f) if not isinstance(objs, list): objs = [objs] @@ -214,13 +216,15 @@ def process_file( for line in f: parts = line.strip().split("\t") desc = parts[1] - objs.append({ - "library": lib_name, - "uid": f"{lib_name}.{parts[0]}", - "name": parts[0], - "description": desc if desc else None, - "genes": parts[2:], - }) + objs.append( + { + "library": lib_name, + "uid": f"{lib_name}.{parts[0]}", + "name": parts[0], + "description": desc if desc else None, + "genes": parts[2:], + } + ) elif format == Format.FASTA: objs = [] current_obj = None @@ -237,29 +241,33 @@ def process_file( elif format == Format.OBO: blocks = split_document(f.read(), "\n\n") id_pattern = re.compile(r"id: (\S+)") + def get_id(block): m = id_pattern.search(block) return m.group(1) if m else None + objs = [{"id": get_id(block), "content": block} for block in blocks] objs = [obj for obj in objs if obj["id"]] elif format == Format.DAT: from linkml_store.utils.dat_parser import parse_sib_format + _, objs = parse_sib_format(f.read()) elif format in (Format.RDFXML, Format.TURTLE): import lightrdf + parser = lightrdf.Parser() objs = [] ext_fmt = "rdfxml" if format == Format.TURTLE: ext_fmt = "ttl" - bytesio = io.BytesIO(f.read().encode('utf-8')) + bytesio = io.BytesIO(f.read().encode("utf-8")) buffer = io.BufferedReader(bytesio) for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt): obj = { - "subject": s, - "predicate": p, - "object": o, - } + "subject": s, + "predicate": p, + "object": o, + } if format_options.get("pivot", False): obj = { "subject": s, @@ -389,7 +397,8 @@ def write_output( def render_output( - data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]], format: Optional[Union[Format, str]] = Format.YAML + data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]], + format: Optional[Union[Format, str]] = Format.YAML, ) -> str: """ Render output data in JSON, JSONLines, YAML, CSV, or TSV format. @@ -441,11 +450,14 @@ def render_output( elif format == Format.PYTHON: return str(data) elif format == Format.MARKDOWN: + def as_markdown(obj: dict): return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()]) + return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data) elif format == Format.TABLE: from tabulate import tabulate + return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql") elif format == Format.YAML: if isinstance(data, list): @@ -510,4 +522,4 @@ def split_document(doc: str, delimiter: str): :param delimiter: The delimiter. :return: The parts of the document. """ - return doc.split(delimiter) \ No newline at end of file + return doc.split(delimiter) diff --git a/src/linkml_store/utils/llm_utils.py b/src/linkml_store/utils/llm_utils.py index f28cd15..ca53be1 100644 --- a/src/linkml_store/utils/llm_utils.py +++ b/src/linkml_store/utils/llm_utils.py @@ -105,6 +105,7 @@ def get_token_limit(model_name: str) -> int: def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]: import yaml + if "```" in yaml_str: yaml_str = yaml_str.split("```")[1].strip() if yaml_str.startswith("yaml"): @@ -115,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]: if strict: raise e logger.error(f"Error parsing YAML: {yaml_str}\n{e}") - return None \ No newline at end of file + return None diff --git a/src/linkml_store/utils/pandas_utils.py b/src/linkml_store/utils/pandas_utils.py index 4355bb2..5229dd1 100644 --- a/src/linkml_store/utils/pandas_utils.py +++ b/src/linkml_store/utils/pandas_utils.py @@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame: def facet_summary_to_dataframe_unmelted( - facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]] + facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]], ) -> pd.DataFrame: rows = [] diff --git a/src/linkml_store/utils/sql_utils.py b/src/linkml_store/utils/sql_utils.py index 01004aa..bac73d8 100644 --- a/src/linkml_store/utils/sql_utils.py +++ b/src/linkml_store/utils/sql_utils.py @@ -116,7 +116,7 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul modified_where = " AND ".join(conditions) def make_col_safe(col): - return '"' + quoted_name(col, True) + '"' if ' ' in col else col + return '"' + quoted_name(col, True) + '"' if " " in col else col if isinstance(facet_column, str): facet_column = make_col_safe(facet_column) diff --git a/src/linkml_store/utils/vector_utils.py b/src/linkml_store/utils/vector_utils.py index 98e727e..f091206 100644 --- a/src/linkml_store/utils/vector_utils.py +++ b/src/linkml_store/utils/vector_utils.py @@ -8,6 +8,7 @@ LOL = List[List[float]] + def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float: """ Calculate the cosine similarity between two vectors. @@ -77,9 +78,7 @@ def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.nd return top_match_indices, top_match_values -def top_n_matches( - cosine_similarity_matrix: np.ndarray, n: int = 10 -) -> Tuple[np.ndarray, np.ndarray]: +def top_n_matches(cosine_similarity_matrix: np.ndarray, n: int = 10) -> Tuple[np.ndarray, np.ndarray]: # Find the indices that would sort each row in descending order sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1) @@ -136,10 +135,7 @@ def mmr_diversified_search( max_sim_to_selected = max( [ np.dot(document_vectors[idx], document_vectors[s]) - / ( - np.linalg.norm(document_vectors[idx]) - * np.linalg.norm(document_vectors[s]) - ) + / (np.linalg.norm(document_vectors[idx]) * np.linalg.norm(document_vectors[s])) for s in selected_indices ] ) @@ -160,6 +156,3 @@ def mmr_diversified_search( selected_indices.add(best_index) return result_indices - - - diff --git a/tests/test_api/test_api.py b/tests/test_api/test_api.py index e395a4e..2952b03 100644 --- a/tests/test_api/test_api.py +++ b/tests/test_api/test_api.py @@ -65,7 +65,7 @@ def is_persistent(handle: str) -> bool: - #if "duckdb" in handle: + # if "duckdb" in handle: # # NOTE: in previous versions of duckdb, in-memory databases were not persistent # return True return ".db" in handle or "mongodb" in handle or "file:" in handle @@ -334,8 +334,6 @@ def test_group_by(handle): assert False, f"Unexpected id: {row['id']}" - - @pytest.mark.parametrize("handle", SCHEMES_PLUS) def test_collections_of_same_type(handle): """ diff --git a/tests/test_api/test_mongodb_adapter.py b/tests/test_api/test_mongodb_adapter.py index 66e9269..5f3f58a 100644 --- a/tests/test_api/test_mongodb_adapter.py +++ b/tests/test_api/test_mongodb_adapter.py @@ -7,6 +7,7 @@ from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection from pymongo import MongoClient + @pytest.fixture(scope="module") def mongodb_client(): try: @@ -220,11 +221,9 @@ def test_index_creation(mongodb_collection, unique_flag): mongodb_collection.mongo_collection.delete_many({}) # Insert **unique, non-null** values for test_field to avoid duplicate key error - mongodb_collection.mongo_collection.insert_many([ - {"_id": 1, "test_field": "value1"}, - {"_id": 2, "test_field": "value2"}, - {"_id": 3, "test_field": "value3"} - ]) + mongodb_collection.mongo_collection.insert_many( + [{"_id": 1, "test_field": "value1"}, {"_id": 2, "test_field": "value2"}, {"_id": 3, "test_field": "value3"}] + ) # Create the index using the method with the unique flag mongodb_collection.index(index_field, index_name=index_name, replace=True, unique=unique_flag) @@ -239,5 +238,6 @@ def test_index_creation(mongodb_collection, unique_flag): if unique_flag: assert created_indexes[index_name]["unique"], f"Index {index_name} should be unique" else: - assert "unique" not in created_indexes[index_name] or not created_indexes[index_name]["unique"], \ - f"Index {index_name} should not be unique" + assert ( + "unique" not in created_indexes[index_name] or not created_indexes[index_name]["unique"] + ), f"Index {index_name} should not be unique" diff --git a/tests/test_api/test_neo4j_adapter.py b/tests/test_api/test_neo4j_adapter.py index 13e2e85..33b7dab 100644 --- a/tests/test_api/test_neo4j_adapter.py +++ b/tests/test_api/test_neo4j_adapter.py @@ -14,6 +14,7 @@ neo4j """ + import pytest from linkml_runtime import SchemaView from linkml_runtime.utils.schema_builder import SchemaBuilder diff --git a/tests/test_index/test_index.py b/tests/test_index/test_index.py index e60f8f4..2bb5826 100644 --- a/tests/test_index/test_index.py +++ b/tests/test_index/test_index.py @@ -56,4 +56,3 @@ def test_index(index_class, texts): # Ensure the queried text appears at the top of the search results exact_matches = [r[1] for r in results if np.isclose(r[0], 1.0, rtol=1e-3)] assert text_id in exact_matches, f"Exact match not found in : {results}" - diff --git a/tests/test_inference/test_rag_engine.py b/tests/test_inference/test_rag_engine.py index 7cf6e24..274bc48 100644 --- a/tests/test_inference/test_rag_engine.py +++ b/tests/test_inference/test_rag_engine.py @@ -105,7 +105,6 @@ def test_inference_nested(handle): # check_accuracy2(ie2, targets, threshold=0.33, features=features, test_data=ie.testing_data.as_dataframe()) - @pytest.mark.integration @pytest.mark.parametrize("handle", SCHEMES) def test_with_validation(handle): @@ -172,7 +171,9 @@ def test_with_validation(handle): sb.add_slot("predicate", range="PredicateType", replace_if_present=True) sv = SchemaView(sb.schema) collection.parent.set_schema_view(sv) - errs = list(collection.iter_validate_collection([{"triples": [{"subject": "a", "predicate": "unknown", "object": "b"}]}])) + errs = list( + collection.iter_validate_collection([{"triples": [{"subject": "a", "predicate": "unknown", "object": "b"}]}]) + ) assert len(errs) == 1 result = ie.derive({"paper": {"abstract": "Mark Hamill played a starring role in the movie Star Wars"}}) assert result @@ -184,7 +185,3 @@ def test_with_validation(handle): # (note that in future this unit test could conceivably be used in training models, in which case # it will need to be modified to a different hard-to-guess predicate) assert result.iterations > 1 - - - - diff --git a/tests/test_utils/test_dat_parser.py b/tests/test_utils/test_dat_parser.py index 6eddd02..191bc59 100644 --- a/tests/test_utils/test_dat_parser.py +++ b/tests/test_utils/test_dat_parser.py @@ -3,13 +3,14 @@ DAT_FILE = INPUT_DIR / "expasy-subset.dat" + def test_parse_dat(): - entries = process_file(open(DAT_FILE) , Format.DAT) + entries = process_file(open(DAT_FILE), Format.DAT) assert len(entries) == 2 e1 = entries[0] dr1 = e1["DR"] - assert dr1.endswith('Q46856, YQHD_ECOLI ;') + assert dr1.endswith("Q46856, YQHD_ECOLI ;") de1 = e1["DE"] - assert de1 == 'alcohol dehydrogenase (NADP(+))' + assert de1 == "alcohol dehydrogenase (NADP(+))" cc1 = e1["CC"] assert len(cc1) == 4 diff --git a/tests/test_utils/test_enrichment_analyzer.py b/tests/test_utils/test_enrichment_analyzer.py index e8928f3..cfc92c5 100644 --- a/tests/test_utils/test_enrichment_analyzer.py +++ b/tests/test_utils/test_enrichment_analyzer.py @@ -2,31 +2,41 @@ import pandas as pd import numpy as np from collections import Counter -from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer # Assuming the previous code is in enrichment_analysis.py +from linkml_store.utils.enrichment_analyzer import ( + EnrichmentAnalyzer, +) # Assuming the previous code is in enrichment_analysis.py @pytest.fixture def sample_df(): """Create a test DataFrame with known enrichment patterns""" data = { - 'sample_id': [ - 'sample1', 'sample1', 'sample1', 'sample1', 'sample1', - 'sample2', 'sample2', 'sample2', - 'sample3', 'sample3', 'sample3' + "sample_id": [ + "sample1", + "sample1", + "sample1", + "sample1", + "sample1", + "sample2", + "sample2", + "sample2", + "sample3", + "sample3", + "sample3", + ], + "categories": [ + ["A", "B"], + ["A", "C"], + ["A", "B"], + ["B", "C"], + ["A"], + ["C", "D"], + ["C", "D"], + ["D", "E"], + ["E", "F"], + ["E", "F"], + ["F", "G"], ], - 'categories': [ - ['A', 'B'], - ['A', 'C'], - ['A', 'B'], - ['B', 'C'], - ['A'], - ['C', 'D'], - ['C', 'D'], - ['D', 'E'], - ['E', 'F'], - ['E', 'F'], - ['F', 'G'] - ] } return pd.DataFrame(data) @@ -34,64 +44,52 @@ def sample_df(): @pytest.fixture def analyzer(sample_df): """Create an EnrichmentAnalyzer instance with the sample data""" - return EnrichmentAnalyzer(sample_df, 'sample_id', 'categories') + return EnrichmentAnalyzer(sample_df, "sample_id", "categories") def test_initialization(analyzer, sample_df): """Test that the analyzer initializes correctly""" assert analyzer.df.equals(sample_df) - assert analyzer.sample_key == 'sample_id' - assert analyzer.classification_key == 'categories' + assert analyzer.sample_key == "sample_id" + assert analyzer.classification_key == "categories" assert isinstance(analyzer.global_stats, Counter) assert len(analyzer.sample_cache) == 0 def test_global_stats_computation(analyzer): """Test that global statistics are computed correctly""" - expected_counts = { - 'A': 4, - 'B': 3, - 'C': 4, - 'D': 3, - 'E': 3, - 'F': 3, - 'G': 1 - } + expected_counts = {"A": 4, "B": 3, "C": 4, "D": 3, "E": 3, "F": 3, "G": 1} assert dict(analyzer.global_stats) == expected_counts def test_sample_stats_computation(analyzer): """Test that sample-specific statistics are computed correctly""" - sample1_stats = analyzer._get_sample_stats('sample1') - expected_sample1 = { - 'A': 4, - 'B': 3, - 'C': 2 - } + sample1_stats = analyzer._get_sample_stats("sample1") + expected_sample1 = {"A": 4, "B": 3, "C": 2} assert dict(sample1_stats) == expected_sample1 # Test caching - assert 'sample1' in analyzer.sample_cache - assert dict(analyzer.sample_cache['sample1']) == expected_sample1 + assert "sample1" in analyzer.sample_cache + assert dict(analyzer.sample_cache["sample1"]) == expected_sample1 def test_enrichment_analysis(analyzer): """Test the enrichment analysis results with different multiple testing corrections""" # Test without correction - enriched_none = analyzer.find_enriched_categories('sample1', min_occurrences=2, - p_value_threshold=0.05, - multiple_testing_correction='none') + enriched_none = analyzer.find_enriched_categories( + "sample1", min_occurrences=2, p_value_threshold=0.05, multiple_testing_correction="none" + ) # Test with Bonferroni correction - enriched_bonf = analyzer.find_enriched_categories('sample1', min_occurrences=2, - p_value_threshold=0.05, - multiple_testing_correction='bonf') + enriched_bonf = analyzer.find_enriched_categories( + "sample1", min_occurrences=2, p_value_threshold=0.05, multiple_testing_correction="bonf" + ) # Test with Benjamini-Hochberg correction - enriched_bh = analyzer.find_enriched_categories('sample1', min_occurrences=2, - p_value_threshold=0.05, - multiple_testing_correction='bh') + enriched_bh = analyzer.find_enriched_categories( + "sample1", min_occurrences=2, p_value_threshold=0.05, multiple_testing_correction="bh" + ) # Convert results to more easily testable format enriched_dict_none = {result.category: result for result in enriched_none} @@ -103,12 +101,12 @@ def test_enrichment_analysis(analyzer): assert len(enriched_bh) >= len(enriched_bonf) # BH should find more than Bonferroni # Check that A and B are enriched in at least one method - assert any(('A' in d) for d in [enriched_dict_none, enriched_dict_bonf, enriched_dict_bh]) + assert any(("A" in d) for d in [enriched_dict_none, enriched_dict_bonf, enriched_dict_bh]) # Check fold changes make sense for enriched_dict in [enriched_dict_none, enriched_dict_bonf, enriched_dict_bh]: - if 'A' in enriched_dict: - result = enriched_dict['A'] + if "A" in enriched_dict: + result = enriched_dict["A"] assert result.fold_change > 1.0 # Should be enriched # Check p-values and adjusted p-values are valid @@ -116,55 +114,51 @@ def test_enrichment_analysis(analyzer): for result in enriched_dict.values(): assert 0 <= result.original_p_value <= 1 assert 0 <= result.adjusted_p_value <= 1 - assert result.adjusted_p_value >= result.original_p_value # Adjusted p-value should never be smaller than original + assert ( + result.adjusted_p_value >= result.original_p_value + ) # Adjusted p-value should never be smaller than original def test_edge_cases(sample_df): """Test edge cases and potential error conditions""" # Test empty DataFrame - empty_df = pd.DataFrame({'sample_id': [], 'categories': []}) - analyzer_empty = EnrichmentAnalyzer(empty_df, 'sample_id', 'categories') + empty_df = pd.DataFrame({"sample_id": [], "categories": []}) + analyzer_empty = EnrichmentAnalyzer(empty_df, "sample_id", "categories") assert len(analyzer_empty.global_stats) == 0 # Test single category - single_cat_data = { - 'sample_id': ['sample1', 'sample2'], - 'categories': [['A'], ['A']] - } + single_cat_data = {"sample_id": ["sample1", "sample2"], "categories": [["A"], ["A"]]} single_cat_df = pd.DataFrame(single_cat_data) - analyzer_single = EnrichmentAnalyzer(single_cat_df, 'sample_id', 'categories') - assert dict(analyzer_single.global_stats) == {'A': 2} + analyzer_single = EnrichmentAnalyzer(single_cat_df, "sample_id", "categories") + assert dict(analyzer_single.global_stats) == {"A": 2} # Test non-list categories (string input) - string_cat_data = { - 'sample_id': ['sample1', 'sample2'], - 'categories': ['A', 'B'] - } + string_cat_data = {"sample_id": ["sample1", "sample2"], "categories": ["A", "B"]} string_cat_df = pd.DataFrame(string_cat_data) - analyzer_string = EnrichmentAnalyzer(string_cat_df, 'sample_id', 'categories') - assert dict(analyzer_string.global_stats) == {'A': 1, 'B': 1} + analyzer_string = EnrichmentAnalyzer(string_cat_df, "sample_id", "categories") + assert dict(analyzer_string.global_stats) == {"A": 1, "B": 1} def test_invalid_sample_id(analyzer): """Test behavior with invalid sample ID""" with pytest.raises(KeyError): - analyzer._get_sample_stats('nonexistent_sample') + analyzer._get_sample_stats("nonexistent_sample") def test_min_occurrences_filter(analyzer): """Test that minimum occurrences filter works""" # Set high minimum occurrences to filter out most categories - enriched = analyzer.find_enriched_categories('sample1', min_occurrences=10) + enriched = analyzer.find_enriched_categories("sample1", min_occurrences=10) assert len(enriched) == 0 # No categories should meet this threshold def test_p_value_threshold(analyzer): """Test that p-value threshold works""" # Set very strict p-value threshold - strict_enriched = analyzer.find_enriched_categories('sample1', p_value_threshold=0.0001) + strict_enriched = analyzer.find_enriched_categories("sample1", p_value_threshold=0.0001) # Set loose p-value threshold - loose_enriched = analyzer.find_enriched_categories('sample1', p_value_threshold=0.5) + loose_enriched = analyzer.find_enriched_categories("sample1", p_value_threshold=0.5) # Should find more enriched categories with looser threshold assert len(strict_enriched) <= len(loose_enriched) @@ -172,10 +166,10 @@ def test_p_value_threshold(analyzer): def test_result_sorting(analyzer): """Test that results are properly sorted by p-value""" - enriched = analyzer.find_enriched_categories('sample1') + enriched = analyzer.find_enriched_categories("sample1") p_values = [p for _, _, p in enriched] assert p_values == sorted(p_values) # Should be sorted in ascending order -if __name__ == '__main__': - pytest.main([__file__]) \ No newline at end of file +if __name__ == "__main__": + pytest.main([__file__]) From 993845b563c8bda51366ff098b7a3b5432ec783a Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Thu, 6 Mar 2025 19:14:48 -0800 Subject: [PATCH 3/3] checks --- .../inference/implementations/sklearn_inference_engine.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/linkml_store/inference/implementations/sklearn_inference_engine.py b/src/linkml_store/inference/implementations/sklearn_inference_engine.py index c8f8810..2f83990 100644 --- a/src/linkml_store/inference/implementations/sklearn_inference_engine.py +++ b/src/linkml_store/inference/implementations/sklearn_inference_engine.py @@ -94,6 +94,8 @@ def initialize_model(self, **kwargs): if not feature_cols: feature_cols = df.columns.difference(target_cols).tolist() self.config.feature_attributes = feature_cols + if not feature_cols: + raise ValueError("No features found in the data") target_col = target_cols[0] logger.info(f"Feature columns: {feature_cols}") X = df[feature_cols].copy() @@ -102,6 +104,8 @@ def initialize_model(self, **kwargs): # find list of features to skip (categorical with > N categories) skip_features = [] + if not len(X.columns): + raise ValueError("No features to train on") for col in X.columns: unique_values = self._get_unique_values(X[col]) if len(unique_values) > self.maximum_proportion_distinct_features * len(X[col]): @@ -115,6 +119,8 @@ def initialize_model(self, **kwargs): # Encode features encoded_features = [] + if not len(X.columns): + raise ValueError(f"No features to train on from after skipping {skip_features}") for col in X.columns: logger.info(f"Checking whether to encode: {col}") col_encoder = self._get_encoder(X[col]) @@ -153,7 +159,7 @@ def initialize_model(self, **kwargs): y = y_encoder.fit_transform(y.values.ravel()) # Convert to 1D numpy array self.transformed_targets = y_encoder.classes_ - # print(f"Fitting model with features: {X.columns}") + # print(f"Fitting model with features: {X.columns}, y={y}, X={X}") clf = DecisionTreeClassifier(random_state=42) clf.fit(X, y) self.classifier = clf