Merge pull request #21 from linkml/web-api

web api
linkml · Jul 26, 2024 · 48c6a08 · 48c6a08
2 parents a5ce8d0 + 01addac
commit 48c6a08
Show file tree

Hide file tree

Showing 17 changed files with 1,051 additions and 178 deletions.
diff --git a/docs/how-to/Index-caDSR.ipynb b/docs/how-to/Index-caDSR.ipynb
diff --git a/docs/manual/data-model.ipynb b/docs/manual/data-model.ipynb
@@ -0,0 +1,39 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Data Model\n",
+    "\n",
+    "The LinkML-Store data model is based around a three-level structure:\n",
+    "\n",
+    " * A `Client` "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "d3371bb475f6fe4a"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/linkml_data_browser/app.py b/src/linkml_data_browser/app.py
@@ -1,18 +1,28 @@
 import logging
+import os
 from typing import Any, Dict
 
 import numpy as np
 import pandas as pd
 import streamlit as st
+import yaml
 from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
+from linkml_store import Client
 from linkml_store.api import Collection
-from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
+from linkml_store.api.queries import QueryResult
 
 logger = logging.getLogger(__name__)
 
 # Set page config to make layout "wide" by default
 st.set_page_config(layout="wide")
 
+config = None
+if os.environ.get("LINKML_STORE_CONFIG"):
+    with open(os.environ["LINKML_STORE_CONFIG"], "r") as f:
+        config = yaml.safe_load(f)
+
+# Initialize client
+client = Client().from_config(config) if config else Client()
 
 DEFAULT_LIMIT = 25
 
@@ -33,14 +43,16 @@ def init_reset_filters(cd: ClassDefinition, reset=False):
             st.session_state[key] = ""  # Assuming text input, adjust for other types
 
 
-def apply_filters(collection: Collection, filters: Dict[str, Any], offset: int, limit: int, **kwargs):
-    print(f"FILTERS={filters}")
-    return collection.find(filters, offset=offset, limit=limit, **kwargs)
+def apply_filters(collection: Collection, filters: Dict[str, Any], offset: int, limit: int, **kwargs) -> QueryResult:
+    print(f"FILTERS={filters} // offset={offset}")
+    qr = collection.find(filters, offset=offset, limit=limit, **kwargs)
+    print(f"QR={qr.num_rows}")
+    return qr
 
 
 def render_filter_widget(collection: Collection, attribute: SlotDefinition):
     """Render appropriate Streamlit widget based on column type."""
-    logger.info("Rendering filter widget")
+    logger.info(f"Rendering filter widget: {attribute.name}")
     # print(f"{attribute.name} // RANGE={attribute.range}")
     # col_type = attribute.range
     col_name = attribute.name
@@ -72,24 +84,30 @@ def render_filter_widget(collection: Collection, attribute: SlotDefinition):
 # Main function to render the app
 def main():
     st.title("LinkML Table Browser")
-    selected_db = st.selectbox("Select a Database", list(DBS.keys()), key="db_selector")
+    db_names = list(client.databases.keys())
+    selected_db = st.selectbox("Select a Database", db_names, key="db_selector")
     print(f"DB SELECTED={selected_db}")
     # con = duckdb.connect(DB_PATH.format(db=selected_db))
-    db_name = DB_PATH.format(db=selected_db)
-    database = DuckDBDatabase(f"duckdb:///{db_name}")
+    # db_name = DB_PATH.format(db=selected_db)
+    # database = DuckDBDatabase(f"duckdb:///{db_name}")
+    database = client.get_database(selected_db)
     st.write(f"Connected to {selected_db}")
-    candidate_tables = DBS.get(selected_db)
+    candidate_tables = database.list_collection_names()
+    print(f"COLLECtiONS={candidate_tables}")
     if len(candidate_tables) > 1:
         curr_table = st.selectbox("Select a Table", candidate_tables, key="table_selector")
     else:
-        curr_table = DBS.get(selected_db)[0]
+        curr_table = candidate_tables[0]
     collection = database.get_collection(curr_table)
+    print(f"CURR={collection.alias} // {collection.target_class_name}")
     cd = collection.class_definition()
+    print(f"CD={cd.name} // {len(cd.attributes)}")
     filters = {}
 
     # Pagination setup
     session_state = st.session_state
     if "current_page" not in session_state:
+        print(f"RESETTING CP// {session_state}")
         session_state.current_page = 0  # Start with page 0
     rows_per_page = DEFAULT_LIMIT
 
@@ -105,8 +123,8 @@ def main():
         if filter_widget is not None and filter_widget != "":
             filters[att_name] = filter_widget
         new_value = filters.get(att_name)
-        if prev_value != new_value:
-            # print(f"CHANGE FOR {att_name}: {prev_value} -> {new_value}")
+        if prev_value != new_value and not (not prev_value and not new_value):
+            print(f"CHANGE FOR {att_name}: {prev_value} -> {new_value}")
             filter_changed = True
             # st.session_state[key] = new_value
         facet_key = f"facet_view_{att_name}"
@@ -116,13 +134,15 @@ def main():
             st.sidebar.write(facet_df)
     # If any filter has changed, reset pagination
     if filter_changed:
+        print(f"FILTER CHANGED={filter_changed}")
         st.session_state.current_page = 0  # Reset offset
     result = apply_filters(collection, filters, session_state.current_page * rows_per_page, rows_per_page)
     # if filter_changed:
     #    facet_results = collection.query_facets(filters, facet_columns=["evidence_type"])
     #    print(f"FACET={facet_results}")
     st.write(f"Number of rows: {result.num_rows}")
     st.write(f"Page: {session_state.current_page + 1}")
+    print(f"SESSION STATE: {session_state}")
     filtered_data = pd.DataFrame(result.rows)
 
     # Pagination buttons
@@ -133,6 +153,7 @@ def main():
         if session_state.current_page > 0:
             session_state.current_page -= 1
     if next_button.button("Next"):
+        print(f"NEXT: CP={session_state.current_page} RPP={rows_per_page} NR={result.num_rows}")
         # Assuming result.num_rows gives the total number of rows after filtering, not just this page's rows
         if (session_state.current_page + 1) * rows_per_page < result.num_rows:
             session_state.current_page += 1

diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py
@@ -346,7 +346,10 @@ def get(self, ids: Optional[List[IDENTIFIER]], **kwargs) -> QueryResult:
         id_field = self.identifier_attribute_name
         if not id_field:
             raise ValueError(f"No identifier for {self.name}")
-        return self.find({id_field: ids})
+        if len(ids) == 1:
+            return self.find({id_field: ids[0]})
+        else:
+            return self.find({id_field: {"$in": ids}})
 
     def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
         """
@@ -518,7 +521,7 @@ def exists(self) -> Optional[bool]:
         :return:
         """
         cd = self.class_definition()
-        return cd is not None
+        return cd is not None and cd.attributes
 
     def load_from_source(self, load_if_exists=False):
         """
@@ -535,11 +538,19 @@ def load_from_source(self, load_if_exists=False):
             kwargs = source.arguments or {}
             if source.local_path:
                 objects = load_objects(
-                    metadata.source.local_path, format=source.format, expected_type=source.expected_type, **kwargs
+                    metadata.source.local_path,
+                    format=source.format,
+                    expected_type=source.expected_type,
+                    compression=source.compression,
+                    **kwargs,
                 )
             elif metadata.source.url:
                 objects = load_objects_from_url(
-                    metadata.source.url, format=source.format, expected_type=source.expected_type, **kwargs
+                    metadata.source.url,
+                    format=source.format,
+                    expected_type=source.expected_type,
+                    compression=source.compression,
+                    **kwargs,
                 )
         self.insert(objects)
 
@@ -746,6 +757,7 @@ def class_definition(self) -> Optional[ClassDefinition]:
         sv: SchemaView = self.parent.schema_view
         if sv:
             cls = sv.get_class(self.target_class_name)
+            # cls = sv.schema.classes[self.target_class_name]
             if cls and not cls.attributes:
                 if not sv.class_induced_slots(cls.name):
                     for att in self._induce_attributes():
@@ -868,7 +880,7 @@ def induce_class_definition_from_objects(
                     exact_dimensions_list.append(v.shape)
                     break
                 if isinstance(v, list):
-                    v = v[0]
+                    v = v[0] if v else None
                     multivalueds.append(True)
                 elif isinstance(v, dict):
                     v = list(v.values())[0]

diff --git a/src/linkml_store/api/config.py b/src/linkml_store/api/config.py
@@ -33,6 +33,7 @@ class CollectionSource(ConfiguredBaseModel):
     refresh_interval_days: Optional[float] = None
     expected_type: Optional[str] = None
     format: Optional[str] = None
+    compression: Optional[str] = None
     arguments: Optional[Dict[str, Any]] = None
 
 
@@ -73,11 +74,11 @@ class CollectionConfig(ConfiguredBaseModel):
         default=None,
         description="Metadata about the source",
     )
-    # TODO: derived_from
     derived_from: Optional[List[DerivationConfiguration]] = Field(
         default=None,
         description="LinkML-Map derivations",
     )
+    page_size: Optional[int] = Field(default=None, description="Suggested page size (items per page) in apps and APIs")
 
 
 class DatabaseConfig(ConfiguredBaseModel):

diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py
@@ -19,7 +19,7 @@
 )
 
 from linkml_store.api.types import CollectionType
-from linkml_store.utils.format_utils import load_objects, render_output
+from linkml_store.utils.format_utils import Format, load_objects, render_output
 from linkml_store.utils.patch_utils import PatchDict
 
 try:
@@ -705,19 +705,35 @@ def drop(self, **kwargs):
         """
         raise NotImplementedError()
 
-    def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
+    def import_database(self, location: str, source_format: Optional[Union[str, Format]] = None, **kwargs):
         """
         Import a database from a file or location.
 
         :param location: location of the file
         :param source_format: source format
         :param kwargs: additional arguments
         """
+        if isinstance(source_format, str):
+            source_format = Format(source_format)
+        if isinstance(source_format, Format):
+            if source_format.is_dump_format() and source_format in [Format.SQLDUMP_DUCKDB, Format.DUMP_MONGODB]:
+                # import into a test instance
+                tmp_handle = source_format.value
+                client = self.parent
+                tmp_db = client.attach_database(tmp_handle, alias="tmp")
+                # TODO: check for infinite recursion
+                tmp_db.import_database(location, source_format=source_format)
+                obj = {}
+                for coll in tmp_db.list_collections():
+                    qr = coll.find({}, limit=-1)
+                    obj[coll.alias] = qr.rows
+                self.store(obj)
+                return
         objects = load_objects(location, format=source_format)
         for obj in objects:
             self.store(obj)
 
-    def export_database(self, location: str, target_format: Optional[str] = None, **kwargs):
+    def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
         """
         Export a database to a file or location.
 
@@ -726,10 +742,23 @@ def export_database(self, location: str, target_format: Optional[str] = None, **
         :param kwargs: additional arguments
         """
         obj = {}
+        if isinstance(target_format, str):
+            target_format = Format(target_format)
         for coll in self.list_collections():
             qr = coll.find({}, limit=-1)
             obj[coll.alias] = qr.rows
         logger.info(f"Exporting object with {len(obj)} collections to {location} in {target_format} format")
+        if isinstance(target_format, Format):
+            if target_format.is_dump_format() and target_format in [Format.SQLDUMP_DUCKDB, Format.DUMP_MONGODB]:
+                tmp_handle = target_format.value
+                client = self.parent
+                tmp_db = client.attach_database(tmp_handle, alias="tmp")
+                tmp_db.store(obj)
+                # TODO: check for infinite recursion
+                tmp_db.export_database(location, target_format=target_format)
+                return
+        if Path(location).is_dir():
+            raise ValueError(f"{location} is a directory; cannot write {target_format} to a dir")
         with open(location, "w", encoding="utf-8") as stream:
             stream.write(render_output(obj, format=target_format))
 

diff --git a/src/linkml_store/api/stores/duckdb/duckdb_database.py b/src/linkml_store/api/stores/duckdb/duckdb_database.py
@@ -1,11 +1,10 @@
 import json
 import logging
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import pandas as pd
 import sqlalchemy
-from duckdb import DuckDBPyConnection
 from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
 from linkml_runtime.utils.schema_builder import SchemaBuilder
@@ -14,6 +13,7 @@
 from linkml_store.api import Database
 from linkml_store.api.queries import Query, QueryResult
 from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
+from linkml_store.utils.format_utils import Format
 from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
 
 TYPE_MAP = {
@@ -45,7 +45,7 @@ class DuckDBDatabase(Database):
     types are used for nested inlined objects.
     """
 
-    _connection: DuckDBPyConnection = None
+    # _connection: DuckDBPyConnection = None
     _engine: sqlalchemy.Engine = None
     collection_class = DuckDBCollection
 
@@ -202,3 +202,31 @@ def induce_schema_view(self) -> SchemaView:
                     cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
                     schema.classes[cls.name] = cls
         return SchemaView(schema)
+
+    def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
+        if target_format == "duckdb" or target_format == Format.SQLDUMP_DUCKDB:
+            path = Path(location)
+            if path.exists():
+                if path.is_file():
+                    path.unlink()
+            with self.engine.connect() as conn:
+                sql = text(f"EXPORT DATABASE '{location}'")
+                conn.execute(sql)
+        else:
+            super().export_database(location, target_format=target_format, **kwargs)
+
+    def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
+        """
+        Import a database from a file or location.
+
+        :param location: location of the file
+        :param source_format: source format
+        :param kwargs: additional arguments
+        """
+        if source_format == Format.SQLDUMP_DUCKDB.value or source_format == Format.SQLDUMP_DUCKDB:
+            with self.engine.connect() as conn:
+                sql = text(f"IMPORT DATABASE '{location}'")
+                conn.execute(sql)
+                conn.commit()
+        else:
+            super().import_database(location, source_format=source_format, **kwargs)