Skip to content

Commit

Permalink
Merge pull request #42 from linkml/ref-integ
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall authored Mar 7, 2025
2 parents e4aa4ba + 993845b commit fd0cf1d
Show file tree
Hide file tree
Showing 30 changed files with 1,123 additions and 328 deletions.
569 changes: 489 additions & 80 deletions docs/how-to/Check-Referential-Integrity.ipynb

Large diffs are not rendered by default.

290 changes: 290 additions & 0 deletions docs/how-to/Index-Bioinformatics-Databases.ipynb

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions src/linkml_store/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
logger = logging.getLogger(__name__)



HANDLE_MAP = {
"duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
"sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
Expand Down Expand Up @@ -220,14 +219,14 @@ def attach_database(
scheme, _ = handle.split(":", 1)
if scheme not in HANDLE_MAP:
raise ValueError(f"Unknown scheme: {scheme}")
module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1)
try:
module = importlib.import_module(module_path)
cls = getattr(module, class_name)
except ImportError as e:
raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")

#cls = HANDLE_MAP[scheme]
# cls = HANDLE_MAP[scheme]
db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
if schema_view:
db.set_schema_view(schema_view)
Expand Down
42 changes: 21 additions & 21 deletions src/linkml_store/api/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
"""
raise NotImplementedError

def index (
def index(
self,
objs: Union[OBJECT, List[OBJECT]],
index_name: Optional[str] = None,
Expand All @@ -231,10 +231,13 @@ def index (
"""
raise NotImplementedError

def upsert(self,
objs: Union[OBJECT, List[OBJECT]],
filter_fields: List[str],
update_fields: Union[List[str], None] = None, **kwargs):
def upsert(
self,
objs: Union[OBJECT, List[OBJECT]],
filter_fields: List[str],
update_fields: Union[List[str], None] = None,
**kwargs,
):
"""
Add one or more objects to the collection.
Expand Down Expand Up @@ -455,10 +458,10 @@ def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
return None

def find(
self,
where: Optional[Any] = None,
select_cols: Optional[List[str] ] = None,
**kwargs,
self,
where: Optional[Any] = None,
select_cols: Optional[List[str]] = None,
**kwargs,
) -> QueryResult:
"""
Find objects in the collection using a where query.
Expand Down Expand Up @@ -596,13 +599,15 @@ def search(
assert ix_coll.size() > 0
qr = ix_coll.find(where=where, limit=-1, **kwargs)
index_col = ix.index_field

# TODO: optimize this for large indexes
def row2array(row):
v = row[index_col]
if isinstance(v, str):
# sqlite stores arrays as strings
v = json.loads(v)
return np.array(v, dtype=float)

vector_pairs = [(row, row2array(row)) for row in qr.rows]
results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
for r in results:
Expand All @@ -618,12 +623,12 @@ def row2array(row):
return new_qr

def group_by(
self,
group_by_fields: List[str],
inlined_field = "objects",
agg_map: Optional[Dict[str, str]] = None,
where: Optional[Dict] = None,
**kwargs,
self,
group_by_fields: List[str],
inlined_field="objects",
agg_map: Optional[Dict[str, str]] = None,
where: Optional[Dict] = None,
**kwargs,
) -> QueryResult:
"""
Group objects in the collection by a column.
Expand All @@ -650,14 +655,9 @@ def group_by(
top_obj = {k: v for k, v in zip(pk_fields, pk)}
top_obj[inlined_field] = objs
results.append(top_obj)
r = QueryResult(
num_rows=len(results),
rows=results
)
r = QueryResult(num_rows=len(results), rows=results)
return r



@property
def is_internal(self) -> bool:
"""
Expand Down
32 changes: 30 additions & 2 deletions src/linkml_store/api/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,31 @@ def induce_schema_view(self) -> SchemaView:
sb.add_class(coll.target_class_name)
return SchemaView(sb.schema)

def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
def validate_database(self, **kwargs) -> List["ValidationResult"]:
"""
Validate the contents of the database.
As `iter_validate_database`, but returns a list of validation results.
:param kwargs:
:return:
"""
return list(self.iter_validate_database(**kwargs))

def validate_database(self, **kwargs) -> List["ValidationResult"]:
"""
Validate the contents of the database.
As `iter_validate_database`, but returns a list of validation results.
:param kwargs:
:return:
"""
return list(self.iter_validate_database(**kwargs))

def iter_validate_database(
self, ensure_referential_integrity: bool = None, **kwargs
) -> Iterator["ValidationResult"]:
"""
Validate the contents of the database.
Expand Down Expand Up @@ -635,12 +659,14 @@ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
'capital' is a required property
'continent' is a required proper
:param ensure_referential_integrity: ensure referential integrity
:param kwargs:
:return: iterator over validation results
"""
for collection in self.list_collections():
yield from collection.iter_validate_collection(**kwargs)
if self.metadata.ensure_referential_integrity:
if self.metadata.ensure_referential_integrity or ensure_referential_integrity:
logger.info(f"Validating referential integrity on {self.alias}")
yield from self._validate_referential_integrity(**kwargs)

def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
Expand All @@ -661,7 +687,9 @@ def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResul
induced_slots = sv.class_induced_slots(cd.name)
slot_map = {s.name: s for s in induced_slots}
# rmap = {s.name: s.range for s in induced_slots}
# map slot ranges to a collection where that range is stored
sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}")
for obj in collection.find_iter():
for k, v in obj.items():
if k not in sr_to_coll:
Expand Down
6 changes: 3 additions & 3 deletions src/linkml_store/api/stores/duckdb/duckdb_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ def _table_exists(self, table: str) -> bool:
meta_query = Query(
from_table="sqlite_master",
where_clause={
#"type": "table",
# "type": "table",
"name": table,
}
},
)
else:
if table.startswith("information_schema"):
Expand All @@ -112,7 +112,7 @@ def _table_exists(self, table: str) -> bool:
where_clause={
"table_type": "BASE TABLE",
"table_name": table,
}
},
)

qr = self.query(meta_query)
Expand Down
2 changes: 1 addition & 1 deletion src/linkml_store/api/stores/filesystem/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Handles have the form:
- ``file:<path>`` for a local file
"""
"""

from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
Expand Down
27 changes: 15 additions & 12 deletions src/linkml_store/api/stores/mongodb/mongodb_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
del obj["_id"]
self._post_insert_hook(objs)


def index(self,
objs: Union[OBJECT, List[OBJECT]],
index_name: Optional[str] = None,
replace: bool = False,
unique: bool = False,
**kwargs):
def index(
self,
objs: Union[OBJECT, List[OBJECT]],
index_name: Optional[str] = None,
replace: bool = False,
unique: bool = False,
**kwargs,
):
"""
Create indexes on the collection.
Expand Down Expand Up @@ -86,11 +87,13 @@ def index(self,
else:
logging.debug(f"Index already exists for field {obj}, skipping creation.")

def upsert(self,
objs: Union[OBJECT, List[OBJECT]],
filter_fields: List[str],
update_fields: Optional[List[str]] = None,
**kwargs):
def upsert(
self,
objs: Union[OBJECT, List[OBJECT]],
filter_fields: List[str],
update_fields: Optional[List[str]] = None,
**kwargs,
):
"""
Upsert one or more documents into the MongoDB collection.
Expand Down
2 changes: 2 additions & 0 deletions src/linkml_store/api/stores/mongodb/mongodb_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def _db_name(self) -> str:
parsed_url = urlparse(self.handle)
path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
db_name = path_parts[0] if path_parts else "default"
if not db_name:
db_name = self.alias
else:
db_name = "default"
return db_name
Expand Down
10 changes: 5 additions & 5 deletions src/linkml_store/api/stores/solr/solr_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ def query(self, query: Query, **kwargs) -> QueryResult:

def query_facets(
self,
where: Optional[Dict] = None,
facet_columns: List[str] = None,
facet_limit=DEFAULT_FACET_LIMIT,
facet_min_count: int = 1,
**kwargs
where: Optional[Dict] = None,
facet_columns: List[str] = None,
facet_limit=DEFAULT_FACET_LIMIT,
facet_min_count: int = 1,
**kwargs,
) -> Dict[str, Dict[str, int]]:
solr_query = self._build_solr_query(where)
solr_query["facet"] = "true"
Expand Down
Loading

0 comments on commit fd0cf1d

Please sign in to comment.