diff --git a/chord_metadata_service/chord/migrations/0008_v8_0_0.py b/chord_metadata_service/chord/migrations/0008_v8_0_0.py new file mode 100644 index 000000000..edf876e5d --- /dev/null +++ b/chord_metadata_service/chord/migrations/0008_v8_0_0.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.11 on 2024-04-24 14:10 + +import chord_metadata_service.restapi.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('chord', '0007_v7_0_0'), + ] + + operations = [ + migrations.AddField( + model_name='dataset', + name='discovery', + field=models.JSONField(blank=True, help_text='Discovery configuration', null=True, validators=[chord_metadata_service.restapi.validators.JsonSchemaValidator({'$id': '/chord_metadata_service/discovery/discovery', 'additionalProperties': False, 'description': 'Discovery configuration for public fields/search', 'properties': {'fields': {'$id': '/chord_metadata_service/discovery/discovery_named_fields', 'additionalProperties': False, 'description': 'Intermediate schema, enforces field schema with flexible names.', 'patternProperties': {'^.*$': {'$id': '/chord_metadata_service/discovery/discovery_field', 'additionalProperties': False, 'description': 'Field configuration', 'properties': {'config': {'properties': {'bin_size': {'type': 'number'}, 'bins': {'items': {'type': 'number'}, 'type': 'array'}, 'enum': {'oneOf': [{'items': {'type': 'string'}, 'type': 'array'}, {'type': 'null'}]}, 'maximum': {'type': 'number'}, 'minimum': {'type': 'number'}, 'taper_left': {'type': 'number'}, 'taper_right': {'type': 'number'}, 'units': {'type': 'string'}}, 'type': 'object'}, 'datatype': {'enum': ['number', 'string', 'date'], 'type': 'string'}, 'description': {'type': 'string'}, 'group_by': {'type': 'string'}, 'group_by_value': {'type': 'string'}, 'mapping': {'type': 'string'}, 'mapping_for_search_filter': {'type': 'string'}, 'title': {'type': 'string'}, 'value_mapping': {'type': 'string'}}, 'type': 'object'}}, 'type': 'object'}, 'overview': {'description': 'List of overview sections', 'items': {'$id': '/chord_metadata_service/discovery/discovery_overview', 'additionalProperties': False, 'description': 'An overview section containing charts', 'properties': {'charts': {'items': {'$id': '/chord_metadata_service/discovery/discovery_overview_chart', 'additionalProperties': False, 'description': 'Associates a field name with a chart type for overview display', 'properties': {'chart_type': {'enum': ['bar', 'pie'], 'type': 'string'}, 'field': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'rules': {'properties': {'count_threshold': {'type': 'integer'}, 'max_query_parameters': {'type': 'integer'}}, 'type': 'object'}, 'search': {'items': {'$id': '/chord_metadata_service/discovery/discovery_search', 'additionalProperties': False, 'description': 'Groups search fields by section.', 'properties': {'fields': {'items': {'type': 'string'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}}, 'type': 'object'}, formats=None)]), + ), + migrations.AddField( + model_name='project', + name='discovery', + field=models.JSONField(blank=True, help_text='Discovery configuration', null=True, validators=[chord_metadata_service.restapi.validators.JsonSchemaValidator({'$id': '/chord_metadata_service/discovery/discovery', 'additionalProperties': False, 'description': 'Discovery configuration for public fields/search', 'properties': {'fields': {'$id': '/chord_metadata_service/discovery/discovery_named_fields', 'additionalProperties': False, 'description': 'Intermediate schema, enforces field schema with flexible names.', 'patternProperties': {'^.*$': {'$id': '/chord_metadata_service/discovery/discovery_field', 'additionalProperties': False, 'description': 'Field configuration', 'properties': {'config': {'properties': {'bin_size': {'type': 'number'}, 'bins': {'items': {'type': 'number'}, 'type': 'array'}, 'enum': {'oneOf': [{'items': {'type': 'string'}, 'type': 'array'}, {'type': 'null'}]}, 'maximum': {'type': 'number'}, 'minimum': {'type': 'number'}, 'taper_left': {'type': 'number'}, 'taper_right': {'type': 'number'}, 'units': {'type': 'string'}}, 'type': 'object'}, 'datatype': {'enum': ['number', 'string', 'date'], 'type': 'string'}, 'description': {'type': 'string'}, 'group_by': {'type': 'string'}, 'group_by_value': {'type': 'string'}, 'mapping': {'type': 'string'}, 'mapping_for_search_filter': {'type': 'string'}, 'title': {'type': 'string'}, 'value_mapping': {'type': 'string'}}, 'type': 'object'}}, 'type': 'object'}, 'overview': {'description': 'List of overview sections', 'items': {'$id': '/chord_metadata_service/discovery/discovery_overview', 'additionalProperties': False, 'description': 'An overview section containing charts', 'properties': {'charts': {'items': {'$id': '/chord_metadata_service/discovery/discovery_overview_chart', 'additionalProperties': False, 'description': 'Associates a field name with a chart type for overview display', 'properties': {'chart_type': {'enum': ['bar', 'pie'], 'type': 'string'}, 'field': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'rules': {'properties': {'count_threshold': {'type': 'integer'}, 'max_query_parameters': {'type': 'integer'}}, 'type': 'object'}, 'search': {'items': {'$id': '/chord_metadata_service/discovery/discovery_search', 'additionalProperties': False, 'description': 'Groups search fields by section.', 'properties': {'fields': {'items': {'type': 'string'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}}, 'type': 'object'}, formats=None)]), + ), + ] diff --git a/chord_metadata_service/chord/models.py b/chord_metadata_service/chord/models.py index 3b9168125..f09add395 100644 --- a/chord_metadata_service/chord/models.py +++ b/chord_metadata_service/chord/models.py @@ -6,7 +6,9 @@ from chord_metadata_service.patients.models import Individual from chord_metadata_service.phenopackets.models import Biosample, Phenopacket from chord_metadata_service.resources.models import Resource -from ..restapi.models import SchemaType +from chord_metadata_service.restapi.validators import JsonSchemaValidator +from chord_metadata_service.restapi.models import SchemaType +from chord_metadata_service.discovery.schemas import DISCOVERY_SCHEMA __all__ = ["Project", "Dataset", "ProjectJsonSchema"] @@ -34,6 +36,8 @@ class Project(models.Model): created = models.DateTimeField(auto_now_add=True) updated = models.DateTimeField(auto_now=True) + discovery = models.JSONField(blank=True, null=True, help_text="Discovery configuration", + validators=[JsonSchemaValidator(DISCOVERY_SCHEMA)]) def __str__(self): return f"{self.title} (ID: {self.identifier})" @@ -149,6 +153,8 @@ def resources(self): extra_properties = models.JSONField(blank=True, null=True, help_text="Extra properties that do not fit in the previous " "specified attributes.") + discovery = models.JSONField(blank=True, null=True, help_text="Discovery configuration", + validators=[JsonSchemaValidator(DISCOVERY_SCHEMA)]) # ------------------------------------------------------------------------- diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index cf6b60dc9..a786ecd62 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -34,6 +34,7 @@ class DatasetSerializer(GenericSerializer): "linked_field_sets", "dats_file", "project", + "discovery", ) # noinspection PyMethodMayBeStatic @@ -146,6 +147,11 @@ class Meta: class ProjectSerializer(serializers.ModelSerializer): # Don't inherit GenericSerializer to not pop empty fields + always_include = ( + "title", + "description", + "discovery", + ) datasets = DatasetSerializer(read_only=True, many=True, exclude_when_nested=["project"]) project_schemas = ProjectJsonSchemaSerializer(read_only=True, many=True) diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py index fcc2d860d..4c8c50ad1 100644 --- a/chord_metadata_service/chord/views_search.py +++ b/chord_metadata_service/chord/views_search.py @@ -47,12 +47,20 @@ OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result" -async def experiment_dataset_summary(_request: DrfRequest, dataset): - return await dt_experiment_summary(Experiment.objects.filter(dataset=dataset), low_counts_censored=False) +async def experiment_dataset_summary(request: DrfRequest, dataset: Dataset): + return await dt_experiment_summary( + Experiment.objects.filter(dataset=dataset), + discovery=None, + low_counts_censored=False + ) -async def phenopacket_dataset_summary(_request: DrfRequest, dataset: Dataset): - return await dt_phenopacket_summary(Phenopacket.objects.filter(dataset=dataset), low_counts_censored=False) +async def phenopacket_dataset_summary(request: DrfRequest, dataset: Dataset): + return await dt_phenopacket_summary( + Phenopacket.objects.filter(dataset=dataset), + discovery=None, + low_counts_censored=False + ) # TODO: CHORD-standardized logging diff --git a/chord_metadata_service/discovery/api_views.py b/chord_metadata_service/discovery/api_views.py index 87bf2e4fc..8145f3680 100644 --- a/chord_metadata_service/discovery/api_views.py +++ b/chord_metadata_service/discovery/api_views.py @@ -9,13 +9,18 @@ from rest_framework.request import Request as DrfRequest from rest_framework.response import Response +from chord_metadata_service.discovery.censorship import RULES_NO_PERMISSIONS +from chord_metadata_service.discovery.exceptions import DiscoveryConfigException +from chord_metadata_service.discovery.utils import get_request_discovery + from . import responses as dres from .types import BinWithValue from ..chord import models as cm from ..logger import logger from .fields import get_field_options, get_range_stats, get_categorical_stats, get_date_stats -from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL +from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL, PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS +from .schemas import DISCOVERY_SCHEMA @extend_schema( @@ -33,31 +38,30 @@ ) @api_view(["GET"]) @permission_classes([AllowAny]) -async def public_search_fields(_request: DrfRequest): +async def public_search_fields(request: DrfRequest): """ get: Return public search fields with their configuration """ - # TODO: should be project-scoped - - config_public = settings.CONFIG_PUBLIC + try: + discovery = await get_request_discovery(request) + except DiscoveryConfigException as e: + return Response(e.message, status=status.HTTP_404_NOT_FOUND) - if not config_public: + if not discovery: return Response(dres.NO_PUBLIC_FIELDS_CONFIGURED, status=status.HTTP_404_NOT_FOUND) - field_conf = config_public["fields"] - # Note: the array is wrapped in a dictionary structure to help with JSON # processing by some services. async def _get_field_response(field) -> dict | None: - field_props = field_conf[field] + field_props = discovery.get("fields", {}).get(field, {}) return { **field_props, "id": field, - "options": await get_field_options(field_props, low_counts_censored=True), + "options": await get_field_options(field, discovery=discovery, low_counts_censored=True), } async def _get_section_response(section) -> dict: @@ -67,7 +71,7 @@ async def _get_section_response(section) -> dict: } return Response({ - "sections": await asyncio.gather(*map(_get_section_response, config_public["search"])), + "sections": await asyncio.gather(*map(_get_section_response, discovery["search"])), }) @@ -90,25 +94,43 @@ async def _counts_for_model_name(mn: str) -> tuple[str, int]: ) @api_view(["GET"]) # Don't use BentoAllowAny, we want to be more careful of cases here. @permission_classes([AllowAny]) -async def public_overview(_request: DrfRequest): +async def public_overview(request: DrfRequest): """ get: Overview of all public data in the database """ - config_public = settings.CONFIG_PUBLIC + try: + discovery = await get_request_discovery(request) + except DiscoveryConfigException as e: + return Response(e.message, status=status.HTTP_404_NOT_FOUND) + dataset_id = request.query_params.get("dataset", None) + project_id = request.query_params.get("project", None) - if not config_public: + if not discovery: return Response(dres.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND) - # TODO: public overviews SHOULD be project-scoped at least. + async def _counts_for_scoped_model_name(mn: str) -> tuple[str, int]: + if dataset_id: + scope = "dataset" + value = dataset_id + elif project_id and not dataset_id: + scope = "project" + value = project_id + elif not project_id and not dataset_id: + return await _counts_for_model_name(mn) + filter_query = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][scope]["filter"] + prefetch = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][scope]["prefetch_related"] + return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.prefetch_related(*prefetch).filter( + **{filter_query: value} + ).acount() # Predefined counts - counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL))) + counts = dict(await asyncio.gather(*map(_counts_for_scoped_model_name, PUBLIC_MODEL_NAMES_TO_MODEL))) # Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose # these values based on if we have access to ALL public fields or not. - rules_config = config_public["rules"] + rules_config = discovery["rules"] count_threshold = rules_config["count_threshold"] # Set counts to 0 if they're under the count threshold, and we don't have full data access permissions for the @@ -119,42 +141,40 @@ async def public_overview(_request: DrfRequest): counts[public_model_name] = 0 response = { - "layout": config_public["overview"], + "layout": discovery["overview"], "fields": {}, "counts": { "individuals": counts["individual"], "biosamples": counts["biosample"], "experiments": counts["experiment"], }, - # TODO: remove these in favour of public_rules endpoint - "max_query_parameters": rules_config["max_query_parameters"], - "count_threshold": count_threshold, } # Parse the public config to gather data for each field defined in the overview - fields = [chart["field"] for section in config_public["overview"] for chart in section["charts"]] - field_conf = config_public["fields"] + fields = [chart["field"] for section in discovery["overview"] for chart in section["charts"]] + field_conf = discovery["fields"] - async def _get_field_response(field_id: str, field_props: dict) -> dict: + async def _get_field_response(field: str) -> dict: + field_props = field_conf.get(field, {"datatype": None}) stats: list[BinWithValue] | None if field_props["datatype"] == "string": - stats = await get_categorical_stats(field_props, low_counts_censored=True) + stats = await get_categorical_stats(field, discovery, project_id, dataset_id, low_counts_censored=True) elif field_props["datatype"] == "number": - stats = await get_range_stats(field_props, low_counts_censored=True) + stats = await get_range_stats(field, discovery, project_id, dataset_id, low_counts_censored=True) elif field_props["datatype"] == "date": - stats = await get_date_stats(field_props, low_counts_censored=True) + stats = await get_date_stats(field, discovery, project_id, dataset_id, low_counts_censored=True) else: raise NotImplementedError() return { **field_props, - "id": field_id, + "id": field, **({"data": stats} if stats is not None else {}), } # Parallel async collection of field responses for public overview - field_responses = await asyncio.gather(*(_get_field_response(field, field_conf[field]) for field in fields)) + field_responses = await asyncio.gather(*(_get_field_response(field) for field in fields)) for field, field_res in zip(fields, field_responses): response["fields"][field] = field_res @@ -184,9 +204,26 @@ async def public_dataset(_request: DrfRequest): "dimensions", "primary_publications", "citations", "produced_by", "creators", "licenses", "acknowledges", "keywords", "version", "dats_file", - "extra_properties", "identifier" + "extra_properties", "identifier", "discovery" ) return Response({ "datasets": datasets }) + + +@api_view(["GET"]) +@permission_classes([AllowAny]) +async def discovery_schema(_request: DrfRequest): + return Response(DISCOVERY_SCHEMA) + + +@api_view(["GET"]) +@permission_classes([AllowAny]) +async def public_rules(request: DrfRequest): + try: + discovery = await get_request_discovery(request) + except DiscoveryConfigException as e: + return Response(e.message, status=status.HTTP_404_NOT_FOUND) + rules = discovery["rules"] if discovery and "rules" in discovery else RULES_NO_PERMISSIONS + return Response(rules, status=status.HTTP_200_OK) diff --git a/chord_metadata_service/discovery/censorship.py b/chord_metadata_service/discovery/censorship.py index 85cb34484..05924d750 100644 --- a/chord_metadata_service/discovery/censorship.py +++ b/chord_metadata_service/discovery/censorship.py @@ -1,6 +1,6 @@ import sys -from django.conf import settings +from chord_metadata_service.discovery.types import DiscoveryConfig __all__ = [ "RULES_NO_PERMISSIONS", @@ -16,27 +16,27 @@ } -def get_threshold(low_counts_censored: bool) -> int: +def get_threshold(discovery: DiscoveryConfig, low_counts_censored: bool) -> int: """ Gets the maximum count threshold for hiding censored data (i.e., rounding to 0). """ if not low_counts_censored: return 0 - if not settings.CONFIG_PUBLIC: + if not discovery: return RULES_NO_PERMISSIONS["count_threshold"] - return settings.CONFIG_PUBLIC["rules"]["count_threshold"] + return discovery["rules"]["count_threshold"] -def thresholded_count(c: int, low_counts_censored: bool) -> int: - return 0 if c <= get_threshold(low_counts_censored) else c +def thresholded_count(c: int, discovery: DiscoveryConfig, low_counts_censored: bool) -> int: + return 0 if c <= get_threshold(discovery, low_counts_censored) else c -def get_max_query_parameters(low_counts_censored: bool) -> int: +def get_max_query_parameters(discovery: DiscoveryConfig, low_counts_censored: bool) -> int: """ Gets the maximum number of query parameters allowed for censored discovery. """ if not low_counts_censored: return sys.maxsize - if not settings.CONFIG_PUBLIC: + if not discovery: return RULES_NO_PERMISSIONS["max_query_parameters"] - return settings.CONFIG_PUBLIC["rules"]["max_query_parameters"] + return discovery["rules"]["max_query_parameters"] diff --git a/chord_metadata_service/discovery/exceptions.py b/chord_metadata_service/discovery/exceptions.py new file mode 100644 index 000000000..c72186218 --- /dev/null +++ b/chord_metadata_service/discovery/exceptions.py @@ -0,0 +1,21 @@ +__all__ = [ + "DiscoveryConfigException" +] + + +class DiscoveryConfigException(Exception): + + def __init__(self, dataset_id: str | None = None, project_id: str | None = None, *args: object) -> None: + self.dataset_id = dataset_id + self.project_id = project_id + + message = "Error retrieving {0} scoped discovery config: {0} {1} does not exist." + if dataset_id and project_id: + message = message.format("project-dataset", f"({project_id}, {dataset_id}) pair") + elif dataset_id: + message = message.format("dataset", dataset_id) + elif project_id: + message = message.format("project", project_id) + self.message = {"message": message} + + super().__init__(*args) diff --git a/chord_metadata_service/discovery/fields.py b/chord_metadata_service/discovery/fields.py index 125e8666e..2928d8f40 100644 --- a/chord_metadata_service/discovery/fields.py +++ b/chord_metadata_service/discovery/fields.py @@ -10,8 +10,8 @@ from . import fields_utils as f_utils from .censorship import get_threshold, thresholded_count -from .stats import stats_for_field -from .types import BinWithValue, DiscoveryFieldProps +from .stats import stats_for_field, get_scoped_queryset +from .types import BinWithValue, DiscoveryConfig, DiscoveryFieldProps LENGTH_Y_M = 4 + 1 + 2 # dates stored as yyyy-mm-dd @@ -31,11 +31,12 @@ async def get_field_bins(query_set: QuerySet, field: str, bin_size: int): return stats -async def get_field_options(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[Any]: +async def get_field_options(field: str, discovery: DiscoveryConfig, low_counts_censored: bool) -> list[Any]: """ Given properties for a public field, return the list of authorized options for querying this field. """ + field_props = discovery.get("fields", {}).get(field) if field_props["datatype"] == "string": options = field_props["config"].get("enum") # Special case: no list of values specified @@ -43,7 +44,7 @@ async def get_field_options(field_props: DiscoveryFieldProps, low_counts_censore # We must be careful here not to leak 'small cell' values as options # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this # should be treated as if the field isn't in the database at all. - options = await get_distinct_field_values(field_props, low_counts_censored) + options = await get_distinct_field_values(field, discovery, low_counts_censored) elif field_props["datatype"] == "number": options = [label for floor, ceil, label in f_utils.labelled_range_generator(field_props)] elif field_props["datatype"] == "date": @@ -59,20 +60,20 @@ async def get_field_options(field_props: DiscoveryFieldProps, low_counts_censore return options -async def get_distinct_field_values(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[Any]: +async def get_distinct_field_values(field: str, discovery: DiscoveryConfig, low_counts_censored: bool) -> list[Any]: # We must be careful here not to leak 'small cell' values as options # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this # should be treated as if the field isn't in the database at all. + field_props = discovery.get("fields", {}).get(field) + model, mapping_field = f_utils.get_model_and_field(field_props["mapping"]) + threshold = get_threshold(discovery, low_counts_censored) - model, field = f_utils.get_model_and_field(field_props["mapping"]) - threshold = get_threshold(low_counts_censored) - - field_query = field + field_query = mapping_field if group_by := field_props.get("group_by"): # JSONField containing an array # use jsonb_path_query field expression - field_query = f_utils.get_jsonb_path_query(field, group_by) - values_with_counts = model.objects.values_list(field_query).annotate(count=Count(field)) + field_query = f_utils.get_jsonb_path_query(mapping_field, group_by) + values_with_counts = model.objects.values_list(field_query).annotate(count=Count(mapping_field)) return [ val @@ -103,7 +104,12 @@ async def compute_binned_ages(individual_queryset: QuerySet, bin_size: int) -> l return binned_ages -async def get_age_numeric_binned(individual_queryset: QuerySet, bin_size: int, low_counts_censored: bool) -> dict: +async def get_age_numeric_binned( + individual_queryset: QuerySet, + bin_size: int, + discovery: DiscoveryConfig, + low_counts_censored: bool +) -> dict: """ age_numeric is computed at ingestion time of phenopackets. On some instances it might be unavailable and as a fallback must be computed from the age JSON field which @@ -121,7 +127,7 @@ async def get_age_numeric_binned(individual_queryset: QuerySet, bin_size: int, l ) return { - b: thresholded_count(bv, low_counts_censored) + b: thresholded_count(bv, discovery, low_counts_censored) for b, bv in individuals_age.items() } @@ -164,8 +170,15 @@ async def get_month_date_range(field_props: DiscoveryFieldProps) -> tuple[str | return start, end -async def get_range_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]: - model, field = f_utils.get_model_and_field(field_props["mapping"]) +async def get_range_stats( + field: str, + discovery: DiscoveryConfig, + project_id: str | None = None, + dataset_id: str | None = None, + low_counts_censored: bool = True +) -> list[BinWithValue]: + field_props = discovery.get("fields", {}).get(field) + model, field_mapping = f_utils.get_model_and_field(field_props["mapping"]) # JSONField array specific field props group_by = field_props.get("group_by") @@ -185,15 +198,15 @@ async def get_range_stats(field_props: DiscoveryFieldProps, low_counts_censored: else: whens = [ When( - **{f"{field}__gte": floor} if floor is not None else {}, - **{f"{field}__lt": ceil} if ceil is not None else {}, + **{f"{field_mapping}__gte": floor} if floor is not None else {}, + **{f"{field_mapping}__lt": ceil} if ceil is not None else {}, then=Value(label), ) for floor, ceil, label in f_utils.labelled_range_generator(field_props) ] query_set = ( - model.objects + get_scoped_queryset(model, project_id, dataset_id) .values(label=Case(*whens, default=Value("missing"), output_field=CharField())) .annotate(total=Count("label")) ) @@ -201,7 +214,7 @@ async def get_range_stats(field_props: DiscoveryFieldProps, low_counts_censored: # Maximum number of entries needed to round a count from its true value down to 0 (censored discovery) stats: dict[str, int] = dict() async for item in query_set: - stats[item["label"]] = thresholded_count(item["total"], low_counts_censored) + stats[item["label"]] = thresholded_count(item["total"], discovery, low_counts_censored) # All the bins between start and end must be represented and ordered bins: list[BinWithValue] = [ @@ -215,11 +228,17 @@ async def get_range_stats(field_props: DiscoveryFieldProps, low_counts_censored: return bins -async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[BinWithValue]: +async def get_categorical_stats( + field: str, + discovery: DiscoveryConfig, + project_id: str | None = None, + dataset_id: str | None = None, + low_counts_censored: bool = True +) -> list[BinWithValue]: """ Fetches statistics for a given categorical field and apply privacy policies """ - + field_props = discovery.get("fields", {}).get(field) model, field_name = f_utils.get_model_and_field(field_props["mapping"]) # Collect stats for the field, censoring low cell counts along the way @@ -227,8 +246,9 @@ async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_cen # database - i.e., if the label is pulled from the values in the database, someone could otherwise learn # 1 <= this field <= threshold given it being present at all. # - stats_for_field(...) handles this! - stats: Mapping[str, int] = await stats_for_field(model, field_name, low_counts_censored, - add_missing=True, group_by=field_props.get("group_by")) + stats: Mapping[str, int] = await stats_for_field(model, field_name, discovery, low_counts_censored, + add_missing=True, group_by=field_props.get("group_by"), + project_id=project_id, dataset_id=dataset_id) # Enforce values order from config and apply policies labels: list[str] | None = field_props["config"].get("enum") @@ -255,7 +275,13 @@ async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_cen ] -async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]: +async def get_date_stats( + field: str, + discovery: DiscoveryConfig, + project_id: str | None = None, + dataset_id: str | None = None, + low_counts_censored: bool = True, +) -> list[BinWithValue]: """ Fetches statistics for a given date field, fill the gaps in the date range and apply privacy policies. @@ -264,6 +290,10 @@ async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: regular fields when needed. TODO: for now only dates binned by month are handled """ + field_props = discovery.get("fields", {}).get(field) + if not field_props: + msg = f"Field {field} is not in the provided discovery config." + raise NotImplementedError(msg) if (bin_by := field_props["config"]["bin_by"]) != "month": msg = f"Binning dates by `{bin_by}` method not implemented" @@ -277,7 +307,7 @@ async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: # Note: lexical sort works on ISO dates query_set = ( - model.objects + get_scoped_queryset(model, project_id, dataset_id) .values(field_name) .order_by(field_name) .annotate(total=Count(field_name)) @@ -308,12 +338,12 @@ async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: label = f"{month_abbr[month].capitalize()} {year}" # convert key as yyyy-mm to `abbreviated month yyyy` bins.append({ "label": label, - "value": thresholded_count(stats.get(key, 0), low_counts_censored), + "value": thresholded_count(stats.get(key, 0), discovery, low_counts_censored), }) # Append missing items at the end if any if "missing" in stats: - bins.append({"label": "missing", "value": thresholded_count(stats["missing"], low_counts_censored)}) + bins.append({"label": "missing", "value": thresholded_count(stats["missing"], discovery, low_counts_censored)}) return bins diff --git a/chord_metadata_service/discovery/fields_utils.py b/chord_metadata_service/discovery/fields_utils.py index b2196d39f..bfb096fe9 100644 --- a/chord_metadata_service/discovery/fields_utils.py +++ b/chord_metadata_service/discovery/fields_utils.py @@ -1,7 +1,7 @@ from typing import Any, Iterator, Type from django.db.models import Q, Func, BooleanField, F, Value, Model, JSONField -from chord_metadata_service.discovery.model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL +from chord_metadata_service.discovery.model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL, PublicModelNames MAPPING_SEPARATOR = "/" JSON_PATH_ACCESSOR = "." @@ -48,6 +48,13 @@ def get_model_and_field(field_id: str) -> tuple[Type[Model], str]: return model, field_name +def get_public_model_name(model: Type[Model]) -> PublicModelNames: + model_name = [key for key, m in PUBLIC_MODEL_NAMES_TO_MODEL.items() if m == model] + if len(model_name) != 1: + raise NotImplementedError(f"Provided model {model} is not available for public.") + return model_name[0] + + def parse_duration(duration: str | dict): """ Returns years integer. """ if isinstance(duration, dict) and "iso8601duration" in duration: diff --git a/chord_metadata_service/discovery/model_lookups.py b/chord_metadata_service/discovery/model_lookups.py index 3e86aded7..40b09358b 100644 --- a/chord_metadata_service/discovery/model_lookups.py +++ b/chord_metadata_service/discovery/model_lookups.py @@ -1,21 +1,72 @@ from django.db.models import Model -from typing import Type +from typing import Literal, Type, TypedDict from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT from chord_metadata_service.experiments import models as exp_models from chord_metadata_service.patients import models as patient_models from chord_metadata_service.phenopackets import models as pheno_models -__all__ = ["PUBLIC_MODEL_NAMES_TO_MODEL", "PUBLIC_MODEL_NAMES_TO_DATA_TYPE"] +__all__ = [ + "PUBLIC_MODEL_NAMES_TO_MODEL", + "PUBLIC_MODEL_NAMES_TO_DATA_TYPE", + "PublicModelNames" +] -PUBLIC_MODEL_NAMES_TO_MODEL: dict[str, Type[Model]] = { +PublicModelNames = Literal["individual", "biosample", "experiment"] + +PUBLIC_MODEL_NAMES_TO_MODEL: dict[PublicModelNames, Type[Model]] = { "individual": patient_models.Individual, "biosample": pheno_models.Biosample, "experiment": exp_models.Experiment, } -PUBLIC_MODEL_NAMES_TO_DATA_TYPE = { +PUBLIC_MODEL_NAMES_TO_DATA_TYPE: dict[PublicModelNames, str] = { "individual": DATA_TYPE_PHENOPACKET, "biosample": DATA_TYPE_PHENOPACKET, "experiment": DATA_TYPE_EXPERIMENT, } + + +class ScopeFilter(TypedDict): + filter: str + prefetch_related: tuple[str, ...] + select_related: tuple[str, ...] + + +class ProjectDatasetScopeFilters(TypedDict): + project: ScopeFilter + dataset: ScopeFilter + + +PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS: dict[PublicModelNames, ProjectDatasetScopeFilters] = { + "individual": { + "project": { + "filter": "phenopackets__dataset__project__identifier", + "prefetch_related": ("phenopackets__dataset__project",) + }, + "dataset": { + "filter": "phenopackets__dataset__identifier", + "prefetch_related": ("phenopackets__dataset",) + }, + }, + "biosample": { + "project": { + "filter": "phenopacket__dataset__project__identifier", + "prefetch_related": ("phenopacket__dataset__project",), + }, + "dataset": { + "filter": "phenopacket__dataset__identifier", + "prefetch_related": ("phenopacket__dataset",), + }, + }, + "experiment": { + "project": { + "filter": "dataset__project__identifier", + "prefetch_related": ("dataset__project",), + }, + "dataset": { + "filter": "dataset__identifier", + "prefetch_related": ("dataset",), + }, + }, +} diff --git a/chord_metadata_service/discovery/schemas.py b/chord_metadata_service/discovery/schemas.py new file mode 100644 index 000000000..9fa5aac7d --- /dev/null +++ b/chord_metadata_service/discovery/schemas.py @@ -0,0 +1,101 @@ +from django.conf import settings +from chord_metadata_service.restapi.schema_utils import ( + sub_schema_uri, array_of, base_type, SchemaTypes, enum_of +) + +discovery_base_uri = sub_schema_uri(settings.SCHEMAS_BASE_URL, "discovery") + +DISCOVERY_FIELD_SCHEMA = { + "description": "Field configuration", + "type": "object", + "properties": { + "mapping": base_type(SchemaTypes.STRING), + "mapping_for_search_filter": base_type(SchemaTypes.STRING), + "group_by": base_type(SchemaTypes.STRING), + "group_by_value": base_type(SchemaTypes.STRING), + "value_mapping": base_type(SchemaTypes.STRING), + "title": base_type(SchemaTypes.STRING), + "description": base_type(SchemaTypes.STRING), + "datatype": enum_of(["number", "string", "date"]), + "config": { + "type": "object", + "properties": { + # datatype == string + "enum": { + "oneOf": [ + # either an array of strings, or null + array_of(base_type(SchemaTypes.STRING)), + base_type(SchemaTypes.NULL) + ] + }, + # datatype == number + "bins": array_of(base_type(SchemaTypes.NUMBER)), + "bin_size": base_type(SchemaTypes.NUMBER), + "taper_left": base_type(SchemaTypes.NUMBER), + "taper_right": base_type(SchemaTypes.NUMBER), + "units": base_type(SchemaTypes.STRING), + "minimum": base_type(SchemaTypes.NUMBER), + "maximum": base_type(SchemaTypes.NUMBER), + } + } + }, + "additionalProperties": False +} + +DISCOVERY_NAMED_FIELDS_SCHEMA = { + "description": "Intermediate schema, enforces field schema with flexible names.", + "type": "object", + "patternProperties": { + "^.*$": DISCOVERY_FIELD_SCHEMA + }, + "additionalProperties": False +} + +DISCOVERY_OVERVIEW_CHART_SCHEMA = { + "description": "Associates a field name with a chart type for overview display", + "type": "object", + "properties": { + "field": base_type(SchemaTypes.STRING), + "chart_type": enum_of(["bar", "pie"]) + }, + "additionalProperties": False +} + +DISCOVERY_OVERVIEW_SCHEMA = { + "description": "An overview section containing charts", + "type": "object", + "properties": { + "section_title": base_type(SchemaTypes.STRING), + "charts": array_of(DISCOVERY_OVERVIEW_CHART_SCHEMA) + }, + "additionalProperties": False +} + +DISCOVERY_SEARCH_SCHEMA = { + "description": "Groups search fields by section.", + "type": "object", + "properties": { + "section_title": base_type(SchemaTypes.STRING), + "fields": array_of(base_type(SchemaTypes.STRING)) + }, + "additionalProperties": False +} + +DISCOVERY_SCHEMA = { + "$id": discovery_base_uri, + "description": "Discovery configuration for public fields/search", + "type": "object", + "properties": { + "overview": array_of(DISCOVERY_OVERVIEW_SCHEMA, "List of overview sections"), + "search": array_of(DISCOVERY_SEARCH_SCHEMA), + "fields": DISCOVERY_NAMED_FIELDS_SCHEMA, + "rules": { + "type": "object", + "properties": { + "count_threshold": base_type(SchemaTypes.INTEGER), + "max_query_parameters": base_type(SchemaTypes.INTEGER) + } + } + }, + "additionalProperties": False +} diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py index 8fe4eecef..9200e393c 100644 --- a/chord_metadata_service/discovery/stats.py +++ b/chord_metadata_service/discovery/stats.py @@ -3,8 +3,9 @@ from typing import Mapping, Type from .censorship import thresholded_count -from .types import BinWithValue -from .fields_utils import get_jsonb_path_query +from .fields_utils import get_jsonb_path_query, get_public_model_name +from .model_lookups import PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS +from .types import BinWithValue, DiscoveryConfig __all__ = [ "individual_experiment_type_stats", @@ -12,11 +13,12 @@ "bento_public_format_count_and_stats_list", "stats_for_field", "queryset_stats_for_field", + "get_scoped_queryset", ] async def individual_experiment_type_stats( - queryset: QuerySet, low_counts_censored: bool + queryset: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool ) -> tuple[int, list[BinWithValue]]: """ Used for a fixed-response public API and beacon. @@ -27,12 +29,13 @@ async def individual_experiment_type_stats( queryset .values(label=F("phenopackets__biosamples__experiment__experiment_type")) .annotate(value=Count("phenopackets__biosamples__experiment", distinct=True)), + discovery, low_counts_censored, ) async def individual_biosample_tissue_stats( - queryset: QuerySet, low_counts_censored: bool + queryset: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool ) -> tuple[int, list[BinWithValue]]: """ Used for a fixed-response public API and beacon. @@ -42,12 +45,14 @@ async def individual_biosample_tissue_stats( queryset .values(label=F("phenopackets__biosamples__sampled_tissue__label")) .annotate(value=Count("phenopackets__biosamples", distinct=True)), + discovery, low_counts_censored, ) async def bento_public_format_count_and_stats_list( annotated_queryset: QuerySet, + discovery: DiscoveryConfig, low_counts_censored: bool, ) -> tuple[int, list[BinWithValue]]: stats_list: list[BinWithValue] = [] @@ -55,7 +60,7 @@ async def bento_public_format_count_and_stats_list( async for q in annotated_queryset: label = q["label"] - value = thresholded_count(int(q["value"]), low_counts_censored) + value = thresholded_count(int(q["value"]), discovery, low_counts_censored) # Be careful not to leak values if they're in the database but below threshold if value == 0: @@ -68,26 +73,54 @@ async def bento_public_format_count_and_stats_list( total += value stats_list.append({"label": label, "value": value}) - return thresholded_count(total, low_counts_censored), stats_list + return thresholded_count(total, discovery, low_counts_censored), stats_list + + +def get_scoped_queryset( + model: Type[Model], + project_id: str | None = None, + dataset_id: str | None = None, +) -> QuerySet: + if project_id and not dataset_id: + scope = "project" + value = project_id + elif project_id and dataset_id: + scope = "dataset" + value = dataset_id + else: + return model.objects.all() + model_name = get_public_model_name(model) + prefetch = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[model_name][scope]["prefetch_related"] + filter_query = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[model_name][scope]["filter"] + return model.objects.prefetch_related(*prefetch).filter(**{filter_query: value}) async def stats_for_field( model: Type[Model], field: str, + discovery: DiscoveryConfig, low_counts_censored: bool, add_missing: bool = False, - group_by=None, + group_by: str | None = None, + project_id: str | None = None, + dataset_id: str | None = None, ) -> Mapping[str, int]: """ Computes counts of distinct values for a given field. Mainly applicable to char fields representing categories """ + qs = get_scoped_queryset(model, project_id, dataset_id) return await queryset_stats_for_field( - model.objects.all(), field, low_counts_censored=low_counts_censored, add_missing=add_missing, group_by=group_by) + qs, field, discovery, low_counts_censored=low_counts_censored, add_missing=add_missing, group_by=group_by) async def queryset_stats_for_field( - queryset: QuerySet, field: str, low_counts_censored: bool, add_missing: bool = False, group_by: str = None + queryset: QuerySet, + field: str, + discovery: DiscoveryConfig, + low_counts_censored: bool, + add_missing: bool = False, + group_by: str | None = None ) -> Mapping[str, int]: """ Computes counts of distinct values for a queryset. @@ -119,12 +152,12 @@ async def queryset_stats_for_field( # Censor low cell counts if necessary - we don't want to betray that the value even exists in the database if # we have a low count for it. - if thresholded_count(item["total"], low_counts_censored) == 0: + if thresholded_count(item["total"], discovery, low_counts_censored) == 0: continue stats[key] = item["total"] if add_missing: - stats["missing"] = thresholded_count(num_missing, low_counts_censored) + stats["missing"] = thresholded_count(num_missing, discovery, low_counts_censored) return stats diff --git a/chord_metadata_service/discovery/tests/constants.py b/chord_metadata_service/discovery/tests/constants.py index 369f9c8ff..95dcae2e2 100644 --- a/chord_metadata_service/discovery/tests/constants.py +++ b/chord_metadata_service/discovery/tests/constants.py @@ -1,6 +1,6 @@ from copy import deepcopy -CONFIG_PUBLIC_TEST = { +DISCOVERY_CONFIG_TEST = { "overview": [ { "section_title": "First Section", @@ -11,34 +11,17 @@ }, { "section_title": "Second Section", - "charts": [ - {"field": "date_of_consent", "chart_type": "bar"}, - {"field": "smoking", "chart_type": "bar"}, - {"field": "baseline_creatinine", "chart_type": "bar"}, - ] - }, - { - "section_title": "Third Section", - "charts": [ - {"field": "lab_test_result_value", "chart_type": "bar"}, - ] - }, - { - "section_title": "Fourth Section", "charts": [ {"field": "diagnostic_markers", "chart_type": "pie"}, - {"field": "measurement_tumor_length", "chart_type": "bar"} + {"field": "measurement_tumor_length", "chart_type": "bar"}, + {"field": "tissues", "chart_type": "pie"} ] } ], "search": [ { "section_title": "First Section", - "fields": [ - "sex", "age", "smoking", "covidstatus", "death_dc", - "lab_test_result_value", "baseline_creatinine", "date_of_consent", - "tissues" - ] + "fields": ["sex", "age", "tissues"] } ], "fields": { @@ -65,7 +48,53 @@ "maximum": 100 } }, - "smoking": { + "tissues": { + "mapping": "biosample/sampled_tissue/label", + "mapping_for_search_filter": "individual/biosamples/sampled_tissue/label", + "title": "Tissue", + "description": "Tissue from which the biosample was extracted", + "datatype": "string", + "config": { + "enum": None + } + }, + "diagnostic_markers": { + "mapping": "individual/phenopackets/biosamples/diagnostic_markers", + "group_by": "label", + "title": "Diagnostic Markers", + "description": "Markers used for diagnosis", + "datatype": "string", + "config": { + "enum": None + } + }, + "measurement_tumor_length": { + "mapping": "individual/phenopackets/measurements", + "group_by": "assay/id", + "group_by_value": "NCIT:C200479", + "value_mapping": "value/quantity/value", + "title": "Tumor lengths", + "description": "measured tumor lengths in millimeters", + "datatype": "number", + "config": { + "minimum": 0, + "maximum": 200, + "bin_size": 20, + "taper_left": 0, + "taper_right": 200, + "units": "mm" + } + }, + }, + "rules": { + "count_threshold": 5, + "max_query_parameters": 2 + } +} + +DISCOVERY_CONFIG_EXTRA_PROPERTIES = deepcopy(DISCOVERY_CONFIG_TEST) +DISCOVERY_CONFIG_EXTRA_PROPERTIES["fields"].update([ + ("smoking", { "mapping": "individual/extra_properties/smoking", "title": "Smoking", "description": "Smoking exposure", @@ -79,8 +108,8 @@ "Not specified" ] } - }, - "covidstatus": { + }), + ("covidstatus", { "mapping": "individual/extra_properties/covidstatus", "title": "Covid status", "description": "Covid status", @@ -92,8 +121,8 @@ "Indeterminate" ] } - }, - "death_dc": { + }), + ("death_dc", { "mapping": "individual/extra_properties/death_dc", "title": "Death", "description": "Death status", @@ -104,8 +133,8 @@ "Deceased" ] } - }, - "lab_test_result_value": { + }), + ("lab_test_result_value", { "mapping": "individual/extra_properties/lab_test_result_value", "title": "Lab Test Result", "description": "This acts as a placeholder for numeric values", @@ -115,8 +144,8 @@ "minimum": 0, "units": "mg/L" } - }, - "baseline_creatinine": { + }), + ("baseline_creatinine", { "mapping": "individual/extra_properties/baseline_creatinine", "title": "Creatinine", "description": "Baseline Creatinine", @@ -129,8 +158,8 @@ "maximum": 600, "units": "mg/L" } - }, - "date_of_consent": { + }), + ("date_of_consent", { "mapping": "individual/extra_properties/date_of_consent", "title": "Verbal consent date", "description": "Date of initial verbal consent(participant, legal representative or tutor), yyyy-mm-dd", @@ -138,55 +167,32 @@ "config": { "bin_by": "month" } - }, - "tissues": { - "mapping": "biosample/sampled_tissue/label", - "mapping_for_search_filter": "individual/biosamples/sampled_tissue/label", - "title": "Tissue", - "description": "Tissue from which the biosample was extracted", - "datatype": "string", - "config": { - "enum": None - } - }, - "diagnostic_markers": { - "mapping": "individual/phenopackets/biosamples/diagnostic_markers", - "group_by": "label", - "title": "Diagnostic Markers", - "description": "Markers used for diagnosis", - "datatype": "string", - "config": { - "enum": None - } - }, - "measurement_tumor_length": { - "mapping": "individual/phenopackets/measurements", - "group_by": "assay/id", - "group_by_value": "NCIT:C200479", - "value_mapping": "value/quantity/value", - "title": "Tumor lengths", - "description": "measured tumor lengths in millimeters", - "datatype": "number", - "config": { - "minimum": 0, - "maximum": 200, - "bin_size": 20, - "taper_left": 0, - "taper_right": 200, - "units": "mm" - } - }, + }) +]) +DISCOVERY_CONFIG_EXTRA_PROPERTIES["search"][0]["fields"].extend( + ["smoking", "covidstatus", "death_dc", "lab_test_result_value", "baseline_creatinine", "date_of_consent"] +) +DISCOVERY_CONFIG_EXTRA_PROPERTIES["overview"].extend([ + { + "section_title": "Dataset individual exta properties specific Section", + "charts": [ + {"field": "date_of_consent", "chart_type": "bar"}, + {"field": "smoking", "chart_type": "bar"}, + {"field": "baseline_creatinine", "chart_type": "bar"}, + ] }, - "rules": { - "count_threshold": 5, - "max_query_parameters": 2 - } -} + { + "section_title": "Lab test results section", + "charts": [ + {"field": "lab_test_result_value", "chart_type": "bar"}, + ] + }, +]) -CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY = deepcopy(CONFIG_PUBLIC_TEST) +CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY = deepcopy(DISCOVERY_CONFIG_TEST) CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY["search"][0]["fields"] = ["sex"] -CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS = deepcopy(CONFIG_PUBLIC_TEST) +CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS = deepcopy(DISCOVERY_CONFIG_TEST) CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS["fields"].update([ ("unset_date", { diff --git a/chord_metadata_service/discovery/tests/test_api.py b/chord_metadata_service/discovery/tests/test_api.py index b7332fc2a..e60b836f9 100644 --- a/chord_metadata_service/discovery/tests/test_api.py +++ b/chord_metadata_service/discovery/tests/test_api.py @@ -1,16 +1,20 @@ import json import os from copy import deepcopy +import uuid from django.conf import settings from django.urls import reverse -from django.test import override_settings +from django.test import TestCase, override_settings from rest_framework import status from rest_framework.test import APITestCase from chord_metadata_service.chord import models as ch_m from chord_metadata_service.chord.tests import constants as ch_c from chord_metadata_service.discovery import responses as dres +from chord_metadata_service.discovery.censorship import RULES_NO_PERMISSIONS +from chord_metadata_service.discovery.schemas import DISCOVERY_SCHEMA +from chord_metadata_service.discovery.types import DiscoveryConfig from chord_metadata_service.phenopackets import models as ph_m from chord_metadata_service.phenopackets.tests import constants as ph_c from chord_metadata_service.experiments import models as exp_m @@ -21,10 +25,54 @@ INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST, INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT ) -from .constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS +from .constants import ( + CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY, + DISCOVERY_CONFIG_TEST, + CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS, + DISCOVERY_CONFIG_EXTRA_PROPERTIES +) + + +class ScopedDiscoveryTestCase(TestCase): + + @classmethod + def setUpTestData(cls) -> None: + # fallback on the node's discovery config + cls.project_a = ch_m.Project.objects.create( + title="Test project A", + description="test description", + discovery={}, + ) + cls.id_proj_a = cls.project_a.identifier + # use provided dataset discovery config + cls.dataset_a = ch_m.Dataset.objects.create( + title="Dataset 1", + description="Test dataset", + data_use=ch_c.VALID_DATA_USE_1, + project=cls.project_a, + discovery=DISCOVERY_CONFIG_EXTRA_PROPERTIES, + ) + cls.id_ds_a = cls.dataset_a.identifier + # use provided project discovery config + cls.project_b = ch_m.Project.objects.create( + title="Test project B", + description="test description", + discovery=CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY, + ) + cls.id_proj_b = cls.project_b.identifier + # Should fallback on project's discovery config + cls.dataset_b = ch_m.Dataset.objects.create( + title="Dataset 2", + description="Test dataset 2", + data_use=ch_c.VALID_DATA_USE_1, + project=cls.project_b, + discovery={}, + ) + cls.id_ds_b = cls.dataset_b.identifier -class PublicSearchFieldsTest(APITestCase): + +class PublicSearchFieldsTest(APITestCase, ScopedDiscoveryTestCase): def setUp(self) -> None: # create 2 phenopackets for 2 individuals; each individual has 1 biosample; @@ -32,7 +80,8 @@ def setUp(self) -> None: self.individual_1 = ph_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1) self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1) self.phenopacket_1 = ph_m.Phenopacket.objects.create( - **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1) + **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1), + dataset=self.dataset_a ) self.disease = ph_m.Disease.objects.create(**ph_c.VALID_DISEASE_1) self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1)) @@ -44,19 +93,88 @@ def setUp(self) -> None: # experiments self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) - self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) + self.experiment = exp_m.Experiment.objects.create( + **exp_c.valid_experiment(self.biosample_1, self.instrument, dataset=self.dataset_a) + ) self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) self.experiment.experiment_results.set([self.experiment_result]) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_public_search_fields_configured(self): - response = self.client.get(reverse("public-search-fields"), content_type="application/json") - self.assertEqual(response.status_code, status.HTTP_200_OK) - response_obj = response.json() + def assert_response_section_fields(self, response_obj: dict, config: dict): self.assertSetEqual( set(field["id"] for section in response_obj["sections"] for field in section["fields"]), - set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"]) + set(field for section in config["search"] for field in section["fields"]) + ) + + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) + def test_public_search_fields_configured(self): + search_fields_url = reverse("public-search-fields") + # SCOPE: whole node + response = self.client.get(search_fields_url, content_type="application/json") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assert_response_section_fields(response.json(), settings.CONFIG_PUBLIC) + + # SCOPE: project_a (same discovery as whole node) + response_p_a = self.client.get( + f"{search_fields_url}?project={str(self.id_proj_a)}", + content_type="application/json", + ) + self.assertEqual(response_p_a.status_code, status.HTTP_200_OK) + self.assert_response_section_fields(response_p_a.json(), settings.CONFIG_PUBLIC) + + # SCOPE: project_b (discovery search sex only) + response_p_b = self.client.get( + f"{search_fields_url}?project={str(self.id_proj_b)}", + content_type="application/json", + ) + self.assertEqual(response_p_b.status_code, status.HTTP_200_OK) + self.assert_response_section_fields(response_p_b.json(), CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY) + + # SCOPE: dataset_a (discovery with dataset specific extra_properties) + response_d_a = self.client.get( + f"{search_fields_url}?dataset={str(self.id_ds_a)}", + content_type="application/json", + ) + self.assertEqual(response_d_a.status_code, status.HTTP_200_OK) + self.assert_response_section_fields(response_d_a.json(), DISCOVERY_CONFIG_EXTRA_PROPERTIES) + + # SCOPE: non existing dataset + response_d_invalid = self.client.get( + f"{search_fields_url}?dataset={uuid.uuid4()}", + content_type="application/json", + ) + self.assertEqual(response_d_invalid.status_code, status.HTTP_404_NOT_FOUND) + + # SCOPE: non existing project + response_p_invalid = self.client.get( + f"{search_fields_url}?project={uuid.uuid4()}", + content_type="application/json", + ) + self.assertEqual(response_p_invalid.status_code, status.HTTP_404_NOT_FOUND) + + # SCOPE: dataset_b + response_d_b = self.client.get( + f"{search_fields_url}?dataset={self.id_ds_b}", + content_type="application/json", + ) + self.assertEqual(response_d_b.status_code, status.HTTP_200_OK) + # fallback on project's config, responses should be the same + self.assertEqual(response_d_b.json(), response_p_b.json()) + + # SCOPE: project_a + dataset_b (invalid) + response_pd_invalid = self.client.get( + f"{search_fields_url}?project={str(self.id_proj_a)}&dataset={self.id_ds_b}", + content_type="application/json", + ) + self.assertEqual(response_pd_invalid.status_code, status.HTTP_404_NOT_FOUND) + + # SCOPE: project_a + dataset_a (valid) + response_pd_valid = self.client.get( + f"{search_fields_url}?project={str(self.id_proj_a)}&dataset={self.id_ds_a}", + content_type="application/json", ) + self.assertEqual(response_pd_valid.status_code, status.HTTP_200_OK) + # same as dataset_a + self.assertEqual(response_pd_valid.json(), response_d_a.json()) @override_settings(CONFIG_PUBLIC={}) def test_public_search_fields_not_configured(self): @@ -70,20 +188,29 @@ def test_public_search_fields_not_configured(self): def test_public_search_fields_missing_extra_properties(self): response = self.client.get(reverse("public-search-fields"), content_type="application/json") self.assertEqual(response.status_code, status.HTTP_200_OK) - response_obj = response.json() - self.assertSetEqual( - set(field["id"] for section in response_obj["sections"] for field in section["fields"]), - set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"]) - ) + self.assert_response_section_fields(response.json(), settings.CONFIG_PUBLIC) -class PublicOverviewTest(APITestCase): +class PublicOverviewTest(APITestCase, ScopedDiscoveryTestCase): def setUp(self) -> None: + self.url = '/api/public_overview' + # individuals (count 8) individuals = { f"individual_{i}": ph_m.Individual.objects.create(**ind) for i, ind in enumerate(VALID_INDIVIDUALS, start=1) } + # all individuals are in phenopackets that belong to dataset_a + phenopackets = { + f"phenopacket_{ind_label}": ph_m.Phenopacket.objects.create( + **ph_c.valid_phenopacket( + subject=ind, + meta_data=ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1), + id=f"phenopacket_{ind_label}" + ), + dataset=self.dataset_a + ) for ind_label, ind in individuals.items() + } # biosamples self.biosample_1 = ph_m.Biosample.objects.create( **ph_c.valid_biosample_1(individuals["individual_1"]) @@ -91,30 +218,93 @@ def setUp(self) -> None: self.biosample_2 = ph_m.Biosample.objects.create( **ph_c.valid_biosample_2(individuals["individual_2"]) ) + phenopackets["phenopacket_individual_1"].biosamples.set([self.biosample_1]) + phenopackets["phenopacket_individual_2"].biosamples.set([self.biosample_2]) # experiments self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) - self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) + self.experiment = exp_m.Experiment.objects.create( + **exp_c.valid_experiment(self.biosample_1, self.instrument, dataset=self.dataset_a) + ) self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) self.experiment.experiment_results.set([self.experiment_result]) # make a copy and create experiment 2 - experiment_2 = deepcopy(exp_c.valid_experiment(self.biosample_2, self.instrument)) + experiment_2 = deepcopy(exp_c.valid_experiment(self.biosample_2, self.instrument, dataset=self.dataset_a)) experiment_2["id"] = "experiment:2" self.experiment = exp_m.Experiment.objects.create(**experiment_2) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + self.data_type_counts: dict[str, int] = { + "individuals": ph_m.Individual.objects.all().count(), + "biosamples": ph_m.Biosample.objects.all().count(), + "experiments": exp_m.Experiment.objects.all().count() + } + + def assert_counts_censored(self, overview_response: dict, discovery: DiscoveryConfig): + count_threshold = discovery["rules"]["count_threshold"] + for data_type in self.data_type_counts.keys(): + response_count = overview_response["counts"][data_type] + if response_count < count_threshold: + self.assertEqual(response_count, 0) + else: + self.assertEqual(response_count, self.data_type_counts[data_type]) + + def assert_scoped_fields(self, overview_response: dict, discovery: DiscoveryConfig): + self.assertSetEqual( + set(field for field in overview_response["fields"].keys()), + set(chart["field"] for section in discovery["overview"] for chart in section["charts"]) + ) + + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_overview(self): - response = self.client.get('/api/public_overview') - response_obj = response.json() - db_count = ph_m.Individual.objects.all().count() + node_discovery = settings.CONFIG_PUBLIC + # SCOPE: whole node + response_whole = self.client.get(self.url) + response_whole_obj = response_whole.json() + self.assertEqual(response_whole.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_whole_obj, dict) + self.assert_counts_censored(response_whole_obj, node_discovery) + self.assert_scoped_fields(response_whole_obj, node_discovery) + + # SCOPE: project_a (whole node fallback) + response_p_a = self.client.get(f"{self.url}?project={self.id_proj_a}") + self.assertEqual(response_p_a.status_code, status.HTTP_200_OK) + self.assertEqual(response_p_a.json(), response_whole_obj) + self.assert_counts_censored(response_whole_obj, node_discovery) + self.assert_scoped_fields(response_whole_obj, node_discovery) + + # SCOPE: dataset_a + response_d_a = self.client.get(f"{self.url}?dataset={self.id_ds_a}") + self.assertEqual(response_d_a.status_code, status.HTTP_200_OK) + self.assert_counts_censored(response_d_a.json(), self.dataset_a.discovery) + self.assert_scoped_fields(response_d_a.json(), self.dataset_a.discovery) + + # SCOPE: project_b + response_p_b = self.client.get(f"{self.url}?project={self.id_proj_b}") + self.assertEqual(response_p_b.status_code, status.HTTP_200_OK) + self.assert_counts_censored(response_p_b.json(), self.project_b.discovery) + self.assert_scoped_fields(response_p_b.json(), self.project_b.discovery) + + # SCOPE: dataset_b (project_b fallback) + response_d_b = self.client.get(f"{self.url}?dataset={self.id_ds_b}") + self.assertEqual(response_d_b.status_code, status.HTTP_200_OK) + self.assert_counts_censored(response_d_b.json(), self.project_b.discovery) + self.assert_scoped_fields(response_d_b.json(), self.project_b.discovery) + + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) + def test_overview_project_dataset(self): + # SCOPE: project_a + dataset_a + response = self.client.get(f"{self.url}?project={self.id_proj_a}&dataset={self.id_ds_a}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj["counts"]["individuals"], db_count) + self.assert_scoped_fields(response.json(), self.dataset_a.discovery) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + # SCOPE: project_a + dataset_b (invalid) + response_invalid = self.client.get(f"{self.url}?project={self.id_proj_a}&dataset={self.id_ds_b}") + self.assertEqual(response_invalid.status_code, status.HTTP_404_NOT_FOUND) + + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_overview_bins(self): # test that there is the correct number of data entries for number # histograms, vs. number of bins - response = self.client.get('/api/public_overview') + response = self.client.get(self.url) response_obj = response.json() self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertIsInstance(response_obj, dict) @@ -127,7 +317,7 @@ def test_overview_bins(self): @override_settings(CONFIG_PUBLIC={}) def test_overview_no_config(self): - response = self.client.get('/api/public_overview') + response = self.client.get(self.url) response_obj = response.json() self.assertIsInstance(response_obj, dict) self.assertEqual(response_obj, dres.NO_PUBLIC_DATA_AVAILABLE) @@ -136,14 +326,15 @@ def test_overview_no_config(self): class PublicOverviewTest2(APITestCase): def setUp(self) -> None: + self.url = '/api/public_overview' # create only 2 individuals for ind in VALID_INDIVIDUALS[:2]: ph_m.Individual.objects.create(**ind) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_overview_response(self): # test overview response when individuals count < threshold - response = self.client.get('/api/public_overview') + response = self.client.get(self.url) response_obj = response.json() self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertIsInstance(response_obj, dict) @@ -152,7 +343,7 @@ def test_overview_response(self): @override_settings(CONFIG_PUBLIC={}) def test_overview_response_no_config(self): # test overview response when individuals count < threshold - response = self.client.get('/api/public_overview') + response = self.client.get(self.url) response_obj = response.json() self.assertIsInstance(response_obj, dict) self.assertEqual(response_obj, dres.NO_PUBLIC_DATA_AVAILABLE) @@ -165,7 +356,7 @@ def setUp(self) -> None: for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST: ph_m.Individual.objects.create(**ind) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_overview_response(self): # test overview response with passing TypeError exception response = self.client.get('/api/public_overview') @@ -190,7 +381,7 @@ def setUp(self) -> None: for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT: ph_m.Individual.objects.create(**ind) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_overview_response(self): # test overview response with passing TypeError exception response = self.client.get('/api/public_overview') @@ -223,7 +414,7 @@ def setUp(self) -> None: dats_file=dats_content ) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_dataset(self): response = self.client.get(reverse("public-dataset")) response_obj = response.json() @@ -244,3 +435,50 @@ def test_public_dataset_response_no_config(self): response_obj = response.json() self.assertIsInstance(response_obj, dict) self.assertEqual(response_obj, dres.NO_PUBLIC_DATA_AVAILABLE) + + +class DiscoverySchemaTest(APITestCase): + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) + def test_discover_schema(self): + response = self.client.get(reverse("discovery-schema")) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), DISCOVERY_SCHEMA) + + +class DiscoveryRulesTest(ScopedDiscoveryTestCase): + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) + def test_discovery_rules(self): + # Node scope + url = reverse("public-rules") + response = self.client.get(url) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), DISCOVERY_CONFIG_TEST["rules"]) + + # PROJECTS + response_p_a = self.client.get(f"{url}?project={self.id_proj_a}") + self.assertEqual(response_p_a.status_code, status.HTTP_200_OK) + self.assertEqual(response_p_a.json(), DISCOVERY_CONFIG_TEST["rules"]) # node discovery fallback + + response_p_b = self.client.get(f"{url}?project={self.id_proj_b}") + self.assertEqual(response_p_b.status_code, status.HTTP_200_OK) + self.assertEqual(response_p_b.json(), self.project_b.discovery["rules"]) + + # Dataset scope + response_d_a = self.client.get(f"{url}?dataset={self.id_ds_a}") + self.assertEqual(response_d_a.status_code, status.HTTP_200_OK) + self.assertEqual(response_d_a.json(), self.dataset_a.discovery["rules"]) + + response_d_b = self.client.get(f"{url}?dataset={self.id_ds_b}") + self.assertEqual(response_d_b.status_code, status.HTTP_200_OK) + self.assertEqual(response_d_b.json(), self.project_b.discovery["rules"]) + + @override_settings(CONFIG_PUBLIC={}) + def test_discovery_exp(self): + # Node scope not configured + url = reverse("public-rules") + response = self.client.get(url) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), RULES_NO_PERMISSIONS) + + response_exp = self.client.get(f"{url}?project={self.id_proj_a}&dataset={self.id_ds_b}") + self.assertEqual(response_exp.status_code, status.HTTP_404_NOT_FOUND) diff --git a/chord_metadata_service/discovery/tests/test_censorship.py b/chord_metadata_service/discovery/tests/test_censorship.py index bdf757637..04719cccc 100644 --- a/chord_metadata_service/discovery/tests/test_censorship.py +++ b/chord_metadata_service/discovery/tests/test_censorship.py @@ -2,7 +2,7 @@ from django.test import TestCase, override_settings from chord_metadata_service.discovery.censorship import get_threshold, thresholded_count, get_max_query_parameters -from .constants import CONFIG_PUBLIC_TEST +from .constants import DISCOVERY_CONFIG_TEST class CensorshipGetThresholdTest(TestCase): @@ -10,33 +10,30 @@ class CensorshipGetThresholdTest(TestCase): # get_threshold(...) def test_get_threshold_no_censorship(self): - self.assertEqual(get_threshold(low_counts_censored=False), 0) + self.assertEqual(get_threshold({}, low_counts_censored=False), 0) @override_settings(CONFIG_PUBLIC={}) def test_get_threshold_no_config(self): # no public config configured - self.assertEqual(get_threshold(low_counts_censored=True), sys.maxsize) + self.assertEqual(get_threshold({}, low_counts_censored=True), sys.maxsize) class CensorshipThresholdedCountTest(TestCase): - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) def test_get_threshold_configured(self): - self.assertEqual(get_threshold(low_counts_censored=False), 0) - self.assertEqual(get_threshold(low_counts_censored=True), 5) + self.assertEqual(get_threshold(DISCOVERY_CONFIG_TEST, low_counts_censored=False), 0) + self.assertEqual(get_threshold(DISCOVERY_CONFIG_TEST, low_counts_censored=True), 5) # thresholded_count(...) def test_thresholded_count_no_censorship(self): - self.assertEqual(thresholded_count(1, low_counts_censored=False), 1) + self.assertEqual(thresholded_count(1, {}, low_counts_censored=False), 1) - @override_settings(CONFIG_PUBLIC={}) - def test_thresholded_count_no_config(self): # no public config configured - self.assertEqual(thresholded_count(100000, low_counts_censored=True), 0) + def test_thresholded_count_no_config(self): + self.assertEqual(thresholded_count(100000, {}, low_counts_censored=True), 0) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) def test_thresholded_count_configured(self): - self.assertEqual(thresholded_count(5, low_counts_censored=False), 5) - self.assertEqual(thresholded_count(5, low_counts_censored=True), 0) + self.assertEqual(thresholded_count(5, DISCOVERY_CONFIG_TEST, low_counts_censored=False), 5) + self.assertEqual(thresholded_count(5, DISCOVERY_CONFIG_TEST, low_counts_censored=True), 0) class CensorshipGetMaxQueryParametersTest(TestCase): @@ -44,13 +41,13 @@ class CensorshipGetMaxQueryParametersTest(TestCase): # get_max_query_parameters(...) def test_get_max_query_parameters_no_censorship(self): - self.assertEqual(get_max_query_parameters(low_counts_censored=False), sys.maxsize) + self.assertEqual(get_max_query_parameters({}, low_counts_censored=False), sys.maxsize) @override_settings(CONFIG_PUBLIC={}) def test_get_max_query_parameters_no_config(self): - self.assertEqual(get_max_query_parameters(low_counts_censored=True), 0) + self.assertEqual(get_max_query_parameters({}, low_counts_censored=True), 0) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_get_max_query_parameters_configured(self): - self.assertEqual(get_max_query_parameters(low_counts_censored=False), sys.maxsize) - self.assertEqual(get_max_query_parameters(low_counts_censored=True), 2) + self.assertEqual(get_max_query_parameters(DISCOVERY_CONFIG_TEST, low_counts_censored=False), sys.maxsize) + self.assertEqual(get_max_query_parameters(DISCOVERY_CONFIG_TEST, low_counts_censored=True), 2) diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py index 12c1fb1b5..8dc905b30 100644 --- a/chord_metadata_service/discovery/tests/test_fields.py +++ b/chord_metadata_service/discovery/tests/test_fields.py @@ -1,11 +1,14 @@ from django.test import TransactionTestCase, override_settings from rest_framework.test import APITestCase +from copy import deepcopy +from chord_metadata_service.chord.tests.helpers import ProjectTestCase +from chord_metadata_service.discovery.types import DiscoveryConfig from chord_metadata_service.patients import models as pa_m from chord_metadata_service.phenopackets.tests import constants as ph_c from chord_metadata_service.phenopackets import models as ph_m -from .constants import CONFIG_PUBLIC_TEST +from .constants import DISCOVERY_CONFIG_TEST from ..fields import ( get_field_options, get_categorical_stats, @@ -17,54 +20,59 @@ class TestGetFieldOptions(TransactionTestCase): - - field_some_prop = { - "datatype": "string", - "mapping": "individual/extra_properties/some_prop", - "title": "Some Prop", - "description": "Some property", - "config": { - "enum": ["a", "b"], - }, + discovery: DiscoveryConfig = { + "fields": { + "some_prop": { + "datatype": "string", + "mapping": "individual/extra_properties/some_prop", + "title": "Some Prop", + "description": "Some property", + "config": { + "enum": ["a", "b"], + }, + } + } } async def test_get_string_options(self): - self.assertListEqual(await get_field_options(self.field_some_prop, low_counts_censored=False), ["a", "b"]) + self.assertListEqual(await get_field_options("some_prop", self.discovery, False), ["a", "b"]) async def test_get_field_options_not_impl(self): + # {**self.field_some_prop, "datatype": "made_up"} + invalid_discovery = deepcopy(self.discovery) + invalid_discovery["fields"]["some_prop"]["datatype"] = "made_up" with self.assertRaises(NotImplementedError): # noinspection PyTypeChecker - await get_field_options({**self.field_some_prop, "datatype": "made_up"}, low_counts_censored=False) + await get_field_options("some_prop", invalid_discovery, low_counts_censored=False) -class TestGetCategoricalStats(TransactionTestCase): - - f_sex = { - "mapping": "individual/sex", - "datatype": "string", - "title": "Sex", - "description": "Sex", - "config": { - "enum": None, - }, - } +class TestGetCategoricalStats(ProjectTestCase): def setUp(self): self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1) + self.meta_data = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1) + self.phenopacket = ph_m.Phenopacket.objects.create( + id="phenopacket_id:1", + subject=self.individual_1, + dataset=self.dataset, + meta_data=self.meta_data, + ) async def test_categorical_stats_lcf(self): - res = await get_categorical_stats(self.f_sex, low_counts_censored=False) + res = await get_categorical_stats("sex", DISCOVERY_CONFIG_TEST, project_id=self.project.identifier, + dataset_id=self.dataset.identifier, low_counts_censored=False) self.assertListEqual(res, [{"label": "MALE", "value": 1}, {"label": "missing", "value": 0}]) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) async def test_categorical_stats_lct(self): - res = await get_categorical_stats(self.f_sex, low_counts_censored=True) + res = await get_categorical_stats("sex", DISCOVERY_CONFIG_TEST, project_id=self.project.identifier, + dataset_id=self.dataset.identifier, low_counts_censored=True) self.assertListEqual(res, [{"label": "missing", "value": 0}]) class TestDateStatsExcept(APITestCase): - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) async def test_wrong_bin_config(self): fp = { "title": "Date of Consent", @@ -77,12 +85,12 @@ async def test_wrong_bin_config(self): } with self.assertRaises(NotImplementedError): - await get_date_stats(fp) + await get_date_stats("date_of_consent", DISCOVERY_CONFIG_TEST) with self.assertRaises(NotImplementedError): await get_month_date_range(fp) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) async def test_wrong_field_config(self): fp = { "title": "Date of Consent", @@ -95,17 +103,17 @@ async def test_wrong_field_config(self): } with self.assertRaises(NotImplementedError): - await get_date_stats(fp) + await get_date_stats("date_of_consent", DISCOVERY_CONFIG_TEST) with self.assertRaises(NotImplementedError): await get_month_date_range(fp) -class TestJsonFieldArrayStats(TransactionTestCase): +class TestJsonFieldArrayStats(ProjectTestCase): tumor_lengths = range(1, 50, 5) - dm_fp = CONFIG_PUBLIC_TEST["fields"]["diagnostic_markers"] - mtl_fp = CONFIG_PUBLIC_TEST["fields"]["measurement_tumor_length"] + dm_fp = DISCOVERY_CONFIG_TEST["fields"]["diagnostic_markers"] + mtl_fp = DISCOVERY_CONFIG_TEST["fields"]["measurement_tumor_length"] def setUp(self) -> None: self.tumors = [ph_c.valid_measurement_tumor_length(length) for length in self.tumor_lengths] @@ -117,12 +125,19 @@ def setUp(self) -> None: subject=self.individual, measurements=self.tumors, meta_data=self.meta_data, + dataset=self.dataset, ) self.phenopacket.biosamples.set([self.biosample]) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) async def test_json_categorical_stats_lcf(self): - res = await get_categorical_stats(self.dm_fp, low_counts_censored=False) + res = await get_categorical_stats( + "diagnostic_markers", + DISCOVERY_CONFIG_TEST, + project_id=self.project.identifier, + dataset_id=self.dataset.identifier, + low_counts_censored=False + ) ground_truth = [ {"label": "Genetic Testing", "value": 1}, {"label": "Hematology Test", "value": 1}, @@ -130,9 +145,15 @@ async def test_json_categorical_stats_lcf(self): ] self.assertListEqual(res, ground_truth) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) async def test_json_categorical_stats_lct(self): - res = await get_categorical_stats(self.dm_fp, low_counts_censored=True) + res = await get_categorical_stats( + "diagnostic_markers", + DISCOVERY_CONFIG_TEST, + project_id=self.project.identifier, + dataset_id=self.dataset.identifier, + low_counts_censored=True + ) ground_truth = [ {"label": "missing", "value": 0}, ] @@ -169,10 +190,10 @@ def test_filter_queryset_field_value_number(self): self.assertEqual(qs.count(), 0) async def test_get_distinct_values(self): - dm_values = await get_distinct_field_values(self.dm_fp, False) + dm_values = await get_distinct_field_values("diagnostic_markers", DISCOVERY_CONFIG_TEST, False) self.assertEqual(len(dm_values), 2) self.assertTrue("Genetic Testing" in dm_values) self.assertTrue("Hematology Test" in dm_values) - dm_values_censored = await get_distinct_field_values(self.dm_fp, True) + dm_values_censored = await get_distinct_field_values("diagnostic_markers", DISCOVERY_CONFIG_TEST, True) self.assertListEqual(dm_values_censored, []) diff --git a/chord_metadata_service/discovery/tests/test_fields_utils.py b/chord_metadata_service/discovery/tests/test_fields_utils.py index b944ce7d1..d20407152 100644 --- a/chord_metadata_service/discovery/tests/test_fields_utils.py +++ b/chord_metadata_service/discovery/tests/test_fields_utils.py @@ -2,11 +2,13 @@ from django.db.models import Q from django.db.models.base import ModelBase -from chord_metadata_service.discovery.tests.constants import CONFIG_PUBLIC_TEST +from chord_metadata_service.discovery.tests.constants import DISCOVERY_CONFIG_TEST +from chord_metadata_service.discovery.model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL from ..fields_utils import ( get_jsonb_path_query, get_json_range_condition, get_model_and_field, + get_public_model_name, labelled_range_generator, get_nested_json_condition ) @@ -30,6 +32,14 @@ def test_get_model_nested_field(self): def test_get_wrong_model(self): self.assertRaises(NotImplementedError, get_model_and_field, "junk/age_numeric") + def test_get_public_model_name(self): + for name, model in PUBLIC_MODEL_NAMES_TO_MODEL.items(): + model_name = get_public_model_name(model) + self.assertEqual(name, model_name) + + def test_get_public_model_name_wrong(self): + self.assertRaises(NotImplementedError, get_public_model_name, ModelBase) + class TestLabelledRangeGenerator(TestCase): def setUp(self): @@ -201,7 +211,7 @@ def test_get_nested_json_condition(self): }) def test_get_json_range_condition(self): - field_props = CONFIG_PUBLIC_TEST["fields"]["measurement_tumor_length"] + field_props = DISCOVERY_CONFIG_TEST["fields"]["measurement_tumor_length"] # GTE 0 an LT 20 json_range_condition_0_20 = get_json_range_condition(field_props, min=0, max=20) diff --git a/chord_metadata_service/discovery/tests/test_stats.py b/chord_metadata_service/discovery/tests/test_stats.py index 5796d0eff..3425f3849 100644 --- a/chord_metadata_service/discovery/tests/test_stats.py +++ b/chord_metadata_service/discovery/tests/test_stats.py @@ -29,11 +29,11 @@ def setUp(self): self.experiment.experiment_results.set([self.experiment_result]) async def test_individual_biosample_tissue_stats(self): - count, res = await individual_biosample_tissue_stats(pa_m.Individual.objects.all(), low_counts_censored=False) + count, res = await individual_biosample_tissue_stats(pa_m.Individual.objects.all(), None, False) self.assertEqual(count, 1) self.assertListEqual(res, [{"label": "wall of urinary bladder", "value": 1}]) async def individual_experiment_type_stats(self): - count, res = await individual_experiment_type_stats(pa_m.Individual.objects.all(), low_counts_censored=False) + count, res = await individual_experiment_type_stats(pa_m.Individual.objects.all(), None, False) self.assertEqual(count, 1) self.assertListEqual(res, [{"label": "DNA Methylation", "value": 1}]) diff --git a/chord_metadata_service/discovery/types.py b/chord_metadata_service/discovery/types.py index ee41bfd81..f6677c739 100644 --- a/chord_metadata_service/discovery/types.py +++ b/chord_metadata_service/discovery/types.py @@ -1,11 +1,12 @@ from typing import Any, Literal, TypedDict __all__ = [ - "OverviewSectionChart", - "OverviewSection", + "BinWithValue", "DiscoveryFieldProps", "DiscoveryRules", - "BinWithValue", + "DiscoveryConfig", + "OverviewSectionChart", + "OverviewSection", ] @@ -38,6 +39,13 @@ class DiscoveryRules(TypedDict): count_threshold: int +class DiscoveryConfig(TypedDict): + overview: list[OverviewSection] + search: list[SearchSection] + fields: dict[str, DiscoveryFieldProps] + rules: DiscoveryRules + + class BinWithValue(TypedDict): label: str value: int diff --git a/chord_metadata_service/discovery/utils.py b/chord_metadata_service/discovery/utils.py new file mode 100644 index 000000000..8cfc9ac42 --- /dev/null +++ b/chord_metadata_service/discovery/utils.py @@ -0,0 +1,58 @@ +from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist +from rest_framework.request import Request as DrfRequest + +from chord_metadata_service.discovery.exceptions import DiscoveryConfigException +from chord_metadata_service.discovery.types import DiscoveryConfig +from ..chord import models as cm + +__all__ = [ + "get_discovery", + "get_request_discovery", +] + + +async def _get_project_discovery(project_id: str | None = None, project: cm.Project | None = None) -> dict: + if not project and project_id: + # retrieve project by ID if not provided + project = await cm.Project.objects.aget(identifier=project_id) + if not project.discovery: + # fallback on global discovery config if project has none + return settings.CONFIG_PUBLIC + return project.discovery + + +async def _get_dataset_discovery(dataset_id: str) -> dict: + dataset = await cm.Dataset.objects.aget(identifier=dataset_id) + if not dataset.discovery: + project = await cm.Project.objects.aget(datasets=dataset_id) + return await _get_project_discovery(project=project) + return dataset.discovery + + +async def get_discovery(project_id: str | None = None, dataset_id: str | None = None) -> DiscoveryConfig: + if dataset_id and project_id: + # check if the dataset belongs to the project + is_scope_valid = await cm.Dataset.objects.filter( + identifier=dataset_id, + project__identifier=project_id, + ).aexists() + if not is_scope_valid: + raise DiscoveryConfigException(dataset_id, project_id) + try: + if dataset_id: + # get dataset's discovery config if dataset_id is passed + return await _get_dataset_discovery(dataset_id) + elif project_id: + # get project's discovery config if project_id is passed and dataset_id is not + return await _get_project_discovery(project_id=project_id) + except ObjectDoesNotExist: + raise DiscoveryConfigException(dataset_id, project_id) + # fallback to config.json when no dataset or project is in the request + return settings.CONFIG_PUBLIC + + +async def get_request_discovery(request: DrfRequest) -> DiscoveryConfig: + dataset_id = request.query_params.get("dataset") + project_id = request.query_params.get("project") + return await get_discovery(project_id, dataset_id) diff --git a/chord_metadata_service/experiments/api_views.py b/chord_metadata_service/experiments/api_views.py index 9ce53ec8b..2f67b9e44 100644 --- a/chord_metadata_service/experiments/api_views.py +++ b/chord_metadata_service/experiments/api_views.py @@ -8,7 +8,7 @@ from .serializers import ExperimentSerializer, ExperimentResultSerializer from .models import Experiment, ExperimentResult -from .schemas import EXPERIMENT_SCHEMA +from .schemas import EXPERIMENT_SCHEMA, experiment_resolver, experiment_base_uri from .filters import ExperimentFilter, ExperimentResultFilter from chord_metadata_service.restapi.constants import MODEL_ID_PATTERN from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination @@ -146,4 +146,17 @@ def get_experiment_schema(_request): get: Experiment schema """ + # TODO: project-scope return Response(EXPERIMENT_SCHEMA) + + +@api_view(["GET"]) +@permission_classes([AllowAny]) +def get_experiment_subschema(_request, subschema: str): + """ + get: + Experiment sub-schema + """ + # TODO: project-scope + schema = experiment_resolver.lookup(ref=f"{experiment_base_uri}/{subschema}").contents + return Response(schema) diff --git a/chord_metadata_service/experiments/schemas.py b/chord_metadata_service/experiments/schemas.py index 49db0470f..cad38f7c4 100644 --- a/chord_metadata_service/experiments/schemas.py +++ b/chord_metadata_service/experiments/schemas.py @@ -1,10 +1,14 @@ -from pathlib import Path +from django.conf import settings +from referencing import Registry, Resource from .descriptions import EXPERIMENT, EXPERIMENT_RESULT, INSTRUMENT from chord_metadata_service.restapi.constants import MODEL_ID_PATTERN from chord_metadata_service.restapi.schemas import ONTOLOGY_CLASS_LIST, KEY_VALUE_OBJECT -from chord_metadata_service.restapi.schema_utils import tag_ids_and_describe, get_schema_app_id, sub_schema_uri +from chord_metadata_service.restapi.schema_utils import tag_ids_and_describe, sub_schema_uri __all__ = [ + "experiment_resource", + "experiment_registry", + "experiment_resolver", "DATA_FILE_OR_RECORD_URL_SCHEMA", "EXPERIMENT_RESULT_FILE_INDEX_SCHEMA", "EXPERIMENT_RESULT_FILE_INDEX_LIST_SCHEMA", @@ -13,11 +17,11 @@ "INSTRUMENT_SCHEMA", ] -base_uri = get_schema_app_id(Path(__file__).parent.name) +experiment_base_uri = f"{settings.SCHEMAS_BASE_URL}/experiment" DATA_FILE_OR_RECORD_URL_SCHEMA = { "$schema": "http://json-schema.org/draft-07/schema#", - "$id": sub_schema_uri(base_uri, "data_file_or_record_url"), + "$id": sub_schema_uri(experiment_base_uri, "data_file_or_record_url"), "title": "Data file or record URL", "description": "A URL of a particular scheme, pointing to a data file OR a DRS record which itself points to a " "data file.", @@ -29,7 +33,7 @@ EXPERIMENT_RESULT_FILE_INDEX_SCHEMA = { "$schema": "http://json-schema.org/draft-07/schema#", - "$id": sub_schema_uri(base_uri, "experiment_result_file_index"), + "$id": sub_schema_uri(experiment_base_uri, "experiment_result_file_index"), "title": "Experiment result file index schema", "description": "Schema for describing an object representing an index of an experiment result file.", "type": "object", @@ -51,7 +55,7 @@ } EXPERIMENT_RESULT_FILE_INDEX_LIST_SCHEMA = { "$schema": "http://json-schema.org/draft-07/schema#", - "$id": sub_schema_uri(base_uri, "experiment_result_file_index_list"), + "$id": sub_schema_uri(experiment_base_uri, "experiment_result_file_index_list"), "title": "Experiment result file index list schema", "description": "Schema for describing a list of object representing an indices of an experiment result file.", "type": "array", @@ -60,7 +64,7 @@ EXPERIMENT_RESULT_SCHEMA = tag_ids_and_describe({ "$schema": "http://json-schema.org/draft-07/schema#", - "$id": sub_schema_uri(base_uri, "experiment_result"), + "$id": sub_schema_uri(experiment_base_uri, "experiment_result"), "title": "Experiment result schema", "description": "Schema for describing information about analysis of sequencing data in a file format.", "type": "object", @@ -105,7 +109,7 @@ INSTRUMENT_SCHEMA = tag_ids_and_describe({ "$schema": "http://json-schema.org/draft-07/schema#", - "$id": sub_schema_uri(base_uri, "instrument"), + "$id": sub_schema_uri(experiment_base_uri, "instrument"), "title": "Instrument schema", "description": "Schema for describing an instrument used for a sequencing experiment.", "type": "object", @@ -129,7 +133,7 @@ EXPERIMENT_SCHEMA = tag_ids_and_describe({ "$schema": "http://json-schema.org/draft-07/schema#", - "$id": sub_schema_uri(base_uri, "experiment"), + "$id": experiment_base_uri, "title": "Experiment schema", "description": "Schema for describing an experiment.", "type": "object", @@ -197,4 +201,9 @@ "instrument": INSTRUMENT_SCHEMA }, "required": ["id", "experiment_type"] -}, EXPERIMENT) +}, EXPERIMENT, override_children=True) + +# URI referencing +experiment_resource = Resource.from_contents(EXPERIMENT_SCHEMA) +experiment_registry = Registry().with_resource(uri=experiment_base_uri, resource=experiment_resource) +experiment_resolver = experiment_registry.resolver(base_uri=experiment_base_uri) diff --git a/chord_metadata_service/experiments/summaries.py b/chord_metadata_service/experiments/summaries.py index 261c832d3..2cdc907ed 100644 --- a/chord_metadata_service/experiments/summaries.py +++ b/chord_metadata_service/experiments/summaries.py @@ -4,6 +4,7 @@ from chord_metadata_service.discovery.censorship import thresholded_count from chord_metadata_service.discovery.stats import queryset_stats_for_field +from chord_metadata_service.discovery.types import DiscoveryConfig from . import models __all__ = [ @@ -14,7 +15,7 @@ ] -async def experiment_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: +async def experiment_summary(experiments: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool) -> dict: # TODO: limit to authorized field list if we're in censored discovery mode - based on discovery config ( @@ -29,18 +30,18 @@ async def experiment_summary(experiments: QuerySet, low_counts_censored: bool) - extraction_protocol, ) = await asyncio.gather( experiments.acount(), - queryset_stats_for_field(experiments, "study_type", low_counts_censored), - queryset_stats_for_field(experiments, "experiment_type", low_counts_censored), - queryset_stats_for_field(experiments, "molecule", low_counts_censored), - queryset_stats_for_field(experiments, "library_strategy", low_counts_censored), - queryset_stats_for_field(experiments, "library_source", low_counts_censored), - queryset_stats_for_field(experiments, "library_selection", low_counts_censored), - queryset_stats_for_field(experiments, "library_layout", low_counts_censored), - queryset_stats_for_field(experiments, "extraction_protocol", low_counts_censored), + queryset_stats_for_field(experiments, "study_type", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "experiment_type", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "molecule", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "library_strategy", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "library_source", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "library_selection", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "library_layout", discovery, low_counts_censored), + queryset_stats_for_field(experiments, "extraction_protocol", discovery, low_counts_censored), ) return { - "count": thresholded_count(count, low_counts_censored), + "count": thresholded_count(count, discovery, low_counts_censored), "study_type": study_type, "experiment_type": experiment_type, "molecule": molecule, @@ -52,7 +53,11 @@ async def experiment_summary(experiments: QuerySet, low_counts_censored: bool) - } -async def experiment_result_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: +async def experiment_result_summary( + experiments: QuerySet, + discovery: DiscoveryConfig, + low_counts_censored: bool +) -> dict: experiment_results = models.ExperimentResult.objects.filter(experiment__in=experiments) ( @@ -62,36 +67,36 @@ async def experiment_result_summary(experiments: QuerySet, low_counts_censored: usage, ) = await asyncio.gather( experiment_results.acount(), - queryset_stats_for_field(experiment_results, "file_format", low_counts_censored), - queryset_stats_for_field(experiment_results, "data_output_type", low_counts_censored), - queryset_stats_for_field(experiment_results, "usage", low_counts_censored), + queryset_stats_for_field(experiment_results, "file_format", discovery, low_counts_censored), + queryset_stats_for_field(experiment_results, "data_output_type", discovery, low_counts_censored), + queryset_stats_for_field(experiment_results, "usage", discovery, low_counts_censored), ) return { - "count": thresholded_count(count, low_counts_censored), + "count": thresholded_count(count, discovery, low_counts_censored), "file_format": file_format, "data_output_type": data_output_type, "usage": usage, } -async def instrument_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: +async def instrument_summary(experiments: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool) -> dict: instruments = models.Instrument.objects.filter(experiment__in=experiments).distinct() count, platform, model = await asyncio.gather( instruments.acount(), - queryset_stats_for_field(instruments, "platform", low_counts_censored), - queryset_stats_for_field(instruments, "model", low_counts_censored), + queryset_stats_for_field(instruments, "platform", discovery, low_counts_censored), + queryset_stats_for_field(instruments, "model", discovery, low_counts_censored), ) return { - "count": thresholded_count(count, low_counts_censored), + "count": thresholded_count(count, discovery, low_counts_censored), "platform": platform, "model": model, } -async def dt_experiment_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: +async def dt_experiment_summary(experiments: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool) -> dict: # Parallel-gather all statistics we may need for this response ( experiments_count, @@ -100,13 +105,13 @@ async def dt_experiment_summary(experiments: QuerySet, low_counts_censored: bool instrument_summary_val, ) = await asyncio.gather( experiments.acount(), - experiment_summary(experiments, low_counts_censored), - experiment_result_summary(experiments, low_counts_censored), - instrument_summary(experiments, low_counts_censored), + experiment_summary(experiments, discovery, low_counts_censored), + experiment_result_summary(experiments, discovery, low_counts_censored), + instrument_summary(experiments, discovery, low_counts_censored), ) return { - "count": thresholded_count(experiments_count, low_counts_censored), + "count": thresholded_count(experiments_count, discovery, low_counts_censored), "data_type_specific": { "experiments": experiment_summary_val, "experiment_results": exp_res_summary_val, diff --git a/chord_metadata_service/experiments/tests/test_api.py b/chord_metadata_service/experiments/tests/test_api.py index 2a3d7e73a..2687d82e6 100644 --- a/chord_metadata_service/experiments/tests/test_api.py +++ b/chord_metadata_service/experiments/tests/test_api.py @@ -1,6 +1,8 @@ +from django.urls import reverse from jsonschema.validators import Draft7Validator from django.test import TestCase +from chord_metadata_service.experiments.schemas import EXPERIMENT_SCHEMA from chord_metadata_service.restapi.api_renderers import ExperimentCSVRenderer import csv import io @@ -54,7 +56,7 @@ def test_get_experiment_one(self): self.assertEqual(response_data['id'], 'katsu.experiment:1') def test_get_experiment_schema(self): - response = self.client.get('/api/experiment_schema') + response = self.client.get('/api/schemas/experiment') self.assertEqual(response.status_code, status.HTTP_200_OK) response_data = response.json() Draft7Validator.check_schema(response_data) @@ -188,3 +190,18 @@ def test_csv_render_with_empty_data(self): for row in reader: for key in row: self.assertEqual(row[key], '') + + +class TestExperimentSchema(APITestCase): + + def test_experiment_schema(self): + response = self.client.get(reverse("experiment-schema")) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), EXPERIMENT_SCHEMA) + + def test_experiment_subschema(self): + for subschema in EXPERIMENT_SCHEMA["properties"].values(): + property = subschema["$id"].split('/')[-1] + response = self.client.get(reverse("experiment-subschema", kwargs={"subschema": property})) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), subschema) diff --git a/chord_metadata_service/metadata/settings.py b/chord_metadata_service/metadata/settings.py index 0bd6747d9..b5e5b8923 100644 --- a/chord_metadata_service/metadata/settings.py +++ b/chord_metadata_service/metadata/settings.py @@ -51,6 +51,12 @@ CHORD_URL = os.environ.get("CHORD_URL") # Leave None if not specified, for running in other contexts +SERVICE_URL_BASE_PATH = os.environ.get("SERVICE_URL_BASE_PATH") +if SERVICE_URL_BASE_PATH: + SCHEMAS_BASE_URL = SERVICE_URL_BASE_PATH + "/api/schemas" +else: + SCHEMAS_BASE_URL = "/chord_metadata_service/schemas" + # SECURITY WARNING: Don't run with CHORD_PERMISSIONS turned off in production, # unless an alternative permissions system is in place. CHORD_PERMISSIONS = os.environ.get("CHORD_PERMISSIONS", str(not DEBUG)).lower() == "true" diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py index bfef00691..d60ed5c9f 100644 --- a/chord_metadata_service/patients/api_views.py +++ b/chord_metadata_service/patients/api_views.py @@ -5,14 +5,13 @@ from bento_lib.responses import errors from bento_lib.search import build_search_response from datetime import datetime -from django.conf import settings from django.contrib.postgres.aggregates import ArrayAgg from django.core.exceptions import ValidationError from django.db.models import Count, F, Q, QuerySet from django.db.models.functions import Coalesce from django_filters.rest_framework import DjangoFilterBackend from drf_spectacular.utils import extend_schema, inline_serializer -from rest_framework import viewsets, filters, mixins, serializers +from rest_framework import viewsets, filters, mixins, serializers, status from rest_framework.decorators import action from rest_framework.request import Request as DrfRequest from rest_framework.response import Response @@ -20,8 +19,10 @@ from chord_metadata_service.discovery import responses as dres from chord_metadata_service.discovery.censorship import get_max_query_parameters, get_threshold, thresholded_count +from chord_metadata_service.discovery.exceptions import DiscoveryConfigException from chord_metadata_service.discovery.fields import get_field_options, filter_queryset_field_value from chord_metadata_service.discovery.stats import individual_biosample_tissue_stats, individual_experiment_type_stats +from chord_metadata_service.discovery.utils import get_request_discovery from chord_metadata_service.logger import logger from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH from chord_metadata_service.phenopackets.models import Phenopacket @@ -149,11 +150,13 @@ def get_queryset(self): async def public_discovery_filter_queryset(request: DrfRequest, queryset: QuerySet, low_counts_censored: bool): # Check query parameters validity qp = request.query_params - if len(qp) > get_max_query_parameters(low_counts_censored=low_counts_censored): + discovery = await get_request_discovery(request) + # TODO: allow exceeding max query parameters for authorized requests + if len(qp) > get_max_query_parameters(discovery, low_counts_censored): raise ValidationError(f"Wrong number of fields: {len(qp)}") - search_conf = settings.CONFIG_PUBLIC["search"] - field_conf = settings.CONFIG_PUBLIC["fields"] + search_conf = discovery["search"] + field_conf = discovery["fields"] queryable_fields = { f"{f}": field_conf[f] for section in search_conf for f in section["fields"] } @@ -163,7 +166,7 @@ async def public_discovery_filter_queryset(request: DrfRequest, queryset: QueryS raise ValidationError(f"Unsupported field used in query: {field}") field_props = queryable_fields[field] - options = await get_field_options(field_props, low_counts_censored=low_counts_censored) + options = await get_field_options(field, discovery, low_counts_censored) if ( value not in options and not ( @@ -203,7 +206,12 @@ class PublicListIndividuals(APIView): """ async def get(self, request, *_args, **_kwargs): - if not settings.CONFIG_PUBLIC: + try: + discovery = await get_request_discovery(request) + except DiscoveryConfigException as e: + return Response(e.message, status=status.HTTP_404_NOT_FOUND) + + if not discovery: return Response(dres.NO_PUBLIC_DATA_AVAILABLE) base_qs = Individual.objects.all() @@ -214,17 +222,17 @@ async def get(self, request, *_args, **_kwargs): *(e.error_list if hasattr(e, "error_list") else e.error_dict.items()), )) - qct = thresholded_count(await filtered_qs.acount(), low_counts_censored=True) + qct = thresholded_count(await filtered_qs.acount(), discovery, low_counts_censored=True) if qct == 0: logger.info( f"Public individuals endpoint recieved query params {request.query_params} which resulted in " - f"sub-threshold count: {qct} <= {get_threshold(True)}") + f"sub-threshold count: {qct} <= {get_threshold(discovery, low_counts_censored=True)}") return Response(dres.INSUFFICIENT_DATA_AVAILABLE) (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather( - individual_biosample_tissue_stats(filtered_qs, low_counts_censored=True), - individual_experiment_type_stats(filtered_qs, low_counts_censored=True), + individual_biosample_tissue_stats(filtered_qs, discovery, low_counts_censored=True), + individual_experiment_type_stats(filtered_qs, discovery, low_counts_censored=True), ) return Response({ @@ -248,7 +256,12 @@ class BeaconListIndividuals(APIView): """ async def get(self, request, *_args, **_kwargs): - if not settings.CONFIG_PUBLIC: + try: + discovery = await get_request_discovery(request) + except DiscoveryConfigException as e: + return Response(e.message, status=status.HTTP_404_NOT_FOUND) + + if not discovery: return Response(dres.NO_PUBLIC_DATA_AVAILABLE, status=404) base_qs = Individual.objects.all() @@ -259,8 +272,8 @@ async def get(self, request, *_args, **_kwargs): *(e.error_list if hasattr(e, "error_list") else e.error_dict.items())), status=400) (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather( - individual_biosample_tissue_stats(filtered_qs, low_counts_censored=False), - individual_experiment_type_stats(filtered_qs, low_counts_censored=False), + individual_biosample_tissue_stats(filtered_qs, discovery, low_counts_censored=False), + individual_experiment_type_stats(filtered_qs, discovery, low_counts_censored=False), ) return Response({ diff --git a/chord_metadata_service/patients/schemas.py b/chord_metadata_service/patients/schemas.py index 5443c3f14..148cc3571 100644 --- a/chord_metadata_service/patients/schemas.py +++ b/chord_metadata_service/patients/schemas.py @@ -1,3 +1,4 @@ +from django.conf import settings from chord_metadata_service.restapi.constants import MODEL_ID_PATTERN from chord_metadata_service.restapi.schema_utils import ( DATE_TIME, @@ -8,20 +9,18 @@ string_with_pattern, enum_of, tag_ids_and_describe, - get_schema_app_id, sub_schema_uri, ) from chord_metadata_service.restapi.schemas import ONTOLOGY_CLASS, EXTRA_PROPERTIES_SCHEMA, TIME_ELEMENT_SCHEMA -from pathlib import Path from .descriptions import INDIVIDUAL, VITAL_STATUS from .values import Sex, KaryotypicSex -base_uri = get_schema_app_id(Path(__file__).parent.name) +phenopacket_base_uri = f"{settings.SCHEMAS_BASE_URL}/phenopacket" VITAL_STATUS_SCHEMA = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "vital_status"), + "$id": sub_schema_uri(phenopacket_base_uri, "vital_status"), "type": "object", "properties": { "status": enum_of(["UNKNOWN_STATUS", "ALIVE", "DECEASED"]), @@ -35,7 +34,7 @@ INDIVIDUAL_SCHEMA = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "individual"), + "$id": sub_schema_uri(phenopacket_base_uri, "individual"), "type": "object", "properties": { # Phenopacket V2 Individual fields diff --git a/chord_metadata_service/patients/summaries.py b/chord_metadata_service/patients/summaries.py index a775971ac..025664c16 100644 --- a/chord_metadata_service/patients/summaries.py +++ b/chord_metadata_service/patients/summaries.py @@ -5,6 +5,7 @@ from chord_metadata_service.discovery.censorship import thresholded_count from chord_metadata_service.discovery.fields import get_age_numeric_binned from chord_metadata_service.discovery.stats import queryset_stats_for_field +from chord_metadata_service.discovery.types import DiscoveryConfig from . import models __all__ = ["individual_summary"] @@ -13,7 +14,7 @@ OVERVIEW_AGE_BIN_SIZE = 10 # TODO: configurable -async def individual_summary(phenopackets: QuerySet | None, low_counts_censored: bool): +async def individual_summary(phenopackets: QuerySet | None, discovery: DiscoveryConfig, low_counts_censored: bool): individuals = ( models.Individual.objects.all() if phenopackets is None else models.Individual.objects.filter(phenopackets__in=phenopackets).distinct() @@ -23,16 +24,16 @@ async def individual_summary(phenopackets: QuerySet | None, low_counts_censored: individuals.acount(), # - Sex related fields stats are precomputed here and post processed later # to include missing values inferred from the schema - queryset_stats_for_field(individuals, "sex", low_counts_censored), - queryset_stats_for_field(individuals, "karyotypic_sex", low_counts_censored), + queryset_stats_for_field(individuals, "sex", discovery, low_counts_censored), + queryset_stats_for_field(individuals, "karyotypic_sex", discovery, low_counts_censored), # - Age - get_age_numeric_binned(individuals, OVERVIEW_AGE_BIN_SIZE, low_counts_censored), + get_age_numeric_binned(individuals, OVERVIEW_AGE_BIN_SIZE, discovery, low_counts_censored), # - Taxonomy - queryset_stats_for_field(individuals, "taxonomy__label", low_counts_censored), + queryset_stats_for_field(individuals, "taxonomy__label", discovery, low_counts_censored), ) return { - "count": thresholded_count(individual_count, low_counts_censored), + "count": thresholded_count(individual_count, discovery, low_counts_censored), "sex": {k: individual_sex.get(k, 0) for k in (s[0] for s in models.Individual.SEX)}, "karyotypic_sex": {k: individual_k_sex.get(k, 0) for k in (s[0] for s in models.Individual.KARYOTYPIC_SEX)}, "age": individual_age, diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py index c25f43b49..5e60318af 100644 --- a/chord_metadata_service/patients/tests/test_api.py +++ b/chord_metadata_service/patients/tests/test_api.py @@ -8,7 +8,11 @@ from rest_framework import status from rest_framework.test import APITestCase from chord_metadata_service.discovery import responses as dres -from chord_metadata_service.discovery.tests.constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY +from chord_metadata_service.discovery.tests.constants import ( + DISCOVERY_CONFIG_EXTRA_PROPERTIES, + DISCOVERY_CONFIG_TEST, + CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY +) from chord_metadata_service.patients.models import Individual from chord_metadata_service.phenopackets import models as ph_m from chord_metadata_service.phenopackets.tests import constants as ph_c @@ -17,7 +21,7 @@ from . import constants as c -CONFIG_PUBLIC_TEST_NO_THRESHOLD = deepcopy(CONFIG_PUBLIC_TEST) +CONFIG_PUBLIC_TEST_NO_THRESHOLD = deepcopy(DISCOVERY_CONFIG_TEST) CONFIG_PUBLIC_TEST_NO_THRESHOLD["rules"]["count_threshold"] = 0 @@ -298,7 +302,7 @@ def setUp(self): for individual in individuals: Individual.objects.create(**individual) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_get(self): # no filters GET request to /api/public, returns count or INSUFFICIENT_DATA_AVAILABLE response = self.client.get('/api/public') @@ -330,7 +334,7 @@ def test_public_get_no_config(self): class PublicFilteringIndividualsTest(APITestCase): """ Test for api/public GET filtering """ - response_threshold = CONFIG_PUBLIC_TEST["rules"]["count_threshold"] + response_threshold = DISCOVERY_CONFIG_TEST["rules"]["count_threshold"] num_individuals = 137 random_seed = 341 # do not change this please :)) @@ -349,7 +353,7 @@ def setUp(self): random.seed(self.random_seed) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_filtering_sex(self): # sex field search response = self.client.get('/api/public?sex=female') @@ -365,7 +369,7 @@ def test_public_filtering_sex(self): else: self.assertEqual(nb_female, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_2_fields(self): # sex and extra_properties string search # test GET query string search for extra_properties field @@ -391,7 +395,7 @@ def test_public_filtering_2_fields_config_empty(self): self.assertIsInstance(response_obj, dict) self.assertEqual(response_obj, dres.NO_PUBLIC_DATA_AVAILABLE) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_1(self): # extra_properties string search (multiple values) response = self.client.get('/api/public?smoking=Non-smoker&death_dc=Deceased') @@ -417,7 +421,7 @@ def test_public_filtering_extra_properties_1_config_empty(self): self.assertIsInstance(response_obj, dict) self.assertEqual(response_obj, dres.NO_PUBLIC_DATA_AVAILABLE) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_2(self): # extra_properties string search (multiple values) response = self.client.get( @@ -427,7 +431,7 @@ def test_public_filtering_extra_properties_2(self): response_obj = response.json() self.assertEqual(response_obj["code"], 400) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_invalid_3(self): # if GET query string list has various data types Error response = self.client.get('/api/public?extra_properties=[{"smoking": "Non-smoker"}, 5, "Test"]') @@ -435,7 +439,7 @@ def test_public_filtering_extra_properties_invalid_3(self): response_obj = response.json() self.assertEqual(response_obj["code"], 400) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_range_1(self): # extra_properties range search (both min and max ranges, single value) response = self.client.get( @@ -454,7 +458,7 @@ def test_public_filtering_extra_properties_range_1(self): else: self.assertEqual(db_count, response_obj["count"]) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_range_2(self): # extra_properties range search (above taper, single value) response = self.client.get( @@ -472,7 +476,7 @@ def test_public_filtering_extra_properties_range_2(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_range_3(self): # extra_properties range search (below taper, single value) response = self.client.get( @@ -490,7 +494,7 @@ def test_public_filtering_extra_properties_range_3(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_wrong_range(self): # extra_properties range search, unauthorized range response = self.client.get( @@ -500,7 +504,7 @@ def test_public_filtering_extra_properties_wrong_range(self): response_obj = response.json() self.assertEqual(response_obj["code"], 400) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_range_string_1(self): # sex string search and extra_properties range search response = self.client.get( @@ -520,7 +524,7 @@ def test_public_filtering_extra_properties_range_string_1(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_range_string_2(self): # extra_properties range search and extra_properties string search (single value) response = self.client.get( @@ -540,7 +544,7 @@ def test_public_filtering_extra_properties_range_string_2(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_multiple_ranges_1(self): # extra_properties range search (both min and max range, multiple values) response = self.client.get( @@ -561,7 +565,7 @@ def test_public_filtering_extra_properties_multiple_ranges_1(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_date_range_1(self): # extra_properties date range search (only after or before, single value) # Testing with a date of consent from 1 year ago @@ -580,7 +584,7 @@ def test_public_filtering_extra_properties_date_range_1(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_public_filtering_extra_properties_date_range_and_other_range(self): # extra_properties date range search (both after and before, single value) and other number range search # Testing with a date of consent from 2 years ago @@ -609,7 +613,7 @@ def test_public_filtering_mapping_for_search_filter(self): response_obj = response.json() self.assertEqual(1, response_obj["count"]) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_overview_sex(self): response = self.client.get('/api/public_search_fields') self.assertEqual(response.status_code, status.HTTP_200_OK) @@ -623,7 +627,7 @@ class PublicFilteringIndividualsTestSmallCellCount(PublicFilteringIndividualsTes num_individuals = 3 # below configured test threshold # rest of the methods are inherited - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_overview_sex(self): response = self.client.get('/api/public_search_fields') self.assertEqual(response.status_code, status.HTTP_200_OK) @@ -688,7 +692,7 @@ def setUp(self): individual.age_unit = age_unit if age_unit else "" individual.save() - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_filtering_age_range(self): # age valid range search response = self.client.get('/api/public?age=[20, 30)') @@ -701,7 +705,7 @@ def test_public_filtering_age_range(self): else: self.assertEqual(db_count, response_obj['count']) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_public_filtering_age_invalid_range(self): # age invalid range max search response = self.client.get('/api/public?age=[10, 50)') @@ -738,7 +742,7 @@ def setUp(self): Individual.objects.create(**individual) # test beacon formatted response - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_beacon_search_response(self): response = self.client.get('/api/beacon_search?sex=MALE') male_count = Individual.objects.filter(sex="MALE").count() @@ -752,17 +756,17 @@ def test_beacon_search_response_no_config(self): response = self.client.get('/api/beacon_search?sex=MALE') self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST) def test_beacon_search_response_invalid_search_key(self): response = self.client.get('/api/beacon_search?birdwatcher=yes') self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_beacon_search_response_invalid_search_value(self): response = self.client.get('/api/beacon_search?smoking=on_Sundays') self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_EXTRA_PROPERTIES) def test_beacon_search_more_params_than_censorship_limit(self): response = self.client.get('/api/beacon_search?sex=MALE&smoking=Non-smoker&death_dc=Deceased') self.assertEqual(response.status_code, status.HTTP_200_OK) diff --git a/chord_metadata_service/phenopackets/api_views.py b/chord_metadata_service/phenopackets/api_views.py index 9f29722fc..37c3ea871 100644 --- a/chord_metadata_service/phenopackets/api_views.py +++ b/chord_metadata_service/phenopackets/api_views.py @@ -11,7 +11,7 @@ from chord_metadata_service.restapi.constants import MODEL_ID_PATTERN from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation -from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA +from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA, phenopacket_resolver, phenopacket_base_uri from . import models as m, serializers as s, filters as f from drf_spectacular.utils import extend_schema, inline_serializer from rest_framework import serializers, status @@ -230,4 +230,17 @@ def get_chord_phenopacket_schema(_request): get: Chord phenopacket schema that can be shared with data providers. """ + # TODO: project-scope return Response(PHENOPACKET_SCHEMA) + + +@api_view(["GET"]) +@permission_classes([AllowAny]) +def get_chord_phenopacket_subschema(_request, subschema: str): + """ + get: + Chord phenopacket schema that can be shared with data providers. + """ + # TODO: project-scope + schema = phenopacket_resolver.lookup(ref=f"{phenopacket_base_uri}/{subschema}").contents + return Response(schema) diff --git a/chord_metadata_service/phenopackets/schemas.py b/chord_metadata_service/phenopackets/schemas.py index b35f90882..849a0d3cd 100644 --- a/chord_metadata_service/phenopackets/schemas.py +++ b/chord_metadata_service/phenopackets/schemas.py @@ -1,6 +1,6 @@ # Individual schemas for validation of JSONField values import json -from pathlib import Path +from django.conf import settings from referencing import Registry, Resource from referencing.jsonschema import DRAFT7 from chord_metadata_service.patients.schemas import INDIVIDUAL_SCHEMA @@ -25,7 +25,7 @@ named_one_of, sub_schema_uri, describe_schema, - get_schema_app_id + tag_ids_and_describe, ) from . import descriptions @@ -58,19 +58,20 @@ "VCF_RECORD_SCHEMA", "VRS_REF_REGISTRY", "VRS_VARIATION_SCHEMA", + "phenopacket_resolver", ] -base_uri = get_schema_app_id(Path(__file__).parent.name) +phenopacket_base_uri = f"{settings.SCHEMAS_BASE_URL}/phenopacket" with open("chord_metadata_service/vrs/schema/vrs.json", "r") as file: vrs_schema_definitions = json.load(file) - vrs_schema_definitions["$id"] = sub_schema_uri(base_uri, "vrs") + vrs_schema_definitions["$id"] = sub_schema_uri(phenopacket_base_uri, "vrs") file.close() PHENOPACKET_EXTERNAL_REFERENCE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "external_reference"), + "$id": sub_schema_uri(phenopacket_base_uri, "external_reference"), "title": "External reference schema", "type": "object", "properties": { @@ -82,7 +83,7 @@ PHENOPACKET_UPDATE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "update"), + "$id": sub_schema_uri(phenopacket_base_uri, "update"), "title": "Updates schema", "type": "object", "properties": { @@ -98,7 +99,7 @@ # noinspection PyProtectedMember PHENOPACKET_META_DATA_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "meta_data"), + "$id": sub_schema_uri(phenopacket_base_uri, "meta_data"), "type": "object", "properties": { "created": DATE_TIME, @@ -114,7 +115,7 @@ PHENOPACKET_EVIDENCE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "evidence"), + "$id": sub_schema_uri(phenopacket_base_uri, "evidence"), "title": "Evidence schema", "type": "object", "properties": { @@ -128,7 +129,7 @@ PHENOPACKET_PHENOTYPIC_FEATURE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "phenotypic_feature"), + "$id": sub_schema_uri(phenopacket_base_uri, "phenotypic_feature"), "type": "object", "properties": { "description": base_type(SchemaTypes.STRING), @@ -146,7 +147,7 @@ # TODO: search PHENOPACKET_GENE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "gene"), + "$id": sub_schema_uri(phenopacket_base_uri, "gene"), "type": "object", "properties": { "id": base_type(SchemaTypes.STRING), @@ -159,7 +160,7 @@ PHENOPACKET_PROCEDURE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "procedure"), + "$id": sub_schema_uri(phenopacket_base_uri, "procedure"), "title": "Procedure schema", "type": "object", "properties": { @@ -172,7 +173,7 @@ REFERENCE_RANGE_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "reference_range"), + "$id": sub_schema_uri(phenopacket_base_uri, "reference_range"), "title": "Reference range schema", "type": "object", "properties": { @@ -185,7 +186,7 @@ PHENOPACKET_QUANTITY_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "quantity"), + "$id": sub_schema_uri(phenopacket_base_uri, "quantity"), "title": "Quantity schema", "type": "object", "properties": { @@ -198,7 +199,7 @@ PHENOPACKET_TYPED_QUANTITY_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "typed_quantity"), + "$id": sub_schema_uri(phenopacket_base_uri, "typed_quantity"), "title": "Quantity schema", "type": "object", "properties": { @@ -210,7 +211,7 @@ PHENOPACKET_VALUE_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "value"), + "$id": sub_schema_uri(phenopacket_base_uri, "value"), "title": "Value schema", "type": "object", "oneOf": [ @@ -221,7 +222,7 @@ PHENOPACKET_COMPLEX_VALUE_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "complex_value"), + "$id": sub_schema_uri(phenopacket_base_uri, "complex_value"), "title": "Complex value schema", "type": "object", "properties": { @@ -232,7 +233,7 @@ PHENOPACKET_MEASUREMENT_VALUE_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "measurement_value"), + "$id": sub_schema_uri(phenopacket_base_uri, "measurement_value"), "title": "Measurement value schema", "type": "object", "oneOf": [ @@ -243,7 +244,7 @@ PHENOPACKET_MEASUREMENT_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "measurement"), + "$id": sub_schema_uri(phenopacket_base_uri, "measurement"), "title": "Measurement schema", "type": "object", "properties": { @@ -261,7 +262,7 @@ FILE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "file"), + "$id": sub_schema_uri(phenopacket_base_uri, "file"), "title": "Phenopacket file schema", "description": "The File message allows a Phenopacket to link the structured phenotypic data it " + "contains to external files which can be used to inform analyses.", @@ -276,7 +277,7 @@ # noinspection PyProtectedMember PHENOPACKET_BIOSAMPLE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "biosample"), + "$id": sub_schema_uri(phenopacket_base_uri, "biosample"), "type": "object", "properties": { "id": string_with_pattern(MODEL_ID_PATTERN), @@ -310,7 +311,7 @@ PHENOPACKET_DISEASE_ONSET_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "disease_onset"), + "$id": sub_schema_uri(phenopacket_base_uri, "disease_onset"), "title": "Onset age", "description": "Schema for the age of the onset of the disease.", "type": "object", @@ -323,7 +324,7 @@ PHENOPACKET_DISEASE_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "disease"), + "$id": sub_schema_uri(phenopacket_base_uri, "disease"), "title": "Disease schema", "type": "object", "properties": { @@ -341,7 +342,7 @@ DOSE_INTERVAL = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "dose_interval"), + "$id": sub_schema_uri(phenopacket_base_uri, "dose_interval"), "title": "Phenopacket dose interval for treatment", "description": "Represents the dose intervals of a treatment.", "type": "object", @@ -355,7 +356,7 @@ PHENOPACKET_TREATMENT = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "treatment"), + "$id": sub_schema_uri(phenopacket_base_uri, "treatment"), "title": "Phenopacket treatment", "description": "Represents a treatment with an agent, such as a drug.", "type": "object", @@ -376,7 +377,7 @@ PHENOPACKET_RADIATION_THERAPY = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "radiation_therapy"), + "$id": sub_schema_uri(phenopacket_base_uri, "radiation_therapy"), "title": "Phenopacket radiation therapy", "description": "Radiation therapy (or radiotherapy) uses ionizing radiation, generally as part of cancer " "treatment to control or kill malignant cells.", @@ -392,7 +393,7 @@ PHENOPACKET_THERAPEUTIC_REGIMEN = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "therapeutic_regimen"), + "$id": sub_schema_uri(phenopacket_base_uri, "therapeutic_regimen"), "title": "Phenopacket therapeutic regimen", "description": "This element represents a therapeutic regimen which will involve a specified set of treatments " "for a particular condition.", @@ -411,7 +412,7 @@ ONE_OF_MEDICAL_ACTION = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "one_of_medical_actions"), + "$id": sub_schema_uri(phenopacket_base_uri, "one_of_medical_actions"), "title": "Supported Phenopacket medical actions", "description": "One-of schema for supported medical action schemas", "type": "object", @@ -425,7 +426,7 @@ PHENOPACKET_MEDICAL_ACTION_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "medical_action"), + "$id": sub_schema_uri(phenopacket_base_uri, "medical_action"), "title": "Phenopacket medical action schema", "description": "Describes a medical action", "type": "object", @@ -447,7 +448,7 @@ GENE_DESCRIPTOR = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "gene_descriptor"), + "$id": sub_schema_uri(phenopacket_base_uri, "gene_descriptor"), "title": "Gene descriptor schema", "description": "Schema used to describe genes", "type": "object", @@ -466,7 +467,7 @@ EXPRESSION_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "expression"), + "$id": sub_schema_uri(phenopacket_base_uri, "expression"), "title": "Expression schema", "description": "Enables description of an object based on a nomenclature", "type": "object", @@ -481,7 +482,7 @@ EXTENSION_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "extension"), + "$id": sub_schema_uri(phenopacket_base_uri, "extension"), "title": "Extension schema", "description": "The Extension class provides a means to extend descriptions with other attributes unique to a " "content provider", @@ -495,7 +496,7 @@ VCF_RECORD_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "vcf_record"), + "$id": sub_schema_uri(phenopacket_base_uri, "vcf_record"), "title": "VCF record schema", "description": "This element is used to describe variants using the Variant Call Format.", "type": "object", @@ -516,38 +517,38 @@ VARIATION_ONE_OF_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "variation"), + "$id": sub_schema_uri(phenopacket_base_uri, "variation"), "title": "VRS Variation object", "description": "Provides a computable representation of variation", "type": "object", "oneOf": [ named_one_of("absolute_copy_numer", { - "$ref": sub_schema_uri(base_uri, "vrs#/definitions/AbsoluteCopyNumber") + "$ref": sub_schema_uri(phenopacket_base_uri, "vrs#/definitions/AbsoluteCopyNumber") }), named_one_of("allele", { - "$ref": sub_schema_uri(base_uri, "vrs#/definitions/Allele") + "$ref": sub_schema_uri(phenopacket_base_uri, "vrs#/definitions/Allele") }), named_one_of("genotype", { - "$ref": sub_schema_uri(base_uri, "#/definitions/Genotype") + "$ref": sub_schema_uri(phenopacket_base_uri, "#/definitions/Genotype") }), named_one_of("haplotype", { - "$ref": sub_schema_uri(base_uri, "#/definitions/Haplotype") + "$ref": sub_schema_uri(phenopacket_base_uri, "#/definitions/Haplotype") }), named_one_of("relative_copy_number", { - "$ref": sub_schema_uri(base_uri, "#/definitions/RelativeCopyNumber") + "$ref": sub_schema_uri(phenopacket_base_uri, "#/definitions/RelativeCopyNumber") }), named_one_of("text", { - "$ref": sub_schema_uri(base_uri, "#/definitions/Text") + "$ref": sub_schema_uri(phenopacket_base_uri, "#/definitions/Text") }), named_one_of("variation_set", { - "$ref": sub_schema_uri(base_uri, "#/definitions/VariationSet") + "$ref": sub_schema_uri(phenopacket_base_uri, "#/definitions/VariationSet") }), ] }, {}) VARIANT_DESCRIPTOR = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "variant_descriptor"), + "$id": sub_schema_uri(phenopacket_base_uri, "variant_descriptor"), "title": "Variant descriptor schema", "description": "Schema used to describe variants", "type": "object", @@ -573,7 +574,7 @@ PHENOPACKET_VARIANT_INTERPRETATION = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "variant_interpretation"), + "$id": sub_schema_uri(phenopacket_base_uri, "variant_interpretation"), "title": "Phenopacket variant interpretation schema", "description": "This element represents the interpretation of a variant according to the American College of " "Medical Genetics (ACMG) guidelines.", @@ -590,7 +591,7 @@ PHENOPACKET_GENOMIC_INTERPRETATION = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "genomic_interpretation"), + "$id": sub_schema_uri(phenopacket_base_uri, "genomic_interpretation"), "title": "Phenopacket genomic interpretation schema", "description": "Describes the interpretation for an individual variant or gene", "type": "object", @@ -608,7 +609,7 @@ PHENOPACKET_DIAGNOSIS_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "diagnosis"), + "$id": sub_schema_uri(phenopacket_base_uri, "diagnosis"), "title": "Phenopacket diagnosis schema", "description": "Refers to a disease and its genomic interpretations", "type": "object", @@ -621,7 +622,7 @@ PHENOPACKET_INTERPRETATION_SCHEMA = describe_schema({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "interpretation"), + "$id": sub_schema_uri(phenopacket_base_uri, "interpretation"), "title": "Phenopacket interpretation schema", "description": "This message intends to represent the interpretation of a genomic analysis, such as the report " "from a diagnostic laboratory.", @@ -642,9 +643,9 @@ "required": ["id", "progress_status"] }, descriptions=descriptions.INTERPRETATION) -PHENOPACKET_SCHEMA = describe_schema({ +PHENOPACKET_SCHEMA = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "phenopacket"), + "$id": phenopacket_base_uri, "title": "Phenopacket schema", "description": "Schema for metadata service datasets", "type": "object", @@ -664,8 +665,14 @@ "required": ["id", "meta_data"], }, descriptions.PHENOPACKET) +# Phenopacket schema URI referencing +phenopacket_resource = Resource.from_contents(PHENOPACKET_SCHEMA) +phenopacket_registry = Registry().with_resource(uri=phenopacket_base_uri, resource=phenopacket_resource) +phenopacket_resolver = phenopacket_registry.resolver(base_uri=phenopacket_base_uri) + +# VRS referencing schema resolving VRS_REF_RESOURCE = Resource.from_contents(contents=vrs_schema_definitions, default_specification=DRAFT7) VRS_REF_REGISTRY = VRS_REF_RESOURCE @ Registry() resolver = VRS_REF_REGISTRY.resolver() -VRS_VARIATION_SCHEMA = resolver.lookup(sub_schema_uri(base_uri, "vrs#/definitions/Variation")).contents +VRS_VARIATION_SCHEMA = resolver.lookup(sub_schema_uri(phenopacket_base_uri, "vrs#/definitions/Variation")).contents diff --git a/chord_metadata_service/phenopackets/summaries.py b/chord_metadata_service/phenopackets/summaries.py index c180d2c32..ede42944f 100644 --- a/chord_metadata_service/phenopackets/summaries.py +++ b/chord_metadata_service/phenopackets/summaries.py @@ -4,6 +4,7 @@ from chord_metadata_service.discovery.censorship import thresholded_count from chord_metadata_service.discovery.stats import stats_for_field, queryset_stats_for_field +from chord_metadata_service.discovery.types import DiscoveryConfig from chord_metadata_service.patients.summaries import individual_summary from . import models @@ -16,7 +17,7 @@ ] -async def biosample_summary(phenopackets: QuerySet, low_counts_censored: bool): +async def biosample_summary(phenopackets: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool): biosamples = models.Biosample.objects.filter(phenopacket__in=phenopackets) ( @@ -27,14 +28,14 @@ async def biosample_summary(phenopackets: QuerySet, low_counts_censored: bool): biosamples_is_control_sample, ) = await asyncio.gather( biosamples.acount(), - queryset_stats_for_field(biosamples, "taxonomy__label", low_counts_censored), - queryset_stats_for_field(biosamples, "sampled_tissue__label", low_counts_censored), - queryset_stats_for_field(biosamples, "histological_diagnosis__label", low_counts_censored), - queryset_stats_for_field(biosamples, "is_control_sample", low_counts_censored), + queryset_stats_for_field(biosamples, "taxonomy__label", discovery, low_counts_censored), + queryset_stats_for_field(biosamples, "sampled_tissue__label", discovery, low_counts_censored), + queryset_stats_for_field(biosamples, "histological_diagnosis__label", discovery, low_counts_censored), + queryset_stats_for_field(biosamples, "is_control_sample", discovery, low_counts_censored), ) return { - "count": thresholded_count(biosamples_count, low_counts_censored), + "count": thresholded_count(biosamples_count, discovery, low_counts_censored), "taxonomy": biosamples_taxonomy, "sampled_tissue": biosamples_sampled_tissue, "histological_diagnosis": biosamples_histological_diagnosis, @@ -42,28 +43,33 @@ async def biosample_summary(phenopackets: QuerySet, low_counts_censored: bool): } -async def disease_summary(phenopackets: QuerySet, low_counts_censored: bool): - disease_stats = await queryset_stats_for_field(phenopackets, "diseases__term__label", low_counts_censored) +async def disease_summary(phenopackets: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool): + disease_stats = await queryset_stats_for_field( + queryset=phenopackets, + field="diseases__term__label", + discovery=discovery, + low_counts_censored=low_counts_censored + ) return { # count is a number of unique disease terms (not all diseases in the database) - "count": thresholded_count(len(disease_stats), low_counts_censored), + "count": thresholded_count(len(disease_stats), discovery, low_counts_censored), "term": disease_stats, } -async def phenotypic_feature_summary(phenopackets: QuerySet, low_counts_censored: bool): +async def phenotypic_feature_summary(phenopackets: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool): phenotypic_features_count, phenotypic_features_type = await asyncio.gather( models.PhenotypicFeature.objects.filter(phenopacket__in=phenopackets).distinct('pftype').acount(), - stats_for_field(models.PhenotypicFeature, "pftype__label", low_counts_censored), + stats_for_field(models.PhenotypicFeature, "pftype__label", discovery, low_counts_censored), ) return { # count is a number of unique phenotypic feature types, not all phenotypic features in the database. - "count": thresholded_count(phenotypic_features_count, low_counts_censored), + "count": thresholded_count(phenotypic_features_count, discovery, low_counts_censored), "type": phenotypic_features_type, } -async def dt_phenopacket_summary(phenopackets: QuerySet, low_counts_censored: bool) -> dict: +async def dt_phenopacket_summary(phenopackets: QuerySet, discovery: DiscoveryConfig, low_counts_censored: bool) -> dict: # Parallel-gather all statistics we may need for this response ( phenopackets_count, @@ -73,14 +79,14 @@ async def dt_phenopacket_summary(phenopackets: QuerySet, low_counts_censored: bo pf_summary_val, ) = await asyncio.gather( phenopackets.acount(), - biosample_summary(phenopackets, low_counts_censored), - individual_summary(phenopackets, low_counts_censored), - disease_summary(phenopackets, low_counts_censored), - phenotypic_feature_summary(phenopackets, low_counts_censored), + biosample_summary(phenopackets, discovery, low_counts_censored), + individual_summary(phenopackets, discovery, low_counts_censored), + disease_summary(phenopackets, discovery, low_counts_censored), + phenotypic_feature_summary(phenopackets, discovery, low_counts_censored), ) return { - "count": thresholded_count(phenopackets_count, low_counts_censored), + "count": thresholded_count(phenopackets_count, discovery, low_counts_censored), "data_type_specific": { "biosamples": biosample_summary_val, "diseases": disease_summary_val, diff --git a/chord_metadata_service/phenopackets/tests/test_api.py b/chord_metadata_service/phenopackets/tests/test_api.py index 9c836fe5b..a721c74a1 100644 --- a/chord_metadata_service/phenopackets/tests/test_api.py +++ b/chord_metadata_service/phenopackets/tests/test_api.py @@ -5,6 +5,8 @@ from django.urls import reverse from rest_framework import status from rest_framework.test import APITestCase + +from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA from . import constants as c from .. import models as m, serializers as s @@ -432,3 +434,19 @@ def test_get_phenopackets_with_authz_dataset_5(self): self.assertEqual(response.status_code, status.HTTP_200_OK) response_data = response.json() self.assertEqual(len(response_data["results"]), 0) + + +class PhenopacketSchema(APITestCase): + + def test_get_phenopacket_schema(self): + response = self.client.get("/api/schemas/phenopacket") + schema = response.json() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(schema, PHENOPACKET_SCHEMA) + + def test_get_pheno_subschemas(self): + for subschema in PHENOPACKET_SCHEMA["properties"].values(): + prop_key = subschema["$id"].split('/')[-1] + response = self.client.get(reverse("chord-phenopacket-subschema", kwargs={"subschema": prop_key})) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), subschema) diff --git a/chord_metadata_service/restapi/api_views.py b/chord_metadata_service/restapi/api_views.py index 9f5b2afa2..7f7e4f5de 100644 --- a/chord_metadata_service/restapi/api_views.py +++ b/chord_metadata_service/restapi/api_views.py @@ -11,6 +11,8 @@ from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly +from chord_metadata_service.discovery.types import DiscoveryConfig +from chord_metadata_service.discovery.utils import get_discovery from chord_metadata_service.experiments import models as experiments_models from chord_metadata_service.experiments.summaries import dt_experiment_summary from chord_metadata_service.metadata.service_info import get_service_info @@ -33,10 +35,10 @@ async def service_info(_request: DrfRequest): return Response(await get_service_info()) -async def build_overview_response(phenopackets: QuerySet, experiments: QuerySet): +async def build_overview_response(phenopackets: QuerySet, experiments: QuerySet, discovery: DiscoveryConfig): phenopackets_summary, experiments_summary = await asyncio.gather( - dt_phenopacket_summary(phenopackets, low_counts_censored=False), - dt_experiment_summary(experiments, low_counts_censored=False), + dt_phenopacket_summary(phenopackets, discovery, low_counts_censored=False), + dt_experiment_summary(experiments, discovery, low_counts_censored=False), ) return Response({ @@ -59,18 +61,20 @@ async def build_overview_response(phenopackets: QuerySet, experiments: QuerySet) ) @api_view(["GET"]) @permission_classes([OverrideOrSuperUserOnly]) -async def overview(_request: DrfRequest): +async def overview(request: DrfRequest): """ get: Overview of all Phenopackets and experiments in the database - private endpoint """ # TODO: permissions based on project - this endpoint should be scrapped / completely rethought + # use node level discovery config for private overview + discovery = await get_discovery() phenopackets = pheno_models.Phenopacket.objects.all() experiments = experiments_models.Experiment.objects.all() - return await build_overview_response(phenopackets, experiments) + return await build_overview_response(phenopackets, experiments, discovery) @api_view(["GET"]) @@ -94,6 +98,8 @@ async def search_overview(request: DrfRequest): """ # TODO: this should be project / dataset-scoped and probably shouldn't even exist as-is + # use node level discovery config for private search overview + discovery = await get_discovery() individual_ids = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", []) phenopackets = pheno_models.Phenopacket.objects.all().filter(subject_id__in=individual_ids) @@ -104,4 +110,4 @@ async def search_overview(request: DrfRequest): # - in general, this endpoint is less than ideal and should be derived from search results themselves vs. this # hack-y mess of passing IDs around. - return await build_overview_response(phenopackets, experiments) + return await build_overview_response(phenopackets, experiments, discovery) diff --git a/chord_metadata_service/restapi/schema_utils.py b/chord_metadata_service/restapi/schema_utils.py index 41189af09..6b5c78589 100644 --- a/chord_metadata_service/restapi/schema_utils.py +++ b/chord_metadata_service/restapi/schema_utils.py @@ -195,7 +195,7 @@ def tag_schema_with_search_properties(schema, search_descriptions: dict | None): return schema_with_search -def tag_schema_with_nested_ids(schema: dict): +def tag_schema_with_nested_ids(schema: dict, override_children: bool = False): if "$id" not in schema: raise ValueError("Schema to tag with nested IDs must have $id") @@ -206,7 +206,10 @@ def tag_schema_with_nested_ids(schema: dict): return { **schema, "properties": { - k: tag_schema_with_nested_ids({**v, "$id": f"{schema_id}/{k}"} if "$id" not in v else v) + k: tag_schema_with_nested_ids( + schema={**v, "$id": f"{schema_id}/{k}"} if override_children or "$id" not in v else v, + override_children=override_children + ) for k, v in schema["properties"].items() }, } if "properties" in schema else schema @@ -217,15 +220,21 @@ def tag_schema_with_nested_ids(schema: dict): "items": tag_schema_with_nested_ids({ **schema["items"], "$id": f"{schema_id}/item", - } if "$id" not in schema["items"] else schema["items"]), + } if override_children or "$id" not in schema["items"] else schema["items"], override_children), } if "items" in schema else schema # If nothing to tag, return itself (base case) return schema -def tag_ids_and_describe(schema: dict, descriptions: dict): - return tag_schema_with_nested_ids(describe_schema(schema, descriptions)) +def tag_ids_and_describe(schema: dict, descriptions: dict, override_children: bool = False): + """ + Adds descriptions and generates $id fields on a given schema. + Set override_children to True to ignore existing children IDs and use the parent's structure. + Helps to make sure that all the properties of a schema have a resolvable URIs, even if reusing definitions. + e.g. ontology_list may need to be resolved from an experiment or phenopacket URI. + """ + return tag_schema_with_nested_ids(describe_schema(schema, descriptions), override_children) def customize_schema(first_typeof: dict, second_typeof: dict, first_property: str, second_property: str, diff --git a/chord_metadata_service/restapi/schemas.py b/chord_metadata_service/restapi/schemas.py index afbb1d1fc..48ebe90cf 100644 --- a/chord_metadata_service/restapi/schemas.py +++ b/chord_metadata_service/restapi/schemas.py @@ -1,8 +1,8 @@ -from pathlib import Path +from django.conf import settings from . import descriptions from .description_utils import EXTRA_PROPERTIES, ONTOLOGY_CLASS as ONTOLOGY_CLASS_DESC from .schema_utils import DATE_TIME, DRAFT_07, SchemaTypes, base_type, tag_ids_and_describe, \ - tag_schema_with_nested_ids, named_one_of, get_schema_app_id, sub_schema_uri + tag_schema_with_nested_ids, named_one_of, sub_schema_uri # Individual schemas for validation of JSONField values @@ -22,14 +22,15 @@ "TIME_ELEMENT_SCHEMA" ] +base_uri = settings.SCHEMAS_BASE_URL # ======================== Phenopackets based schemas ========================= -base_uri = get_schema_app_id(Path(__file__).parent.name) +phenopacket_base_uri = sub_schema_uri(base_uri=base_uri, name="phenopacket") ONTOLOGY_CLASS = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "ontology_class"), + "$id": sub_schema_uri(phenopacket_base_uri, "ontology_class"), "title": "Ontology class schema", "type": "object", "properties": { @@ -42,7 +43,7 @@ ONTOLOGY_CLASS_LIST = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "ontology_class_list"), + "$id": sub_schema_uri(phenopacket_base_uri, "ontology_class_list"), "title": "Ontology class list", "description": "Ontology class list", "type": "array", @@ -51,7 +52,7 @@ CURIE_SCHEMA = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "curie"), + "$id": sub_schema_uri(phenopacket_base_uri, "curie"), "title": "Curie style string", "description": ("A [W3C Compact URI](https://www.w3.org/TR/curie/) formatted string. " "A CURIE string has the structure ``prefix``:``reference``, as defined by the W3C syntax."), @@ -62,7 +63,7 @@ KEY_VALUE_OBJECT = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "key_value_object"), + "$id": sub_schema_uri(phenopacket_base_uri, "key_value_object"), "title": "Key-value object", "description": "The schema represents a key-value object.", "type": "object", @@ -74,20 +75,20 @@ EXTRA_PROPERTIES_SCHEMA = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "extra_properties"), + "$id": sub_schema_uri(phenopacket_base_uri, "extra_properties"), "type": "object" }, EXTRA_PROPERTIES) AGE_STRING = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "age_string"), + "$id": sub_schema_uri(phenopacket_base_uri, "age_string"), "type": "string" }, descriptions.AGE) AGE = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "age"), + "$id": sub_schema_uri(phenopacket_base_uri, "age"), "title": "Age schema", "type": "object", "properties": { @@ -100,7 +101,7 @@ AGE_RANGE = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "age_range"), + "$id": sub_schema_uri(phenopacket_base_uri, "age_range"), "title": "Age range schema", "type": "object", "properties": { @@ -114,7 +115,7 @@ AGE_OR_AGE_RANGE = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "age_or_age_range"), + "$id": sub_schema_uri(phenopacket_base_uri, "age_or_age_range"), "title": "Age schema", "description": "An age object describing the age of the individual at the time of collection of biospecimens or " "phenotypic observations.", @@ -128,7 +129,7 @@ TIME_INTERVAL = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "time_interval"), + "$id": sub_schema_uri(phenopacket_base_uri, "time_interval"), "title": "Age schema", "description": "An age object describing the age of the individual at the time of collection of biospecimens or " "phenotypic observations.", @@ -144,7 +145,7 @@ DISEASE_ONSET = { "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "disease_onset"), + "$id": sub_schema_uri(phenopacket_base_uri, "disease_onset"), "title": "Onset age", "description": "Schema for the age of the onset of the disease.", "type": "object", @@ -158,7 +159,7 @@ GESTATIONAL_AGE = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "gestational_age"), + "$id": sub_schema_uri(phenopacket_base_uri, "gestational_age"), "title": "Gestational age schema", "type": "object", "properties": { @@ -171,7 +172,7 @@ TIME_ELEMENT_SCHEMA = tag_ids_and_describe({ "$schema": DRAFT_07, - "$id": sub_schema_uri(base_uri, "time_element"), + "$id": sub_schema_uri(phenopacket_base_uri, "time_element"), "title": "Time element schema", "type": "object", "oneOf": [ diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py index a74d8b31f..a06f3be5b 100644 --- a/chord_metadata_service/restapi/tests/test_api.py +++ b/chord_metadata_service/restapi/tests/test_api.py @@ -5,6 +5,7 @@ from rest_framework import status from rest_framework.test import APITestCase +from chord_metadata_service.chord.tests.helpers import ProjectTestCase from chord_metadata_service.metadata.service_info import get_service_info from chord_metadata_service.phenopackets import models as ph_m from chord_metadata_service.phenopackets.tests import constants as ph_c @@ -38,7 +39,7 @@ def test_extra_properties_schema_types(self): self.assertEqual(response_obj, expected_response) -class OverviewTest(APITestCase): +class OverviewTest(APITestCase, ProjectTestCase): def setUp(self) -> None: # create 2 phenopackets for 2 individuals; each individual has 1 biosample; @@ -48,7 +49,8 @@ def setUp(self) -> None: self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1) self.metadata_2 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_2) self.phenopacket_1 = ph_m.Phenopacket.objects.create( - **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1) + **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1), + dataset=self.dataset, ) self.phenopacket_2 = ph_m.Phenopacket.objects.create( id='phenopacket:2', subject=self.individual_2, meta_data=self.metadata_2 diff --git a/chord_metadata_service/restapi/urls.py b/chord_metadata_service/restapi/urls.py index 62c3b23b1..5b1bf087e 100644 --- a/chord_metadata_service/restapi/urls.py +++ b/chord_metadata_service/restapi/urls.py @@ -2,7 +2,13 @@ from rest_framework import routers from chord_metadata_service.chord import api_views as chord_views -from chord_metadata_service.discovery.api_views import public_search_fields, public_overview, public_dataset +from chord_metadata_service.discovery.api_views import ( + public_rules, + public_search_fields, + public_overview, + public_dataset, + discovery_schema +) from chord_metadata_service.experiments import api_views as experiment_views from chord_metadata_service.patients import api_views as individual_views from chord_metadata_service.phenopackets import api_views as phenopacket_views @@ -56,11 +62,14 @@ path('', include(batch_router.urls)), # apps schemas - path('chord_phenopacket_schema', phenopacket_views.get_chord_phenopacket_schema, + path('schemas/phenopacket', phenopacket_views.get_chord_phenopacket_schema, name="chord-phenopacket-schema"), - path('experiment_schema', experiment_views.get_experiment_schema, + path('schemas/phenopacket/', phenopacket_views.get_chord_phenopacket_subschema, + name="chord-phenopacket-subschema"), + path('schemas/experiment', experiment_views.get_experiment_schema, name="experiment-schema"), - + path('schemas/experiment/', experiment_views.get_experiment_subschema, name="experiment-subschema"), + path('schemas/discovery', discovery_schema, name="discovery-schema"), # extra properties schema types path('extra_properties_schema_types', extra_properties_schema_types, name="extra-properties-schema-types"), @@ -78,9 +87,10 @@ # public endpoints (no confidential information leak) path('public', individual_views.PublicListIndividuals.as_view(), name='public',), - path('public_search_fields', public_search_fields, name='public-search-fields',), + path('public_search_fields', public_search_fields, name='public-search-fields'), path('public_overview', public_overview, name='public-overview',), path('public_dataset', public_dataset, name='public-dataset'), + path('public_rules', public_rules, name='public-rules'), # uncensored endpoint for beacon search using fields from config.json path('beacon_search', individual_views.BeaconListIndividuals.as_view(), name='beacon-search'),