diff --git a/docker-compose.yml b/docker-compose.yml index ac0c76a64..b20005df4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ services: python: + container_name: rikolti build: context: ./ dockerfile: ./Dockerfile.dev diff --git a/metadata_mapper/mappers/ucsd_blacklight/ucsd_blacklight_mapper.py b/metadata_mapper/mappers/ucsd_blacklight/ucsd_blacklight_mapper.py index ac7461613..b3107662b 100644 --- a/metadata_mapper/mappers/ucsd_blacklight/ucsd_blacklight_mapper.py +++ b/metadata_mapper/mappers/ucsd_blacklight/ucsd_blacklight_mapper.py @@ -3,7 +3,7 @@ from typing import Any -class UcsdBlacklightMapper(Record): +class UcsdBlacklightRecord(Record): BASE_URL = "https://library.ucsd.edu/dc/object/" BASE_ARK = "ark:/20775/" @@ -357,7 +357,7 @@ def identifier_content_match(validation_def: dict, rikolti_value: Any, class UcsdBlacklightVernacular(Vernacular): - record_cls = UcsdBlacklightMapper + record_cls = UcsdBlacklightRecord validator = UcsdBlacklightValidator def parse(self, api_response) -> list: diff --git a/metadata_mapper/requirements.txt b/metadata_mapper/requirements.txt index 210804155..9046d68ef 100644 --- a/metadata_mapper/requirements.txt +++ b/metadata_mapper/requirements.txt @@ -4,3 +4,7 @@ lxml sickle MarkupSafe python-dotenv +faker +pytest +pytest-assume +requests-mock \ No newline at end of file diff --git a/metadata_mapper/test/__init__.py b/metadata_mapper/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/metadata_mapper/test/conftest.py b/metadata_mapper/test/conftest.py new file mode 100644 index 000000000..41aad1646 --- /dev/null +++ b/metadata_mapper/test/conftest.py @@ -0,0 +1,3 @@ +def pytest_addoption(parser): + parser.addoption("--mappers", action="store", default=None) + parser.addoption("--mapper", action="store", default=None) \ No newline at end of file diff --git a/metadata_mapper/test/helpers/__init__.py b/metadata_mapper/test/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/metadata_mapper/test/helpers/base_helper.py b/metadata_mapper/test/helpers/base_helper.py new file mode 100644 index 000000000..334390a42 --- /dev/null +++ b/metadata_mapper/test/helpers/base_helper.py @@ -0,0 +1,133 @@ +import importlib +import os + +from datetime import datetime +from faker import Faker +from random import randint +from typing import Any, Union + + +class BaseTestHelper: + """ + Generates fake data for use in mapper unit tests. + + By default, this class uses DEFAULT_SCHEMA as its schema definition. + Any subclass can define SCHEMA, and it will be merged into + DEFAULT_SCHEMA to generate an appropriate schema for any given mapper. + + If a field requires special logic to generate, define a method named + "generate_{field_name}" where {field_name} equals the key in SCHEMA + that you want to generate the value for. + """ + + # SCHEMAs will be merged together in order to generate + # the final fixture schema. + SCHEMA = {} + + @classmethod + def for_mapper(cls, module_parts: list[str]) -> type["BaseTestHelper"]: + helper_path = None + module_len = len(module_parts) + while module_len and not helper_path: + helper_path = ( + "metadata_mapper/test/helpers/" + + "/".join(module_parts[:module_len]).replace("_mapper", "") + + "_helper.py" + ) + if not os.path.exists(helper_path): + helper_path = None + module_len = module_len - 1 + + if helper_path: + helper_module_parts = [ + p.replace("_mapper", "_helper") for p in module_parts + ] + helper_class_name = ( + "".join( + [ + word.title() + for word in module_parts[-1].replace("_mapper", "").split("_") + ] + ) + + "TestHelper" + ) + helper_module = importlib.import_module( + f".helpers.{'.'.join(helper_module_parts)}", + package="rikolti.metadata_mapper.test", + ) + return getattr(helper_module, helper_class_name) + else: + return None + + def __init__(self, request_mock=None): + self.request_mock = request_mock + self.faker = Faker() + self.collection_id = self.faker.pyint() + self.static = {} + self.setup_mocks() + + def setup_mocks(self): + pass + + def instantiate_record(self, record_class) -> type["Record"]: + fixture = self.generate_fixture() + instance = record_class(self.collection_id, fixture) + self.prepare_record(instance) + return instance + + def prepare_record(self, record) -> None: + record.legacy_couch_db_id = "asdf--123123" + + def generate_fixture(self, schema_index: int = 0) -> dict[str, Any]: + """ + Generates a test data fixture. + """ + schema = self.merged_schema(schema_index) + + return { + key: self.generate_value_for(key, type) for (key, type) in schema.items() + } + + def merged_schema(self, schema_index: int = 0) -> dict[str, Any]: + inheritance_chain = list(reversed(type(self).__mro__)) + superschemas = [ + super(cls, self).SCHEMA + for cls in inheritance_chain + if hasattr(super(cls, self), "SCHEMA") + ] + + schema = {} + for superschema in superschemas: + schema = {**schema, **superschema} + + return {**schema, **self.SCHEMA} + + def generate_value_for( + self, + field_name: str = None, + expected_type: Union[type, list, str] = str, + ) -> Any: + if isinstance(expected_type, str): + return getattr(self, expected_type)() + elif isinstance(expected_type, type): + return self.generate_value_of_type(expected_type) + elif isinstance(expected_type, list): + return [self.generate_value_for(item) for item in expected_type] + + def generate_value_of_type(self, type: type) -> Any: + if type == str: + return self.faker.pystr() + elif type == datetime: + return self.faker.date() + + # Helper methods + + def splittable_string(self) -> str: + """Generate a string with semicolons to be split on""" + return ";".join([self.faker.pystr() for _ in range(0, randint(1, 3))]) + + def list_of_splittable_strings(self) -> list[str]: + """ + Generate content to be split and flattened by mapper#split_and_flatten. + """ + return [self.splittable_string() for _ in range(0, randint(1, 3))] diff --git a/metadata_mapper/test/helpers/oai/__init__.py b/metadata_mapper/test/helpers/oai/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/metadata_mapper/test/helpers/oai/cca_vault_helper.py b/metadata_mapper/test/helpers/oai/cca_vault_helper.py new file mode 100644 index 000000000..bb0cb9ad7 --- /dev/null +++ b/metadata_mapper/test/helpers/oai/cca_vault_helper.py @@ -0,0 +1,8 @@ +from .oai_helper import OaiTestHelper + +class CcaVaultTestHelper(OaiTestHelper): + + SCHEMA = { + "language": "list_of_splittable_strings", + "source": [str] + } \ No newline at end of file diff --git a/metadata_mapper/test/helpers/oai/content_dm/contentdm_helper.py b/metadata_mapper/test/helpers/oai/content_dm/contentdm_helper.py new file mode 100644 index 000000000..1892d413e --- /dev/null +++ b/metadata_mapper/test/helpers/oai/content_dm/contentdm_helper.py @@ -0,0 +1,38 @@ +import re + +from ..oai_helper import OaiTestHelper +from .....mappers.oai.content_dm.contentdm_mapper import ContentdmRecord + + +class ContentdmTestHelper(OaiTestHelper): + SCHEMA = { + "contributor": "list_of_splittable_strings", + "coverage": "splittable_string", + "creator": "list_of_splittable_strings", + "identifier": "contentdm_identifier", + "language": "list_of_splittable_strings", + "spatial": "splittable_string", + "subject": "list_of_splittable_strings", + "type": "list_of_splittable_strings", + } + + def setup_mocks(self): + matcher = re.compile('utils/ajaxhelper') + response = { + "imageinfo": { + "height": self.faker.pyint(), + "width": self.faker.pyint() + } + } + self.request_mock.register_uri('GET', matcher, json=response) + + def contentdm_identifier(self): + # See ContentdmMapper#get_identifier_parts for required formated + value = f"http://{self.faker.domain_name()}/" + \ + f"{ContentdmRecord.identifier_match}/" + \ + (f"{self.faker.pystr()}/") + \ + f"{self.collection_id}/" + \ + f"{self.faker.pystr()}/" + \ + str(self.faker.pyint()) + + return [value] diff --git a/metadata_mapper/test/helpers/oai/content_dm/csudh_helper.py b/metadata_mapper/test/helpers/oai/content_dm/csudh_helper.py new file mode 100644 index 000000000..59caf3f2c --- /dev/null +++ b/metadata_mapper/test/helpers/oai/content_dm/csudh_helper.py @@ -0,0 +1,15 @@ +from datetime import datetime +from random import randint +from typing import Any + +from .contentdm_helper import ContentdmTestHelper + + +class CsudhTestHelper(ContentdmTestHelper): + SCHEMA = { + "bibliographicCitation": str, + "title": "csudh_title", + } + + def csudh_title(self): + return [f"csudh-{self.faker.pystr()}", self.faker.pystr()] diff --git a/metadata_mapper/test/helpers/oai/oai_helper.py b/metadata_mapper/test/helpers/oai/oai_helper.py new file mode 100644 index 000000000..674d7213b --- /dev/null +++ b/metadata_mapper/test/helpers/oai/oai_helper.py @@ -0,0 +1,29 @@ +from datetime import datetime +from random import randint + +from ..base_helper import BaseTestHelper + +class OaiTestHelper(BaseTestHelper): + + SCHEMA = { + "contributor": str, + "creator": str, + "date": [datetime] * randint(1, 9), + "description": [str] * randint(1, 3), + "extent": str, + "format": [str] * randint(1, 2), + "id": str, + "identifier": [str] * randint(1, 2), + "provenance": str, + "publisher": str, + "relation": [str] * randint(1, 14), + "rights": [str] * randint(1, 2), + "spatial": [str] * randint(1, 2), + "subject": str, + "temporal": str, + "title": str, + "type": str + } + + def prepare_record(self, record) -> None: + record.select_id(["id"]) \ No newline at end of file diff --git a/metadata_mapper/test/test_mapper.py b/metadata_mapper/test/test_mapper.py new file mode 100644 index 000000000..2875ce412 --- /dev/null +++ b/metadata_mapper/test/test_mapper.py @@ -0,0 +1,115 @@ +import importlib +import os +import pytest +import re +import requests_mock +import traceback + +from textwrap import dedent + +from .helpers.base_helper import BaseTestHelper +from ..mappers.mapper import Record + + +class TestMapper: + DEFAULT_TEST_METHOD_NAME = "_test_generic_mapper" + + def find_mappers_to_test(self, start_path="metadata_mapper/mappers"): + ret = {} + + for dir in os.scandir(start_path): + if dir.is_dir(): + ret = {**ret, **self.find_mappers_to_test(dir.path)} + elif dir.is_file() and dir.name.endswith("_mapper.py"): + path_regex_result = re.search("([\\w\\/]+?_mapper).py", dir.path) + if path_regex_result: + full_mapper_path = ( + path_regex_result[1].replace("/", ".").lstrip(".") + ) + module_parts = [ + path + for path in full_mapper_path.split(".") + if path not in ["metadata_mapper", "mappers"] + ] + mapper_name = module_parts[-1] + mapper_path = ".".join(module_parts) + + if f"test_{mapper_name}" in locals().keys(): + ret[mapper_path] = getattr(self, f"test_{mapper_name}") + else: + ret[mapper_path] = getattr(self, self.DEFAULT_TEST_METHOD_NAME) + + return ret + + def get_record(self, module_parts, module) -> Record: + mapper_name = module_parts[-1].replace("_mapper", "") + class_name = f"{self.camelize(mapper_name)}Record" + return getattr(module, class_name) + + def camelize(self, words: str) -> str: + return "".join([word.title() for word in words.split("_")]) + + def _test_generic_mapper(self, record_class, helper): + try: + instance = helper.instantiate_record(record_class) + try: + instance.to_UCLDC() + except Exception as exc: + pytest.assume( + False, + dedent(f"""\n**{type(instance).__name__}** raised error '{exc}' + at time of mapping.\n Here's the backtrace:\n + {traceback.format_exc()}"""), + ) + except Exception as exc: + pytest.assume( + False, + dedent(f"""\n**{record_class.__name__}** raised error '{exc}' + at time of initialization.\n Here's the backtrace:\n + {traceback.format_exc()}"""), + ) + + # Test methods (invoked by pytest) + + # This will loop through all mappers that don't have explicit test methods and + # run them with default data + def test_mappers(self, pytestconfig): + with requests_mock.Mocker() as r_mock: + default_test_method = getattr(self, self.DEFAULT_TEST_METHOD_NAME) + + mapper_filter = [ + mapper + for mapper in re.split( + r"[,;]", + pytestconfig.getoption("mappers") + or pytestconfig.getoption("mapper") + or "", + ) + if mapper + ] or None + + mappers = [ + mapper + for mapper, method in self.find_mappers_to_test().items() + if method == default_test_method + ] + + for mapper in mappers: + module_parts = mapper.split(".") + if ( + mapper_filter + and module_parts[-1] not in mapper_filter + and module_parts[-1].replace("_mapper", "") not in mapper_filter + ): + continue + + module = importlib.import_module( + f".mappers.{'.'.join(module_parts)}", + package="rikolti.metadata_mapper", + ) + helper_class = BaseTestHelper.for_mapper(module_parts) + + if helper_class: + helper = helper_class(r_mock) + record_class = self.get_record(module_parts, module) + default_test_method(record_class, helper) diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py index cd7d707cd..214099286 100644 --- a/metadata_mapper/utilities.py +++ b/metadata_mapper/utilities.py @@ -1,5 +1,14 @@ import importlib from typing import Callable +import json +import os +import re +from typing import Callable, Union + +import boto3 +import requests + +from . import settings def returns_callable(func: Callable) -> Callable: @@ -40,3 +49,233 @@ def import_vernacular_reader(mapper_type): exit() return vernacular_class + +def get_files(collection_id: int, directory: str) -> list[str]: + """ + Gets a list of filenames in a given directory. + """ + if settings.DATA_SRC["STORE"] == "file": + path = settings.local_path(collection_id, directory) + try: + return [f for f in os.listdir(path) + if os.path.isfile(os.path.join(path, f))] + except Exception as e: + raise FileNotFoundError( + f"{collection_id:<6}: Error listing files in {path}\n" + f"{collection_id:<6}: {e}" + ) + elif settings.DATA_SRC["STORE"] == "s3": + s3_client = boto3.client('s3') + try: + resp = s3_client.list_objects_v2( + Bucket=settings.DATA_SRC["BUCKET"], + Prefix=f"{collection_id}/{directory}" + ) + # TODO: check resp['IsTruncated'] and use ContinuationToken if needed + return [page['Key'] for page in resp['Contents']] + except Exception as e: + s3_url = ( + f"s3://{settings.DATA_SRC['BUCKET']}/{collection_id}/" + f"{directory}/") + url = ( + f"https://{settings.DATA_SRC['BUCKET']}.s3.us-west-2.amazonaws" + f".com/index.html#{collection_id}/" + ) + raise FileNotFoundError( + f"{collection_id<6}: Error listing files at {s3_url}\n" + f"{collection_id<6}: Check that {directory} exists at {url}\n" + f"{collection_id<6}: {e}" + ) + +def read_from_bucket(collection_id: int, directory: str, + file_name: Union[str, int]) -> str: + """ + Reads the contents of a file from the appropriate content bucket. + + Data comes from local filesystem or S3, depending on ENV vars. + + Parameters: + directory: str + collection_id: str + Files are separated into directories by collection_id + file_name: Union[str, int] + The name of the file to read + + Returns: str + The file contents + """ + if settings.DATA_SRC["STORE"] == 'file': + page_path = os.sep.join([ + settings.local_path(collection_id, directory), + str(file_name) + ]) + try: + with open(page_path, "r") as metadata_file: + return metadata_file.read() + except Exception as e: + raise Exception( + f"{collection_id:<6}: Error reading {page_path}\n" + f"{collection_id:<6}: {e}" + ) + elif settings.DATA_SRC["STORE"] == 's3': + s3_client = boto3.client('s3') + try: + s3_obj_summary = s3_client.get_object( + Bucket=settings.DATA_SRC["BUCKET"], + Key=f"{file_name}" + ) + return s3_obj_summary['Body'].read() + except Exception as e: + s3_url = (f"s3://{settings.DATA_SRC['BUCKET']}/{file_name}") + url = ( + f"https://{settings.DATA_SRC['BUCKET']}.s3.us-west-2.amazonaws" + ".com/index.html#{file_name}/" + ) + raise Exception( + f"{collection_id<6}: Error reading file at {s3_url}\n" + f"{collection_id<6}: Check {url}\n" + f"{collection_id<6}: {e}" + ) + +def read_mapped_metadata(collection_id: int, page_id: int) -> list[dict]: + """ + Reads and parses the content of a mapped metadata file. + + Parameters: + collection_id: int + The collection ID + page_id: int + The page ID (filename) to read and parse + + Returns: list[dict] + The parsed data + """ + return json.loads(read_from_bucket(collection_id, "mapped_metadata", page_id)) + + +def read_vernacular_metadata(collection_id: int, page_id: int) -> list[dict]: + """ + Reads and parses the content of a vernacular (unmapped) metadata file. + + Parameters: + collection_id: int + The collection ID + page_id: int + The page ID (filename) to read and parse + + Returns: list[dict] + The parsed data + """ + return json.loads(read_from_bucket(collection_id, "vernacular_metadata", page_id)) + + +def write_to_bucket(collection_id: int, directory: str, + file_name: Union[str, int], content: str, + append: bool = False) -> None: + if isinstance(content, list) or isinstance(content, dict): + content = json.dumps(content) + + if settings.DATA_SRC["STORE"] == 'file': + dir_path = settings.local_path(collection_id, directory) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + page_path = os.sep.join([dir_path, str(file_name)]) + + with open(page_path, "a" if append else "w") as file: + file.write(content) + file_location = f"file://{page_path}" + elif settings.DATA_SRC["STORE"] == 's3': + s3_client = boto3.client('s3') + key = ( + f"{collection_id}/{directory}/" + f"{file_name}" + ) + s3_client.put_object( + Bucket=settings.DATA_DEST["BUCKET"], + Key=key, + Body=content) + file_location = f"s3://{settings.DATA_DEST['BUCKET']}/{key}" + + return file_location + +MAPPER_TYPES = ['oac_dc', + 'ucd_json', + 'dublin_core', + 'ucldc_nuxeo', + 'ucsb_aleph_marc', + 'ucb_tind_marc', + 'ucsc_oai_dpla', + 'ucsd_blacklight_dc', + 'csa_omeka', + 'ucla_solr_dc', + 'quartex_oai', + 'sjsu_islandora', + 'cca_vault_oai_dc', + 'chs_islandora', + 'contentdm_oai_dc', + 'preservica_api', + 'black_gold_oai', + 'omeka_santa_clara', + 'calpoly_oai_dc', + 'csu_sac_oai_dc', + 'csudh_contentdm_oai_dc', + 'chula_vista_pl_contentdm_oai_dc', + 'lapl_oai', + 'sfpl_marc', + 'lapl_26096', + 'ucsf_solr', + 'cavpp_islandora', + 'up_oai_dc', + 'chapman_oai_dc', + 'califa_oai_dc', + 'csl_marc', + 'contentdm_oai_dc_get_sound_thumbs', + 'pspl_oai_dc', + 'omeka', + 'chico_oai_dc', + 'ucb_bampfa_solr', + 'islandora_oai_dc', + 'flickr_api', + 'youtube_video_snippet', + 'csuci_mets', + 'pastperfect_xml', + 'caltech_restrict', + 'usc_oai_dc', + 'yosemite_oai_dc', + 'emuseum_xml', + 'csu_dspace_mets', + 'sierramadre_marc', + 'burbank_islandora', + 'omeka_nothumb', + 'sanjose_pastperfect', + 'tv_academy_oai_dc', + 'flickr_sdasm', + 'flickr_sppl', + 'internet_archive', + 'arck_oai'] + +def get_mapper_pre_enrichments(path = "./mappers"): + ret = {} + base_url = "https://registry.cdlib.org" + + for mapper_type in MAPPER_TYPES: + ret[mapper_type] = {} + url = f"{base_url}/api/v1/rikolticollection?mapper_type={mapper_type}&format=json" + + while url: + response = requests.get(url) + + json = response.json() + objects = json.get("objects") + for object in objects: + pre_mappers = ",".join(object.get("rikolti__pre_mapping")) + if ret[mapper_type].get(pre_mappers): + ret[mapper_type][pre_mappers].append(object["id"]) + else: + ret[mapper_type][pre_mappers] = [object["id"]] + + url = json.get("next") + if url: + url = f"{base_url}{url}" + + return ret diff --git a/requirements_dev.txt b/requirements_dev.txt index 712efe192..7489a7ee1 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -3,3 +3,4 @@ ipython ruff isort +pytest \ No newline at end of file