diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index de6750a2..4f52ecfa 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -19,7 +19,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install flake8 pytest==6.2.5 if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | diff --git a/Code_of_Conduct.md b/Code_of_Conduct.md new file mode 100644 index 00000000..90e8037b --- /dev/null +++ b/Code_of_Conduct.md @@ -0,0 +1,72 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge +One of the primary goals of the pyQuARC repository is to cultivate a respectful and collaborative environment for all users, community members, stakeholders, and developers. Our Code of Conduct is grounded in the FAIR principles (Findable, Accessible, Interoperable, and Reusable) and outlines our expectations for all participants, as well as the consequences of unacceptable behavior. We invite all users to help us create a positive experience for every member of the community. + +## Our Standards +We strive to create a space that empowers people to provide outstanding contributions to open science. Achieving this requires an open exchange of ideas, guided by thoughtful and respectful standards. + +Examples of behaviors that contribute to a positive community environment include the following: + +* Engaging in professional interactions with other members that are respectful and courteous +* Providing and receiving constructive feedback +* Accepting responsibility when mistakes are made +* Prioritizing the well-being of the community as a whole over individual interests + +Examples of unacceptable behavior include the following: + +* Using inappropriate or suggestive language or imagery +* Trolling, making insulting or derogatory remarks, or engaging in personal attacks +* Publishing private information (e.g., physical address, email address) without explicit permission +* Commit of malicious code + +## Enforcement Responsibilities +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior. They will take appropriate action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. + +## Key Definitions +* A **participant** is anyone who creates an issue, posts a comment, or submits a question in the pyQuARC GitHub repository. +* A **contributor** is an individual who submits a pull request or code commit to the pyQuARC GitHub repository. +* A **moderator** is an individual appointed to oversee and moderate comments, issues, pull requests, and code commits, as well as manage access to the pyQuARC repository. + +[GitHub](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/roles-in-an-organization#organization-moderators) defines a moderator as “Moderators are organization members who, in addition to their permissions as members, are allowed to block and unblock non-member contributors, set interaction limits, and hide comments in public repositories owned by the organization.” GitHub moderators can hide comments, pull requests, and issues; block or unblock contributors; and limit interactions for specific users. GitHub resources for moderation can be found [here](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/managing-moderators-in-your-organization#about-organization-moderators). + +## Enforcement Guidelines +Community moderators will follow the community impact guidelines detailed below when determining the consequences for any action deemed in violation of this Code of Conduct. + +### First Code of Conduct Violation (Warning) +If a participant violates the Code of Conduct for the first time, a community moderator will contact the individual as soon as possible and promptly remove the content. + +* **Participant:** Content removed + contacted by the community moderator +* **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + +### Second Code of Conduct Violation (Temporary Ban) +If a participant violates the Code of Conduct a second time, they will be contacted by a community moderator and informed of a temporary ban from the repository. + +* **Participant:** Content removed + contacted by the community moderator + ban for 90 days from the space where the offense occurred +* **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + banned from submitting PRs for 90 days + +### Third Code of Conduct Violation (Permanent Ban) +If a participant violates the Code of Conduct a third time and demonstrates a repeated pattern of disregarding community standards. In that case, they will be permanently banned and removed from the pyQuARC GitHub repository. + +* **Participant:** Content removed + contacted by the community moderator + permanent ban from the repository +* **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + permanent ban from the repository + +**Additional note:** If a contributor submits a pull request that is harmful to our digital spaces (e.g., malicious code), they will be immediately and permanently banned from the pyQuARC repository. + +## Question? +If you have a question about how to contribute to the pyQuARC library, please refer to the contributing file (contributing.md) in the pyQuARC repository. For all other inquiries, including reports of potential violations of this Code of Conduct, please contact earthdata-support@nasa.gov. + +## Attributions +The pyQuARC Code of Conduct has been adopted from the following sources: + +* [The GSA Code of Conduct](https://handbook.tts.gsa.gov/about-us/code-of-conduct/) +* [The Contributor Covenant](https://www.contributor-covenant.org/), [version 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.md) +* [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion#code-of-conduct--enforcement) +* [The Citizen Code of Conduct](https://github.com/stumpsyn/policies/blob/master/citizen_code_of_conduct.md) +* [Django Code of Conduct](https://www.djangoproject.com/conduct/) +* [The TTS Handbook](https://handbook.tts.gsa.gov/about-us/code-of-conduct/) +* [Ada Initiative](https://adainitiative.org/) +* [National Aeronautics and Space Administration Open-Source Software Policy](https://www.earthdata.nasa.gov/engage/open-data-services-software-policies/open-source-software-policy) +* [MetaDocencia - Transform to Open Science repository](https://github.com/MetaDocencia/Transform-to-Open-Science_ES) \ No newline at end of file diff --git a/README.md b/README.md index 03da9ce2..89ce3600 100644 --- a/README.md +++ b/README.md @@ -5,79 +5,150 @@ [![DOI](https://zenodo.org/badge/153786129.svg)](https://zenodo.org/doi/10.5281/zenodo.10724716) ## Introduction - The pyQuARC (*pronounced "pie-quark"*) library was designed to read and evaluate descriptive metadata used to catalog Earth observation data products and files. This type of metadata focuses and limits attention to important aspects of data, such as the spatial and temporal extent, in a structured manner that can be leveraged by data catalogs and other applications designed to connect users to data. Therefore, poor quality metadata (e.g. inaccurate, incomplete, improperly formatted, inconsistent) can yield subpar results when users search for data. Metadata that inaccurately represents the data it describes risks matching users with data that does not reflect their search criteria and, in the worst-case scenario, can make data impossible to find. Given the importance of high quality metadata, it is necessary that metadata be regularly assessed and updated as needed. pyQuARC is a tool that can help streamline the process of assessing metadata quality by automating it as much as possible. In addition to basic validation checks (e.g. adherence to the metadata schema, controlled vocabularies, and link checking), pyQuARC flags opportunities to improve or add contextual metadata information to help the user connect to, access, and better understand the data product. pyQuARC also ensures that information common to both data product (i.e. collection) and the file-level (i.e. granule) metadata are consistent and compatible. As open source software, pyQuARC can be adapted and customized to allow for quality checks unique to different needs. -## pyQuARC Base Package +## pyQuARC Metadata Quality Framework +pyQuARC was designed to assess metadata in NASA’s [Common Metadata Repository (CMR)](https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components), a centralized repository for all of NASA’s Earth observation data products. In addition, the CMR contains metadata for Earth observation products submitted by external partners. The CMR serves as the backend for NASA’s Earthdata Search ([search.earthdata.nasa.gov](https://search.earthdata.nasa.gov/)) and is also the authoritative metadata source for NASA’s [Earth Observing System Data and Information System (EOSDIS)](https://earthdata.nasa.gov/eosdis). -pyQuARC was specifically designed to assess metadata in NASA’s [Common Metadata Repository (CMR)](https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components), which is a centralized metadata repository for all of NASA’s Earth observation data products. In addition to NASA’s ~9,000 data products, the CMR also holds metadata for over 40,000 additional Earth observation data products submitted by external data partners. The CMR serves as the backend for NASA’s Earthdata Search (search.earthdata.nasa.gov) and is also the authoritative metadata source for NASA’s [Earth Observing System Data and Information System (EOSDIS).](https://earthdata.nasa.gov/eosdis) +pyQuARC was initially developed by a group called the [Analysis and Review of the CMR (ARC)](https://www.earthdata.nasa.gov/data/projects/analysis-review-cmr-project) team. The ARC team conducted quality assessments of NASA’s metadata records in the CMR, identified opportunities for improvement in the metadata records, and collaborated with the data archive centers to resolve any identified issues. ARC has developed a [metadata quality assessment framework](http://doi.org/10.5334/dsj-2021-017) which specifies a common set of assessment criteria. These criteria focus on correctness, completeness, and consistency with the goal of making data more discoverable, accessible, and usable. The ARC metadata quality assessment framework is the basis for the metadata checks that have been incorporated into pyQuARC base package. Specific quality criteria for each CMR metadata element are documented in the [Earthdata Wiki space](https://wiki.earthdata.nasa.gov/display/CMR/CMR+Metadata+Best+Practices%3A+Landing+Page). -pyQuARC was developed by a group called the [Analysis and Review of the CMR (ARC)](https://earthdata.nasa.gov/esds/impact/arc) team. The ARC team conducts quality assessments of NASA’s metadata records in the CMR, identifies opportunities for improvement in the metadata records, and collaborates with the data archive centers to resolve any identified issues. ARC has developed a [metadata quality assessment framework](http://doi.org/10.5334/dsj-2021-017) which specifies a common set of assessment criteria. These criteria focus on correctness, completeness, and consistency with the goal of making data more discoverable, accessible, and usable. The ARC metadata quality assessment framework is the basis for the metadata checks that have been incorporated into pyQuARC base package. Specific quality criteria for each CMR metadata element is documented in the following wiki: -[https://wiki.earthdata.nasa.gov/display/CMR/CMR+Metadata+Best+Practices%3A+Landing+Page](https://wiki.earthdata.nasa.gov/display/CMR/CMR+Metadata+Best+Practices%3A+Landing+Page) +Each metadata element’s wiki page includes an “Metadata Validation and QA/QC” section that lists quality criteria categorized by priority levels, referred to as a priority matrix. The [priority matrix](https://wiki.earthdata.nasa.gov/spaces/CMR/pages/109874556/ARC+Priority+Matrix) are designated as high (red), medium (yellow), or low (blue), and are intended to communicate the importance of meeting the specified criteria. -There is an “ARC Metadata QA/QC” section on the wiki page for each metadata element that lists quality criteria categorized by level of [priority. Priority categories](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) are designated as high (red), medium (yellow), or low (blue), and are intended to communicate the importance of meeting the specified criteria. +The CMR is designed around its own metadata standard called the [Unified Metadata Model (UMM)](https://www.earthdata.nasa.gov/about/esdis/eosdis/cmr/umm). In addition to being an extensible metadata model, the UMM provides a crosswalk for mapping among the various CMR-supported metadata standards, including DIF10, ECHO10, ISO 19115-1, and ISO 19115-2. -The CMR is designed around its own metadata standard called the [Unified Metadata Model (UMM).](https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components/cmr/umm) In addition to being an extensible metadata model, the UMM also provides a cross-walk for mapping between the various CMR-supported metadata standards. CMR-supported metadata standards currently include: -* [DIF10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/directory-interchange-format-dif-standard) (Collection/Data Product-level only) -* [ECHO10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/echo-metadata-standard) (Collection/Data Product and Granule/File-level metadata) -* [ISO19115-1 and ISO19115-2](https://earthdata.nasa.gov/esdis/eso/standards-and-references/iso-19115) (Collection/Data Product and Granule/File-level metadata) +pyQuARC currently supports the following metadata standards: * [UMM-JSON](https://wiki.earthdata.nasa.gov/display/CMR/UMM+Documents) (UMM) - * UMM-C (Collection/Data Product-level metadata) - * UMM-G (Granule/File-level metadata) - * UMM-S (Service metadata) - * UMM-T (Tool metadata) + * Collection/Data Product-level metadata (UMM-C) + * Granule/File-level metadata (UMM-G) +* [ECHO10](https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0) + * Collection/Data Product-level metadata (ECHO-C) + * Granule/File-level metadata (ECHO-G) +* [DIF10](https://git.earthdata.nasa.gov/projects/EMFD/repos/dif-schemas/browse) + * Collection/Data Product-level only +## Install and Clone the Repository +The pyQuARC library requires `Python 3.10` to function properly across all operating systems. -pyQuARC supports DIF10 (collection only), ECHO10 (collection and granule), UMM-C, and UMM-G standards. At this time, there are no plans to add ISO 19115 or UMM-S/T specific checks. **Note that pyQuARC development is still underway, so further enhancements and revisions are planned.** +### 1. Open your Command Prompt or Terminal and use the following command to clone the pyQuARC repository: +* `git clone https://github.com/NASA-IMPACT/pyQuARC.git` -**For inquiries, please email: sheyenne.kirkland@uah.edu** +Note: If you see the message `fatal: destination path 'pyQuARC' already exists and is not an empty directory` when running this command, it means the repository has already been cloned. To reclone it, delete the folder and its contents using the following command before running the original command again. -## pyQuARC as a Service (QuARC) +* `rmdir /s /q pyQuARC` # deletes the directory (be cautious) -QuARC is pyQuARC deployed as a service and can be found here: https://quarc.nasa-impact.net/docs/. +Additional note: If you want to know where your freshly cloned pyQuARC folder ended up, you can use the following command to print your working directory: -QuARC is still in beta but is regularly synced with the latest version of pyQuARC on GitHub. Fully cloud-native, the architecture diagram of QuARC is shown below: +* `pwd` # for Linux/MacOS operating systems +* `cd` # for Windows operating systems -![QuARC](https://user-images.githubusercontent.com/17416300/179866276-7c025699-01a1-4d3e-93cd-50e12c5a5ec2.png) +This will show you the full path to the directory where the cloned pyQuARC repository is located. You can then append `\pyQuARC` to the end of the path to get the full path to the folder. + +### 2. Configure and Activate Environment: +Create an environment to set up an isolated workspace for using pyQuARC. You can do this with Anaconda/Miniconda (Option A) or with Python’s built-in `venv` module (Option B). + +**A. Use the Conda package manager to create and name the environment:** +* `conda create --name ` # - Replace `` with the name of your environment. + +**B. Use the Python interpreter to create a virtual environment in your current directory:** +* `python -m venv env` + +Next, activate the environment using either Option A or Option B, depending on how you created it in the previous step: + +**A. Activate the Conda environment with the Conda package manager:** +* `conda activate ` + +**B. Activate the Python virtual environment** +For macOS/Linux operating systems, use the following: +* `source env/bin/activate` + +For Windows operating systems, use the following command: +* `env\Scripts\activate` + +Note: On Windows, you may encounter an error with this command. If that happens, use: +* `.\env\Scripts\Activate.ps1` + +Be sure to reference the correct location of the env directory, as you may need to activate either the `.bat` or `.ps1` script. This error is uncommon. -## Architecture +### 3. Install Requirements +Next, install the required packages. The requirements are included as a text file in the repository and will be available on your local machine automatically once you clone the pyQuARC repository. Before installing the requirements, make sure you are in your working directory and navigate to the pyQuARC folder. +Navigate to your directory: +* `cd` + +Navigate to the pyQuARC folder: +* `cd pyQuARC` + +Install the requirements: +* `pip install -r requirements.txt` + +You are almost there! Open your code editor (e.g., VS Code), navigate to the location where you cloned the repository, select the pyQuARC folder, and click Open. You should now be able to see all the existing files and contents of the pyQuARC folder in your code editor. Voilà! You are ready to use pyQuARC! + +## pyQuARC Architecture ![pyQuARC Architecture](/images/architecture.png) -The Downloader is used to obtain a copy of a metadata record of interest from the CMR. This is accomplished using a [CMR API query,](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) where the metadata record of interest is identified by its unique identifier in the CMR (concept_id). CMR API documentation can be found here: -[https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) +pyQuARC uses a Downloader to obtain a copy of a metadata record of interest from the CMR API. This is accomplished using a [CMR API query,](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) where the metadata record of interest is identified by its unique identifier in the CMR (concept_id). For more, please visi the [CMR API documentation](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html). -There is also the option to select and run pyQuARC on a metadata record already downloaded to your local desktop. +After cloning the repository, you can find a set of files in the `schemas` folder including `checks.json`, `rule_mapping.json`, and `check_messages.json` that define and apply the rules used to evaluate metadata. Each rule is specified by its `rule_id`, associated function, and any dependencies on specific metadata elements. -The `checks.json` file includes a comprehensive list of rules. Each rule is specified by its `rule_id,` associated function, and any dependencies on specific metadata elements. +* The `checks.json` file contains a comprehensive list of all metadata quality rules used by pyQuARC. Each rule in this file includes a `check_function` that specifies the name of the check. +* The `check_messages.json` file contains the messages that are displayed when a check fails. You can use the `check_function` name from the `checks.json` file to locate the output message associated with each check. +* The `rule_mapping.json` file specifies which metadata element(s) each rule applies to. -The `rule_mapping.json` file specifies which metadata element(s) each rule applies to. The `rule_mapping.json` also references the `messages.json` file which includes messages that can be displayed when a check passes or fails. +Furthermore, the `rule_mapping.json` file specifies the severity level associated with a failure. If a check fails, it is assigned one of three categories: ❌ Error, ⚠️ Warning, or ℹ️ Info. These categories correspond to priority levels in [ARC’s priority matrix](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) and indicate the importance of the failed check. Default severity values are based on ARC’s metadata quality assessment framework but can be customized to meet individual needs. -Furthermore, the `rule_mapping.json` file specifies the level of severity associated with a failure. If a check fails, it will be assigned a severity category of “error”, “warning”, or "info.” These categories correspond to priority categorizations in [ARC’s priority matrix](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) and communicate the importance of the failed check, with “error” being the most critical category, “warning” indicating a failure of medium priority, and “info” indicating a minor issue or inconsistency. Default severity values are assigned based on ARC’s metadata quality assessment framework, but can be customized to meet individual needs. +❌ Error → most critical issues +⚠️ Warning → medium-priority issues +ℹ️ Info → minor issues -## Customization -pyQuARC is designed to be customizable. Output messages can be modified using the `messages_override.json` file - any messages added to `messages_override.json` will display over the default messages in the `message.json` file. Similarly, there is a `rule_mapping_override.json` file which can be used to override the default settings for which rules/checks are applied to which metadata elements. +In the `code` folder, you will find a series of Python files containing the implementations for each check. For example, the `data_format_gcmd_check` listed in the `checks.json` file can be found in the `string_validator.py` file, where the code performs the check using a string validator. -There is also the opportunity for more sophisticated customization. New QA rules can be added and existing QA rules can be edited or removed. Support for new metadata standards can be added as well. Further details on how to customize pyQuARC will be provided in the technical user’s guide below. +## Run pyQuARC on a Single Record -While the pyQuARC base package is currently managed by the ARC team, the long term goal is for it to be owned and governed by the broader EOSDIS metadata community. +### Locating the Concept ID +To run pyQuARC on a single record, either at the collection (data product) level or the granule (individual file) level, you will need the associated Concept ID. If you don’t know the Concept ID for the record, you can find it by following these steps: -## Install/User’s Guide -### Running the program +1. Go to NASA [Earthdata Search](https://search.earthdata.nasa.gov/) and locate the data product of interest. +2. Click Collection Details and locate the dataset’s Short Name, which is often highlighted in gray along with the Version number (for example: Short Name = Aqua_AIRS_MODIS1km_IND, Version = 1). +3. Copy the Short Name and Version number, then modify the following path: -*Note:* This program requires `Python 3.8` installed in your system. +* `https://cmr.earthdata.nasa.gov/search/collections.umm-json?entry_id=SHORTNAME_VERSION#.2&all_revisions=true` -**Clone the repo:** [https://github.com/NASA-IMPACT/pyQuARC/](https://github.com/NASA-IMPACT/pyQuARC/) +You will need to replace `SHORTNAME` in the path with the actual Short Name of the dataset (for example: Aqua_AIRS_MODIS1km_IND). +You will also need to replace `VERSION#` in the path with the actual Version number listed under Collection Details in Earthdata Search (for example: 1). -**Go to the project directory:** `cd pyQuARC` +For the dataset “Aqua AIRS-MODIS 1-km Matchup Indexes V1 (Aqua_AIRS_MODIS1km_IND) at GES_DISC” with Short Name Aqua_AIRS_MODIS1km_IND and Version 1, the path is modified as follows: + +* `https://cmr.earthdata.nasa.gov/search/collections.umm-json?entry_id=Aqua_AIRS_MODIS1km_IND_1&all_revisions=true` + +You should now be able to find the `concept-id` for that collection (data product). + +For individual files (granules), locating the Concept ID is straightforward. In [Earthdata Search](https://search.earthdata.nasa.gov/), find the file of interest, click View Details, and then check the Information tab to see the Concept ID. + +### Running pyQuARC Using the Concept ID +Now that you have identified the Concept ID for the collection (data product) or granule (individual file) metadata, you can use the following command in your code editor to curate it: -**Create a python virtual environment:** `python -m venv env` +* `python pyQuARC/main.py --concept_ids CONCEPT_ID --format FORMAT` -**Activate the environment:** `source env/bin/activate` +`CONCEPT_ID` should be replaced with the Concept ID of the collection or granule-level metadata (for example: `C2515837343-GES_DISC`). +`FORMAT` should be replaced with the schema you are using to validate the metadata. This will differ depending on whether you are curating collection- or granule-level metadata. The list of acceptable formats is as follows: -**Install the requirements:** `pip install -r requirements.txt` +- `umm-c` (for collection) +- `umm-g` (for granule) +- `echo-c` (for collection) +- `echo-g` (for granule) +- `dif10` (for both collection and granule) + +**Example** +For `C2515837343-GES_DISC`, the command above can be modified as follows: + +`python pyQuARC/main.py --concept_ids C2515837343-GES_DISC --format umm-c` + +In this example, `CONCEPT_ID` has been replaced with `C2515837343-GES_DISC`, and `FORMAT` has been replaced with `umm-c` + +### Running pyQuARC on a Local File +There is also the option to select and run pyQuARC on a metadata record already downloaded to your local desktop. **Run `main.py`:** @@ -110,8 +181,33 @@ or ▶ python pyQuARC/main.py --file "/Users/batman/projects/pyQuARC/tests/fixtures/test_cmr_metadata.echo10" ``` -### Adding a custom rule +## Run pyQuARC on Multiple Records +pyQuARC has the capability to run metadata checks on multiple collection or granule IDs. This feature allows users to perform validation checks on multiple records simultaneously. When performing validation checks on multiple records, it is essential that all records share the same schema format, which could be one of the following: `umm-c`, `umm-g`, `echo-c`, `echo-g`, and `dif10`. + +To run pyQuARC on multiple records, use one of the following options/commands: + +A. List the collection IDs consecutively, separated by commas. The results will be displayed in the console. + +`python pyQuARC/main.py --concept_ids , , , …. --format umm-c` + +B. If you have multiple collection IDs (e.g., more than 10 records), it is recommended to create a text file listing the collection IDs. The format of the records should be: + + + + +…… + + +`python pyQuARC/main.py --concept_ids $(cat pyQuARC/files.txt) --format umm-c` + +C. If you prefer to save the output from multiple records to a `.csv` file for reference, use the following command. Note that the output format may not be perfectly structured due to the default settings used when writing output from the Python console. +`python pyQuARC/main.py --concept_ids , , , …. --format umm-c > pyquarc_output.csv` + +## Customization +pyQuARC is designed to be customizable. Output messages can be modified using the `messages_override.json` file - any messages added to `messages_override.json` will display over the default messages in the `message.json` file. Similarly, there is a `rule_mapping_override.json` file which can be used to override the default settings for which rules/checks are applied to which metadata elements. There is also the opportunity for more sophisticated customization. New QA rules can be added and existing QA rules can be edited or removed. Support for new metadata standards can be added as well. + +### Adding a custom rule To add a custom rule, follow the following steps: **Add an entry to the `schemas/rule_mapping.json` file in the form:** @@ -389,7 +485,6 @@ The values 0 and 1 do not amount to a true value >>> ... ``` - **To provide custom messages for new or old fields:** ```python @@ -418,3 +513,13 @@ The values 0 and 1 do not amount to a true value >>> validator.validate() >>> ... ``` + +## pyQuARC as a Service (QuARC) +QuARC is pyQuARC deployed as a service and can be found here: https://quarc.nasa-impact.net/docs/. + +QuARC is still in beta but is regularly synced with the latest version of pyQuARC on GitHub. Fully cloud-native, the architecture diagram of QuARC is shown below: + +![QuARC](https://user-images.githubusercontent.com/17416300/179866276-7c025699-01a1-4d3e-93cd-50e12c5a5ec2.png) + +## Have a question? +If you have any questions, please contact us at **earthdata-support@nasa.gov**. diff --git a/contributing.md b/contributing.md new file mode 100644 index 00000000..91cf80ee --- /dev/null +++ b/contributing.md @@ -0,0 +1,56 @@ +# Contributing File + +# Welcome to pyQuARC! +This page is meant to help you learn how you can contribute to pyQuARC! We are passionate about NASA's Open Science initiative and are open to a variety of contributions. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself. + +## How you can contribute to pyQuARC: + +1. **Report a Bug:** for when you find something within the code that does not respond the way you expected/wanted it to. + * To start you will need to proceed to the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. + * From here, look for the green button on the right side of the page labeled **New issue**. + * Select **Bug Report** from the list that appears, so that you can create a report to help us improve an aspect of pyQuARC. + * The page you are directed to will provide a prompt to add a title and explain how to fill in the bug you want to report. + * If you change your mind about reporting a bug, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. + * Beneath the description box, select "Issue Type" and "Bug". + * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors, and pyQuARC developers will automatically be assigned to the Issue and notified. + * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the red __bug__ tag. + +2. **Suggest a New Feature:** for when you think of something that could enhance pyQuARC for other users. + * Suggesting a new feature is very similar to reporting a bug. You will start at the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. + * Select the green **New Issue** button found on the top right side of the page. + * From the menu that appears, select **Feature Request** so that you can suggest an idea for our project. + * The page you are directed to will provide a prompt to add a title and explain how to make a new suggestion. + * If you change your mind about making a feature request, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. + * Beneath the description box, select "Issue Type" and "Feature". + * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors. + * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the green __new check__ tag. + +3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks or new features. + * Fork the repository + * To edit the code, you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibility settings with the original repository and allows you to create your edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo). + * To create your fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. + * On the top right of the page, select the **Fork** tab. + * Under the "Owner" dropdown menu, select yourself as the owner of the new forked repository. + * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. + * You can set an optional description in the 'Description' field below. + * Make sure the checkbox next to 'Copy the master branch only' is selected. + * Click **Create fork** when you are finished to create your fork! + * After completing the steps above, you should be on a new page titled the same as your new fork, with "forked from NASA-IMPACT/pyQuARC" beneath the title. You have successfully created a fork of pyQuARC! + * Clone your fork locally + * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. + * Under the **HTTPS** tab, copy the link to the repository. + * Open a Python terminal in your preferred coding location. + * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. + * Type '__git clone__' and then paste the URL you copied a few steps above. + * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer. + * Create a new branch and make your desired changes. + * Create a PR + * Once your changes are made, push your commits. + * You can then open a Pull Request (PR) on the [**Pull requests** tab](https://github.com/NASA-IMPACT/pyQuARC/pulls) within the pyQuARC Github page. + * Set the base repository to "NASA-IMPACT/pyQuARC" and the base to "dev". + * Fill out a title and description, then submit! + * Feedback may be provided on your PR. Once it is approved, a pyQuARC team member will merge your changes. + +## Thank you for your interest in pyQuARC! +We appreciate your interest in pyQuARC! Everyone is encouraged to help improve pyQuARC, and we welcome your comments, suggestions, and new ideas! +Please contact earthdata-support@nasa.gov with any questions. diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py index 4bb401c7..69904bee 100644 --- a/pyQuARC/code/checker.py +++ b/pyQuARC/code/checker.py @@ -14,6 +14,9 @@ from .string_validator import StringValidator from .url_validator import UrlValidator +from .schema_validator import SchemaValidator +from .constants import UMM_C # or however you define metadata format + from .constants import ECHO10_C, SCHEMA_PATHS @@ -117,9 +120,9 @@ def build_message(self, result, rule_id): rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get( rule_id ) - severity = rule_mapping.get("severity", "error") messages = [] if not (result["valid"]) and result.get("value"): + severity = result.get("severity") or rule_mapping.get("severity", "error") for value in result["value"]: formatted_message = failure_message value = value if isinstance(value, tuple) else (value,) diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py index e77c74fc..6eadb68b 100644 --- a/pyQuARC/code/constants.py +++ b/pyQuARC/code/constants.py @@ -73,17 +73,26 @@ GCMD_BASIC_URL = "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/" GCMD_LINKS = { - keyword: f"{GCMD_BASIC_URL}{keyword}?format=csv" for keyword in GCMD_KEYWORDS + keyword: f"{GCMD_BASIC_URL}{keyword}?format=csv" + for keyword in GCMD_KEYWORDS } CMR_URL = "https://cmr.earthdata.nasa.gov" DATE_FORMATS = [ - "%Y-%m-%dT%H:%M:%S.%f", # Year to microsecond - "%Y-%m-%dT%H:%M:%S", # Year to second - "%Y-%m-%dT%H:%M", # Year to minute - "%Y-%m-%dT%H", # Year to hour + "%Y-%m-%dT%H:%M:%S.%fZ", # Year to microsecond + "%Y-%m-%dT%H:%M:%SZ", # Year to second + "%Y-%m-%dT%H:%MZ", # Year to minute + "%Y-%m-%dT%HZ", # Year to hour "%Y-%m-%d", # Year to day "%Y-%m", # Year to month "%Y", # Year ] + +CONTENT_TYPE_MAP = { + UMM_C: "vnd.nasa.cmr.umm+json", + UMM_G: "vnd.nasa.cmr.umm+json", + ECHO10_C: "echo10+xml", + ECHO10_G: "echo10+xml", + DIF: "dif10+xml" +} diff --git a/pyQuARC/code/custom_checker.py b/pyQuARC/code/custom_checker.py index f38cedda..839ee6cc 100644 --- a/pyQuARC/code/custom_checker.py +++ b/pyQuARC/code/custom_checker.py @@ -45,7 +45,11 @@ def _get_path_value_recursively( or isinstance(root_content, int) or isinstance(root_content, float) ): - container.append(root_content) + # if there is at least one element in new_path, the value can not be found + if new_path: + container.append(None) + else: + container.append(root_content) return elif isinstance(root_content, list): if not new_path: @@ -184,6 +188,7 @@ def run( for future in as_completed(future_results): try: func_return = future.result() + severity = func_return.get("severity") valid = func_return["valid"] # can be True, False or None if valid is not None: if valid: @@ -196,4 +201,6 @@ def run( raise e result["valid"] = validity result["value"] = invalid_values + if severity: + result["severity"] = severity return result diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index bf3620d1..ab789d3f 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -2,6 +2,7 @@ from .string_validator import StringValidator from .utils import cmr_request, if_arg, set_cmr_prms +from collections.abc import Mapping class CustomValidator(BaseValidator): @@ -277,3 +278,63 @@ def count_check(count, values, key): items = [items] num_items = len(items) return {"valid": int(count) == num_items, "value": (count, num_items)} + + @staticmethod + def opendap_link_check(related_urls, key, extra=None): + """ + Checks if the related_urls contains an OPeNDAP link by looking for "opendap" in the URL + or matching Type/Subtype fields. This function works with both OrderedDict and regular dict, + as well as a list of dictionaries. + + Args: + related_urls (list or Mapping): The related_urls field of the object, expected to be a list of URL objects + or a single OrderedDict. + key (dict): A dictionary with "type" and "url_keyword" keys for the checks. + extra (optional): An additional argument to match the expected function call signature. This argument is ignored. + + Returns: + dict: A validation result indicating whether a valid OPeNDAP link is present and the link itself if found. + """ + + # If related_urls is None or not provided, initialize it as an empty list + if not related_urls: + related_urls = [] + + # If related_urls is a single Mapping (like OrderedDict), wrap it in a list + elif isinstance(related_urls, Mapping): + related_urls = [related_urls] + + # Default return object if no valid OPeNDAP link is found + return_obj = {"valid": False, "value": "None"} + + # Extract URL keyword and type to check from key + url_keyword = key.get("url_keyword", "opendap").lower() + type_to_check = key.get("type", "OPENDAP DATA").upper() + + # Process each URL object in the list + for url_obj in related_urls: + # Ensure that url_obj is a dictionary-like object before processing + if not isinstance(url_obj, Mapping): + continue + + # Retrieve the URL field + url_value = url_obj.get("URL", "").lower() + + # Check if the URL contains "opendap" + if "opendap" in url_value: + return_obj["valid"] = True + return_obj["value"] = url_value + break + + # Retrieve and normalize Type and Subtype fields + type_field = url_obj.get("Type", "").upper() + subtype_field = url_obj.get("Subtype", "").upper() + + # Check if the Type or Subtype contains "OPENDAP DATA" + if type_to_check in type_field or type_to_check in subtype_field: + return_obj["valid"] = True + return_obj["value"] = url_value if url_value else "None" + break + + return return_obj + diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py index fd67e4ef..0d33cecd 100644 --- a/pyQuARC/code/datetime_validator.py +++ b/pyQuARC/code/datetime_validator.py @@ -87,9 +87,17 @@ def date_or_datetime_format_check(datetime_string): Returns: (dict) An object with the validity of the check and the instance """ + is_datetime = DatetimeValidator._iso_datetime(datetime_string) + is_date = DatetimeValidator._iso_date(datetime_string) + + # If it's a datetime, require that it ends with 'Z' + if is_datetime and not datetime_string.endswith("Z"): + is_datetime = False + + valid = is_datetime or is_date + return { - "valid": bool(DatetimeValidator._iso_datetime(datetime_string)) - or bool(DatetimeValidator._iso_date(datetime_string)), + "valid": valid, "value": datetime_string, } @@ -140,20 +148,37 @@ def validate_datetime_against_granules( "granules", ) granules = cmr_request(cmr_prms) - validity = True last_granule_datetime = None + last_granule_datetime_string = None date_time = None # Compare the precision of the two datetime strings if len(granules["feed"]["entry"]) > 0: last_granule = granules["feed"]["entry"][0] - last_granule_datetime = last_granule.get(time_key) + last_granule_datetime_string = last_granule.get(time_key) date_time = get_date_time(datetime_string) - last_granule_datetime = get_date_time(last_granule_datetime) + last_granule_datetime = get_date_time(last_granule_datetime_string) validity = date_time == last_granule_datetime + diff_bigger_than_a_day = abs( + (date_time - last_granule_datetime).total_seconds() / 3600 + ) > 24 + else: + validity = False + + return_value = {} + if ( + (not date_time) + or not last_granule_datetime + or diff_bigger_than_a_day + ): + return_value["severity"] = "error" - return {"valid": validity, "value": (date_time, last_granule_datetime)} + return { + **return_value, + "valid": validity, + "value": (datetime_string, last_granule_datetime_string), + } @staticmethod @if_arg diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..1ba229e9 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,12 +3,26 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url +from .utils import read_json_schema_from_url +from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G -from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C +SUPPORTED_UMM_C_VERSIONS = ["v1.18.4"] +DEFAULT_UMM_C_VERSION = "v1.18.4" # Or any other version you prefer as default + +# Define UMM-G versions if you want to make it flexible as well +SUPPORTED_UMM_G_VERSIONS = ["v1.6.6"] +DEFAULT_UMM_G_VERSION = "v1.6.6" + +SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" + +REMOTE_XML_SCHEMAS = { + "echo10_collection": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Collection.xsd", + "echo10_granule": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Granule.xsd" +} class SchemaValidator: """ @@ -21,6 +35,10 @@ def __init__( self, check_messages, metadata_format=ECHO10_C, + # Add a new parameter for UMM-C version + umm_c_version=DEFAULT_UMM_C_VERSION, + # Add a new parameter for UMM-G version (if you want to make it flexible too) + umm_g_version=DEFAULT_UMM_G_VERSION ): """ Args: @@ -29,41 +47,95 @@ def __init__( validation_paths (list of str): The path of the fields in the metadata that need to be validated. In the form ['Collection/StartDate', ...]. + umm_c_version (str): The specific UMM-C version to use for validation (e.g., "v1.18.4"). + umm_g_version (str): The specific UMM-G version to use for validation (e.g., "v1.6.6"). + check_messages (dict): A dictionary of check messages for errors. """ self.metadata_format = metadata_format + # Validate and store the UMM-C version + if umm_c_version not in SUPPORTED_UMM_C_VERSIONS: + raise ValueError( + f"Unsupported UMM-C version: {umm_c_version}. " + f"Supported versions are: {', '.join(SUPPORTED_UMM_C_VERSIONS)}" + ) + self.umm_c_version = umm_c_version + + # Validate and store the UMM-G version + if umm_g_version not in SUPPORTED_UMM_G_VERSIONS: + raise ValueError( + f"Unsupported UMM-G version: {umm_g_version}. " + f"Supported versions are: {', '.join(SUPPORTED_UMM_G_VERSIONS)}" + ) + self.umm_g_version = umm_g_version + if metadata_format.startswith("umm-"): self.validator_func = self.run_json_validator else: self.validator_func = self.run_xml_validator self.check_messages = check_messages + + def read_xml_schema(self): """ - Reads the xml schema file + Reads the XML schema file (either from a remote URL or local path). """ - # The XML schema file (echo10_xml.xsd) imports another schema file (MetadataCommon.xsd) - # Python cannot figure out the import if they are in a different location than the calling script - # Thus we need to set an environment variable to let it know where the files are located - # Path to catalog must be a url + from urllib.request import urlopen + + # Maintain XML catalog handling catalog_path = f"file:{pathname2url(str(SCHEMA_PATHS['catalog']))}" - # Temporarily set the environment variable os.environ["XML_CATALOG_FILES"] = os.environ.get( "XML_CATALOG_FILES", catalog_path ) - with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: - file_content = schema_file.read().encode() - xmlschema_doc = etree.parse(BytesIO(file_content)) - schema = etree.XMLSchema(xmlschema_doc) - return schema + def get_raw_schema_url(browse_url: str) -> str: + """Convert /browse/ URL into /raw/ for direct XML download.""" + if "/browse/" in browse_url: + return browse_url.replace("/browse/", "/raw/") + "?at=refs%2Fheads%2Fmaster" + return browse_url + # Select remote schema if metadata_format matches + schema_url = REMOTE_XML_SCHEMAS.get(self.metadata_format) + try: + if schema_url: + raw_url = get_raw_schema_url(schema_url) + print(f"Fetching schema remotely from: {raw_url}") + import ssl + ssl_context = ssl._create_unverified_context() # Disable certificate check safely for this fetch + with urlopen(raw_url, context=ssl_context) as response: + file_content = response.read() + else: + # Fallback to local schema file + with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: + file_content = schema_file.read().encode() + + xmlschema_doc = etree.parse(BytesIO(file_content)) + schema = etree.XMLSchema(xmlschema_doc) + return schema + + except Exception as e: + print(f"⚠️ Remote fetch failed or unavailable for {self.metadata_format}: {e}") + print("Falling back to local schema file...") + with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: + file_content = schema_file.read().encode() + xmlschema_doc = etree.parse(BytesIO(file_content)) + schema = etree.XMLSchema(xmlschema_doc) + return schema + def read_json_schema(self): """ Reads the json schema file """ + if self.metadata_format == UMM_C: + schema_url = (f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json") + return read_json_schema_from_url(schema_url) + + if self.metadata_format == UMM_G: + schema_url = (f"{SCHEMA_CDN_BASE}/granule/{self.umm_g_version}/umm-g-json-schema.json") + return read_json_schema_from_url(schema_url) + with open(SCHEMA_PATHS[f"{self.metadata_format}-json-schema"]) as schema_file: - schema = json.load(schema_file) - return schema + return json.load(schema_file) def run_json_validator(self, content_to_validate): """ @@ -77,21 +149,31 @@ def run_json_validator(self, content_to_validate): schema_store = {} if self.metadata_format == UMM_C: - with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as schema_file: - schema_base = json.load(schema_file) - # workaround to read local referenced schema file (only supports uri) - schema_store = { - schema_base.get("$id", "/umm-cmn-json-schema.json"): schema_base, - schema_base.get("$id", "umm-cmn-json-schema.json"): schema_base, - } - errors = {} + #umm_cmn_schema_url = f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json" + # If it's *not* versioned and always the latest or a specific fixed version, adjust this URL + # e.g., f"{SCHEMA_CDN_BASE}/common/umm-cmn-json-schema.json" or from SCHEMA_PATHS - resolver = RefResolver.from_schema(schema, store=schema_store) + try: + with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as common_schema_file: + schema_base = json.load(common_schema_file) + # 1. Add the schema using its $id (most common canonical reference) + if "$id" in schema_base: + schema_store[schema_base["$id"]] = schema_base + + # 2. Add the schema using the full URL you fetched it from (if different from $id or for robustness) + schema_store["/umm-cmn-json-schema.json"] = schema_base + schema_store["umm-cmn-json-schema.json"] = schema_base + except Exception as e: + print(f"Error loading UMM Common schema from {SCHEMA_PATHS['umm-cmn-json-schema']}: {e}") + print("Schema validation for UMM-C might proceed without common schema, leading to incomplete validation.") + errors = {} + resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, + format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +218,13 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/code/string_validator.py b/pyQuARC/code/string_validator.py index 1bd27715..8ba756c1 100644 --- a/pyQuARC/code/string_validator.py +++ b/pyQuARC/code/string_validator.py @@ -1,7 +1,7 @@ from .base_validator import BaseValidator from .gcmd_validator import GcmdValidator from .utils import cmr_request, collection_in_cmr, if_arg, set_cmr_prms - +import re class StringValidator(BaseValidator): """ @@ -38,15 +38,21 @@ def length_check(string, extent, relation): def compare(first, second, relation): """ Compares two strings based on the relationship - Returns: - (dict) An object with the validity of the check and the instance + (dict) An object with the validity of the check and the instance """ + + # Check if 'first' and 'second' contain any special characters + first_clean = re.sub(r'[^a-zA-Z0-9]', '', first).upper() + second_clean = re.sub(r'[^a-zA-Z0-9]', '', second).upper() + + # If either string contains special characters, return a warning or handle as needed return { - "valid": BaseValidator.compare(first.upper(), second.upper(), relation), + "valid": BaseValidator.compare(first_clean, second_clean, relation), "value": (first, second), } + @staticmethod @if_arg def controlled_keywords_check(value, keywords_list): diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 55a74e61..fc4d7efa 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -34,6 +34,28 @@ def _extract_http_texts(text_with_urls): starts_with_http.add(text) return starts_with_http + @staticmethod + def _status_code_from_request(url): + """ + Return HTTP status code for url, raising requests exceptions to caller. + """ + headers = get_headers() + return requests.get(url, headers=headers, timeout=10).status_code + + @staticmethod + def _extract_and_normalize_urls(text_with_urls): + """ + Extract URLs from text, include tokens that start with 'http', strip trailing dots, + and return (set_of_urls, joined_value_string). + """ + extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) + urls = extractor.find_urls(text_with_urls) + urls.extend(UrlValidator._extract_http_texts(text_with_urls)) + # remove dots at the end and deduplicate + urls = set(url[:-1] if url.endswith(".") else url for url in urls) + value = ", ".join(urls) + return urls, value + @staticmethod @if_arg def health_and_status_check(text_with_urls): @@ -45,48 +67,87 @@ def health_and_status_check(text_with_urls): (dict) An object with the validity of the check and the instance/results """ - def status_code_from_request(url): - headers = get_headers() - # timeout = 10 seconds, to allow for slow but not invalid connections - return requests.get(url, headers=headers, timeout=10).status_code - results = [] validity = True - # extract URLs from text - extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) - urls = extractor.find_urls(text_with_urls) - urls.extend(UrlValidator._extract_http_texts(text_with_urls)) - - # remove dots at the end (The URLExtract library catches URLs, but sometimes appends a '.' at the end) - # remove duplicated urls - urls = set(url[:-1] if url.endswith(".") else url for url in urls) - value = ", ".join(urls) + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) - # check that URL returns a valid response for url in urls: - if not url.startswith("http"): - url = f"http://{url}" - try: - response_code = status_code_from_request(url) - if response_code == 200: - if url.startswith("http://"): - secure_url = url.replace("http://", "https://") - if status_code_from_request(secure_url) == 200: - result = { - "url": url, - "error": "The URL is secure. Please use 'https' instead of 'http'.", - } + if url.startswith("https"): + try: + response_code = UrlValidator._status_code_from_request(url) + if response_code != 200: + result = { + "url": url, + "error": f"The url {url} is broken.", + } + results.append(result) else: continue - else: - result = {"url": url, "error": f"Status code {response_code}"} - except requests.ConnectionError: - result = {"url": url, "error": "The URL does not exist on Internet."} - except: - result = {"url": url, "error": "Some unknown error occurred."} - results.append(result) + except requests.ConnectionError: + result = {"url": url, "error": f"The URL {url} does not exist on Internet."} + results.append(result) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def protocol_checks(text_with_urls): + """ + Checks the ftp included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains ftp + Returns: + (dict) An object with the validity of the check and the instance/results + """ + + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) + + for url in urls: + if url.startswith("ftp://"): + results.append({ + "url": url, + "error": f"The URL {url} exists" + }) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def secure_url_checks(text_with_urls): + """ + Checks whether the secure link (https) is included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains https + Returns: + (dict) An object with the validity of the check and the instance/results + """ + + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) + + for url in urls: + if url.startswith("http://"): + results.append({ + "url": url, + "error": f"The URL {url} is not secure" + }) if results: validity = False @@ -117,3 +178,25 @@ def doi_link_update(value, bad_urls): validity = False return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def url_update_email_check(url, bad_urls=None): + if bad_urls is None: + bad_urls = [] + + if not url: + return { + "valid": False, + "value": url, + "message": "No email value provided for URL update contact.", + "remediation": "Provide a valid contact email address." + } + + validity = True + # Check if the URL matches 'support-cddis@earthdata.nasa.gov' + if url in bad_urls or url == "support-cddis@earthdata.nasa.gov": + # Update the URL + url = "support-cddis@nasa.gov" + validity = False # Mark as invalid if the URL was updated + return {"valid": validity, "value": url} diff --git a/pyQuARC/code/utils.py b/pyQuARC/code/utils.py index 1fe82270..2187bf70 100644 --- a/pyQuARC/code/utils.py +++ b/pyQuARC/code/utils.py @@ -82,3 +82,18 @@ def get_date_time(dt_str): except ValueError: continue return None + +def read_json_schema_from_url(url): + """ + Downloads and returns a JSON schema from a given URL. + """ + response = requests.get(url) + response.raise_for_status() + return response.json() + +def get_concept_type(concept_id): + """ + Extract the concept type from a given concept ID. + This is useful for determining the type of concept (e.g., 'collection', 'granule') from its ID. + """ + return concept_id.startswith("C") and "collection" or "granule" diff --git a/pyQuARC/main.py b/pyQuARC/main.py index c0890e31..361a4286 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -8,16 +8,21 @@ if __name__ == "__main__": from code.checker import Checker - from code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS + from code.constants import ( + COLOR, + ECHO10_C, + SUPPORTED_FORMATS, + CONTENT_TYPE_MAP, + ) from code.downloader import Downloader from code.utils import get_cmr_url, is_valid_cmr_url - from code.utils import get_headers + from code.utils import get_concept_type, get_headers else: from .code.checker import Checker from .code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS from .code.downloader import Downloader from .code.utils import get_cmr_url, is_valid_cmr_url - from .code.utils import get_headers + from .code.utils import get_concept_type, get_headers ABS_PATH = os.path.abspath(os.path.dirname(__file__)) END = COLOR["reset"] @@ -133,6 +138,61 @@ def _cmr_query(self): query = f"{orig_query}&page_num={page_num}" return concept_ids + + + def _get_collection_version(self, concept_id): + """ + Fetches collection information from CMR for a given concept_id. + Args: + concept_id (str): The concept ID to query. + + Returns: + dict: {"revision_id": str | None, "metadata_version": str | None } A dict of Revision ID and Metadata Version of the collection. + """ + failure_return_value = {"revision_id": None, "metadata_version": None} + try: + url = f"{self.cmr_host}/search/concepts/{concept_id}.umm_json" + headers = get_headers() + response = requests.get(url, headers=headers) + response.raise_for_status() + + data = response.json() if response.content else {} + return { + "revision_id": response.headers.get("CMR-Revision-Id"), + "metadata_version": data.get("MetadataSpecification", {}).get("Version"), + } + + except Exception as e: + # Unified error handling — return dict even on failure + print(f"Error fetching collection info for {concept_id}: {str(e)}") + return failure_return_value + + + def _validate_with_cmr(self, concept_id, metadata_content): + """ + Validates metadata using the CMR API. + + Args: + metadata_content (str): The metadata content to validate. + + Returns: + dict: Results of the CMR API validation. + """ + provider_id = concept_id.split("-")[1] + # native-id is only available in umm-json (sometimes not even) format and it seems like validation works without the actual native-id value, so just leaving in the url + cmr_url = ( + f"{self.cmr_host}/ingest/providers/{provider_id}/validate/" + f"{get_concept_type(concept_id)}/" + ) + headers = { + "Content-Type": ( + f"application/{CONTENT_TYPE_MAP[self.metadata_format]}" + ), + "Accept": "application/json", + "Cmr-Validate-Keywords": "true", + } + response = requests.post(cmr_url, data=metadata_content, headers=headers) + return response def validate(self): """ @@ -150,8 +210,17 @@ def validate(self): if self.concept_ids: for concept_id in tqdm(self.concept_ids): + # If no version specified, get the latest version + # Get both revision and metadata version in one call + info = self._get_collection_version(concept_id) + version_to_use = self.version or info["revision_id"] + + metadata_version = info["metadata_version"] + if metadata_version: + print(f"Collection {concept_id} schema version: {metadata_version}") + downloader = Downloader( - concept_id, self.metadata_format, self.version, self.cmr_host + concept_id, self.metadata_format, version_to_use, self.cmr_host ) if not (content := downloader.download()): self.errors.append( @@ -162,12 +231,19 @@ def validate(self): } ) continue + content = content.encode() + cmr_response = self._validate_with_cmr(concept_id, content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { "concept_id": concept_id, "errors": validation_errors, + "cmr_validation": { + "errors": cmr_response.json().get("errors", []), + # TODO: show warnings + "warnings": cmr_response.json().get("warnings", []) + }, "pyquarc_errors": pyquarc_errors, } ) @@ -175,7 +251,6 @@ def validate(self): elif self.file_path: with open(os.path.abspath(self.file_path), "r") as myfile: content = myfile.read().encode() - validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { @@ -184,8 +259,10 @@ def validate(self): "pyquarc_errors": pyquarc_errors, } ) + return self.errors + @staticmethod def _error_message(messages): severities = ["error", "warning", "info"] @@ -199,6 +276,29 @@ def _error_message(messages): result_string += f"\t\t{colored_message}{END}\n" return result_string + @staticmethod + def _format_cmr_error(cmr_validation): + cmr_errors = cmr_validation.get("errors") + if not cmr_errors: + return None + error_msg_dict = {} + error_msg = "" + for error in cmr_errors: + if type(error) is dict and error.get("path"): + if error["path"][0] not in error_msg_dict: + error_msg_dict[error["path"][0]] = [] + error_msg_dict[error["path"][0]].append(error['errors']) + else: + error_msg_dict["Misc"] = [error] + for path, errors in error_msg_dict.items(): + error_msg += f"\n\t>> {path}: {END}\n" + for error in errors: + error_str = str(error) + if isinstance(error, list): + error_str = ", ".join(error) + error_msg += f"\t\t{COLOR['error']}Error:{END} {error_str}\n" + return error_msg + def display_results(self): result_string = """ ******************************** @@ -231,7 +331,18 @@ def display_results(self): f"\n\t {COLOR['title']}{COLOR['bright']} pyQuARC ERRORS: {END}\n" ) for error in pyquarc_errors: - error_prompt += f"\t\t ERROR: {error['message']}. Details: {error['details']} \n" + error_prompt += ( + f"\t\t ERROR: {error.get('message', 'No message available')} \n" + f"\t\t DETAILS: {error.get('details', 'No details available')} \n" + ) + + if cmr_validation := error.get("cmr_validation"): + cmr_error_msg = self._format_cmr_error(cmr_validation) + if cmr_error_msg: + error_prompt += ( + f"\n\t {COLOR['title']}{COLOR['bright']} CMR VALIDATION ERRORS: {END}\n" + ) + error_prompt += cmr_error_msg result_string += error_prompt print(result_string) @@ -324,3 +435,4 @@ def display_results(self): ) results = arc.validate() arc.display_results() + \ No newline at end of file diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index aa6bcdd1..ffe03742 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -40,12 +40,36 @@ "remediation": "Recommend updating the Revision date so that it comes chronologically after the Insert/Creation time." }, "url_check": { - "failure": "A URL with a status code other than 200 has been identified: `{}`.", + "failure": "`{}`.", "help": { "message": "", "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" }, - "remediation": "This often indicates a broken link. If the URL is broken, recommend revising." + "remediation": "The following link is broken. Recommend replacing the OnlineAccessURL with a link to directly access the granule via https." + }, + "protocol_check": { + "failure": "The following URL `{}` does not exist.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend removing the ftp access link." + }, + "secure_url_check": { + "failure": "`{}`.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend updating the following link(s) from 'http' to 'https':" + }, + "url_update_email_check": { + "failure": "The listed email contact information must be updated.", + "help": { + "message": "Recommend providing the updated contact information as per the data product.", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" + }, + "remediation": "Recommend changing the contact information to 'support-cddis@nasa.gov'. " }, "shortname_uniqueness": { "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", @@ -53,7 +77,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" }, - "remediation": "The EntryTitle/DataSetId should not be identical to the ShortName. Recommend providing a descriptive, formal title for the dataset. " + "remediation": "Recommend providing a more descriptive title for the dataset. " }, "abstract_length_check": { "failure": "The abstract provided may be inadequate based on length.", @@ -93,23 +117,31 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Processing+Level, https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, - "remediation": "Recommend changing the Id to match one of the EOSDIS Data Processing Levels, if applicable: [0, 1A, 1B, 1C, 2, 2A, 2B, 3, 3A, 4]" + "remediation": "Recommend changing the Id to match one of the EOSDIS Data Processing Levels, if applicable: [Not Provided, 0, 1, 1A, 1B, 1C, 1T, 2, 2A, 2B, 2G, 2P, 3, 3A 4, NA]" }, "processing_level_description_length_check": { "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", "help": { - "message": "Use the EOSDIS Data Processing Level descriptions as guidance.", + "message": "Use the EOSDIS Data Processing level description as guidance.", "url": "https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, - "remediation": "Consider providing a more detailed processing level description." + "remediation": "Recommend providing a more detailed processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, + "processing_level_description_presence_check": { + "failure": "The Processing Level Description is missing.", + "help": { + "message": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance", + "url": "https://www.earthdata.nasa.gov/learn/earth-observation-data-basics/data-processing-levels" + }, + "remediation": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + }, "science_keywords_gcmd_check": { "failure": "`{}` is not a valid GCMD science keyword.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Science+Keywords" }, - "remediation": "Provide a valid GCMD keyword or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "science_keywords_presence_check": { "failure": "Science keywords are required.", @@ -125,7 +157,7 @@ "message": "", "url": "" }, - "remediation": "Provide a valid GCMD keyword or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "eosdis_doi_authority_check": { "failure": "`{}` may be an invalid value.", @@ -157,7 +189,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" }, - "remediation": "Provide a valid short name from the GCMD Providers keyword list or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid short name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "organization_long_name_gcmd_check": { "failure": "The provided data center long name `{}` does not comply with the GCMD. ", @@ -165,7 +197,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" }, - "remediation": "Provide a valid long name name from the GCMD Providers keyword list or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid long name name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "organization_short_long_name_consistency_check": { "failure": "The provided data center short name `{}` and long name `{}` aren't consistent.", @@ -309,7 +341,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid short name, or submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instruments keyword list." + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." }, "instrument_long_name_gcmd_check": { "failure": "The provided instrument long name `{}` does not comply with GCMD.", @@ -317,7 +349,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid long name, or submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instruments keyword list." + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." }, "instrument_long_name_presence_check": { "failure": "The provided instrument/sensor short name `{}` is missing the corresponding instrument/sensor long name.", @@ -365,7 +397,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid short name, or submit a request to support@earthdata.nasa.gov to have this platform added to the GCMD Platforms keyword list." + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." }, "platform_short_long_name_consistency_check": { "failure": "The provided platform short name `{}` and long name `{}` are not consistent.", @@ -389,7 +421,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/DataFormat/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid data format, or submit a request to support@earthdata.nasa.gov to have this data format added to the GCMD Data Format keyword list." + "remediation": "Select a valid data format, or submit a request to earthdata-support@nasa.gov to have this data format added to the GCMD Data Format keyword list." }, "platform_long_name_gcmd_check": { "failure": "The provided platform long name `{}` does not comply with GCMD.", @@ -397,7 +429,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid long name, or submit a request to support@earthdata.nasa.gov to have this platform added to the GCMD Platforms keyword list." + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." }, "spatial_keyword_gcmd_check": { "failure": "The provided location/spatial keyword `{}` does not comply with GCMD.", @@ -405,7 +437,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/locations/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid location keyword, or submit a request to support@earthdata.nasa.gov to have this value added to the GCMD Locations keyword list." + "remediation": "Select a valid location keyword, or submit a request to earthdata-support@nasa.gov to have this value added to the GCMD Locations keyword list." }, "platform_type_gcmd_check": { "failure": "The provided platform type `{}` does not comply with GCMD.", @@ -413,7 +445,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid platform type, or submit a request to support@earthdata.nasa.gov to have this platform type added to the GCMD Platforms keyword list." + "remediation": "Select a valid platform type, or submit a request to earthdata-support@nasa.gov to have this platform type added to the GCMD Platforms keyword list." }, "campaign_short_long_name_consistency_check": { "failure": "The provided project/campaign short name `{}` and long name `{}` are not consistent.", @@ -429,7 +461,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid short name, or submit a request to support@earthdata.nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." }, "campaign_long_name_gcmd_check": { "failure": "The provided project/campaign long name `{}` does not comply with GCMD.", @@ -437,7 +469,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid long name, or submit a request to support@earthdata.nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." }, "campaign_long_name_presence_check": { "failure": "The provided project/campaign short name `{}` is missing the corresponding project/campaign long name.", @@ -629,7 +661,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Project" }, - "remediation": "Please add a GCMD compliant campaign/project name if applicable to the dataset." + "remediation": "Recommend providing a campaign short name from the following list: https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv" }, "spatial_coverage_type_presence_check": { "failure": "The Spatial Coverage Type is missing.", @@ -645,7 +677,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" }, - "remediation": "If appropriate for the dataset, recommend providing information about the horizontal datum." + "remediation": "Information about the datum should be provided in the metadata if possible." }, "online_access_url_presence_check": { "failure": "No Online Access URL is provided. A link to access the data is required.", @@ -768,7 +800,7 @@ "remediation": "Recommend providing a unique name for each characteristic." }, "validate_beginning_datetime_against_granules": { - "failure": "The collection beginning date time `{}` is not consistent with the first granule's beginning date time `{}`.", + "failure": "The collection beginning date time `{}` is not consistent with the beginning date time in the metadata for the first granule `{}`.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Temporal+Extent" @@ -776,7 +808,7 @@ "remediation": "Recommend updating the beginning date time to match the granule extent." }, "validate_ending_datetime_against_granules": { - "failure": "The collection ending date time `{}` is not consistent with the last granule's ending date time `{}`.", + "failure": "The collection ending date time `{}` is not consistent with the ending date time in the metadata for the last granule `{}`.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Temporal+Extent" @@ -1070,5 +1102,13 @@ "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" }, "remediation": "Recommend providing the horizontal pixel resolution, if applicable. If provided, this information will be indexed in the EDSC 'Horizontal Data Resolution' search facet which allows users to search by spatial resolution." + }, + "opendap_link_check": { + "failure": "No OPeNDAP URL is provided in the granule fields. An OPeNDAP link is recommended for data access.", + "help": { + "message": "OPeNDAP links allow for direct data access through the OPeNDAP protocol.", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" + }, + "remediation": "Recommend providing an OPeNDAP in the granule's Online Resources or Related URLs fields for enhanced data accessibility." } } \ No newline at end of file diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..311847da 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,2 @@ {} + diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 778f4da3..4fa0df4c 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -23,6 +23,21 @@ "data_type": "url", "check_function": "health_and_status_check", "available": true + }, + "protocol_check": { + "data_type": "url", + "check_function": "protocol_checks", + "available": true + }, + "secure_url_check": { + "data_type": "url", + "check_function": "secure_url_checks", + "available": true + }, + "url_update_email_check": { + "data_type": "url", + "check_function": "url_update_email_check", + "available": true }, "string_compare": { "data_type": "string", @@ -298,5 +313,10 @@ "data_type": "custom", "check_function": "count_check", "available": true + }, + "opendap_link_check": { + "data_type": "custom", + "check_function": "opendap_link_check", + "available": true } } diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 9afd5059..64214155 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -812,6 +812,262 @@ "severity": "error", "check_id": "url_check" }, + + "protocol_check": { + "rule_name": "protocol_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "error", + "check_id": "protocol_check" + }, + "secure_url_check": { + "rule_name": "secure_url_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "info", + "check_id": "secure_url_check" + }, + "shortname_uniqueness": { "rule_name": "Short Name uniqueness check", "fields_to_apply": { @@ -994,15 +1250,20 @@ "data": [ [ "0", + "1", "1A", "1B", "1C", "2", "2A", "2B", + "2G", + "2P", "3", "3A", - "4" + "4", + "NA", + "Not provided" ] ], "severity": "warning", @@ -1253,6 +1514,27 @@ "severity": "info", "check_id": "length_check" }, + "processing_level_description_presence_check": { + "rule_name": "Processing Level Description Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/ProcessingLevelDescription" + ] + } + ], + "umm-c": [ + { + "fields": [ + "ProcessingLevel/ProcessingLevelDescription" + ] + } + ] + }, + "severity": "info", + "check_id": "one_item_presence_check" + }, "umm_controlled_collection_state_list_check": { "rule_name": "UMM Controlled Collection State List", "fields_to_apply": { @@ -3621,7 +3903,7 @@ "umm-g": [ { "fields": [ - "Projects/ShortName" + "Campaign/ShortName" ] } ] @@ -3677,7 +3959,7 @@ "umm-c": [ { "fields": [ - "SpatialRepresentationInfo/HorizontalCoordinateSystem/GeodeticModel/HorizontalDatumName" + "SpatialExtent/HorizontalSpatialDomain/ResolutionAndCoordinateSystem/GeodeticModel/HorizontalDatumName" ] } ] @@ -3745,6 +4027,34 @@ "severity": "error", "check_id": "string_compare" }, + "opendap_link_check": { + "rule_name": "OPeNDAP Link Presence Check", + "fields_to_apply": { + "echo-g": [ + { + "fields": [ + "Granule/OnlineResources/OnlineResource" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls" + ] + } + ] + }, + "data": [ + { + "type": "OPENDAP DATA", + "url_keyword": "opendap" + } + ], + "relation": "contains", + "check_id": "opendap_link_check", + "severity": "warning" + }, "location_keyword_presence_check": { "rule_name": "Location Keyword Presence Check", "fields_to_apply": { @@ -3898,7 +4208,7 @@ ] }, "severity": "warning", - "check_id": "license_url_description_check" + "check_id": "one_item_presence_check" }, "collection_citation_presence_check": { "rule_name": "Collection Citation Presence Check", @@ -4111,6 +4421,11 @@ "fields": [ "RelatedUrls/Type" ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/Type" + ] } ], "umm-g": [ @@ -4144,6 +4459,11 @@ } ], "umm-c": [ + { + "fields": [ + "Collection/OnlineResources/OnlineResource/Type" + ] + }, { "fields": [ "RelatedUrls/Type", @@ -4161,7 +4481,7 @@ ] }, "severity": "warning", - "check_id": "availability_check" + "check_id": "one_item_presence_check" }, "characteristic_name_uniqueness_check": { "rule_name": "Characteristic Name Uniqueness Check", @@ -4792,6 +5112,18 @@ "RelatedUrls/Description", "RelatedUrls/URL" ] + }, + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URLDescription", + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/Description", + "Granule/OnlineResources/OnlineResource/URL" + ] } ] }, @@ -5342,7 +5674,7 @@ "URLDescription" ], "severity": "info", - "check_id": "uniqueness_check" + "check_id": "one_item_presence_check" }, "metadata_update_time_logic_check": { "rule_name": "Metadata Update Time Logic Check", @@ -5559,5 +5891,20 @@ }, "severity": "warning", "check_id": "one_item_presence_check" - } + }, + "url_update_email_check": { + "rule_name": "URL Email address check", + "fields_to_apply": { + "umm-c": [ + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/ContactMechanisms/Value", + "DataCenters/ContactGroups/ContactInformation/ContactInstruction" + ] + } + ] + }, + "severity": "info", + "check_id": "url_update_email_check" +} } \ No newline at end of file diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 0967ef42..9e26dfee 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1 +1 @@ -{} +{} \ No newline at end of file diff --git a/pyQuARC/schemas/ruleset.json b/pyQuARC/schemas/ruleset.json index ce3bbeca..aaa4ab14 100644 --- a/pyQuARC/schemas/ruleset.json +++ b/pyQuARC/schemas/ruleset.json @@ -464,12 +464,12 @@ { "name-id": "Instrument Short Name Check", "name-display": "Instrument Short Name Check", - "description": "Check to ensure the provided instrument short name matches a value from the GCMD controlled vocabularly list. Provide an error if the provided short name is not an exact match with any of the names on the keyword list, and suggest a request be made to support@earthdata.nasa.gov in order to have it added to the GCMD Instrument KMS .", + "description": "Check to ensure the provided instrument short name matches a value from the GCMD controlled vocabularly list. Provide an error if the provided short name is not an exact match with any of the names on the keyword list, and suggest a request be made to earthdata-support@nasa.gov in order to have it added to the GCMD Instrument KMS .", "severity": "error", "timeframe": null, "scope": null, "message-fail": "1. If the provided short name is not GCMD-compliant: The provided instrument short name does not comply with the GCMD. \n2. If an instrument short name is not provided: The instrument short name appears to be missing from the metadata.", - "remediation": "1. Please submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument short name under the [associated platform short name] platform.", + "remediation": "1. Please submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument short name under the [associated platform short name] platform.", "help_url": null, "specification": null, "spec_version": null, @@ -478,12 +478,12 @@ { "name-id": "Instrument Long Name Check", "name-display": "Instrument Long Name Check", - "description": "Check to determine if the provided long name matches a value from the GCMD controlled vocabulary list and is associated with the correct instrument short name. Provide an error if 1) a long name is not provided when one exists in the vocabulary list for the associated instrument short name or 2) if the provided long name is not an exact match with any of the names on the keyword list; suggest a request be made to support@earthdata.nasa.gov in order to have it added to the GCMD Instrument KMS if this is the case.", + "description": "Check to determine if the provided long name matches a value from the GCMD controlled vocabulary list and is associated with the correct instrument short name. Provide an error if 1) a long name is not provided when one exists in the vocabulary list for the associated instrument short name or 2) if the provided long name is not an exact match with any of the names on the keyword list; suggest a request be made to earthdata-support@nasa.gov in order to have it added to the GCMD Instrument KMS if this is the case.", "severity": "warning (if no long name is provided when it should be)\n\nerror (if the names is not an exact match with the keyword list)", "timeframe": null, "scope": null, "message-fail": "1. If the provided long name is not GCMD-compliant: The provided instrument long name does not comply with the GCMD.\n2. If a long name is not provided when one exists: The instrument long name appears to be missing from the metadata.", - "remediation": "1. Please submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument long name for [the instrument] under the [associated platform short name] platform.", + "remediation": "1. Please submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument long name for [the instrument] under the [associated platform short name] platform.", "help_url": null, "specification": null, "spec_version": null, diff --git a/pyQuARC/schemas/umm-c-json-schema.json b/pyQuARC/schemas/umm-c-json-schema.json index fd193169..f8103804 100644 --- a/pyQuARC/schemas/umm-c-json-schema.json +++ b/pyQuARC/schemas/umm-c-json-schema.json @@ -1415,9 +1415,9 @@ "enum": ["NEAR_REAL_TIME", "LOW_LATENCY", "EXPEDITED", "SCIENCE_QUALITY", "OTHER"] }, "CollectionProgressEnum": { - "description": "This element describes the production status of the data set. There are five choices for Data Providers: PLANNED refers to data sets to be collected in the future and are thus unavailable at the present time. For Example: The Hydro spacecraft has not been launched, but information on planned data sets may be available. ACTIVE refers to data sets currently in production or data that is continuously being collected or updated. For Example: data from the AIRS instrument on Aqua is being collected continuously. COMPLETE refers to data sets in which no updates or further data collection will be made. For Example: Nimbus-7 SMMR data collection has been completed. DEPRECATED refers to data sets that have been retired, but still can be retrieved. Usually newer products exist that replace the retired data set. NOT APPLICABLE refers to data sets in which a collection progress is not applicable such as a calibration collection. There is a sixth value of NOT PROVIDED that should not be used by a data provider. It is currently being used as a value when a correct translation cannot be done with the current valid values, or when the value is not provided by the data provider.", + "description": "This element describes the production status of the data set. There are multiple choices for Data Providers: PLANNED refers to data sets to be collected in the future and are thus unavailable at the present time. For Example: The Hydro spacecraft has not been launched, but information on planned data sets may be available. ACTIVE refers to data sets currently in production or data that is continuously being collected or updated. For Example: data from the AIRS instrument on Aqua is being collected continuously. COMPLETE refers to data sets in which no updates or further data collection will be made. For Example: Nimbus-7 SMMR data collection has been completed. DEPRECATED refers to data sets that have been retired, but still can be retrieved. Usually newer products exist that replace the retired data set. NOT PROVIDED should not be used by a data provider. It is currently being used as a value when a correct translation cannot be done with the current valid values, or when the value is not provided by the data provider. PREPRINT: Refers to datasets which are made available prior to completion of validation and review processes to support manuscript publication processes and open science.\nPreprint datasets are provisional and should not be used for production applications. INREVIEW: Refers to datasets which are made available to support science team final review. In Review datasets are provisional and should not be used for production applications. Note that if restricted access is needed, an INREVIEW dataset may also have an Access Control List applied. SUPERSEDED: Refers to datasets which remain publicly available, but for which a newer version is available.", "type": "string", - "enum": ["ACTIVE", "PLANNED", "COMPLETE", "DEPRECATED", "NOT APPLICABLE", "NOT PROVIDED"] + "enum": ["ACTIVE", "PLANNED", "COMPLETE", "DEPRECATED", "NOT PROVIDED", "PREPRINT", "INREVIEW", "SUPERSEDED"] }, "LocationKeywordType": { "description": "This element defines a hierarchical location list. It replaces SpatialKeywords. The controlled vocabulary for location keywords is maintained in the Keyword Management System (KMS). Each tier must have data in the tier above it.", @@ -1468,7 +1468,7 @@ "enum": ["KB", "MB", "GB", "TB", "PB", "NA"] }, "DistributionMediaType": { - "description": "This element defines the media by which the end user can obtain the distributable item. Examples of media include: CD-ROM, 9 track tape, diskettes, hard drives, online, transparencies, hardcopy, etc.", + "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include HTTPS, Earthdata Cloud, etc.", "type": "string", "minLength": 1, "maxLength": 80 @@ -1593,7 +1593,7 @@ "$ref": "#/definitions/ArchiveDistributionFormatDescriptionType" }, "Media": { - "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include: CD-ROM, 9 track tape, diskettes, hard drives, online, transparencies, hardcopy, etc.", + "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include HTTPS, Earthdata Cloud, etc.", "type": "array", "items": { "$ref": "#/definitions/DistributionMediaType" @@ -1656,7 +1656,7 @@ "$ref": "#/definitions/ArchiveDistributionFormatDescriptionType" }, "Media": { - "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include: CD-ROM, 9 track tape, diskettes, hard drives, online, transparencies, hardcopy, etc.", + "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include HTTPS, Earthdata Cloud, etc.", "type": "array", "items": { "$ref": "#/definitions/DistributionMediaType" @@ -1811,7 +1811,7 @@ "Type": { "description": "This element describes to what DOI is associated.", "type": "string", - "enum": ["Child Dataset", "Collaborative/Other Agency", "Field Campaign", "Parent Dataset", "Related Dataset"] + "enum": ["Child Dataset", "Collaborative/Other Agency", "Field Campaign", "Parent Dataset", "Related Dataset", "IsPreviousVersionOf", "IsNewVersionOf", "IsDescribedBy"] } }, "required": ["DOI"] @@ -1905,7 +1905,7 @@ "URL": { "description": "This element represents the URL where the schema lives. The schema can be downloaded.", "type": "string", - "enum": ["https://cdn.earthdata.nasa.gov/umm/collection/v1.18.1"] + "enum": ["https://cdn.earthdata.nasa.gov/umm/collection/v1.18.4"] }, "Name": { "description": "This element represents the name of the schema.", @@ -1915,7 +1915,7 @@ "Version": { "description": "This element represents the version of the schema.", "type": "string", - "enum": ["1.18.1"] + "enum": ["1.18.4"] } }, "required": ["URL", "Name", "Version"] diff --git a/pyQuARC/schemas/version.txt b/pyQuARC/schemas/version.txt index adcf29f0..3a6f0812 100644 --- a/pyQuARC/schemas/version.txt +++ b/pyQuARC/schemas/version.txt @@ -1 +1 @@ -2023-04-24 \ No newline at end of file +2025-05-19 \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..29158f47 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +filterwarnings = + ignore:Accessing jsonschema.draft7_format_checker is deprecated:DeprecationWarning + ignore:ssl.PROTOCOL_TLS is deprecated:DeprecationWarning + ignore:ssl.match_hostname.*:DeprecationWarning diff --git a/requirements.txt b/requirements.txt index 30aec17c..6432dc89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ colorama==0.4.4 idna==2.10 jsonschema==4.17.3 -lxml==4.9.1 +lxml==5.3.0 #4.9.1 pytest==5.4.3 pytz==2020.1 requests==2.24.0 diff --git a/tests/test_datetime_validator.py b/tests/test_datetime_validator.py index 9f9c0262..0358f6d0 100644 --- a/tests/test_datetime_validator.py +++ b/tests/test_datetime_validator.py @@ -1,3 +1,7 @@ +import pytest +from unittest.mock import patch +from datetime import datetime + from pyQuARC.code.datetime_validator import DatetimeValidator from tests.fixtures.validator import INPUT_OUTPUT @@ -18,3 +22,73 @@ def test_datetime_iso_format_check(self): def test_datetime_compare(self): pass + + @patch("pyQuARC.code.datetime_validator.set_cmr_prms") + @patch("pyQuARC.code.datetime_validator.cmr_request") + @patch("pyQuARC.code.datetime_validator.get_date_time") + @pytest.mark.parametrize( + "datetime_string, granule_datetime, expected_valid, expected_severity", + [ + # Exact match → valid, no severity + ("2025-08-01T00:00:00Z", "2025-08-01T00:00:00Z", True, None), + + # Different date but within 24 hours → invalid, no severity + ("2025-08-02T00:00:00Z", "2025-08-01T12:00:00Z", False, None), + + # More than 24 hours difference → invalid, severity error + ("2025-08-03T00:00:00Z", "2025-08-01T00:00:00Z", False, "error"), + + # No granules returned → valid=False, severity error + ("2025-08-01T00:00:00Z", None, False, "error"), + ], + ) + def test_validate_datetime_against_granules( + self, + mock_get_date_time, + mock_cmr_request, + mock_set_cmr_prms, + datetime_string, + granule_datetime, + expected_valid, + expected_severity, + ): + # Arrange: cmr_request mock + if granule_datetime is None: + mock_cmr_request.return_value = {"feed": {"entry": []}} + else: + mock_cmr_request.return_value = { + "feed": { + "entry": [ + { + "time_start": granule_datetime, + "time_end": granule_datetime, + } + ] + } + } + + mock_set_cmr_prms.return_value = {"mock": "params"} + + # Mock get_date_time to return datetime objects or None + def fake_get_date_time(val): + if val is None: + return None + return datetime.strptime(val, "%Y-%m-%dT%H:%M:%SZ") + + mock_get_date_time.side_effect = fake_get_date_time + + # Act + result = DatetimeValidator.validate_datetime_against_granules( + datetime_string, + collection_shortname="TEST", + version="1", + sort_key="start_date", + time_key="time_start", + ) + + # Assert + assert result["valid"] == expected_valid + if expected_severity: + assert result["severity"] == expected_severity + else: + assert "severity" not in result diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..ca1762c8 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000042-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1018577631-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf",