-
Notifications
You must be signed in to change notification settings - Fork 8
Enforce data structure and data type consistency for JSON metadata #1421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 89 commits
1d53e3a
8efb138
2ce744c
20e175b
19213ec
1360a4c
04497cd
853dd15
77e6295
71143ae
a5ca110
bdb6327
42f9f59
b4b3451
c5e8678
8a4a38b
49c5ff3
e90e088
3337cc2
1f8dee6
ac4864c
0c343ad
c6d150f
19a710c
da0e51d
d3d118b
d9c046e
e5f7e96
0351dc2
9107c7d
2833a4d
e3b1429
de18655
15c2f2f
156e56f
c431f7b
9036ca7
9663a76
cdd8419
a08363c
d545e4b
225deae
c2bba5b
3f6606c
6315505
4bb4b5b
2d87449
329ee8f
04de852
44fb68e
598906b
acb17d4
ad28046
c6e093f
da5f9e8
111ec55
c917c85
e44889b
743ac08
235c3f9
fae0e97
d430aee
05b301e
7a96090
2148812
8e953cd
091b665
f27dcee
fe15778
a0b91fe
5e831a9
d719846
e945040
f428d24
3c9ee96
bb141dc
681654a
7a1071e
bb739f4
cf14588
74d0375
86c22a7
d3f2056
64206c3
1162d93
3232373
38f9fbd
466de93
fa2327b
0cc8590
30cf06e
401d91d
13e2288
95f1c99
9fca65c
ccda29d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -2,8 +2,18 @@ | |||||
|
|
||||||
| require "maremma" | ||||||
| require "benchmark" | ||||||
| require "byebug" | ||||||
| require "pp" | ||||||
|
|
||||||
| class Doi < ApplicationRecord | ||||||
| INVALID_SCHEMAS = %w[ | ||||||
| http://datacite.org/schema/kernel-2.1 | ||||||
| http://datacite.org/schema/kernel-2.2 | ||||||
| http://datacite.org/schema/kernel-3.0 | ||||||
| http://datacite.org/schema/kernel-3.1 | ||||||
| http://datacite.org/schema/kernel-3 | ||||||
| ].freeze | ||||||
|
|
||||||
| self.ignored_columns += [:publisher] | ||||||
| PUBLISHER_JSON_SCHEMA = Rails.root.join("app", "models", "schemas", "doi", "publisher.json") | ||||||
| audited only: %i[doi url creators contributors titles publisher_obj publication_year types descriptions container sizes formats version_info language dates identifiers related_identifiers related_items funding_references geo_locations rights_list subjects schema_version content_url landing_page aasm_state source reason] | ||||||
|
|
@@ -112,16 +122,13 @@ class Doi < ApplicationRecord | |||||
| validates_presence_of :doi | ||||||
| validates_presence_of :url, if: Proc.new { |doi| doi.is_registered_or_findable? } | ||||||
|
|
||||||
| json_schema_validation = { | ||||||
| message: ->(errors) { errors }, | ||||||
| schema: PUBLISHER_JSON_SCHEMA | ||||||
| } | ||||||
|
|
||||||
| def validate_publisher_obj?(doi) | ||||||
| doi.validatable? && doi.publisher_obj? && !(doi.publisher_obj.blank? || doi.publisher_obj.all?(nil)) | ||||||
| def validate_json_attribute?(attribute) | ||||||
| validatable? && !self[attribute].nil? && !INVALID_SCHEMAS.include?(self.schema_version) | ||||||
| end | ||||||
|
|
||||||
| validates :publisher_obj, if: ->(doi) { validate_publisher_obj?(doi) }, json: json_schema_validation | ||||||
| def schema_file_path(schema_name) | ||||||
| Rails.root.join("app", "models", "schemas", "doi", "#{schema_name}.json") | ||||||
| end | ||||||
|
|
||||||
| # from https://www.crossref.org/blog/dois-and-matching-regular-expressions/ but using uppercase | ||||||
| validates_format_of :doi, with: /\A10\.\d{4,5}\/[-._;()\/:a-zA-Z0-9*~$=]+\z/, on: :create | ||||||
|
|
@@ -148,6 +155,46 @@ def validate_publisher_obj?(doi) | |||||
| validate :check_geo_locations, if: :geo_locations? | ||||||
| validate :check_language, if: :language? | ||||||
|
|
||||||
| # JSON-SCHEMA VALIDATION | ||||||
| # temporarily commenting out this validation. | ||||||
| # validates :doi, if: proc { |doi| doi.validate_json_attribute?(:identifier) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("identifier") } }, unless: :only_validate | ||||||
| validates :creators, if: proc { |doi| doi.validate_json_attribute?(:creators) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("creators") } }, unless: :only_validate | ||||||
| validates :titles, if: proc { |doi| doi.validate_json_attribute?(:titles) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("titles") } }, unless: :only_validate | ||||||
| validates :publisher_obj, if: proc { |doi| doi.validate_json_attribute?(:publisher_obj) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("publisher") } }, unless: :only_validate | ||||||
| validates :publication_year, if: proc { |doi| doi.validate_json_attribute?(:publication_year) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("publication_year") } }, unless: :only_validate | ||||||
| validates :subjects, if: proc { |doi| doi.validate_json_attribute?(:subjects) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("subjects") } }, unless: :only_validate | ||||||
| validates :contributors, if: proc { |doi| doi.validate_json_attribute?(:contributors) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("contributors") } }, unless: :only_validate | ||||||
| validates :dates, if: proc { |doi| doi.validate_json_attribute?(:dates) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("dates") } }, unless: :only_validate | ||||||
| validates :alternate_identifiers, if: proc { |doi| doi.validate_json_attribute?(:alternate_identifiers) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("alternate_identifiers") } }, unless: :only_validate | ||||||
| validates :related_identifiers, if: proc { |doi| doi.validate_json_attribute?(:related_identifiers) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("related_identifiers") } }, unless: :only_validate | ||||||
| validates :sizes, if: proc { |doi| doi.validate_json_attribute?(:sizes) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("sizes") } }, unless: :only_validate | ||||||
| validates :formats, if: proc { |doi| doi.validate_json_attribute?(:formats) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("formats") } }, unless: :only_validate | ||||||
| validates :version, if: proc { |doi| doi.validate_json_attribute?(:version) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("version") } }, unless: :only_validate | ||||||
| validates :rights_list, if: proc { |doi| doi.validate_json_attribute?(:rights_list) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("rights_list") } }, unless: :only_validate | ||||||
| validates :descriptions, if: proc { |doi| doi.validate_json_attribute?(:descriptions) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("descriptions") } }, unless: :only_validate | ||||||
| validates :geolocations, if: proc { |doi| doi.validate_json_attribute?(:geolocations) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("geolocations") } }, unless: :only_validate | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix geolocation validator key mismatch. Line 175 validates 🔧 Proposed fix- validates :geolocations, if: proc { |doi| doi.validate_json_attribute?(:geolocations) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("geolocations") } }, unless: :only_validate
+ validates :geo_locations, if: proc { |doi| doi.validate_json_attribute?(:geo_locations) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("geolocations") } }, unless: :only_validate📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
| validates :funding_references, if: proc { |doi| doi.validate_json_attribute?(:funding_references) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("funding_references") } }, unless: :only_validate | ||||||
| validates :related_items, if: proc { |doi| doi.validate_json_attribute?(:related_items) }, json: { message: ->(errors) { errors }, schema: lambda { schema_file_path("related_items") } }, unless: :only_validate | ||||||
|
|
||||||
| validates :raw_language, presence: true, if: proc { |doi| doi.validate_json_attribute?(:raw_language) }, json: { | ||||||
| message: ->(errors) { errors }, | ||||||
| schema: lambda { schema_file_path("language") } | ||||||
| }, unless: :only_validate | ||||||
|
|
||||||
| validates :raw_types, if: proc { |doi| doi.validate_json_attribute?(:raw_types) }, json: { | ||||||
| message: ->(errors) { errors }, | ||||||
| schema: lambda { schema_file_path("resource_type") }, | ||||||
| }, unless: :only_validate | ||||||
|
|
||||||
| # See https://github.com/mirego/activerecord_json_validator for an explanation of why this must be done. | ||||||
| def raw_language | ||||||
| self[:language] | ||||||
| end | ||||||
|
|
||||||
| def raw_types | ||||||
| self[:types] | ||||||
| end | ||||||
|
|
||||||
| after_commit :update_url, on: %i[create update] | ||||||
| after_commit :update_media, on: %i[create update] | ||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| { | ||
| "title": "Affiliation", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "object", | ||
| "properties": { | ||
| "affiliationIdentifier": { | ||
| "type": ["string", "null"] | ||
| }, | ||
| "affiliationIdentifierScheme": { | ||
| "type": ["string", "null"] | ||
| }, | ||
| "name": { | ||
| "type": ["string", "null"] | ||
| }, | ||
| "schemeUri": { | ||
| "type": ["string", "null"] | ||
| } | ||
| }, | ||
| "dependentRequired": { | ||
svogt0511 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| "affiliationIdentifier": ["affiliationIdentifierScheme"] | ||
| }, | ||
| "additionalProperties": false | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| { | ||
| "title": "Affiliations", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "array", | ||
| "minItems": 0, | ||
| "items": { | ||
| "$ref": "affiliation.json" | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| { | ||
| "title": "AlternateIdentifier", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "object", | ||
| "properties": { | ||
| "alternateIdentifier": { | ||
| "type": "string" | ||
| }, | ||
| "alternateIdentifierType": { | ||
| "type": "string" | ||
| } | ||
| }, | ||
| "additionalProperties": false, | ||
| "dependentRequired": { | ||
| "alternateIdentifier": ["alternateIdentifierType"] | ||
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| { | ||
| "title": "AlternateIdentifiers", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "array", | ||
| "minItems": 0, | ||
svogt0511 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "items": { | ||
| "$ref": "alternate_identifier.json" | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -0,0 +1,42 @@ | ||||
| { | ||||
| "title": "Contributor", | ||||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||||
| "type": "object", | ||||
| "properties": { | ||||
| "name": { | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an interesting one. I was testing with metadata pulled from some recently created/updated DOIs, to see if this validation would have impacted them. I found this DOI which in the JSON has a contributor "contributors": [
{
"nameType": "Personal",
"givenName": "Jacopo",
"familyName": "Torrisi",
"affiliation": [],
"contributorType": "DataManager",
"nameIdentifiers": [
{
"nameIdentifier": "",
"nameIdentifierScheme": "ORCID"
}
]
}
],Using this metadata to create a DOI on staging failed with this error: {
"errors": [
{
"source": "contributors",
"title": "Object at `/0` is missing required properties: name",
"uid": "10.1111/742r-wc63"
}
]
}From my understanding of the XSD, <contributors>
<contributor contributorType="DataManager">
<contributorName nameType="Personal">Torrisi, Jacopo</contributorName>
<givenName>Jacopo</givenName>
<familyName>Torrisi</familyName>
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI=""/>
<affiliation affiliationIdentifierScheme="ROR"/>
</contributor>
</contributors>How is that contributorName being generated for the XML? I am just thinking through the potential impact on user who are currently providing not providing
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tagging @codycooperross for input into this as well :)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With JSON -> XML, we currently generate a contributorName and creatorName based on available familyName and givenName metadata if available. See the description here: https://docs.google.com/spreadsheets/d/1Hy0KXWPxqNx-Pfh-nNFxbsUFXXVYsO8O2sDIytXQv7U/edit?gid=1806954511#gid=1806954511&range=2:2 For the sake of backwards compatibility with existing request patterns and scoping this PR, let's remove the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was always very confusing to me. As I understand it, contributorName is always required. If nameType = Unknown then Contributor can have a given name, family name and a completely independent name. If nameType = Organization then Contributor has any name you want to give it. (**Although, if you are converting the type to organization to one of the other types, givenName and familyName are preserved in the DB (and probably in elasticSearch. A topic for another time.) If nameType = Person, the Name field is generated from the givenName and familyName. I would not remove the name requirement. I would not back out our requirements to fit incorrect data. I thought that if there are incorrect dois in the DB, that the user would be prodded to correct them if/when the doi is updated, otherwise, there is not much we could do, except perhaps, to flag them as being incorrect at some point. Does this make sense? |
||||
| "type": "string" | ||||
| }, | ||||
| "nameType": { | ||||
| "$ref": "controlled_vocabularies/name_type.json" | ||||
| }, | ||||
| "givenName": { | ||||
| "type": ["string", "null"] | ||||
| }, | ||||
| "familyName": { | ||||
| "type": ["string", "null"] | ||||
| }, | ||||
| "contributorType": { | ||||
| "oneOf": [ | ||||
| { | ||||
| "$ref": "controlled_vocabularies/contributor_type.json" | ||||
| }, | ||||
| { | ||||
| "type": "null" | ||||
| } | ||||
| ] | ||||
| }, | ||||
| "lang": { | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think It doesn't look like it's populated in the API: https://api.datacite.org/dois?query=contributors.lang:*
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's interesting, it should be allowed. In the XML, it is attached to contributorName: https://datacite-metadata-schema.readthedocs.io/en/4.6/properties/contributor/#id2 There is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like we do accept this currently in ParamsSanitizer: lupo/app/lib/params_sanitizer.rb Line 161 in e2556e3
Not sure if it's mapped in OpenSearch or to XML. But I'll close this then. |
||||
| "$ref": "language.json" | ||||
| }, | ||||
| "affiliation": { | ||||
| "$ref": "affiliations.json" | ||||
| }, | ||||
| "nameIdentifiers": { | ||||
| "$ref": "name_identifiers.json" | ||||
| } | ||||
| }, | ||||
| "additionalProperties": false, | ||||
| "required": [ | ||||
| "name" | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||
| ] | ||||
| } | ||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| { | ||
| "title": "Contributors", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "array", | ||
| "minItems": 0, | ||
| "items": { | ||
| "$ref": "contributor.json" | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "title": "ContributorType", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": ["string", "null"], | ||
| "anyOf": [ | ||
| { | ||
| "type": "string", | ||
| "enum": [ | ||
| "ContactPerson", | ||
| "DataCollector", | ||
| "DataCurator", | ||
| "DataManager", | ||
| "Distributor", | ||
| "Editor", | ||
| "HostingInstitution", | ||
| "Producer", | ||
| "ProjectLeader", | ||
| "ProjectManager", | ||
| "ProjectMember", | ||
| "RegistrationAgency", | ||
| "RegistrationAuthority", | ||
| "RelatedPerson", | ||
| "Researcher", | ||
| "ResearchGroup", | ||
| "RightsHolder", | ||
| "Sponsor", | ||
| "Supervisor", | ||
| "Translator", | ||
| "WorkPackageLeader", | ||
| "Other" | ||
| ] | ||
| }, | ||
| { | ||
| "type": "null" | ||
| } | ||
| ] | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| { | ||
| "title": "DateType", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": ["string", "null"], | ||
| "anyOf": [ | ||
| { | ||
| "type": "string", | ||
| "enum": [ | ||
| "Accepted", | ||
| "Available", | ||
| "Copyrighted", | ||
| "Collected", | ||
| "Coverage", | ||
| "Created", | ||
| "Issued", | ||
| "Submitted", | ||
| "Updated", | ||
| "Valid", | ||
| "Withdrawn", | ||
| "Other" | ||
| ] | ||
| }, | ||
| { | ||
| "type": "null" | ||
| } | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| { | ||
| "title": "DescriptionType", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "string", | ||
| "enum": [ | ||
| "Abstract", | ||
| "Methods", | ||
| "SeriesInformation", | ||
| "TableOfContents", | ||
| "TechnicalInfo", | ||
| "Other" | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| { | ||
| "title": "FunderIdentifierType", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": "string", | ||
| "enum": [ | ||
| "Crossref Funder ID", | ||
| "GRID", | ||
| "ISNI", | ||
| "ROR", | ||
| "Other" | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| { | ||
| "title": "NameType", | ||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||
| "type": ["string", "null"], | ||
| "anyOf": [ | ||
| { | ||
| "type": "string", | ||
| "enum": ["Organizational", "Personal"] | ||
| }, | ||
| { | ||
| "type": "null" | ||
| } | ||
| ] | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: datacite/lupo
Length of output: 1609
Remove
byebugrequire from model code.Requiring debugger gems in production model code causes boot failures in environments where the gem is not installed and is a release blocker. The
require "pp"should also be removed—standard library utilities should not be required at class definition time.Fix
📝 Committable suggestion
🤖 Prompt for AI Agents