From a0c486ca5f2cc82519d2f04344b9f1c23917544d Mon Sep 17 00:00:00 2001 From: jrhoads Date: Mon, 2 Dec 2024 10:14:53 +0100 Subject: [PATCH 01/11] Standardize; always use 'terms' for filters, not 'term' --- app/models/doi/graphql_query.rb | 30 ++++++++-------- .../doi/graphql_query_builder_filters_spec.rb | 36 +++++++++++++------ 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 46611b534..391dba466 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -125,22 +125,22 @@ def filters filter = [] filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? - filter << { term: { resource_type_id: options[:resource_type_id].underscore.dasherize } } if options[:resource_type_id].present? + filter << { terms: { resource_type_id: [options[:resource_type_id].underscore.dasherize] } } if options[:resource_type_id].present? filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? filter << { terms: { agency: options[:agency].split(",").map(&:downcase) } } if options[:agency].present? filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? filter << { terms: { language: options[:language].to_s.split(",").map(&:downcase) } } if options[:language].present? - filter << { term: { uid: options[:uid] } } if options[:uid].present? + filter << { terms: { uid: [options[:uid]] } } if options[:uid].present? filter << { range: { created: { gte: "#{options[:created].split(',').min}||/y", lte: "#{options[:created].split(',').max}||/y", format: "yyyy" } } } if options[:created].present? filter << { range: { publication_year: { gte: "#{options[:published].split(',').min}||/y", lte: "#{options[:published].split(',').max}||/y", format: "yyyy" } } } if options[:published].present? - filter << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" } } if options[:schema_version].present? + filter << { terms: { schema_version: ["http://datacite.org/schema/kernel-#{options[:schema_version]}"] } } if options[:schema_version].present? filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? if options[:pid_entity].present? - filter << { term: { "subjects.subjectScheme": "PidEntity" } } + filter << { terms: { "subjects.subjectScheme": ["PidEntity"] } } filter << { terms: { "subjects.subject": options[:pid_entity].split(",").map(&:humanize) } } end if options[:field_of_science].present? - filter << { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } } + filter << { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } } filter << { terms: { "subjects.subject": options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } end if options[:field_of_science_repository].present? @@ -150,7 +150,7 @@ def filters filter << { terms: { "fields_of_science_combined": options[:field_of_science_combined].split(",").map { |s| s.humanize } } } end filter << { terms: { "rights_list.rightsIdentifier" => options[:license].split(",") } } if options[:license].present? - filter << { term: { source: options[:source] } } if options[:source].present? + filter << { terms: { source: [options[:source]] } } if options[:source].present? filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? @@ -159,25 +159,25 @@ def filters filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? - filter << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present? + filter << { terms: { "landing_page.status": [options[:link_check_status]] } } if options[:link_check_status].present? filter << { exists: { field: "landing_page.checked" } } if options[:link_checked].present? - filter << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] } } if options[:link_check_has_schema_org].present? - filter << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] } } if options[:link_check_body_has_pid].present? + filter << { terms: { "landing_page.hasSchemaOrg": [options[:link_check_has_schema_org]] } } if options[:link_check_has_schema_org].present? + filter << { terms: { "landing_page.bodyHasPid": [options[:link_check_body_has_pid]] } } if options[:link_check_body_has_pid].present? filter << { exists: { field: "landing_page.schemaOrgId" } } if options[:link_check_found_schema_org_id].present? filter << { exists: { field: "landing_page.dcIdentifier" } } if options[:link_check_found_dc_identifier].present? filter << { exists: { field: "landing_page.citationDoi" } } if options[:link_check_found_citation_doi].present? filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? filter << { terms: { aasm_state: options[:state].to_s.split(",") } } if options[:state].present? filter << { range: { registered: { gte: "#{options[:registered].split(',').min}||/y", lte: "#{options[:registered].split(',').max}||/y", format: "yyyy" } } } if options[:registered].present? - filter << { term: { consortium_id: { value: options[:consortium_id], case_insensitive: true } } } if options[:consortium_id].present? + filter << { terms: { consortium_id: [options[:consortium_id].downcase] } } if options[:consortium_id].present? # TODO align PID parsing - filter << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) } } if options[:re3data_id].present? - filter << { term: { "client.opendoar_id" => options[:opendoar_id] } } if options[:opendoar_id].present? + filter << { terms: { "client.re3data_id": [doi_from_url(options[:re3data_id])] } } if options[:re3data_id].present? + filter << { terms: { "client.opendoar_id": [options[:opendoar_id]] } } if options[:opendoar_id].present? filter << { terms: { "client.certificate" => options[:certificate].split(",") } } if options[:certificate].present? filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? - filter << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" } } if options[:has_person].present? - filter << { term: { "client.client_type" => options[:client_type] } } if options[:client_type] - filter << { term: { "types.resourceTypeGeneral" => "PhysicalObject" } } if options[:client_type] == "igsnCatalog" + filter << { terms: { "creators.nameIdentifiers.nameIdentifierScheme": ["ORCID"] } } if options[:has_person].present? + filter << { terms: { "client.client_type": [options[:client_type]] } } if options[:client_type] + filter << { terms: { "types.resourceTypeGeneral": ["PhysicalObject"] } } if options[:client_type] == "igsnCatalog" filter end diff --git a/spec/models/doi/graphql_query_builder_filters_spec.rb b/spec/models/doi/graphql_query_builder_filters_spec.rb index 261ef7af2..5953846a0 100644 --- a/spec/models/doi/graphql_query_builder_filters_spec.rb +++ b/spec/models/doi/graphql_query_builder_filters_spec.rb @@ -1,5 +1,3 @@ - - # frozen_string_literal: true require "rails_helper" @@ -22,7 +20,7 @@ options = { resource_type_id: "Journal_Article" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { resource_type_id: "journal-article" } } + { terms: { resource_type_id: ["journal-article"] } } ) end @@ -63,7 +61,7 @@ options = { uid: "10.5438/0012" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { uid: "10.5438/0012" } } + { terms: { uid: ["10.5438/0012"] } } ) end @@ -79,7 +77,15 @@ options = { consortium_id: "dc" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { consortium_id: { case_insensitive: true, value: "dc" } } } + { terms: { consortium_id: ["dc"] } } + ) + end + + it "handles registered" do + options = { registered: "2021,2023" } + builder = described_class.new(query, options) + expect(builder.filters).to include( + { range: { registered: { gte: "2021||/y", lte: "2023||/y", format: "yyyy" } } } ) end @@ -97,7 +103,7 @@ options = { re3data_id: "10.17616/r31njmjx" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { "client.re3data_id" => "10.17616/r31njmjx" } } + { terms: { "client.re3data_id": ["10.17616/r31njmjx"] } } ) end @@ -105,7 +111,15 @@ options = { opendoar_id: "123456" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { "client.opendoar_id" => "123456" } } + { terms: { "client.opendoar_id": ["123456"] } } + ) + end + + it "handles certificates" do + options = { certificate: "CoreTrustSeal,WDS" } + builder = described_class.new(query, options) + expect(builder.filters).to include( + { terms: { "client.certificate" => ["CoreTrustSeal", "WDS"] } } ) end @@ -207,7 +221,7 @@ options = { pid_entity: "dataset,software" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { "subjects.subjectScheme": "PidEntity" } }, + { terms: { "subjects.subjectScheme": ["PidEntity"] } }, { terms: { "subjects.subject": ["Dataset", "Software"] } } ) end @@ -216,7 +230,7 @@ options = { field_of_science: "computer_science,mathematics" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } }, + { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } }, { terms: { "subjects.subject": ["FOS: Computer science", "FOS: Mathematics"] } } ) end @@ -227,7 +241,7 @@ options = { link_check_status: "200" } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { "landing_page.status": "200" } } + { terms: { "landing_page.status": ["200"] } } ) end @@ -235,7 +249,7 @@ options = { link_check_has_schema_org: true } builder = described_class.new(query, options) expect(builder.filters).to include( - { term: { "landing_page.hasSchemaOrg": true } } + { terms: { "landing_page.hasSchemaOrg": [true] } } ) end end From 10c444cd3a138be1924bc0ab39f1b7b6c2237854 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Mon, 9 Dec 2024 22:06:46 +0100 Subject: [PATCH 02/11] Reorder filter terms. Group by type of field --- app/models/doi/graphql_query.rb | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 391dba466..3780e0f02 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -125,30 +125,16 @@ def filters filter = [] filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? + filter << { terms: { uid: [options[:uid]] } } if options[:uid].present? filter << { terms: { resource_type_id: [options[:resource_type_id].underscore.dasherize] } } if options[:resource_type_id].present? filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? filter << { terms: { agency: options[:agency].split(",").map(&:downcase) } } if options[:agency].present? filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? filter << { terms: { language: options[:language].to_s.split(",").map(&:downcase) } } if options[:language].present? - filter << { terms: { uid: [options[:uid]] } } if options[:uid].present? filter << { range: { created: { gte: "#{options[:created].split(',').min}||/y", lte: "#{options[:created].split(',').max}||/y", format: "yyyy" } } } if options[:created].present? filter << { range: { publication_year: { gte: "#{options[:published].split(',').min}||/y", lte: "#{options[:published].split(',').max}||/y", format: "yyyy" } } } if options[:published].present? filter << { terms: { schema_version: ["http://datacite.org/schema/kernel-#{options[:schema_version]}"] } } if options[:schema_version].present? filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? - if options[:pid_entity].present? - filter << { terms: { "subjects.subjectScheme": ["PidEntity"] } } - filter << { terms: { "subjects.subject": options[:pid_entity].split(",").map(&:humanize) } } - end - if options[:field_of_science].present? - filter << { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } } - filter << { terms: { "subjects.subject": options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } - end - if options[:field_of_science_repository].present? - filter << { terms: { "fields_of_science_repository": options[:field_of_science_repository].split(",").map { |s| s.humanize } } } - end - if options[:field_of_science_combined].present? - filter << { terms: { "fields_of_science_combined": options[:field_of_science_combined].split(",").map { |s| s.humanize } } } - end filter << { terms: { "rights_list.rightsIdentifier" => options[:license].split(",") } } if options[:license].present? filter << { terms: { source: [options[:source]] } } if options[:source].present? filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? @@ -170,14 +156,27 @@ def filters filter << { terms: { aasm_state: options[:state].to_s.split(",") } } if options[:state].present? filter << { range: { registered: { gte: "#{options[:registered].split(',').min}||/y", lte: "#{options[:registered].split(',').max}||/y", format: "yyyy" } } } if options[:registered].present? filter << { terms: { consortium_id: [options[:consortium_id].downcase] } } if options[:consortium_id].present? - # TODO align PID parsing - filter << { terms: { "client.re3data_id": [doi_from_url(options[:re3data_id])] } } if options[:re3data_id].present? + filter << { terms: { "client.re3data_id": [doi_from_url(options[:re3data_id])] } } if options[:re3data_id].present? # TODO align PID parsing filter << { terms: { "client.opendoar_id": [options[:opendoar_id]] } } if options[:opendoar_id].present? filter << { terms: { "client.certificate" => options[:certificate].split(",") } } if options[:certificate].present? filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? filter << { terms: { "creators.nameIdentifiers.nameIdentifierScheme": ["ORCID"] } } if options[:has_person].present? filter << { terms: { "client.client_type": [options[:client_type]] } } if options[:client_type] filter << { terms: { "types.resourceTypeGeneral": ["PhysicalObject"] } } if options[:client_type] == "igsnCatalog" + if options[:pid_entity].present? + filter << { terms: { "subjects.subjectScheme": ["PidEntity"] } } + filter << { terms: { "subjects.subject": options[:pid_entity].split(",").map(&:humanize) } } + end + if options[:field_of_science].present? + filter << { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } } + filter << { terms: { "subjects.subject": options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } + end + if options[:field_of_science_repository].present? + filter << { terms: { "fields_of_science_repository": options[:field_of_science_repository].split(",").map { |s| s.humanize } } } + end + if options[:field_of_science_combined].present? + filter << { terms: { "fields_of_science_combined": options[:field_of_science_combined].split(",").map { |s| s.humanize } } } + end filter end From 948ea00164117c8a0bc38933a94b7e4820d39d3f Mon Sep 17 00:00:00 2001 From: jrhoads Date: Thu, 12 Dec 2024 15:44:48 +0100 Subject: [PATCH 03/11] Refactor filter builder into its own class. Call that from the graphql query builder --- app/models/doi/graphql_query.rb | 62 +---------------- app/models/doi/search/filter_builder.rb | 91 +++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 61 deletions(-) create mode 100644 app/models/doi/search/filter_builder.rb diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 3780e0f02..571fce420 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -118,67 +118,7 @@ def must end def filters - options = @options - - # turn ids into an array if provided as comma-separated string - options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String) - - filter = [] - filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? - filter << { terms: { uid: [options[:uid]] } } if options[:uid].present? - filter << { terms: { resource_type_id: [options[:resource_type_id].underscore.dasherize] } } if options[:resource_type_id].present? - filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? - filter << { terms: { agency: options[:agency].split(",").map(&:downcase) } } if options[:agency].present? - filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? - filter << { terms: { language: options[:language].to_s.split(",").map(&:downcase) } } if options[:language].present? - filter << { range: { created: { gte: "#{options[:created].split(',').min}||/y", lte: "#{options[:created].split(',').max}||/y", format: "yyyy" } } } if options[:created].present? - filter << { range: { publication_year: { gte: "#{options[:published].split(',').min}||/y", lte: "#{options[:published].split(',').max}||/y", format: "yyyy" } } } if options[:published].present? - filter << { terms: { schema_version: ["http://datacite.org/schema/kernel-#{options[:schema_version]}"] } } if options[:schema_version].present? - filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? - filter << { terms: { "rights_list.rightsIdentifier" => options[:license].split(",") } } if options[:license].present? - filter << { terms: { source: [options[:source]] } } if options[:source].present? - filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? - filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? - filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? - filter << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present? - filter << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present? - filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? - filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? - filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? - filter << { terms: { "landing_page.status": [options[:link_check_status]] } } if options[:link_check_status].present? - filter << { exists: { field: "landing_page.checked" } } if options[:link_checked].present? - filter << { terms: { "landing_page.hasSchemaOrg": [options[:link_check_has_schema_org]] } } if options[:link_check_has_schema_org].present? - filter << { terms: { "landing_page.bodyHasPid": [options[:link_check_body_has_pid]] } } if options[:link_check_body_has_pid].present? - filter << { exists: { field: "landing_page.schemaOrgId" } } if options[:link_check_found_schema_org_id].present? - filter << { exists: { field: "landing_page.dcIdentifier" } } if options[:link_check_found_dc_identifier].present? - filter << { exists: { field: "landing_page.citationDoi" } } if options[:link_check_found_citation_doi].present? - filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? - filter << { terms: { aasm_state: options[:state].to_s.split(",") } } if options[:state].present? - filter << { range: { registered: { gte: "#{options[:registered].split(',').min}||/y", lte: "#{options[:registered].split(',').max}||/y", format: "yyyy" } } } if options[:registered].present? - filter << { terms: { consortium_id: [options[:consortium_id].downcase] } } if options[:consortium_id].present? - filter << { terms: { "client.re3data_id": [doi_from_url(options[:re3data_id])] } } if options[:re3data_id].present? # TODO align PID parsing - filter << { terms: { "client.opendoar_id": [options[:opendoar_id]] } } if options[:opendoar_id].present? - filter << { terms: { "client.certificate" => options[:certificate].split(",") } } if options[:certificate].present? - filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? - filter << { terms: { "creators.nameIdentifiers.nameIdentifierScheme": ["ORCID"] } } if options[:has_person].present? - filter << { terms: { "client.client_type": [options[:client_type]] } } if options[:client_type] - filter << { terms: { "types.resourceTypeGeneral": ["PhysicalObject"] } } if options[:client_type] == "igsnCatalog" - if options[:pid_entity].present? - filter << { terms: { "subjects.subjectScheme": ["PidEntity"] } } - filter << { terms: { "subjects.subject": options[:pid_entity].split(",").map(&:humanize) } } - end - if options[:field_of_science].present? - filter << { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } } - filter << { terms: { "subjects.subject": options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } - end - if options[:field_of_science_repository].present? - filter << { terms: { "fields_of_science_repository": options[:field_of_science_repository].split(",").map { |s| s.humanize } } } - end - if options[:field_of_science_combined].present? - filter << { terms: { "fields_of_science_combined": options[:field_of_science_combined].split(",").map { |s| s.humanize } } } - end - - filter + Doi::Search::FilterBuilder.new(@options).build end def get_should_clause diff --git a/app/models/doi/search/filter_builder.rb b/app/models/doi/search/filter_builder.rb new file mode 100644 index 000000000..f1e0955f1 --- /dev/null +++ b/app/models/doi/search/filter_builder.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +class Doi + module Search + class FilterBuilder + include Modelable + + def initialize(options) + @options = options + end + + def build + options = @options + + # turn ids into an array if provided as comma-separated string + options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String) + + filter = [] + filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? + filter << { terms: { uid: [options[:uid]] } } if options[:uid].present? + filter << { terms: { resource_type_id: [options[:resource_type_id].underscore.dasherize] } } if options[:resource_type_id].present? + filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? + filter << { terms: { agency: options[:agency].split(",").map(&:downcase) } } if options[:agency].present? + filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? + filter << { terms: { language: options[:language].to_s.split(",").map(&:downcase) } } if options[:language].present? + filter << { range: { created: { gte: "#{options[:created].split(',').min}||/y", lte: "#{options[:created].split(',').max}||/y", format: "yyyy" } } } if options[:created].present? + filter << { range: { publication_year: { gte: "#{options[:published].split(',').min}||/y", lte: "#{options[:published].split(',').max}||/y", format: "yyyy" } } } if options[:published].present? + filter << { terms: { schema_version: ["http://datacite.org/schema/kernel-#{options[:schema_version]}"] } } if options[:schema_version].present? + filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? + filter << { terms: { "rights_list.rightsIdentifier" => options[:license].split(",") } } if options[:license].present? + filter << { terms: { source: [options[:source]] } } if options[:source].present? + filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? + filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? + filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? + filter << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present? + filter << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present? + filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? + filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? + filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? + filter << { terms: { "landing_page.status": [options[:link_check_status]] } } if options[:link_check_status].present? + filter << { exists: { field: "landing_page.checked" } } if options[:link_checked].present? + filter << { terms: { "landing_page.hasSchemaOrg": [options[:link_check_has_schema_org]] } } if options[:link_check_has_schema_org].present? + filter << { terms: { "landing_page.bodyHasPid": [options[:link_check_body_has_pid]] } } if options[:link_check_body_has_pid].present? + filter << { exists: { field: "landing_page.schemaOrgId" } } if options[:link_check_found_schema_org_id].present? + filter << { exists: { field: "landing_page.dcIdentifier" } } if options[:link_check_found_dc_identifier].present? + filter << { exists: { field: "landing_page.citationDoi" } } if options[:link_check_found_citation_doi].present? + filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? + filter << { terms: { aasm_state: options[:state].to_s.split(",") } } if options[:state].present? + filter << { range: { registered: { gte: "#{options[:registered].split(',').min}||/y", lte: "#{options[:registered].split(',').max}||/y", format: "yyyy" } } } if options[:registered].present? + filter << { terms: { consortium_id: [options[:consortium_id].downcase] } } if options[:consortium_id].present? + filter << { terms: { "client.re3data_id": [doi_from_url(options[:re3data_id])] } } if options[:re3data_id].present? # TODO align PID parsing + filter << { terms: { "client.opendoar_id": [options[:opendoar_id]] } } if options[:opendoar_id].present? + filter << { terms: { "client.certificate" => options[:certificate].split(",") } } if options[:certificate].present? + filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? + filter << { terms: { "creators.nameIdentifiers.nameIdentifierScheme": ["ORCID"] } } if options[:has_person].present? + filter << { terms: { "client.client_type": [options[:client_type]] } } if options[:client_type] + filter << { terms: { "types.resourceTypeGeneral": ["PhysicalObject"] } } if options[:client_type] == "igsnCatalog" + filter.push(*build_pid_entity_filter) if options[:pid_entity].present? + filter.push(*build_field_of_science_filter) if options[:field_of_science].present? + filter << build_field_of_science_repository_filter if options[:field_of_science_repository].present? + filter << build_field_of_science_combined_filter if options[:field_of_science_combined].present? + + filter + end + + private + + def build_pid_entity_filter + [ + { terms: { "subjects.subjectScheme": ["PidEntity"] } }, + { terms: { "subjects.subject": @options[:pid_entity].split(",").map(&:humanize) } } + ] + end + + def build_field_of_science_filter + [ + { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } }, + { terms: { "subjects.subject": @options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } + ] + end + + def build_field_of_science_repository_filter + { terms: { "fields_of_science_repository": @options[:field_of_science_repository].split(",").map { |s| s.humanize } } } + end + + def build_field_of_science_combined_filter + { terms: { "fields_of_science_combined": @options[:field_of_science_combined].split(",").map { |s| s.humanize } } } + end + end + end +end From 24537887a543589a57aa729b484ab61d46a4f30b Mon Sep 17 00:00:00 2001 From: jrhoads Date: Thu, 12 Dec 2024 15:46:33 +0100 Subject: [PATCH 04/11] Appease Rubocop --- app/models/doi/search/filter_builder.rb | 37 ++++++++++++------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/app/models/doi/search/filter_builder.rb b/app/models/doi/search/filter_builder.rb index f1e0955f1..76c9ab3f4 100644 --- a/app/models/doi/search/filter_builder.rb +++ b/app/models/doi/search/filter_builder.rb @@ -64,28 +64,27 @@ def build end private + def build_pid_entity_filter + [ + { terms: { "subjects.subjectScheme": ["PidEntity"] } }, + { terms: { "subjects.subject": @options[:pid_entity].split(",").map(&:humanize) } } + ] + end - def build_pid_entity_filter - [ - { terms: { "subjects.subjectScheme": ["PidEntity"] } }, - { terms: { "subjects.subject": @options[:pid_entity].split(",").map(&:humanize) } } - ] - end + def build_field_of_science_filter + [ + { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } }, + { terms: { "subjects.subject": @options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } + ] + end - def build_field_of_science_filter - [ - { terms: { "subjects.subjectScheme": ["Fields of Science and Technology (FOS)"] } }, - { terms: { "subjects.subject": @options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } - ] - end + def build_field_of_science_repository_filter + { terms: { "fields_of_science_repository": @options[:field_of_science_repository].split(",").map { |s| s.humanize } } } + end - def build_field_of_science_repository_filter - { terms: { "fields_of_science_repository": @options[:field_of_science_repository].split(",").map { |s| s.humanize } } } - end - - def build_field_of_science_combined_filter - { terms: { "fields_of_science_combined": @options[:field_of_science_combined].split(",").map { |s| s.humanize } } } - end + def build_field_of_science_combined_filter + { terms: { "fields_of_science_combined": @options[:field_of_science_combined].split(",").map { |s| s.humanize } } } + end end end end From 2fb7441d3f2492612846e9b5a83c9b092aafafd4 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Mon, 9 Dec 2024 22:09:10 +0100 Subject: [PATCH 05/11] Refactor aggregates, allow selection --- app/models/doi/graphql_query.rb | 386 ++++++++++++++++++++------------ 1 file changed, 247 insertions(+), 139 deletions(-) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 571fce420..4134ba068 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -207,157 +207,265 @@ def inner_query }.compact end - def aggregations + + def facet_sizes + tmp_aggs = selected_aggs facet_count = (@options[:facet_count] || DEFAULT_FACET_COUNT).to_i - if facet_count.positive? - { - resource_types: { terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, - clients: { terms: { field: "client_id_and_name", size: facet_count, min_doc_count: 1 } }, - open_licenses: { - filter: { terms: { - "rights_list.rightsIdentifier": [ - "cc-by-1.0", - "cc-by-2.0", - "cc-by-2.5", - "cc-by-3.0", - "cc-by-3.0-at", - "cc-by-3.0-us", - "cc-by-4.0", - "cc-pddc", - "cc0-1.0", - "cc-pdm-1.0" - ] - } }, - aggs: { - resource_types: { - terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1 } - } - } - }, - published: { - date_histogram: { - field: "publication_year", - interval: "year", - format: "year", - order: { - _key: "desc", - }, + custom_sizes = (@options[:facet_sizes] || {}).select do |key, value| + tmp_aggs.key?(key.to_sym) && value.to_i.positive? + end.transform_keys(&:to_sym) + + if facet_count != DEFAULT_FACET_COUNT && facet_count.positive? + # Create a hash with facet_count for all selected aggregations + default_sizes = tmp_aggs.keys.each_with_object({}) do |key, hash| + # Only set size if the aggregation has a terms query and isn't in custom_sizes + hash[key] = facet_count unless custom_sizes.key?(key) || !tmp_aggs[key]&.dig(:terms) + end + custom_sizes.merge(default_sizes) + else + custom_sizes + end + end + + def requested_aggs + included_aggs = @options[:include_aggregations] || :all + ## if included agg is a string, split it on commas + if included_aggs.is_a?(String) + included_aggs = included_aggs.split(",") + end + return {} if included_aggs == :none || included_aggs == 'none' + Array.wrap(included_aggs).map(&:to_sym) + end + + def selected_aggs + tmp_aggs = if requested_aggs.include?(:all) + AGGREGATION_DEFINITIONS + else + AGGREGATION_DEFINITIONS.slice(*requested_aggs) + end + Marshal.load(Marshal.dump(tmp_aggs)) + end + + def aggregations + aggs = selected_aggs + facet_sizes.each do |key, size| + aggs[key][:terms][:size] = size.to_i if aggs[key]&.dig(:terms) + end + aggs + end + + def self.all_aggregation_keys + AGGREGATION_DEFINITIONS.keys + end + + private + + AGGREGATION_DEFINITIONS = { + resource_types: { + terms: { + field: "resource_type_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + clients: { terms: { + field: "client_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } }, + open_licenses: { + filter: { terms: { + "rights_list.rightsIdentifier": [ + "cc-by-1.0", + "cc-by-2.0", + "cc-by-2.5", + "cc-by-3.0", + "cc-by-3.0-at", + "cc-by-3.0-us", + "cc-by-4.0", + "cc-pddc", + "cc0-1.0", + "cc-pdm-1.0" + ] + } }, + aggs: { + resource_types: { + terms: { + field: "resource_type_id_and_name", + size: DEFAULT_FACET_COUNT, min_doc_count: 1, - }, + } + } + } + }, + published: { + date_histogram: { + field: "publication_year", + interval: "year", + format: "year", + order: { + _key: "desc", }, - registration_agencies: { terms: { field: "agency", size: facet_count, min_doc_count: 1 } }, - affiliations: { terms: { field: "affiliation_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, + min_doc_count: 1, + }, + }, + registration_agencies: { + terms: { + field: "agency", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + }, + }, + affiliations: { + terms: { + field: "affiliation_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + authors: { + terms: { + field: "creators.nameIdentifiers.nameIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "https?://orcid.org/.*", + }, + aggs: { authors: { - terms: { field: "creators.nameIdentifiers.nameIdentifier", size: facet_count, min_doc_count: 1, include: "https?://orcid.org/.*" }, - aggs: { - authors: { - top_hits: { - _source: { - includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] - }, - size: 1 - } - } + top_hits: { + _source: { + includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] + }, + size: 1 } - }, + } + } + }, + creators_and_contributors: { + terms: { + field: "creators_and_contributors.nameIdentifiers.nameIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "https?://orcid.org/.*" + }, + aggs: { creators_and_contributors: { - terms: { - field: "creators_and_contributors.nameIdentifiers.nameIdentifier", - size: facet_count, - min_doc_count: 1, - include: "https?://orcid.org/.*" - }, - aggs: { - creators_and_contributors: { - top_hits: { - _source: { - includes: [ - "creators_and_contributors.name", - "creators_and_contributors.nameIdentifiers.nameIdentifier" - ] - }, - size: 1 - } + top_hits: { + _source: { + includes: [ + "creators_and_contributors.name", + "creators_and_contributors.nameIdentifiers.nameIdentifier" + ] }, - "work_types": { - "terms": { - "field": "resource_type_id_and_name", - "min_doc_count": 1 - } - } + size: 1 } }, - funders: { - terms: { - field: "funding_references.funderIdentifier", - size: facet_count, - min_doc_count: 1 - }, - aggs: { - funders: { - top_hits: { - _source: { - includes: [ - "funding_references.funderName", - "funding_references.funderIdentifier" - ] - }, - size: 1 - } - } + "work_types": { + "terms": { + "field": "resource_type_id_and_name", + "min_doc_count": 1 } - }, - pid_entities: { - filter: { term: { "subjects.subjectScheme": "PidEntity" } }, - aggs: { - subject: { terms: { - field: "subjects.subject", - size: facet_count, - min_doc_count: 1, - include: %w( - Dataset - Publication - Software - Organization - Funder - Person - Grant - Sample - Instrument - Repository - Project - ) - } }, - }, - }, - fields_of_science: { - filter: { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } }, - aggs: { - subject: { terms: { field: "subjects.subject", size: facet_count, min_doc_count: 1, - include: "FOS:.*" } }, - }, - }, - fields_of_science_combined: { - terms: { field: "fields_of_science_combined", size: facet_count, min_doc_count: 1 } - }, - fields_of_science_repository: { - terms: { field: "fields_of_science_repository", size: facet_count, min_doc_count: 1 } - }, - licenses: { terms: { field: "rights_list.rightsIdentifier", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, - languages: { terms: { field: "language", size: facet_count, min_doc_count: 1 } }, - view_count: { sum: { field: "view_count" } }, - download_count: { sum: { field: "download_count" } }, - citation_count: { sum: { field: "citation_count" } }, - content_url_count: { value_count: { field: "content_url" } }, - client_types: { - terms: { - field: "client.client_type", - size: facet_count, - min_doc_count: 1 + } + } + }, + funders: { + terms: { + field: "funding_references.funderIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1 + }, + aggs: { + funders: { + top_hits: { + _source: { + includes: [ + "funding_references.funderName", + "funding_references.funderIdentifier" + ] + }, + size: 1 } } } - end - end + }, + pid_entities: { + filter: { term: { "subjects.subjectScheme": "PidEntity" } }, + aggs: { + subject: { terms: { + field: "subjects.subject", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: %w( + Dataset + Publication + Software + Organization + Funder + Person + Grant + Sample + Instrument + Repository + Project + ) + } }, + }, + }, + fields_of_science: { + filter: { + term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" }, + }, + aggs: { + subject: { terms: { + field: "subjects.subject", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "FOS:.*", + } }, + }, + }, + fields_of_science_combined: { + terms: { + field: "fields_of_science_combined", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } + }, + fields_of_science_repository: { + terms: { + field: "fields_of_science_repository", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } + }, + licenses: { + terms: { + field: "rights_list.rightsIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + languages: { + terms: { + field: "language", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + }, + }, + view_count: { sum: { field: "view_count" } }, + download_count: { sum: { field: "download_count" } }, + citation_count: { sum: { field: "citation_count" } }, + content_url_count: { value_count: { field: "content_url" } }, + client_types: { + terms: { + field: "client.client_type", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1 + } + } + }.freeze end end From b9016c464172b8d80cedae52d56660196bebf66f Mon Sep 17 00:00:00 2001 From: jrhoads Date: Mon, 9 Dec 2024 22:10:08 +0100 Subject: [PATCH 06/11] Add spec for selected aggregations --- .../graphql_query_builder_aggregates_spec.rb | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/spec/models/doi/graphql_query_builder_aggregates_spec.rb b/spec/models/doi/graphql_query_builder_aggregates_spec.rb index 9c10e0911..9228fec7d 100644 --- a/spec/models/doi/graphql_query_builder_aggregates_spec.rb +++ b/spec/models/doi/graphql_query_builder_aggregates_spec.rb @@ -206,4 +206,58 @@ expect(builder.aggregations.keys).to match_array(expected_keys) end end + + describe "include aggregations" do + it "includes all aggregations by default" do + expected_keys = described_class.all_aggregation_keys + options = {} + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array(expected_keys) + end + + it "includes all aggregations when :all symbol provided" do + expected_keys = described_class.all_aggregation_keys + options = { include_aggregations: :all } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array(expected_keys) + end + + it "includes all aggregations when 'all' string provided" do + expected_keys = described_class.all_aggregation_keys + options = { include_aggregations: 'all' } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array(expected_keys) + end + + it "returns empty hash when :none provided" do + options = { include_aggregations: :none } + builder = described_class.new(query, options) + expect(builder.aggregations).to eq({}) + end + + it "returns empty hash when 'none' string provided" do + options = { include_aggregations: 'none' } + builder = described_class.new(query, options) + expect(builder.aggregations).to eq({}) + end + + it "includes only specified aggregations when array of symbols provided" do + options = { include_aggregations: [:clients, :languages] } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array([:clients, :languages]) + end + + it "ignores invalid aggregation keys" do + options = { include_aggregations: [:clients, :invalid_key, :languages] } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array([:clients, :languages]) + end + + it "includes all aggregations when :all is included in array" do + expected_keys = described_class.all_aggregation_keys + options = { include_aggregations: [:clients, :all, :languages] } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array(expected_keys) + end + end end From b2292ce71bfaa77e9d6ff10c9e08033a982357aa Mon Sep 17 00:00:00 2001 From: jrhoads Date: Tue, 10 Dec 2024 15:20:48 +0100 Subject: [PATCH 07/11] Add additional specs for edge cases for aggregate specification --- app/models/doi/graphql_query.rb | 9 +- .../graphql_query_builder_aggregates_spec.rb | 198 +++--------------- 2 files changed, 35 insertions(+), 172 deletions(-) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 4134ba068..23333bca4 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -229,10 +229,10 @@ def facet_sizes def requested_aggs included_aggs = @options[:include_aggregations] || :all - ## if included agg is a string, split it on commas - if included_aggs.is_a?(String) - included_aggs = included_aggs.split(",") - end + ## if included agg is a string, split it on commas + if included_aggs.is_a?(String) + included_aggs = included_aggs.split(",").map(&:strip) + end return {} if included_aggs == :none || included_aggs == 'none' Array.wrap(included_aggs).map(&:to_sym) end @@ -258,7 +258,6 @@ def self.all_aggregation_keys AGGREGATION_DEFINITIONS.keys end - private AGGREGATION_DEFINITIONS = { resource_types: { diff --git a/spec/models/doi/graphql_query_builder_aggregates_spec.rb b/spec/models/doi/graphql_query_builder_aggregates_spec.rb index 9228fec7d..c04d03695 100644 --- a/spec/models/doi/graphql_query_builder_aggregates_spec.rb +++ b/spec/models/doi/graphql_query_builder_aggregates_spec.rb @@ -9,173 +9,7 @@ describe "aggregations" do it "by default all aggregations are enabled" do builder = described_class.new(query, options) - expect(builder.aggregations).to eq( - { - affiliations: { terms: { field: "affiliation_id_and_name", min_doc_count: 1, missing: "__missing__", size: 10 } }, - authors: { - aggs: { authors: { - top_hits: { _source: { - includes: ["creators.name", "creators.nameIdentifiers.nameIdentifier"], - }, size: 1 }, - } }, - terms: { field: "creators.nameIdentifiers.nameIdentifier", include: "https?://orcid.org/.*", min_doc_count: 1, size: 10 }, - }, - citation_count: { - sum: { field: "citation_count" }, - }, - client_types: { - terms: { field: "client.client_type", min_doc_count: 1, size: 10 }, - }, - clients: { - terms: { field: "client_id_and_name", min_doc_count: 1, size: 10 }, - }, - content_url_count: { - value_count: { field: "content_url" }, - }, - creators_and_contributors: { - aggs: { - creators_and_contributors: { - top_hits: { _source: { - includes: ["creators_and_contributors.name", "creators_and_contributors.nameIdentifiers.nameIdentifier"], - }, size: 1 }, - }, - work_types: { terms: { field: "resource_type_id_and_name", min_doc_count: 1 } }, - }, - terms: { - field: "creators_and_contributors.nameIdentifiers.nameIdentifier", - include: "https?://orcid.org/.*", - min_doc_count: 1, - size: 10, - }, - }, - download_count: { - sum: { field: "download_count" }, - }, - fields_of_science: { - aggs: { - subject: { - terms: { - field: "subjects.subject", - include: "FOS:.*", - min_doc_count: 1, - size: 10, - }, - }, - }, - filter: { - term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" }, - }, - }, - fields_of_science_combined: { - terms: { - field: "fields_of_science_combined", - min_doc_count: 1, - size: 10, - }, - }, - fields_of_science_repository: { - terms: { - field: "fields_of_science_repository", - min_doc_count: 1, - size: 10, - }, - }, - funders: { - aggs: { funders: { top_hits: { _source: { includes: ["funding_references.funderName", "funding_references.funderIdentifier"] }, size: 1 } } }, - terms: { - field: "funding_references.funderIdentifier", - min_doc_count: 1, - size: 10, - }, - }, - languages: { - terms: { - field: "language", - min_doc_count: 1, - size: 10, - }, - }, - licenses: { - terms: { - field: "rights_list.rightsIdentifier", - min_doc_count: 1, - missing: "__missing__", - size: 10, - }, - }, - open_licenses: { - aggs: { - resource_types: { - terms: { - field: "resource_type_id_and_name", - min_doc_count: 1, - size: 10, - }, - }, - }, - filter: { - terms: { "rights_list.rightsIdentifier": [ - "cc-by-1.0", - "cc-by-2.0", - "cc-by-2.5", - "cc-by-3.0", - "cc-by-3.0-at", - "cc-by-3.0-us", - "cc-by-4.0", - "cc-pddc", - "cc0-1.0", - "cc-pdm-1.0", - ] }, - }, - }, - pid_entities: { - aggs: { - subject: { - terms: { - field: "subjects.subject", - include: [ - "Dataset", - "Publication", - "Software", - "Organization", - "Funder", - "Person", - "Grant", - "Sample", - "Instrument", - "Repository", - "Project", - ], - min_doc_count: 1, - size: 10, - }, - }, - }, - filter: { term: { "subjects.subjectScheme": "PidEntity" } }, - }, - published: { - date_histogram: { - field: "publication_year", - format: "year", - interval: "year", - min_doc_count: 1, - order: { _key: "desc" }, - }, - }, - registration_agencies: { - terms: { field: "agency", min_doc_count: 1, size: 10 }, - }, - resource_types: { terms: { - field: "resource_type_id_and_name", - min_doc_count: 1, - missing: "__missing__", - size: 10, - } }, - view_count: { - sum: { field: "view_count" }, - }, - } - ) + expect(builder.aggregations).to eq(described_class::AGGREGATION_DEFINITIONS) end it "has keys for all aggregates" do @@ -241,6 +75,18 @@ expect(builder.aggregations).to eq({}) end + it "returns empty hash when empty array provided" do + options = { include_aggregations: [] } + builder = described_class.new(query, options) + expect(builder.aggregations).to eq({}) + end + + it "returns empty hash when empty string provided" do + options = { include_aggregations: '' } + builder = described_class.new(query, options) + expect(builder.aggregations).to eq({}) + end + it "includes only specified aggregations when array of symbols provided" do options = { include_aggregations: [:clients, :languages] } builder = described_class.new(query, options) @@ -253,6 +99,24 @@ expect(builder.aggregations.keys).to match_array([:clients, :languages]) end + it "includes only specified aggregations when array of strings provided" do + options = { include_aggregations: ['clients', 'languages'] } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array([:clients, :languages]) + end + + it "includes only specified aggregations when comma separated string provided" do + options = { include_aggregations: 'clients,languages' } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array([:clients, :languages]) + end + + it "includes only specified aggregations when comma separated string provided with spaces" do + options = { include_aggregations: 'clients, languages' } + builder = described_class.new(query, options) + expect(builder.aggregations.keys).to match_array([:clients, :languages]) + end + it "includes all aggregations when :all is included in array" do expected_keys = described_class.all_aggregation_keys options = { include_aggregations: [:clients, :all, :languages] } From 8adba3a0a4915ec2b16c0f1bb3236520f557f7cd Mon Sep 17 00:00:00 2001 From: jrhoads Date: Thu, 12 Dec 2024 11:47:23 +0100 Subject: [PATCH 08/11] Update specs to skip elasticsearch if they don't use them --- spec/models/doi/graphql_query_builder_aggregates_spec.rb | 4 ++-- spec/models/doi/graphql_query_builder_filters_spec.rb | 2 +- spec/models/doi/graphql_query_builder_spec.rb | 2 +- spec/models/doi/related_doi_indexer_spec.rb | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/models/doi/graphql_query_builder_aggregates_spec.rb b/spec/models/doi/graphql_query_builder_aggregates_spec.rb index c04d03695..2ddcd3f08 100644 --- a/spec/models/doi/graphql_query_builder_aggregates_spec.rb +++ b/spec/models/doi/graphql_query_builder_aggregates_spec.rb @@ -2,14 +2,14 @@ require "rails_helper" -RSpec.describe Doi::GraphqlQuery::Builder do +RSpec.describe Doi::GraphqlQuery::Builder, elasticsearch: false, skip_prefix_pool: true do let(:query) { "" } let(:options) { {} } describe "aggregations" do it "by default all aggregations are enabled" do builder = described_class.new(query, options) - expect(builder.aggregations).to eq(described_class::AGGREGATION_DEFINITIONS) + expect(builder.aggregations).to eq(described_class.all_aggregations) end it "has keys for all aggregates" do diff --git a/spec/models/doi/graphql_query_builder_filters_spec.rb b/spec/models/doi/graphql_query_builder_filters_spec.rb index 5953846a0..6c084ed9c 100644 --- a/spec/models/doi/graphql_query_builder_filters_spec.rb +++ b/spec/models/doi/graphql_query_builder_filters_spec.rb @@ -2,7 +2,7 @@ require "rails_helper" -RSpec.describe Doi::GraphqlQuery::Builder do +RSpec.describe Doi::GraphqlQuery::Builder, elasticsearch: false, skip_prefix_pool: true do let(:query) { "" } let(:options) { {} } diff --git a/spec/models/doi/graphql_query_builder_spec.rb b/spec/models/doi/graphql_query_builder_spec.rb index 034adcbcf..1066b7415 100644 --- a/spec/models/doi/graphql_query_builder_spec.rb +++ b/spec/models/doi/graphql_query_builder_spec.rb @@ -3,7 +3,7 @@ require "rails_helper" -RSpec.describe Doi::GraphqlQuery::Builder do +RSpec.describe Doi::GraphqlQuery::Builder, elasticsearch: false, skip_prefix_pool: true do let(:query) { "" } let(:options) { {} } describe "page size" do diff --git a/spec/models/doi/related_doi_indexer_spec.rb b/spec/models/doi/related_doi_indexer_spec.rb index 3ad7b78e6..5e887f6c5 100644 --- a/spec/models/doi/related_doi_indexer_spec.rb +++ b/spec/models/doi/related_doi_indexer_spec.rb @@ -3,7 +3,7 @@ require "rails_helper" -RSpec.describe Doi::Indexer::RelatedDoiIndexer do +RSpec.describe Doi::Indexer::RelatedDoiIndexer , elasticsearch: false, skip_prefix_pool: true do describe "related_dois with different input" do let(:good_related_identifier) do { From 255bb7bed1c9ad5a5bd4d9abc0fd7fade0994bd3 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Thu, 12 Dec 2024 11:53:04 +0100 Subject: [PATCH 09/11] Refactor building aggregations into their own class --- app/models/doi/graphql_query.rb | 258 +---------------- app/models/doi/search/aggregations_builder.rb | 273 ++++++++++++++++++ 2 files changed, 278 insertions(+), 253 deletions(-) create mode 100644 app/models/doi/search/aggregations_builder.rb diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 23333bca4..68eb23e29 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -207,264 +207,16 @@ def inner_query }.compact end - - def facet_sizes - tmp_aggs = selected_aggs - facet_count = (@options[:facet_count] || DEFAULT_FACET_COUNT).to_i - custom_sizes = (@options[:facet_sizes] || {}).select do |key, value| - tmp_aggs.key?(key.to_sym) && value.to_i.positive? - end.transform_keys(&:to_sym) - - if facet_count != DEFAULT_FACET_COUNT && facet_count.positive? - # Create a hash with facet_count for all selected aggregations - default_sizes = tmp_aggs.keys.each_with_object({}) do |key, hash| - # Only set size if the aggregation has a terms query and isn't in custom_sizes - hash[key] = facet_count unless custom_sizes.key?(key) || !tmp_aggs[key]&.dig(:terms) - end - custom_sizes.merge(default_sizes) - else - custom_sizes - end - end - - def requested_aggs - included_aggs = @options[:include_aggregations] || :all - ## if included agg is a string, split it on commas - if included_aggs.is_a?(String) - included_aggs = included_aggs.split(",").map(&:strip) - end - return {} if included_aggs == :none || included_aggs == 'none' - Array.wrap(included_aggs).map(&:to_sym) - end - - def selected_aggs - tmp_aggs = if requested_aggs.include?(:all) - AGGREGATION_DEFINITIONS - else - AGGREGATION_DEFINITIONS.slice(*requested_aggs) - end - Marshal.load(Marshal.dump(tmp_aggs)) - end - def aggregations - aggs = selected_aggs - facet_sizes.each do |key, size| - aggs[key][:terms][:size] = size.to_i if aggs[key]&.dig(:terms) - end - aggs + Doi::Search::AggregationsBuilder.new(@options).build end def self.all_aggregation_keys - AGGREGATION_DEFINITIONS.keys + Doi::Search::AggregationsBuilder.all_aggregation_keys end - - AGGREGATION_DEFINITIONS = { - resource_types: { - terms: { - field: "resource_type_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - missing: "__missing__", - }, - }, - clients: { terms: { - field: "client_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } }, - open_licenses: { - filter: { terms: { - "rights_list.rightsIdentifier": [ - "cc-by-1.0", - "cc-by-2.0", - "cc-by-2.5", - "cc-by-3.0", - "cc-by-3.0-at", - "cc-by-3.0-us", - "cc-by-4.0", - "cc-pddc", - "cc0-1.0", - "cc-pdm-1.0" - ] - } }, - aggs: { - resource_types: { - terms: { - field: "resource_type_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } - } - } - }, - published: { - date_histogram: { - field: "publication_year", - interval: "year", - format: "year", - order: { - _key: "desc", - }, - min_doc_count: 1, - }, - }, - registration_agencies: { - terms: { - field: "agency", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - }, - }, - affiliations: { - terms: { - field: "affiliation_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - missing: "__missing__", - }, - }, - authors: { - terms: { - field: "creators.nameIdentifiers.nameIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - include: "https?://orcid.org/.*", - }, - aggs: { - authors: { - top_hits: { - _source: { - includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] - }, - size: 1 - } - } - } - }, - creators_and_contributors: { - terms: { - field: "creators_and_contributors.nameIdentifiers.nameIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - include: "https?://orcid.org/.*" - }, - aggs: { - creators_and_contributors: { - top_hits: { - _source: { - includes: [ - "creators_and_contributors.name", - "creators_and_contributors.nameIdentifiers.nameIdentifier" - ] - }, - size: 1 - } - }, - "work_types": { - "terms": { - "field": "resource_type_id_and_name", - "min_doc_count": 1 - } - } - } - }, - funders: { - terms: { - field: "funding_references.funderIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1 - }, - aggs: { - funders: { - top_hits: { - _source: { - includes: [ - "funding_references.funderName", - "funding_references.funderIdentifier" - ] - }, - size: 1 - } - } - } - }, - pid_entities: { - filter: { term: { "subjects.subjectScheme": "PidEntity" } }, - aggs: { - subject: { terms: { - field: "subjects.subject", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - include: %w( - Dataset - Publication - Software - Organization - Funder - Person - Grant - Sample - Instrument - Repository - Project - ) - } }, - }, - }, - fields_of_science: { - filter: { - term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" }, - }, - aggs: { - subject: { terms: { - field: "subjects.subject", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - include: "FOS:.*", - } }, - }, - }, - fields_of_science_combined: { - terms: { - field: "fields_of_science_combined", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } - }, - fields_of_science_repository: { - terms: { - field: "fields_of_science_repository", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } - }, - licenses: { - terms: { - field: "rights_list.rightsIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - missing: "__missing__", - }, - }, - languages: { - terms: { - field: "language", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - }, - }, - view_count: { sum: { field: "view_count" } }, - download_count: { sum: { field: "download_count" } }, - citation_count: { sum: { field: "citation_count" } }, - content_url_count: { value_count: { field: "content_url" } }, - client_types: { - terms: { - field: "client.client_type", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1 - } - } - }.freeze + def self.all_aggregations + Doi::Search::AggregationsBuilder.all_aggregations + end end end diff --git a/app/models/doi/search/aggregations_builder.rb b/app/models/doi/search/aggregations_builder.rb new file mode 100644 index 000000000..6d2a7942d --- /dev/null +++ b/app/models/doi/search/aggregations_builder.rb @@ -0,0 +1,273 @@ +# frozen_string_literal: true +module Doi::Search + class AggregationsBuilder + DEFAULT_FACET_COUNT = 10 + + def initialize(options) + @options = options + end + + def build + aggs = selected_aggs + facet_sizes.each do |key, size| + aggs[key][:terms][:size] = size.to_i if aggs[key]&.dig(:terms) + end + aggs + end + + def facet_sizes + facet_count = (@options[:facet_count] || DEFAULT_FACET_COUNT).to_i + custom_sizes = (@options[:facet_sizes] || {}).select do |key, value| + selected_aggs.key?(key.to_sym) && value.to_i.positive? + end.transform_keys(&:to_sym) + + if facet_count != DEFAULT_FACET_COUNT && facet_count.positive? + # Create a hash with facet_count for all selected aggregations that have terms + default_sizes = selected_aggs.each_with_object({}) do |(key, agg), hash| + hash[key] = facet_count if agg&.dig(:terms) + end + + # Let custom sizes override the defaults + default_sizes.merge(custom_sizes) + else + custom_sizes + end + end + + def requested_aggs + included_aggs = @options[:include_aggregations] || :all + if included_aggs.is_a?(String) + included_aggs = included_aggs.split(",").map(&:strip) + end + return {} if included_aggs == :none || included_aggs == 'none' + + Array.wrap(included_aggs).map(&:to_sym) + end + + def selected_aggs + tmp_aggs = if requested_aggs.include?(:all) + AGGREGATION_DEFINITIONS + else + AGGREGATION_DEFINITIONS.slice(*requested_aggs) + end + Marshal.load(Marshal.dump(tmp_aggs)) + end + + def self.all_aggregation_keys + AGGREGATION_DEFINITIONS.keys + end + + def self.all_aggregations + AGGREGATION_DEFINITIONS + end + + private + AGGREGATION_DEFINITIONS = { + resource_types: { + terms: { + field: "resource_type_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + clients: { terms: { + field: "client_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } }, + open_licenses: { + filter: { terms: { + "rights_list.rightsIdentifier": [ + "cc-by-1.0", + "cc-by-2.0", + "cc-by-2.5", + "cc-by-3.0", + "cc-by-3.0-at", + "cc-by-3.0-us", + "cc-by-4.0", + "cc-pddc", + "cc0-1.0", + "cc-pdm-1.0" + ] + } }, + aggs: { + resource_types: { + terms: { + field: "resource_type_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } + } + } + }, + published: { + date_histogram: { + field: "publication_year", + interval: "year", + format: "year", + order: { + _key: "desc", + }, + min_doc_count: 1, + }, + }, + registration_agencies: { + terms: { + field: "agency", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + }, + }, + affiliations: { + terms: { + field: "affiliation_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + authors: { + terms: { + field: "creators.nameIdentifiers.nameIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "https?://orcid.org/.*", + }, + aggs: { + authors: { + top_hits: { + _source: { + includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] + }, + size: 1 + } + } + } + }, + creators_and_contributors: { + terms: { + field: "creators_and_contributors.nameIdentifiers.nameIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "https?://orcid.org/.*" + }, + aggs: { + creators_and_contributors: { + top_hits: { + _source: { + includes: [ + "creators_and_contributors.name", + "creators_and_contributors.nameIdentifiers.nameIdentifier" + ] + }, + size: 1 + } + }, + "work_types": { + "terms": { + "field": "resource_type_id_and_name", + "min_doc_count": 1 + } + } + } + }, + funders: { + terms: { + field: "funding_references.funderIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1 + }, + aggs: { + funders: { + top_hits: { + _source: { + includes: [ + "funding_references.funderName", + "funding_references.funderIdentifier" + ] + }, + size: 1 + } + } + } + }, + pid_entities: { + filter: { term: { "subjects.subjectScheme": "PidEntity" } }, + aggs: { + subject: { terms: { + field: "subjects.subject", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: %w( + Dataset + Publication + Software + Organization + Funder + Person + Grant + Sample + Instrument + Repository + Project + ) + } }, + }, + }, + fields_of_science: { + filter: { + term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" }, + }, + aggs: { + subject: { terms: { + field: "subjects.subject", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "FOS:.*", + } }, + }, + }, + fields_of_science_combined: { + terms: { + field: "fields_of_science_combined", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } + }, + fields_of_science_repository: { + terms: { + field: "fields_of_science_repository", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } + }, + licenses: { + terms: { + field: "rights_list.rightsIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + languages: { + terms: { + field: "language", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + }, + }, + view_count: { sum: { field: "view_count" } }, + download_count: { sum: { field: "download_count" } }, + citation_count: { sum: { field: "citation_count" } }, + content_url_count: { value_count: { field: "content_url" } }, + client_types: { + terms: { + field: "client.client_type", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1 + } + } + }.freeze + end +end From 13f7087d970eba78ac3faf55c2476431c318758f Mon Sep 17 00:00:00 2001 From: jrhoads Date: Thu, 12 Dec 2024 16:31:57 +0100 Subject: [PATCH 10/11] Appease Rubocop --- app/models/doi/search/aggregations_builder.rb | 371 +++++++++--------- .../graphql_query_builder_aggregates_spec.rb | 12 +- spec/models/doi/related_doi_indexer_spec.rb | 2 +- 3 files changed, 193 insertions(+), 192 deletions(-) diff --git a/app/models/doi/search/aggregations_builder.rb b/app/models/doi/search/aggregations_builder.rb index 6d2a7942d..d13e49cdd 100644 --- a/app/models/doi/search/aggregations_builder.rb +++ b/app/models/doi/search/aggregations_builder.rb @@ -1,4 +1,5 @@ # frozen_string_literal: true + module Doi::Search class AggregationsBuilder DEFAULT_FACET_COUNT = 10 @@ -39,17 +40,17 @@ def requested_aggs if included_aggs.is_a?(String) included_aggs = included_aggs.split(",").map(&:strip) end - return {} if included_aggs == :none || included_aggs == 'none' + return {} if included_aggs == :none || included_aggs == "none" Array.wrap(included_aggs).map(&:to_sym) end def selected_aggs tmp_aggs = if requested_aggs.include?(:all) - AGGREGATION_DEFINITIONS - else - AGGREGATION_DEFINITIONS.slice(*requested_aggs) - end + AGGREGATION_DEFINITIONS + else + AGGREGATION_DEFINITIONS.slice(*requested_aggs) + end Marshal.load(Marshal.dump(tmp_aggs)) end @@ -62,212 +63,212 @@ def self.all_aggregations end private - AGGREGATION_DEFINITIONS = { - resource_types: { - terms: { - field: "resource_type_id_and_name", + AGGREGATION_DEFINITIONS = { + resource_types: { + terms: { + field: "resource_type_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, + }, + clients: { terms: { + field: "client_id_and_name", size: DEFAULT_FACET_COUNT, min_doc_count: 1, - missing: "__missing__", - }, - }, - clients: { terms: { - field: "client_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } }, - open_licenses: { - filter: { terms: { - "rights_list.rightsIdentifier": [ - "cc-by-1.0", - "cc-by-2.0", - "cc-by-2.5", - "cc-by-3.0", - "cc-by-3.0-at", - "cc-by-3.0-us", - "cc-by-4.0", - "cc-pddc", - "cc0-1.0", - "cc-pdm-1.0" - ] } }, - aggs: { - resource_types: { - terms: { - field: "resource_type_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, + open_licenses: { + filter: { terms: { + "rights_list.rightsIdentifier": [ + "cc-by-1.0", + "cc-by-2.0", + "cc-by-2.5", + "cc-by-3.0", + "cc-by-3.0-at", + "cc-by-3.0-us", + "cc-by-4.0", + "cc-pddc", + "cc0-1.0", + "cc-pdm-1.0" + ] + } }, + aggs: { + resource_types: { + terms: { + field: "resource_type_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + } } } - } - }, - published: { - date_histogram: { - field: "publication_year", - interval: "year", - format: "year", - order: { - _key: "desc", - }, - min_doc_count: 1, }, - }, - registration_agencies: { - terms: { - field: "agency", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, + published: { + date_histogram: { + field: "publication_year", + interval: "year", + format: "year", + order: { + _key: "desc", + }, + min_doc_count: 1, + }, }, - }, - affiliations: { - terms: { - field: "affiliation_id_and_name", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - missing: "__missing__", + registration_agencies: { + terms: { + field: "agency", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + }, }, - }, - authors: { - terms: { - field: "creators.nameIdentifiers.nameIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - include: "https?://orcid.org/.*", + affiliations: { + terms: { + field: "affiliation_id_and_name", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, }, - aggs: { - authors: { - top_hits: { - _source: { - includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] - }, - size: 1 + authors: { + terms: { + field: "creators.nameIdentifiers.nameIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "https?://orcid.org/.*", + }, + aggs: { + authors: { + top_hits: { + _source: { + includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] + }, + size: 1 + } } } - } - }, - creators_and_contributors: { - terms: { - field: "creators_and_contributors.nameIdentifiers.nameIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - include: "https?://orcid.org/.*" }, - aggs: { - creators_and_contributors: { - top_hits: { - _source: { - includes: [ - "creators_and_contributors.name", - "creators_and_contributors.nameIdentifiers.nameIdentifier" - ] - }, - size: 1 - } + creators_and_contributors: { + terms: { + field: "creators_and_contributors.nameIdentifiers.nameIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "https?://orcid.org/.*" }, - "work_types": { - "terms": { - "field": "resource_type_id_and_name", - "min_doc_count": 1 + aggs: { + creators_and_contributors: { + top_hits: { + _source: { + includes: [ + "creators_and_contributors.name", + "creators_and_contributors.nameIdentifiers.nameIdentifier" + ] + }, + size: 1 + } + }, + "work_types": { + "terms": { + "field": "resource_type_id_and_name", + "min_doc_count": 1 + } } } - } - }, - funders: { - terms: { - field: "funding_references.funderIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1 }, - aggs: { - funders: { - top_hits: { - _source: { - includes: [ - "funding_references.funderName", - "funding_references.funderIdentifier" - ] - }, - size: 1 + funders: { + terms: { + field: "funding_references.funderIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1 + }, + aggs: { + funders: { + top_hits: { + _source: { + includes: [ + "funding_references.funderName", + "funding_references.funderIdentifier" + ] + }, + size: 1 + } } } - } - }, - pid_entities: { - filter: { term: { "subjects.subjectScheme": "PidEntity" } }, - aggs: { - subject: { terms: { - field: "subjects.subject", + }, + pid_entities: { + filter: { term: { "subjects.subjectScheme": "PidEntity" } }, + aggs: { + subject: { terms: { + field: "subjects.subject", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: %w( + Dataset + Publication + Software + Organization + Funder + Person + Grant + Sample + Instrument + Repository + Project + ) + } }, + }, + }, + fields_of_science: { + filter: { + term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" }, + }, + aggs: { + subject: { terms: { + field: "subjects.subject", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + include: "FOS:.*", + } }, + }, + }, + fields_of_science_combined: { + terms: { + field: "fields_of_science_combined", size: DEFAULT_FACET_COUNT, min_doc_count: 1, - include: %w( - Dataset - Publication - Software - Organization - Funder - Person - Grant - Sample - Instrument - Repository - Project - ) - } }, - }, - }, - fields_of_science: { - filter: { - term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" }, + } }, - aggs: { - subject: { terms: { - field: "subjects.subject", + fields_of_science_repository: { + terms: { + field: "fields_of_science_repository", size: DEFAULT_FACET_COUNT, min_doc_count: 1, - include: "FOS:.*", - } }, + } }, - }, - fields_of_science_combined: { - terms: { - field: "fields_of_science_combined", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } - }, - fields_of_science_repository: { - terms: { - field: "fields_of_science_repository", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - } - }, - licenses: { - terms: { - field: "rights_list.rightsIdentifier", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, - missing: "__missing__", + licenses: { + terms: { + field: "rights_list.rightsIdentifier", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + missing: "__missing__", + }, }, - }, - languages: { - terms: { - field: "language", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1, + languages: { + terms: { + field: "language", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1, + }, }, - }, - view_count: { sum: { field: "view_count" } }, - download_count: { sum: { field: "download_count" } }, - citation_count: { sum: { field: "citation_count" } }, - content_url_count: { value_count: { field: "content_url" } }, - client_types: { - terms: { - field: "client.client_type", - size: DEFAULT_FACET_COUNT, - min_doc_count: 1 + view_count: { sum: { field: "view_count" } }, + download_count: { sum: { field: "download_count" } }, + citation_count: { sum: { field: "citation_count" } }, + content_url_count: { value_count: { field: "content_url" } }, + client_types: { + terms: { + field: "client.client_type", + size: DEFAULT_FACET_COUNT, + min_doc_count: 1 + } } - } - }.freeze + }.freeze end end diff --git a/spec/models/doi/graphql_query_builder_aggregates_spec.rb b/spec/models/doi/graphql_query_builder_aggregates_spec.rb index 2ddcd3f08..5f073d819 100644 --- a/spec/models/doi/graphql_query_builder_aggregates_spec.rb +++ b/spec/models/doi/graphql_query_builder_aggregates_spec.rb @@ -58,7 +58,7 @@ it "includes all aggregations when 'all' string provided" do expected_keys = described_class.all_aggregation_keys - options = { include_aggregations: 'all' } + options = { include_aggregations: "all" } builder = described_class.new(query, options) expect(builder.aggregations.keys).to match_array(expected_keys) end @@ -70,7 +70,7 @@ end it "returns empty hash when 'none' string provided" do - options = { include_aggregations: 'none' } + options = { include_aggregations: "none" } builder = described_class.new(query, options) expect(builder.aggregations).to eq({}) end @@ -82,7 +82,7 @@ end it "returns empty hash when empty string provided" do - options = { include_aggregations: '' } + options = { include_aggregations: "" } builder = described_class.new(query, options) expect(builder.aggregations).to eq({}) end @@ -100,19 +100,19 @@ end it "includes only specified aggregations when array of strings provided" do - options = { include_aggregations: ['clients', 'languages'] } + options = { include_aggregations: ["clients", "languages"] } builder = described_class.new(query, options) expect(builder.aggregations.keys).to match_array([:clients, :languages]) end it "includes only specified aggregations when comma separated string provided" do - options = { include_aggregations: 'clients,languages' } + options = { include_aggregations: "clients,languages" } builder = described_class.new(query, options) expect(builder.aggregations.keys).to match_array([:clients, :languages]) end it "includes only specified aggregations when comma separated string provided with spaces" do - options = { include_aggregations: 'clients, languages' } + options = { include_aggregations: "clients, languages" } builder = described_class.new(query, options) expect(builder.aggregations.keys).to match_array([:clients, :languages]) end diff --git a/spec/models/doi/related_doi_indexer_spec.rb b/spec/models/doi/related_doi_indexer_spec.rb index 5e887f6c5..ddaa362aa 100644 --- a/spec/models/doi/related_doi_indexer_spec.rb +++ b/spec/models/doi/related_doi_indexer_spec.rb @@ -3,7 +3,7 @@ require "rails_helper" -RSpec.describe Doi::Indexer::RelatedDoiIndexer , elasticsearch: false, skip_prefix_pool: true do +RSpec.describe Doi::Indexer::RelatedDoiIndexer, elasticsearch: false, skip_prefix_pool: true do describe "related_dois with different input" do let(:good_related_identifier) do { From 69d49bda47e7f7c687dae780c13a0bd53fe08713 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Fri, 13 Dec 2024 11:50:41 +0100 Subject: [PATCH 11/11] Do not return facets if the facet count is set to 0 --- app/models/doi/search/aggregations_builder.rb | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/app/models/doi/search/aggregations_builder.rb b/app/models/doi/search/aggregations_builder.rb index d13e49cdd..764246dce 100644 --- a/app/models/doi/search/aggregations_builder.rb +++ b/app/models/doi/search/aggregations_builder.rb @@ -6,10 +6,13 @@ class AggregationsBuilder def initialize(options) @options = options + @facet_count = (@options[:facet_count] || DEFAULT_FACET_COUNT).to_i + @selected_aggs = selected_aggs end def build - aggs = selected_aggs + return {} if @facet_count == 0 + aggs = @selected_aggs facet_sizes.each do |key, size| aggs[key][:terms][:size] = size.to_i if aggs[key]&.dig(:terms) end @@ -17,15 +20,14 @@ def build end def facet_sizes - facet_count = (@options[:facet_count] || DEFAULT_FACET_COUNT).to_i custom_sizes = (@options[:facet_sizes] || {}).select do |key, value| - selected_aggs.key?(key.to_sym) && value.to_i.positive? + @selected_aggs.key?(key.to_sym) && value.to_i.positive? end.transform_keys(&:to_sym) - if facet_count != DEFAULT_FACET_COUNT && facet_count.positive? + if @facet_count != DEFAULT_FACET_COUNT && @facet_count.positive? # Create a hash with facet_count for all selected aggregations that have terms - default_sizes = selected_aggs.each_with_object({}) do |(key, agg), hash| - hash[key] = facet_count if agg&.dig(:terms) + default_sizes = @selected_aggs.each_with_object({}) do |(key, agg), hash| + hash[key] = @facet_count if agg&.dig(:terms) end # Let custom sizes override the defaults