From 79c459c11f3b0bbad30ccd06c67ab2e1474dc912 Mon Sep 17 00:00:00 2001 From: Peter Mangiafico Date: Fri, 11 Mar 2022 14:55:12 -0800 Subject: [PATCH 1/2] add provenance convenience column to pub table; rake task to backfill data --- app/models/publication.rb | 6 +++++- .../20220311182838_add_provenance_field.rb | 6 ++++++ db/schema.rb | 2 ++ lib/tasks/data.rake | 21 +++++++++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 db/migrate/20220311182838_add_provenance_field.rb create mode 100644 lib/tasks/data.rake diff --git a/app/models/publication.rb b/app/models/publication.rb index d10d53a16..e16c7d251 100644 --- a/app/models/publication.rb +++ b/app/models/publication.rb @@ -31,6 +31,8 @@ class Publication < ApplicationRecord self.publication_type = pub_hash[:type] if pub_hash[:type].present? self.year = pub_hash[:year] if pub_hash[:year].present? self.wos_uid ||= web_of_science_source_record.uid if web_of_science_source_record.present? + # NOTE: we already validate the presence and value of newly generated provenance with the PubHashValidator, though it is possible for old bad data to exist + self.provenance = pub_hash[:provenance].to_s.downcase # could be nil or CAPS in old bad data end has_one :batch_uploaded_source_record, dependent: :destroy @@ -285,7 +287,9 @@ def authoritative_doi_source? private - # @return [String] might be empty, won't be nil + # @return [String] might be empty, won't be nil, normalize since we have some older data in varying cases + # @note obscures ActiveRecord field/attribute getter for provenance, once we are sure we have backfilled all previous + # records with the rake data:add_provenance rake task, we can get rid of this method def provenance pub_hash[:provenance].to_s.downcase end diff --git a/db/migrate/20220311182838_add_provenance_field.rb b/db/migrate/20220311182838_add_provenance_field.rb new file mode 100644 index 000000000..1cbe62292 --- /dev/null +++ b/db/migrate/20220311182838_add_provenance_field.rb @@ -0,0 +1,6 @@ +class AddProvenanceField < ActiveRecord::Migration[6.1] + def change + add_column :publications, :provenance, :string + add_index :publications, :provenance + end +end diff --git a/db/schema.rb b/db/schema.rb index 6d16bcf01..055468448 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -139,9 +139,11 @@ t.string "issn", limit: 255 t.string "publication_type", limit: 255 t.string "wos_uid" + t.string "provenance" t.index ["issn"], name: "index_publications_on_issn" t.index ["pages"], name: "index_publications_on_pages" t.index ["pmid"], name: "index_publications_on_pmid" + t.index ["provenance"], name: "index_publications_on_provenance" t.index ["sciencewire_id"], name: "index_publications_on_sciencewire_id" t.index ["title"], name: "index_publications_on_title" t.index ["updated_at"], name: "index_publications_on_updated_at" diff --git a/lib/tasks/data.rake b/lib/tasks/data.rake new file mode 100644 index 000000000..435a839ff --- /dev/null +++ b/lib/tasks/data.rake @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +namespace :data do + desc 'Backfile provenance into AR column for all records' + # A new field was added to the publication table to allow for querying on publication provenance (already stored in pub_hash). + # This task goes through all publications and adds the value to this field from the pub_hash + # After this task completes, we can remove the `Publication#provenance` method + # RAILS_ENV=production bundle exec rake cleanup:merge_profiles[123,456] # will merge all publications from cap_profile_id 456 into 123, without duplication + # rubocop:disable Rails/SkipsModelValidations + task add_provenance: :environment do |_t, _args| + num_pubs = Publication.where(provenance: nil).count + puts "Started at #{Time.zone.now}" + puts "Found #{num_pubs} with missing provenance." + Publication.where(provenance: nil).find_each.with_index do |pub, i| + puts "#{i + 1} of #{num_pubs}" + pub.update_column('provenance', pub.pub_hash[:provenance]) # skip callbacks and timestamp updates, just set the value + end + puts "Finished at #{Time.zone.now}" + end + # rubocop:enable Rails/SkipsModelValidations +end From a374e1d952150e39ff7cb69907159588cc37e9bc Mon Sep 17 00:00:00 2001 From: Peter Mangiafico Date: Fri, 18 Mar 2022 12:59:29 -0700 Subject: [PATCH 2/2] add note --- app/models/publication.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/models/publication.rb b/app/models/publication.rb index e16c7d251..7f81db3dd 100644 --- a/app/models/publication.rb +++ b/app/models/publication.rb @@ -288,8 +288,9 @@ def authoritative_doi_source? private # @return [String] might be empty, won't be nil, normalize since we have some older data in varying cases - # @note obscures ActiveRecord field/attribute getter for provenance, once we are sure we have backfilled all previous + # TODO: obscures ActiveRecord field/attribute getter for provenance, once we are sure we have backfilled all previous # records with the rake data:add_provenance rake task, we can get rid of this method + # see https://github.com/sul-dlss/sul_pub/issues/1467 def provenance pub_hash[:provenance].to_s.downcase end