Skip to content

Commit

Permalink
add provenance convenience column to pub table; rake task to backfill…
Browse files Browse the repository at this point in the history
… data
  • Loading branch information
peetucket committed Mar 18, 2022
1 parent 7806c7d commit f726205
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 2 deletions.
6 changes: 5 additions & 1 deletion app/models/publication.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class Publication < ApplicationRecord
self.publication_type = pub_hash[:type] if pub_hash[:type].present?
self.year = pub_hash[:year] if pub_hash[:year].present?
self.wos_uid ||= web_of_science_source_record.uid if web_of_science_source_record.present?
# NOTE: we already validate the presence and value of newly generated provenance with the PubHashValidator, though it is possible for old bad data to exist
self.provenance = pub_hash[:provenance].to_s.downcase # could be nil or CAPS in old bad data
end

has_one :batch_uploaded_source_record, dependent: :destroy
Expand Down Expand Up @@ -270,7 +272,9 @@ def authoritative_doi_source?

private

# @return [String] might be empty, won't be nil
# @return [String] might be empty, won't be nil, normalize since we have some older data in varying cases
# @note obscures ActiveRecord field/attribute getter for provenance, once we are sure we have backfilled all previous
# records with the rake data:add_provenance rake task, we can get rid of this method
def provenance
pub_hash[:provenance].to_s.downcase
end
Expand Down
6 changes: 6 additions & 0 deletions db/migrate/20220311182838_add_provenance_field.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class AddProvenanceField < ActiveRecord::Migration[6.1]
def change
add_column :publications, :provenance, :string
add_index :publications, :provenance
end
end
4 changes: 3 additions & 1 deletion db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2022_01_11_185021) do
ActiveRecord::Schema.define(version: 2022_03_11_182838) do

create_table "author_identities", id: :integer, charset: "utf8", collation: "utf8_unicode_ci", force: :cascade do |t|
t.integer "author_id", null: false
Expand Down Expand Up @@ -140,9 +140,11 @@
t.string "issn"
t.string "publication_type"
t.string "wos_uid"
t.string "provenance"
t.index ["issn"], name: "index_publications_on_issn"
t.index ["pages"], name: "index_publications_on_pages"
t.index ["pmid"], name: "index_publications_on_pmid"
t.index ["provenance"], name: "index_publications_on_provenance"
t.index ["sciencewire_id"], name: "index_publications_on_sciencewire_id"
t.index ["title"], name: "index_publications_on_title", length: 255
t.index ["updated_at"], name: "index_publications_on_updated_at"
Expand Down
21 changes: 21 additions & 0 deletions lib/tasks/data.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# frozen_string_literal: true

namespace :data do
desc 'Backfile provenance into AR column for all records'
# A new field was added to the publication table to allow for querying on publication provenance (already stored in pub_hash).
# This task goes through all publications and adds the value to this field from the pub_hash
# After this task completes, we can remove the `Publication#provenance` method
# RAILS_ENV=production bundle exec rake cleanup:merge_profiles[123,456] # will merge all publications from cap_profile_id 456 into 123, without duplication
# rubocop:disable Rails/SkipsModelValidations
task add_provenance: :environment do |_t, _args|
num_pubs = Publication.where(provenance: nil).count
puts "Started at #{Time.zone.now}"
puts "Found #{num_pubs} with missing provenance."
Publication.where(provenance: nil).find_each.with_index do |pub, i|
puts "#{i + 1} of #{num_pubs}"
pub.update_column('provenance', pub.pub_hash[:provenance]) # skip callbacks and timestamp updates, just set the value
end
puts "Finished at #{Time.zone.now}"
end
# rubocop:enable Rails/SkipsModelValidations
end

0 comments on commit f726205

Please sign in to comment.