diff --git a/app/models/publication.rb b/app/models/publication.rb
index d10d53a16..7f81db3dd 100644
--- a/app/models/publication.rb
+++ b/app/models/publication.rb
@@ -31,6 +31,8 @@ class Publication < ApplicationRecord
     self.publication_type = pub_hash[:type] if pub_hash[:type].present?
     self.year = pub_hash[:year] if pub_hash[:year].present?
     self.wos_uid ||= web_of_science_source_record.uid if web_of_science_source_record.present?
+    # NOTE: we already validate the presence and value of newly generated provenance with the PubHashValidator, though it is possible for old bad data to exist
+    self.provenance = pub_hash[:provenance].to_s.downcase # could be nil or CAPS in old bad data
   end
 
   has_one :batch_uploaded_source_record, dependent: :destroy
@@ -285,7 +287,10 @@ def authoritative_doi_source?
 
   private
 
-  # @return [String] might be empty, won't be nil
+  # @return [String] might be empty, won't be nil, normalize since we have some older data in varying cases
+  # TODO: obscures ActiveRecord field/attribute getter for provenance, once we are sure we have backfilled all previous
+  #  records with the rake data:add_provenance rake task, we can get rid of this method
+  #  see https://github.com/sul-dlss/sul_pub/issues/1467
   def provenance
     pub_hash[:provenance].to_s.downcase
   end
diff --git a/db/migrate/20220311182838_add_provenance_field.rb b/db/migrate/20220311182838_add_provenance_field.rb
new file mode 100644
index 000000000..1cbe62292
--- /dev/null
+++ b/db/migrate/20220311182838_add_provenance_field.rb
@@ -0,0 +1,6 @@
+class AddProvenanceField < ActiveRecord::Migration[6.1]
+  def change
+    add_column :publications, :provenance, :string
+    add_index :publications, :provenance
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
index 6d16bcf01..055468448 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -139,9 +139,11 @@
     t.string "issn", limit: 255
     t.string "publication_type", limit: 255
     t.string "wos_uid"
+    t.string "provenance"
     t.index ["issn"], name: "index_publications_on_issn"
     t.index ["pages"], name: "index_publications_on_pages"
     t.index ["pmid"], name: "index_publications_on_pmid"
+    t.index ["provenance"], name: "index_publications_on_provenance"
     t.index ["sciencewire_id"], name: "index_publications_on_sciencewire_id"
     t.index ["title"], name: "index_publications_on_title"
     t.index ["updated_at"], name: "index_publications_on_updated_at"
diff --git a/lib/tasks/data.rake b/lib/tasks/data.rake
new file mode 100644
index 000000000..435a839ff
--- /dev/null
+++ b/lib/tasks/data.rake
@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+
+namespace :data do
+  desc 'Backfile provenance into AR column for all records'
+  # A new field was added to the publication table to allow for querying on publication provenance (already stored in pub_hash).
+  # This task goes through all publications and adds the value to this field from the pub_hash
+  # After this task completes, we can remove the `Publication#provenance` method
+  # RAILS_ENV=production bundle exec rake cleanup:merge_profiles[123,456] # will merge all publications from cap_profile_id 456 into 123, without duplication
+  # rubocop:disable Rails/SkipsModelValidations
+  task add_provenance: :environment do |_t, _args|
+    num_pubs = Publication.where(provenance: nil).count
+    puts "Started at #{Time.zone.now}"
+    puts "Found #{num_pubs} with missing provenance."
+    Publication.where(provenance: nil).find_each.with_index do |pub, i|
+      puts "#{i + 1} of #{num_pubs}"
+      pub.update_column('provenance', pub.pub_hash[:provenance]) # skip callbacks and timestamp updates, just set the value
+    end
+    puts "Finished at #{Time.zone.now}"
+  end
+  # rubocop:enable Rails/SkipsModelValidations
+end