Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 23 additions & 10 deletions app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,16 @@ def validate_csv(csv_file:, zip_file: nil, admin_set_id: nil) # rubocop:disable
file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id)

# 10. Item hierarchy for UI display
collections, works, file_sets = extract_validation_items(csv_data)
collections, works, file_sets = extract_validation_items(csv_data, all_ids)

# 11. Assemble result
source_id_missing = !headers.map(&:to_s).include?(source_id_key.to_s)
if source_id_missing && Bulkrax.fill_in_blank_source_identifiers.blank?
all_models.each do |model|
missing_required << { model: model, field: source_id_key.to_s }
end
end

row_errors = validator_context[:errors]
has_errors = missing_required.any? || headers.blank? || csv_data.empty? ||
file_validator.missing_files.any? || row_errors.any?
Expand Down Expand Up @@ -189,14 +196,14 @@ def resolve_parent_split_pattern(mappings)
split_val
end

def extract_validation_items(csv_data) # rubocop:disable Metrics/MethodLength
def extract_validation_items(csv_data, all_ids = Set.new) # rubocop:disable Metrics/MethodLength
child_to_parents = build_child_to_parents_map(csv_data)
collections = []
works = []
file_sets = []

csv_data.each do |item|
categorise_validation_item(item, child_to_parents, collections, works, file_sets)
categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets)
end

[collections, works, file_sets]
Expand All @@ -205,32 +212,34 @@ def extract_validation_items(csv_data) # rubocop:disable Metrics/MethodLength
def build_child_to_parents_map(csv_data)
Hash.new { |h, k| h[k] = [] }.tap do |map|
csv_data.each do |item|
next if item[:source_identifier].blank?

parse_relationship_field(item[:children]).each do |child_id|
map[child_id] << item[:source_identifier]
end
end
end
end

def categorise_validation_item(item, child_to_parents, collections, works, file_sets)
def categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets) # rubocop:disable Metrics/ParameterLists
item_id = item[:source_identifier]
title = item[:raw_row]['title'] || item_id
model_str = item[:model].to_s

if model_str.casecmp('collection').zero? || model_str.casecmp('collectionresource').zero?
explicit = parse_relationship_field(item[:parent])
inferred = child_to_parents[item_id] || []
explicit = resolvable_ids(parse_relationship_field(item[:parent]), all_ids)
inferred = resolvable_ids(child_to_parents[item_id] || [], all_ids)
collections << { id: item_id, title: title, type: 'collection',
parentIds: (explicit + inferred).uniq,
childIds: parse_relationship_field(item[:children]) }
childIds: resolvable_ids(parse_relationship_field(item[:children]), all_ids) }
elsif model_str.casecmp('fileset').zero? || model_str.casecmp('hyrax::fileset').zero?
file_sets << { id: item_id, title: title, type: 'file_set' }
else
explicit = parse_relationship_field(item[:parent])
inferred = child_to_parents[item_id] || []
explicit = resolvable_ids(parse_relationship_field(item[:parent]), all_ids)
inferred = resolvable_ids(child_to_parents[item_id] || [], all_ids)
works << { id: item_id, title: title, type: 'work',
parentIds: (explicit + inferred).uniq,
childIds: parse_relationship_field(item[:children]) }
childIds: resolvable_ids(parse_relationship_field(item[:children]), all_ids) }
end
end

Expand All @@ -239,6 +248,10 @@ def parse_relationship_field(value)
value.to_s.split('|').map(&:strip).reject(&:blank?)
end

def resolvable_ids(ids, all_ids)
ids.select { |id| all_ids.include?(id) }
end

def apply_rights_statement_validation_override!(result, missing_required)
only_rights = missing_required.present? &&
missing_required.all? { |h| h[:field].to_s == 'rights_statement' }
Expand Down
8 changes: 7 additions & 1 deletion app/services/bulkrax/csv_template/explanation_builder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def build_explanations(header_row)
def build_explanation(column)
mapping_key = @service.mapping_manager.mapped_to_key(column)

column_description = @descriptor.find_description_for(column)
column_description = source_identifier_description(column) || @descriptor.find_description_for(column)
controlled_vocab_info = controlled_vocab_text(mapping_key)
split_info = split_text(mapping_key, controlled_vocab_info)

Expand All @@ -34,6 +34,12 @@ def build_explanation(column)
components.join("\n")
end

def source_identifier_description(column)
return unless @service.mapping_manager.mapped_to_key(column) == 'source_identifier'
return if Bulkrax.fill_in_blank_source_identifiers.blank?
"Will be auto-generated if left blank.\nProviding one allows round-tripping and deduplication across imports."
end

def controlled_vocab_text(field_name)
vocab_terms = @service.field_analyzer.controlled_vocab_terms
return unless vocab_terms.include?(field_name) || field_name == 'based_near'
Expand Down
2 changes: 2 additions & 0 deletions app/validators/bulkrax/csv_row/duplicate_identifier.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ module CsvRow
module DuplicateIdentifier
def self.call(record, row_index, context)
source_id = record[:source_identifier]
return if source_id.blank? && Bulkrax.fill_in_blank_source_identifiers.present?

source_id_label = context[:source_identifier] || 'source_identifier'
first_row = context[:seen_ids][source_id]

Expand Down
52 changes: 52 additions & 0 deletions spec/services/bulkrax/csv_template/csv_parser_template_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,58 @@
expect(result[:unrecognized].keys).to include('source_idenifier', 'titel', 'creater')
end
end

context 'source_identifier generation' do
context 'when fill_in_blank_source_identifiers is not configured' do
before { allow(Bulkrax).to receive(:fill_in_blank_source_identifiers).and_return(nil) }

it 'does not include source_identifier in missingRequired when the column is present' do
result = described_class.validate_csv(csv_file: csv_file, zip_file: nil)
source_id_missing = result[:missingRequired].any? { |h| h[:field].to_s == 'source_identifier' }
expect(source_id_missing).to be false
end

it 'treats a missing source_identifier column as an error' do
csv_without_source_id = Tempfile.new(['no_source_id', '.csv'])
csv_without_source_id.write("title,model\nTest Work,GenericWork\n")
csv_without_source_id.rewind

result = described_class.validate_csv(csv_file: csv_without_source_id, zip_file: nil)
expect(result[:isValid]).to be false
expect(result[:missingRequired]).to include(a_hash_including(field: 'source_identifier'))
ensure
csv_without_source_id.close
csv_without_source_id.unlink
end
end

context 'when fill_in_blank_source_identifiers is configured' do
before do
allow(Bulkrax).to receive(:fill_in_blank_source_identifiers)
.and_return(->(_parser, _index) { SecureRandom.uuid })
end

it 'does not include source_identifier in missingRequired even when the column is present' do
result = described_class.validate_csv(csv_file: csv_file, zip_file: nil)
source_id_missing = result[:missingRequired].any? { |h| h[:field].to_s == 'source_identifier' }
expect(source_id_missing).to be false
end

it 'does not treat a missing source_identifier column as an error' do
csv_without_source_id = Tempfile.new(['no_source_id', '.csv'])
csv_without_source_id.write("title,model\nTest Work,GenericWork\n")
csv_without_source_id.rewind

result = described_class.validate_csv(csv_file: csv_without_source_id, zip_file: nil)
expect(result[:isValid]).to be true
source_id_missing = result[:missingRequired].any? { |h| h[:field].to_s == 'source_identifier' }
expect(source_id_missing).to be false
ensure
csv_without_source_id.close
csv_without_source_id.unlink
end
end
end
end

describe 'TemplateContext' do
Expand Down
23 changes: 23 additions & 0 deletions spec/validators/bulkrax/csv_row/duplicate_identifier_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,27 @@ def make_record(source_id)
expect(error[:row]).to eq(3)
expect(error[:value]).to eq('work1')
end

context 'when fill_in_blank_source_identifiers is configured' do
before do
allow(Bulkrax).to receive(:fill_in_blank_source_identifiers)
.and_return(->(_parser, _index) { SecureRandom.uuid })
end

it 'skips the duplicate check for blank source_identifiers' do
context = make_context
described_class.call(make_record(nil), 2, context)
described_class.call(make_record(nil), 3, context)

expect(context[:errors]).to be_empty
end

it 'still records an error for duplicate non-blank identifiers' do
context = make_context
described_class.call(make_record('work1'), 2, context)
described_class.call(make_record('work1'), 3, context)

expect(context[:errors].length).to eq(1)
end
end
end
Loading