diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb index f705c35a..cf040de6 100644 --- a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +++ b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb @@ -76,9 +76,16 @@ def validate_csv(csv_file:, zip_file: nil, admin_set_id: nil) # rubocop:disable file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id) # 10. Item hierarchy for UI display - collections, works, file_sets = extract_validation_items(csv_data) + collections, works, file_sets = extract_validation_items(csv_data, all_ids) # 11. Assemble result + source_id_missing = !headers.map(&:to_s).include?(source_id_key.to_s) + if source_id_missing && Bulkrax.fill_in_blank_source_identifiers.blank? + all_models.each do |model| + missing_required << { model: model, field: source_id_key.to_s } + end + end + row_errors = validator_context[:errors] has_errors = missing_required.any? || headers.blank? || csv_data.empty? || file_validator.missing_files.any? || row_errors.any? @@ -189,14 +196,14 @@ def resolve_parent_split_pattern(mappings) split_val end - def extract_validation_items(csv_data) # rubocop:disable Metrics/MethodLength + def extract_validation_items(csv_data, all_ids = Set.new) # rubocop:disable Metrics/MethodLength child_to_parents = build_child_to_parents_map(csv_data) collections = [] works = [] file_sets = [] csv_data.each do |item| - categorise_validation_item(item, child_to_parents, collections, works, file_sets) + categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets) end [collections, works, file_sets] @@ -205,6 +212,8 @@ def extract_validation_items(csv_data) # rubocop:disable Metrics/MethodLength def build_child_to_parents_map(csv_data) Hash.new { |h, k| h[k] = [] }.tap do |map| csv_data.each do |item| + next if item[:source_identifier].blank? + parse_relationship_field(item[:children]).each do |child_id| map[child_id] << item[:source_identifier] end @@ -212,25 +221,25 @@ def build_child_to_parents_map(csv_data) end end - def categorise_validation_item(item, child_to_parents, collections, works, file_sets) + def categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets) # rubocop:disable Metrics/ParameterLists item_id = item[:source_identifier] title = item[:raw_row]['title'] || item_id model_str = item[:model].to_s if model_str.casecmp('collection').zero? || model_str.casecmp('collectionresource').zero? - explicit = parse_relationship_field(item[:parent]) - inferred = child_to_parents[item_id] || [] + explicit = resolvable_ids(parse_relationship_field(item[:parent]), all_ids) + inferred = resolvable_ids(child_to_parents[item_id] || [], all_ids) collections << { id: item_id, title: title, type: 'collection', parentIds: (explicit + inferred).uniq, - childIds: parse_relationship_field(item[:children]) } + childIds: resolvable_ids(parse_relationship_field(item[:children]), all_ids) } elsif model_str.casecmp('fileset').zero? || model_str.casecmp('hyrax::fileset').zero? file_sets << { id: item_id, title: title, type: 'file_set' } else - explicit = parse_relationship_field(item[:parent]) - inferred = child_to_parents[item_id] || [] + explicit = resolvable_ids(parse_relationship_field(item[:parent]), all_ids) + inferred = resolvable_ids(child_to_parents[item_id] || [], all_ids) works << { id: item_id, title: title, type: 'work', parentIds: (explicit + inferred).uniq, - childIds: parse_relationship_field(item[:children]) } + childIds: resolvable_ids(parse_relationship_field(item[:children]), all_ids) } end end @@ -239,6 +248,10 @@ def parse_relationship_field(value) value.to_s.split('|').map(&:strip).reject(&:blank?) end + def resolvable_ids(ids, all_ids) + ids.select { |id| all_ids.include?(id) } + end + def apply_rights_statement_validation_override!(result, missing_required) only_rights = missing_required.present? && missing_required.all? { |h| h[:field].to_s == 'rights_statement' } diff --git a/app/services/bulkrax/csv_template/explanation_builder.rb b/app/services/bulkrax/csv_template/explanation_builder.rb index 7ecf08dc..2d64616e 100644 --- a/app/services/bulkrax/csv_template/explanation_builder.rb +++ b/app/services/bulkrax/csv_template/explanation_builder.rb @@ -21,7 +21,7 @@ def build_explanations(header_row) def build_explanation(column) mapping_key = @service.mapping_manager.mapped_to_key(column) - column_description = @descriptor.find_description_for(column) + column_description = source_identifier_description(column) || @descriptor.find_description_for(column) controlled_vocab_info = controlled_vocab_text(mapping_key) split_info = split_text(mapping_key, controlled_vocab_info) @@ -34,6 +34,12 @@ def build_explanation(column) components.join("\n") end + def source_identifier_description(column) + return unless @service.mapping_manager.mapped_to_key(column) == 'source_identifier' + return if Bulkrax.fill_in_blank_source_identifiers.blank? + "Will be auto-generated if left blank.\nProviding one allows round-tripping and deduplication across imports." + end + def controlled_vocab_text(field_name) vocab_terms = @service.field_analyzer.controlled_vocab_terms return unless vocab_terms.include?(field_name) || field_name == 'based_near' diff --git a/app/validators/bulkrax/csv_row/duplicate_identifier.rb b/app/validators/bulkrax/csv_row/duplicate_identifier.rb index d555731f..8aea9c85 100644 --- a/app/validators/bulkrax/csv_row/duplicate_identifier.rb +++ b/app/validators/bulkrax/csv_row/duplicate_identifier.rb @@ -8,6 +8,8 @@ module CsvRow module DuplicateIdentifier def self.call(record, row_index, context) source_id = record[:source_identifier] + return if source_id.blank? && Bulkrax.fill_in_blank_source_identifiers.present? + source_id_label = context[:source_identifier] || 'source_identifier' first_row = context[:seen_ids][source_id] diff --git a/spec/services/bulkrax/csv_template/csv_parser_template_spec.rb b/spec/services/bulkrax/csv_template/csv_parser_template_spec.rb index bfdd8c88..19f05e69 100644 --- a/spec/services/bulkrax/csv_template/csv_parser_template_spec.rb +++ b/spec/services/bulkrax/csv_template/csv_parser_template_spec.rb @@ -158,6 +158,58 @@ expect(result[:unrecognized].keys).to include('source_idenifier', 'titel', 'creater') end end + + context 'source_identifier generation' do + context 'when fill_in_blank_source_identifiers is not configured' do + before { allow(Bulkrax).to receive(:fill_in_blank_source_identifiers).and_return(nil) } + + it 'does not include source_identifier in missingRequired when the column is present' do + result = described_class.validate_csv(csv_file: csv_file, zip_file: nil) + source_id_missing = result[:missingRequired].any? { |h| h[:field].to_s == 'source_identifier' } + expect(source_id_missing).to be false + end + + it 'treats a missing source_identifier column as an error' do + csv_without_source_id = Tempfile.new(['no_source_id', '.csv']) + csv_without_source_id.write("title,model\nTest Work,GenericWork\n") + csv_without_source_id.rewind + + result = described_class.validate_csv(csv_file: csv_without_source_id, zip_file: nil) + expect(result[:isValid]).to be false + expect(result[:missingRequired]).to include(a_hash_including(field: 'source_identifier')) + ensure + csv_without_source_id.close + csv_without_source_id.unlink + end + end + + context 'when fill_in_blank_source_identifiers is configured' do + before do + allow(Bulkrax).to receive(:fill_in_blank_source_identifiers) + .and_return(->(_parser, _index) { SecureRandom.uuid }) + end + + it 'does not include source_identifier in missingRequired even when the column is present' do + result = described_class.validate_csv(csv_file: csv_file, zip_file: nil) + source_id_missing = result[:missingRequired].any? { |h| h[:field].to_s == 'source_identifier' } + expect(source_id_missing).to be false + end + + it 'does not treat a missing source_identifier column as an error' do + csv_without_source_id = Tempfile.new(['no_source_id', '.csv']) + csv_without_source_id.write("title,model\nTest Work,GenericWork\n") + csv_without_source_id.rewind + + result = described_class.validate_csv(csv_file: csv_without_source_id, zip_file: nil) + expect(result[:isValid]).to be true + source_id_missing = result[:missingRequired].any? { |h| h[:field].to_s == 'source_identifier' } + expect(source_id_missing).to be false + ensure + csv_without_source_id.close + csv_without_source_id.unlink + end + end + end end describe 'TemplateContext' do diff --git a/spec/validators/bulkrax/csv_row/duplicate_identifier_spec.rb b/spec/validators/bulkrax/csv_row/duplicate_identifier_spec.rb index 11dd44cb..f346f6a3 100644 --- a/spec/validators/bulkrax/csv_row/duplicate_identifier_spec.rb +++ b/spec/validators/bulkrax/csv_row/duplicate_identifier_spec.rb @@ -29,4 +29,27 @@ def make_record(source_id) expect(error[:row]).to eq(3) expect(error[:value]).to eq('work1') end + + context 'when fill_in_blank_source_identifiers is configured' do + before do + allow(Bulkrax).to receive(:fill_in_blank_source_identifiers) + .and_return(->(_parser, _index) { SecureRandom.uuid }) + end + + it 'skips the duplicate check for blank source_identifiers' do + context = make_context + described_class.call(make_record(nil), 2, context) + described_class.call(make_record(nil), 3, context) + + expect(context[:errors]).to be_empty + end + + it 'still records an error for duplicate non-blank identifiers' do + context = make_context + described_class.call(make_record('work1'), 2, context) + described_class.call(make_record('work1'), 3, context) + + expect(context[:errors].length).to eq(1) + end + end end