diff --git a/app/assets/javascripts/bulkrax/importers_stepper.js b/app/assets/javascripts/bulkrax/importers_stepper.js index 0f7631c1..e2c9d314 100644 --- a/app/assets/javascripts/bulkrax/importers_stepper.js +++ b/app/assets/javascripts/bulkrax/importers_stepper.js @@ -1579,8 +1579,8 @@ }) allItems.forEach(function (item) { - if (item.childrenIds && item.childrenIds.length > 0) { - item.childrenIds.forEach(function (childId) { + if (item.childIds && item.childIds.length > 0) { + item.childIds.forEach(function (childId) { var child = itemMap[childId] if (child) { if (child.parentIds.indexOf(item.id) === -1) { @@ -1591,12 +1591,38 @@ } }) - // Build hierarchy lookup map from normalized parentIds - var hierarchyMap = {} + // Inject stub nodes for existing-record relationships so the tree can + // render them as children/parents even though they are not in the CSV. allItems.forEach(function (item) { + // existingChildIds → the item in the CSV has a child that lives in the repo + ;(item.existingChildIds || []).forEach(function (childId) { + if (!itemMap[childId]) { + itemMap[childId] = { id: childId, title: childId, type: 'existing', parentIds: [], existing: true } + } + if (itemMap[childId].parentIds.indexOf(item.id) === -1) { + itemMap[childId].parentIds.push(item.id) + } + }) + // existingParentIds → the item in the CSV has a parent that lives in the repo + ;(item.existingParentIds || []).forEach(function (parentId) { + if (!itemMap[parentId]) { + itemMap[parentId] = { id: parentId, title: parentId, type: 'existing', parentIds: [], existing: true } + } + if (item.parentIds.indexOf(parentId) === -1) { + item.parentIds.push(parentId) + } + }) + }) + + // Build hierarchy lookup map from normalized parentIds (including existing stubs) + var hierarchyMap = {} + Object.keys(itemMap).forEach(function (id) { + var item = itemMap[id] item.parentIds.forEach(function (parentId) { if (!hierarchyMap[parentId]) { hierarchyMap[parentId] = [] } - hierarchyMap[parentId].push(item) + if (hierarchyMap[parentId].indexOf(item) === -1) { + hierarchyMap[parentId].push(item) + } }) }) @@ -1870,9 +1896,23 @@ var orphanWorks = data.works.filter(function (w) { return !w.parentIds || w.parentIds.length === 0 }) + // Existing-record stubs that are top-level parents (not themselves children of anything) + var existingRoots = Object.keys(hierarchyMap) + .filter(function (id) { + var allCsvIds = data.collections.concat(data.works).map(function (i) { return i.id }) + return allCsvIds.indexOf(id) === -1 + }) + .map(function (id) { + return { id: id, title: id, type: 'existing', parentIds: [], existing: true } + }) var visited = new Set() var hierarchyContent = '
' + + existingRoots + .map(function (e) { + return renderTreeItem(e, hierarchyMap, 0, visited) + }) + .join('') + topLevelCollections .map(function (c) { return renderTreeItem(c, hierarchyMap, 0, visited) @@ -1925,8 +1965,9 @@ var children = hierarchyMap[item.id] || [] var hasChildren = children.length > 0 + var isExisting = !!item.existing var icon = item.type === 'collection' ? 'fa-folder' : 'fa-file-o' - var iconColor = item.type === 'collection' ? 'text-primary' : 'text-muted' + var iconColor = isExisting ? 'text-muted' : (item.type === 'collection' ? 'text-primary' : 'text-muted') // Hidden chevron still takes up space (via fixed width in CSS) to prevent icon shifting var chevronClass = hasChildren ? 'tree-chevron' : 'tree-chevron tree-chevron-hidden' var chevron = '' @@ -1944,8 +1985,13 @@ ? ' tabindex="0" role="treeitem" aria-expanded="false"' : '' + var existingBadge = isExisting + ? '' + + t('existing_record_badge') + '' + : '' + var html = - '
' + - '' + + '' + safeTitle + '' + (item.parentIds && item.parentIds.length > 1 @@ -1965,6 +2011,7 @@ ' ' + t('shared_badge') + '' : '') + count + + existingBadge + '
' if (hasChildren) { diff --git a/app/assets/stylesheets/bulkrax/stepper/_summary.scss b/app/assets/stylesheets/bulkrax/stepper/_summary.scss index c59620b8..edfa7995 100644 --- a/app/assets/stylesheets/bulkrax/stepper/_summary.scss +++ b/app/assets/stylesheets/bulkrax/stepper/_summary.scss @@ -100,6 +100,28 @@ $summary-variants: ( color: $color-text-dark; } +.tree-item-existing { + opacity: 0.7; +} + +.tree-label-existing { + color: $color-text-muted; + font-style: italic; +} + +.tree-existing-badge { + display: inline-block; + font-size: 10px; + font-weight: 600; + color: $color-text-muted; + background: $bg-muted; + border-radius: 10px; + padding: 1px 7px; + margin-left: 6px; + white-space: nowrap; + vertical-align: middle; +} + .tree-shared-badge { display: inline-flex; align-items: center; diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb index ad9e9e40..cd3613bd 100644 --- a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +++ b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb @@ -2,7 +2,7 @@ module Bulkrax class CsvParser < ApplicationParser - module CsvValidation # rubocop:disable Metrics/ModuleLength + module CsvValidation extend ActiveSupport::Concern included do @@ -12,265 +12,102 @@ module CsvValidation # rubocop:disable Metrics/ModuleLength end class_methods do + include CsvValidationHelpers + # Validate a CSV (and optional zip) without a persisted Importer record. # # @param csv_file [File, ActionDispatch::Http::UploadedFile, String] path or file object # @param zip_file [File, ActionDispatch::Http::UploadedFile, nil] # @param admin_set_id [String, nil] # @return [Hash] validation result compatible with the guided import UI - def validate_csv(csv_file:, zip_file: nil, admin_set_id: nil) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize - # 1. Read headers — use CsvEntry.read_data so header normalisation - # (special-char stripping, symbolisation) is identical to a real import. + def validate_csv(csv_file:, zip_file: nil, admin_set_id: nil) + raw_csv, headers, mapping_manager, mappings, source_id_key, csv_data, field_metadata, field_analyzer = + parse_csv_inputs(csv_file, admin_set_id) + + all_ids = csv_data.map { |r| r[:source_identifier] }.compact.to_set + + header_issues = check_headers(headers, raw_csv, mapping_manager, mappings, field_metadata, field_analyzer) + missing_required = header_issues[:missing_required] + find_record = build_find_record(mapping_manager, mappings) + row_errors = run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record) + file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id) + collections, works, file_sets = extract_validation_items(csv_data, all_ids, find_record) + + append_missing_source_id!(missing_required, headers, source_id_key, csv_data.map { |r| r[:model] }.compact.uniq) + + result = assemble_result( + headers: headers, + missing_required: missing_required, + header_issues: header_issues, + row_errors: row_errors, + csv_data: csv_data, + file_validator: file_validator, + collections: collections, + works: works, + file_sets: file_sets + ) + apply_rights_statement_validation_override!(result, missing_required) + result + end + + private + + # Reads the CSV, resolves mappings, parses rows, and builds field metadata. + # Returns the values needed by all subsequent validation steps. + def parse_csv_inputs(csv_file, admin_set_id) + # Use CsvEntry.read_data so header normalisation is identical to a real import. raw_csv = CsvEntry.read_data(csv_file) headers = raw_csv.headers.map(&:to_s) - # 2. Field mappings / column name resolution mapping_manager = CsvTemplate::MappingManager.new mappings = mapping_manager.mappings - source_id_key = resolve_validation_key(mapping_manager, flag: 'source_identifier', default: :source_identifier) - parent_key = resolve_validation_key(mapping_manager, flag: 'related_parents_field_mapping', default: :parents) - children_key = resolve_validation_key(mapping_manager, flag: 'related_children_field_mapping', default: :children) - file_key = resolve_validation_key(mapping_manager, key: 'file', default: :file) + source_id_key = resolve_validation_key(mapping_manager, flag: 'source_identifier', default: :source_identifier) + parent_key = resolve_validation_key(mapping_manager, flag: 'related_parents_field_mapping', default: :parents) + children_key = resolve_validation_key(mapping_manager, flag: 'related_children_field_mapping', default: :children) + file_key = resolve_validation_key(mapping_manager, key: 'file', default: :file) - # 3. Parse rows — CsvEntry.read_data already filters blank rows and - # returns symbol-keyed rows (same as a real import). - csv_data = parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key) - - # 4. Field metadata + csv_data = parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key) all_models = csv_data.map { |r| r[:model] }.compact.uniq field_analyzer = CsvTemplate::FieldAnalyzer.new(mappings, admin_set_id) field_metadata = build_validation_field_metadata(all_models, field_analyzer) - # 5. Valid-header set (drives unrecognised-header detection) - valid_headers = build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata) - - # 6. Suffixed variants seen in this specific CSV (e.g. title_1, creator_2) - suffixed_headers = headers.select { |h| h.match?(/_\d+\z/) } - valid_headers = (valid_headers + suffixed_headers).uniq + [raw_csv, headers, mapping_manager, mappings, source_id_key, csv_data, field_metadata, field_analyzer] + end - # 7. Header-level checks - missing_required = find_missing_required_headers(headers, field_metadata, mapping_manager) - unrecognized = find_unrecognized_validation_headers(headers, valid_headers) - empty_columns = find_empty_column_positions(headers, raw_csv) + # Runs all header-level checks and returns a hash of results. + def check_headers(headers, raw_csv, mapping_manager, mappings, field_metadata, field_analyzer) # rubocop:disable Metrics/ParameterLists + all_models = field_metadata.keys + valid_headers = build_valid_validation_headers(mapping_manager, field_analyzer, + all_models, mappings, field_metadata) + suffixed = headers.select { |h| h.match?(/_\d+\z/) } + valid_headers = (valid_headers + suffixed).uniq + + { + missing_required: find_missing_required_headers(headers, field_metadata, mapping_manager), + unrecognized: find_unrecognized_validation_headers(headers, valid_headers), + empty_columns: find_empty_column_positions(headers, raw_csv) + } + end - # 8. Row-level validators - parent_split = resolve_parent_split_pattern(mappings) - all_ids = csv_data.map { |r| r[:source_identifier] }.compact.to_set - validator_context = { + # Runs all registered row validators and returns the collected errors. + def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record) # rubocop:disable Metrics/ParameterLists + context = { errors: [], warnings: [], seen_ids: {}, all_ids: all_ids, source_identifier: source_id_key.to_s, - parent_split_pattern: parent_split, + parent_split_pattern: resolve_parent_split_pattern(mappings), mappings: mappings, - field_metadata: field_metadata + field_metadata: field_metadata, + find_record_by_source_identifier: find_record } - csv_data.each_with_index do |record, index| row_number = index + 2 # 1-indexed, plus header row - Bulkrax.csv_row_validators.each { |v| v.call(record, row_number, validator_context) } - end - - # 9. File validation - file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id) - - # 10. Item hierarchy for UI display - collections, works, file_sets = extract_validation_items(csv_data, all_ids) - - # 11. Assemble result - source_id_missing = !headers.map(&:to_s).include?(source_id_key.to_s) - if source_id_missing && Bulkrax.fill_in_blank_source_identifiers.blank? - all_models.each do |model| - missing_required << { model: model, field: source_id_key.to_s } - end - end - - row_errors = validator_context[:errors] - has_errors = missing_required.any? || headers.blank? || csv_data.empty? || - file_validator.missing_files.any? || row_errors.any? - has_warnings = unrecognized.any? || empty_columns.any? || file_validator.possible_missing_files? - - result = { - headers: headers, - missingRequired: missing_required, - unrecognized: unrecognized, - emptyColumns: empty_columns, - rowCount: csv_data.length, - isValid: !has_errors, - hasWarnings: has_warnings, - rowErrors: row_errors, - collections: collections, - works: works, - fileSets: file_sets, - totalItems: csv_data.length, - fileReferences: file_validator.count_references, - missingFiles: file_validator.missing_files, - foundFiles: file_validator.found_files_count, - zipIncluded: file_validator.zip_included? - } - - apply_rights_statement_validation_override!(result, missing_required) - result - end - - private - - # Resolve a symbol key from mappings for use as a record hash key. - # Returns a Symbol matching the parser's symbol-keyed record hash. - def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:) - options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s) - options.first&.to_sym || default - end - - # Parse rows from a CsvEntry.read_data result into the canonical record shape. - # CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows - # are already filtered by CsvWrapper. - def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key) - raw_csv.map do |row| - # CSV::Row#to_h converts symbol headers → string-keyed hash - row_hash = row.to_h.transform_keys(&:to_s) - { - source_identifier: row[source_id_key], - model: row[:model], - parent: row[parent_key], - children: row[children_key], - file: row[file_key], - raw_row: row_hash - } - end - rescue StandardError => e - Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}") - [] - end - - def build_validation_field_metadata(all_models, field_analyzer) - all_models.each_with_object({}) do |model, hash| - field_list = field_analyzer.find_or_create_field_list_for(model_name: model) - hash[model] = { - properties: field_list.dig(model, 'properties') || [], - required_terms: field_list.dig(model, 'required_terms') || [], - controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || [] - } - end - end - - def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata) - svc = ValidationContext.new( - mapping_manager: mapping_manager, - field_analyzer: field_analyzer, - all_models: all_models, - mappings: mappings - ) - all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns - all_cols - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES - rescue StandardError => e - Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}") - standard = %w[model source_identifier parent parents file] - model_fields = field_metadata.values.flat_map { |m| m[:properties] } - (standard + model_fields).uniq - end - - def find_missing_required_headers(headers, field_metadata, mapping_manager) - csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq - missing = [] - field_metadata.each do |model, meta| - (meta[:required_terms] || []).each do |field| - missing << { model: model, field: field } unless csv_keys.include?(field) - end - end - missing.uniq - end - - def find_unrecognized_validation_headers(headers, valid_headers) - checker = DidYouMean::SpellChecker.new(dictionary: valid_headers) - headers - .reject { |h| h.blank? || valid_headers.include?(h) || valid_headers.include?(h.sub(/_\d+\z/, '')) } - .index_with { |h| checker.correct(h).first } - end - - def find_empty_column_positions(headers, raw_csv) - headers.each_with_index.filter_map do |h, i| - next if h.present? - has_data = raw_csv.any? { |row| row.fields[i].present? } - i + 1 if has_data + Bulkrax.csv_row_validators.each { |v| v.call(record, row_number, context) } end - end - - def resolve_parent_split_pattern(mappings) - split_val = mappings.dig('parents', 'split') || mappings.dig(:parents, :split) - return nil if split_val.blank? - return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true - - split_val - end - - def extract_validation_items(csv_data, all_ids = Set.new) # rubocop:disable Metrics/MethodLength - child_to_parents = build_child_to_parents_map(csv_data) - collections = [] - works = [] - file_sets = [] - - csv_data.each do |item| - categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets) - end - - [collections, works, file_sets] - end - - def build_child_to_parents_map(csv_data) - Hash.new { |h, k| h[k] = [] }.tap do |map| - csv_data.each do |item| - next if item[:source_identifier].blank? - - parse_relationship_field(item[:children]).each do |child_id| - map[child_id] << item[:source_identifier] - end - end - end - end - - def categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets) # rubocop:disable Metrics/ParameterLists - item_id = item[:source_identifier] - title = item[:raw_row]['title'] || item_id - model_str = item[:model].to_s - - if model_str.casecmp('collection').zero? || model_str.casecmp('collectionresource').zero? - explicit = resolvable_ids(parse_relationship_field(item[:parent]), all_ids) - inferred = resolvable_ids(child_to_parents[item_id] || [], all_ids) - collections << { id: item_id, title: title, type: 'collection', - parentIds: (explicit + inferred).uniq, - childIds: resolvable_ids(parse_relationship_field(item[:children]), all_ids) } - elsif model_str.casecmp('fileset').zero? || model_str.casecmp('hyrax::fileset').zero? - file_sets << { id: item_id, title: title, type: 'file_set' } - else - explicit = resolvable_ids(parse_relationship_field(item[:parent]), all_ids) - inferred = resolvable_ids(child_to_parents[item_id] || [], all_ids) - works << { id: item_id, title: title, type: 'work', - parentIds: (explicit + inferred).uniq, - childIds: resolvable_ids(parse_relationship_field(item[:children]), all_ids) } - end - end - - def parse_relationship_field(value) - return [] if value.blank? - value.to_s.split('|').map(&:strip).reject(&:blank?) - end - - def resolvable_ids(ids, all_ids) - ids.select { |id| all_ids.include?(id) } - end - - def apply_rights_statement_validation_override!(result, missing_required) - only_rights = missing_required.present? && - missing_required.all? { |h| h[:field].to_s == 'rights_statement' } - return unless only_rights && !result[:isValid] - return if result[:headers].blank? - return if result[:missingFiles]&.any? - - result[:isValid] = true - result[:hasWarnings] = true + context[:errors] end end end diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb new file mode 100644 index 00000000..5630200d --- /dev/null +++ b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb @@ -0,0 +1,181 @@ +# frozen_string_literal: true + +module Bulkrax + class CsvParser < ApplicationParser + # Private helper methods for CsvValidation. + module CsvValidationHelpers # rubocop:disable Metrics/ModuleLength + include CsvValidationHierarchy + + # Resolve a symbol key from mappings for use as a record hash key. + # Returns a Symbol matching the parser's symbol-keyed record hash. + def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:) + options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s) + options.first&.to_sym || default + end + + # Parse rows from a CsvEntry.read_data result into the canonical record shape. + # CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows + # are already filtered by CsvWrapper. + def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key) + raw_csv.map do |row| + # CSV::Row#to_h converts symbol headers → string-keyed hash + row_hash = row.to_h.transform_keys(&:to_s) + { + source_identifier: row[source_id_key], + model: row[:model], + parent: row[parent_key], + children: row[children_key], + file: row[file_key], + raw_row: row_hash + } + end + rescue StandardError => e + Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}") + [] + end + + def build_validation_field_metadata(all_models, field_analyzer) + all_models.each_with_object({}) do |model, hash| + field_list = field_analyzer.find_or_create_field_list_for(model_name: model) + hash[model] = { + properties: field_list.dig(model, 'properties') || [], + required_terms: field_list.dig(model, 'required_terms') || [], + controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || [] + } + end + end + + def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata) + svc = ValidationContext.new( + mapping_manager: mapping_manager, + field_analyzer: field_analyzer, + all_models: all_models, + mappings: mappings + ) + all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns + all_cols - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES + rescue StandardError => e + Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}") + standard = %w[model source_identifier parent parents file] + model_fields = field_metadata.values.flat_map { |m| m[:properties] } + (standard + model_fields).uniq + end + + def find_missing_required_headers(headers, field_metadata, mapping_manager) + csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq + missing = [] + field_metadata.each do |model, meta| + (meta[:required_terms] || []).each do |field| + missing << { model: model, field: field } unless csv_keys.include?(field) + end + end + missing.uniq + end + + def find_unrecognized_validation_headers(headers, valid_headers) + checker = DidYouMean::SpellChecker.new(dictionary: valid_headers) + headers + .reject { |h| h.blank? || valid_headers.include?(h) || valid_headers.include?(h.sub(/_\d+\z/, '')) } + .index_with { |h| checker.correct(h).first } + end + + def find_empty_column_positions(headers, raw_csv) + headers.each_with_index.filter_map do |h, i| + next if h.present? + has_data = raw_csv.any? { |row| row.fields[i].present? } + i + 1 if has_data + end + end + + # Adds a missing source_identifier entry to missing_required when the column + # is absent and fill_in_blank_source_identifiers is not configured. + def append_missing_source_id!(missing_required, headers, source_id_key, all_models) + return if headers.map(&:to_s).include?(source_id_key.to_s) + return if Bulkrax.fill_in_blank_source_identifiers.present? + + all_models.each { |model| missing_required << { model: model, field: source_id_key.to_s } } + end + + def apply_rights_statement_validation_override!(result, missing_required) + only_rights = missing_required.present? && + missing_required.all? { |h| h[:field].to_s == 'rights_statement' } + return unless only_rights && !result[:isValid] + return if result[:headers].blank? + return if result[:missingFiles]&.any? + + result[:isValid] = true + result[:hasWarnings] = true + end + + # Assembles the final result hash returned to the guided import UI. + def assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:) # rubocop:disable Metrics/ParameterLists + has_errors = missing_required.any? || headers.blank? || csv_data.empty? || + file_validator.missing_files.any? || row_errors.any? + has_warnings = header_issues[:unrecognized].any? || header_issues[:empty_columns].any? || + file_validator.possible_missing_files? + + { + headers: headers, + missingRequired: missing_required, + unrecognized: header_issues[:unrecognized], + emptyColumns: header_issues[:empty_columns], + rowCount: csv_data.length, + isValid: !has_errors, + hasWarnings: has_warnings, + rowErrors: row_errors, + collections: collections, + works: works, + fileSets: file_sets, + totalItems: csv_data.length, + fileReferences: file_validator.count_references, + missingFiles: file_validator.missing_files, + foundFiles: file_validator.found_files_count, + zipIncluded: file_validator.zip_included? + } + end + + # Builds the find_record lambda used by row validators and hierarchy extraction. + def build_find_record(mapping_manager, mappings) + work_identifier = mapping_manager.resolve_column_name(flag: 'source_identifier', default: 'source').first&.to_s || 'source' + work_identifier_search = Array.wrap(mappings.dig(work_identifier, 'search_field')).first&.to_s || + "#{work_identifier}_sim" + ->(id) { find_record_by_source_identifier(id, work_identifier, work_identifier_search) } + end + + # Attempt to locate an existing repository record by its identifier. + # The identifier may be a Bulkrax source_identifier or a repository object ID. + # This mimics the find behavior of the actual import process, which checks for existing records to determine whether to create or update. + # Since we don't have the full importer context here, we check both the Entry model and the repository directly. + # + # @param identifier [String] + # @param work_identifier [String] the source_identifier property name (e.g. "source") + # @param work_identifier_search [String] the Solr field for source_identifier (e.g. "source_sim") + # @return [Boolean] true if a matching Entry or repository object is found + def find_record_by_source_identifier(identifier, work_identifier, work_identifier_search) + return false if identifier.blank? + + return true if Entry.exists?(identifier: identifier, importerexporter_type: 'Bulkrax::Importer') + return true if Bulkrax.object_factory.find_or_nil(identifier).present? + + [Bulkrax.collection_model_class, *Bulkrax.curation_concerns].any? do |klass| + Bulkrax.object_factory.search_by_property( + value: identifier, + klass: klass, + search_field: work_identifier_search, + name_field: work_identifier + ).present? + end + rescue StandardError + false + end + + def resolve_parent_split_pattern(mappings) + split_val = mappings.dig('parents', 'split') || mappings.dig(:parents, :split) + return nil if split_val.blank? + return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true + + split_val + end + end + end +end diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb new file mode 100644 index 00000000..69188f0b --- /dev/null +++ b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb @@ -0,0 +1,81 @@ +# frozen_string_literal: true + +module Bulkrax + class CsvParser < ApplicationParser + # Hierarchy-building helpers for CsvValidation. Handles extracting and + # categorising items from parsed CSV data for the guided import tree view. + module CsvValidationHierarchy + def extract_validation_items(csv_data, all_ids = Set.new, find_record = nil) + child_to_parents = build_child_to_parents_map(csv_data) + collections = [] + works = [] + file_sets = [] + + csv_data.each do |item| + categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets, find_record) + end + + [collections, works, file_sets] + end + + def build_child_to_parents_map(csv_data) + Hash.new { |h, k| h[k] = [] }.tap do |map| + csv_data.each do |item| + next if item[:source_identifier].blank? + + parse_relationship_field(item[:children]).each do |child_id| + map[child_id] << item[:source_identifier] + end + end + end + end + + def categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets, find_record = nil) # rubocop:disable Metrics/ParameterLists + item_id = item[:source_identifier] + model_str = item[:model].to_s + + if model_str.casecmp('collection').zero? || model_str.casecmp('collectionresource').zero? + collections << build_item_hash(item, child_to_parents, all_ids, find_record, type: 'collection') + elsif model_str.casecmp('fileset').zero? || model_str.casecmp('hyrax::fileset').zero? + file_sets << { id: item_id, title: item[:raw_row]['title'] || item_id, type: 'file_set' } + else + works << build_item_hash(item, child_to_parents, all_ids, find_record, type: 'work') + end + end + + def build_item_hash(item, child_to_parents, all_ids, find_record, type:) + item_id = item[:source_identifier] + title = item[:raw_row]['title'] || item_id + parents = parse_relationship_field(item[:parent]) + children = parse_relationship_field(item[:children]) + + { + id: item_id, + title: title, + type: type, + parentIds: (resolvable_ids(parents, all_ids) + resolvable_ids(child_to_parents[item_id] || [], all_ids)).uniq, + childIds: resolvable_ids(children, all_ids), + existingParentIds: external_ids(parents, all_ids, find_record), + existingChildIds: external_ids(children, all_ids, find_record) + } + end + + def parse_relationship_field(value) + return [] if value.blank? + value.to_s.split('|').map(&:strip).reject(&:blank?) + end + + def resolvable_ids(ids, all_ids) + ids.select { |id| all_ids.include?(id) } + end + + # Returns ids from the list that are NOT in the CSV but exist in the repository. + def external_ids(ids, all_ids, find_record) + return [] if find_record.nil? + + ids.reject { |id| all_ids.include?(id) } + .select { |id| find_record.call(id) } + end + end + end +end diff --git a/app/validators/bulkrax/csv_row/child_reference.rb b/app/validators/bulkrax/csv_row/child_reference.rb new file mode 100644 index 00000000..4a464ae5 --- /dev/null +++ b/app/validators/bulkrax/csv_row/child_reference.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module Bulkrax + module CsvRow + ## + # Validates that any child references in a row point to source identifiers + # that exist either elsewhere in the same CSV or as existing repository records. + # Uses context[:all_ids] (Set of all source identifiers) to validate references + # within the CSV, and context[:find_record_by_source_identifier] (callable) to + # look up existing records in the same way the importer does at runtime. + # Skips validation when all_ids is empty and fill_in_blank_source_identifiers is + # configured, since generated identifiers cannot be cross-referenced at validation time. + module ChildReference + def self.call(record, row_index, context) + children = record[:children] + return if children.blank? + + all_ids = context[:all_ids] + return if all_ids.empty? && Bulkrax.fill_in_blank_source_identifiers.present? + + find_record = context[:find_record_by_source_identifier] + child_ids = children.to_s.split('|').map(&:strip).reject(&:blank?) + + child_ids.each do |child_id| + next if all_ids.include?(child_id) + next if find_record&.call(child_id) + + context[:errors] << { + row: row_index, + source_identifier: record[:source_identifier], + severity: 'error', + category: 'invalid_child_reference', + column: 'children', + value: child_id, + message: I18n.t('bulkrax.importer.guided_import.validation.child_reference_validator.errors.message', + value: child_id, + field: 'source_identifier'), + suggestion: I18n.t('bulkrax.importer.guided_import.validation.child_reference_validator.errors.suggestion') + } + end + end + end + end +end diff --git a/app/validators/bulkrax/csv_row/parent_reference.rb b/app/validators/bulkrax/csv_row/parent_reference.rb index fd64f410..f1e6a4f8 100644 --- a/app/validators/bulkrax/csv_row/parent_reference.rb +++ b/app/validators/bulkrax/csv_row/parent_reference.rb @@ -4,8 +4,10 @@ module Bulkrax module CsvRow ## # Validates that any parent references in a row point to source identifiers - # that exist elsewhere in the same CSV. - # Uses context[:all_ids] (Set of all source identifiers) to validate references. + # that exist either elsewhere in the same CSV or as existing repository records. + # Uses context[:all_ids] (Set of all source identifiers) to validate references + # within the CSV, and context[:find_record_by_source_identifier] (callable) to + # look up existing records in the same way the importer does at runtime. # Uses context[:parent_split_pattern] (String/Regexp, may be nil) for multi-value splitting. module ParentReference def self.call(record, row_index, context) @@ -14,6 +16,7 @@ def self.call(record, row_index, context) all_ids = context[:all_ids] split_pattern = context[:parent_split_pattern] + find_record = context[:find_record_by_source_identifier] parent_ids = if split_pattern parents.to_s.split(split_pattern).map(&:strip).reject(&:blank?) @@ -23,6 +26,7 @@ def self.call(record, row_index, context) parent_ids.each do |parent_id| next if all_ids.include?(parent_id) + next if find_record&.call(parent_id) context[:errors] << { row: row_index, diff --git a/config/locales/bulkrax.en.yml b/config/locales/bulkrax.en.yml index a39529f3..f13d6efb 100644 --- a/config/locales/bulkrax.en.yml +++ b/config/locales/bulkrax.en.yml @@ -209,6 +209,8 @@ en: review_total: "%{total} total — %{collections} collections, %{works} works, %{file_sets} file sets" review_visibility: 'Visibility:' server_error: Server error during validation. Please try again or contact support. + existing_record_badge: existing + existing_record_title: This record already exists in the repository and will be linked during import shared_badge: shared starting: Starting... upload_csv_and_zip: CSV + files uploaded separately @@ -340,6 +342,10 @@ en: errors: message: "Referenced parent '%{value}' does not exist as a %{field} in this CSV." suggestion: "Check for typos or add the parent record." + child_reference_validator: + errors: + message: "Referenced child '%{value}' does not exist as a %{field} in this CSV." + suggestion: "Check for typos or add the child record." passed: Validation Passed passed_warnings: Validation Passed with Warnings recognized_fields: 'Recognized fields: %{fields}' diff --git a/lib/bulkrax.rb b/lib/bulkrax.rb index 4f8b10b2..12649e29 100644 --- a/lib/bulkrax.rb +++ b/lib/bulkrax.rb @@ -183,6 +183,7 @@ def csv_row_validators @csv_row_validators ||= [ Bulkrax::CsvRow::DuplicateIdentifier, Bulkrax::CsvRow::ParentReference, + Bulkrax::CsvRow::ChildReference, Bulkrax::CsvRow::RequiredValues, Bulkrax::CsvRow::ControlledVocabulary ] diff --git a/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb b/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb new file mode 100644 index 00000000..7fe7c6d7 --- /dev/null +++ b/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb @@ -0,0 +1,207 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe Bulkrax::CsvParser::CsvValidationHelpers do + # Minimal host object that mixes in the concern under test. + let(:host) do + Object.new.tap { |o| o.extend(described_class) } + end + + # All specs in this file exercise the Valkyrie path. ActiveFedora / Wings is + # not verified to work with this feature, so we configure the factory + # globally for the file rather than repeating it in every context. + before { Bulkrax.object_factory = Bulkrax::ValkyrieObjectFactory } + after { Bulkrax.object_factory = Bulkrax::ObjectFactory } + + describe '#find_record_by_source_identifier' do + let(:work_identifier) { 'source' } + let(:work_identifier_search) { 'source_sim' } + + def find(id) + host.find_record_by_source_identifier(id, work_identifier, work_identifier_search) + end + + context 'when the identifier is blank' do + it 'returns false for nil' do + expect(find(nil)).to be false + end + + it 'returns false for an empty string' do + expect(find('')).to be false + end + end + + context 'when a matching Bulkrax::Entry exists in the database' do + let!(:importer) { FactoryBot.create(:bulkrax_importer) } + let!(:entry) { FactoryBot.create(:bulkrax_csv_entry, identifier: 'entry_id_001', importerexporter: importer) } + + it 'returns true without querying the repository' do + # ValkyrieObjectFactory.find delegates to Hyrax.query_service.find_by; + # the Entry short-circuit means it should never be reached. + expect(Hyrax.query_service).not_to receive(:find_by) + expect(find('entry_id_001')).to be true + end + end + + context 'when no Entry exists but the repository has a matching object by ID' do + # ValkyrieObjectFactory.find_or_nil calls ValkyrieObjectFactory.find which + # calls Hyrax.query_service.find_by(id:). Stub at that level so we verify + # the full Valkyrie call chain. + before do + allow(Hyrax.query_service).to receive(:find_by) + .with(id: 'repo-uuid-001') + .and_return(instance_double(Hyrax::Work)) + end + + it 'returns true' do + expect(find('repo-uuid-001')).to be true + end + + it 'does not fall through to search_by_property' do + expect(Bulkrax::ValkyrieObjectFactory).not_to receive(:search_by_property) + find('repo-uuid-001') + end + end + + context 'when no Entry exists and find_or_nil returns nil' do + # ValkyrieObjectFactory.find raises ObjectNotFoundError when the object + # does not exist; find_or_nil rescues that to nil. + before do + allow(Hyrax.query_service).to receive(:find_by) + .and_raise(Hyrax::ObjectNotFoundError) + end + + context 'when search_by_property finds a match on one of the model classes' do + before do + allow(Bulkrax).to receive(:collection_model_class).and_return(Collection) + allow(Bulkrax).to receive(:curation_concerns).and_return([Work]) + + # Collection misses, Work hits. + allow(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property) + .with(value: 'custom_source_001', klass: Collection, + search_field: work_identifier_search, name_field: work_identifier) + .and_return(nil) + allow(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property) + .with(value: 'custom_source_001', klass: Work, + search_field: work_identifier_search, name_field: work_identifier) + .and_return(instance_double(Hyrax::Work)) + end + + it 'returns true' do + expect(find('custom_source_001')).to be true + end + end + + context 'when search_by_property finds nothing across all model classes' do + before do + allow(Bulkrax).to receive(:collection_model_class).and_return(Collection) + allow(Bulkrax).to receive(:curation_concerns).and_return([Work]) + allow(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).and_return(nil) + end + + it 'returns false' do + expect(find('nonexistent_id')).to be false + end + end + + context 'when search_by_property is called with the correct field arguments' do + let(:work_identifier) { 'local_id' } + let(:work_identifier_search) { 'local_id_sim' } + + before do + allow(Bulkrax).to receive(:collection_model_class).and_return(Collection) + allow(Bulkrax).to receive(:curation_concerns).and_return([]) + allow(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).and_return(nil) + end + + it 'passes the resolved work_identifier and search field through to search_by_property' do + expect(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).with( + value: 'some_local_id', + klass: Collection, + search_field: 'local_id_sim', + name_field: 'local_id' + ) + find('some_local_id') + end + end + end + + context 'when an exception is raised during lookup' do + before do + allow(Bulkrax::Entry).to receive(:exists?).and_raise(StandardError, 'DB unavailable') + end + + it 'returns false instead of propagating the error' do + expect(find('some_id')).to be false + end + end + end + + describe '#build_find_record' do + let(:mapping_manager) { instance_double(Bulkrax::CsvTemplate::MappingManager) } + let(:mappings) { {} } + + before do + allow(Hyrax.query_service).to receive(:find_by).and_raise(Hyrax::ObjectNotFoundError) + allow(Bulkrax).to receive(:collection_model_class).and_return(Collection) + allow(Bulkrax).to receive(:curation_concerns).and_return([Work]) + allow(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).and_return(nil) + end + + context 'with default source_identifier mapping' do + before do + allow(mapping_manager).to receive(:resolve_column_name) + .with(flag: 'source_identifier', default: 'source') + .and_return(['source']) + end + + it 'returns a callable lambda' do + lam = host.build_find_record(mapping_manager, mappings) + expect(lam).to respond_to(:call) + end + + it 'defaults the search field to _sim when no search_field mapping is present' do + lam = host.build_find_record(mapping_manager, mappings) + expect(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).with( + hash_including(search_field: 'source_sim', name_field: 'source') + ).and_return(nil) + lam.call('anything') + end + end + + context 'when the mapping provides a custom search_field' do + let(:mappings) { { 'local_id' => { 'search_field' => 'local_id_tesim' } } } + + before do + allow(mapping_manager).to receive(:resolve_column_name) + .with(flag: 'source_identifier', default: 'source') + .and_return(['local_id']) + end + + it 'uses the mapped search_field instead of the default _sim suffix' do + lam = host.build_find_record(mapping_manager, mappings) + expect(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).with( + hash_including(search_field: 'local_id_tesim', name_field: 'local_id') + ).and_return(nil) + lam.call('anything') + end + end + + context 'when resolve_column_name returns nothing' do + before do + allow(mapping_manager).to receive(:resolve_column_name) + .with(flag: 'source_identifier', default: 'source') + .and_return([]) + end + + it 'falls back to "source" as the work_identifier' do + lam = host.build_find_record(mapping_manager, mappings) + expect(Bulkrax::ValkyrieObjectFactory).to receive(:search_by_property).with( + hash_including(search_field: 'source_sim', name_field: 'source') + ).and_return(nil) + lam.call('anything') + end + end + end +end diff --git a/spec/validators/bulkrax/csv_row/child_reference_spec.rb b/spec/validators/bulkrax/csv_row/child_reference_spec.rb new file mode 100644 index 00000000..d788e7c9 --- /dev/null +++ b/spec/validators/bulkrax/csv_row/child_reference_spec.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe Bulkrax::CsvRow::ChildReference do + def make_context(all_ids: Set.new(%w[col1 work1]), find_record: nil) + { errors: [], warnings: [], all_ids: all_ids, parent_split_pattern: nil, + find_record_by_source_identifier: find_record } + end + + def make_record(children: nil) + { source_identifier: 'col1', model: 'Collection', children: children, raw_row: {} } + end + + it 'adds no error when children field is blank' do + context = make_context + described_class.call(make_record(children: nil), 2, context) + expect(context[:errors]).to be_empty + end + + it 'adds no error when the child exists in the CSV' do + context = make_context + described_class.call(make_record(children: 'work1'), 2, context) + expect(context[:errors]).to be_empty + end + + it 'adds an error when the child does not exist in the CSV' do + context = make_context + described_class.call(make_record(children: 'missing_child'), 2, context) + expect(context[:errors].length).to eq(1) + expect(context[:errors].first[:category]).to eq('invalid_child_reference') + expect(context[:errors].first[:value]).to eq('missing_child') + end + + it 'adds an error for each unresolvable id in a pipe-separated list' do + context = make_context + described_class.call(make_record(children: 'work1|missing1|missing2'), 2, context) + expect(context[:errors].length).to eq(2) + expect(context[:errors].map { |e| e[:value] }).to contain_exactly('missing1', 'missing2') + end + + it 'adds no error when the child is not in the CSV but exists as a repository record' do + find_record = ->(id) { id == 'existing_repo_child' } + context = make_context(find_record: find_record) + described_class.call(make_record(children: 'existing_repo_child'), 2, context) + expect(context[:errors]).to be_empty + end + + it 'adds an error when the child is not in the CSV and not found in the repository' do + find_record = ->(_id) { false } + context = make_context(find_record: find_record) + described_class.call(make_record(children: 'truly_missing'), 2, context) + expect(context[:errors].length).to eq(1) + expect(context[:errors].first[:category]).to eq('invalid_child_reference') + end + + it 'resolves mixed pipe-separated ids using both CSV and repository lookup' do + find_record = ->(id) { id == 'repo_child' } + context = make_context(find_record: find_record) + described_class.call(make_record(children: 'work1|repo_child|truly_missing'), 2, context) + expect(context[:errors].length).to eq(1) + expect(context[:errors].first[:value]).to eq('truly_missing') + end + + context 'when fill_in_blank_source_identifiers is configured and all_ids is empty' do + before do + allow(Bulkrax).to receive(:fill_in_blank_source_identifiers) + .and_return(->(_parser, _index) { SecureRandom.uuid }) + end + + it 'skips the check — child ids cannot be validated against generated identifiers' do + context = make_context(all_ids: Set.new) + described_class.call(make_record(children: 'bcd123'), 2, context) + expect(context[:errors]).to be_empty + end + end +end diff --git a/spec/validators/bulkrax/csv_row/parent_reference_spec.rb b/spec/validators/bulkrax/csv_row/parent_reference_spec.rb index 53ba10f9..9ff0ae22 100644 --- a/spec/validators/bulkrax/csv_row/parent_reference_spec.rb +++ b/spec/validators/bulkrax/csv_row/parent_reference_spec.rb @@ -3,8 +3,9 @@ require 'rails_helper' RSpec.describe Bulkrax::CsvRow::ParentReference do - def make_context(all_ids: Set.new(%w[col1 work1])) - { errors: [], warnings: [], all_ids: all_ids, parent_split_pattern: nil } + def make_context(all_ids: Set.new(%w[col1 work1]), find_record: nil) + { errors: [], warnings: [], all_ids: all_ids, parent_split_pattern: nil, + find_record_by_source_identifier: find_record } end def make_record(parent: nil) @@ -30,4 +31,19 @@ def make_record(parent: nil) expect(context[:errors].first[:category]).to eq('invalid_parent_reference') expect(context[:errors].first[:value]).to eq('missing_parent') end + + it 'adds no error when the parent is not in the CSV but exists as a repository record' do + find_record = ->(id) { id == 'existing_repo_parent' } + context = make_context(find_record: find_record) + described_class.call(make_record(parent: 'existing_repo_parent'), 2, context) + expect(context[:errors]).to be_empty + end + + it 'adds an error when the parent is not in the CSV and not found in the repository' do + find_record = ->(_id) { false } + context = make_context(find_record: find_record) + described_class.call(make_record(parent: 'truly_missing'), 2, context) + expect(context[:errors].length).to eq(1) + expect(context[:errors].first[:category]).to eq('invalid_parent_reference') + end end