diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb index e4231b39..fc483605 100644 --- a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +++ b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb @@ -112,7 +112,8 @@ def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadat children_column: resolve_relationship_column(mappings, 'related_children_field_mapping', 'children'), mappings: mappings, field_metadata: field_metadata, - find_record_by_source_identifier: find_record + find_record_by_source_identifier: find_record, + relationship_graph: build_relationship_graph(csv_data, mappings) } csv_data.each_with_index do |record, index| row_number = index + 2 # 1-indexed, plus header row diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb index e30b7a21..42168d9d 100644 --- a/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +++ b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb @@ -192,6 +192,64 @@ def resolve_children_split_pattern(mappings) split_val end + + # Builds a graph of { source_identifier => [parent_ids] } from all CSV records. + # Used by CircularReference validator to detect cycles across the whole CSV. + # + # Parent edges are collected from both directions: + # - explicit parent declarations (parents / parents_N columns) + # - inverted child declarations (children / children_N columns), mirroring + # the normalisation done in importers_stepper.js#normalizeRelationships + def build_relationship_graph(csv_data, mappings) + parent_column = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents') + children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children') + parent_suffix = /\A#{Regexp.escape(parent_column)}_\d+\z/ + children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/ + + graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings)) + invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings)) + graph + end + + def build_parent_edges(csv_data, suffix_pattern, split_pattern) + csv_data.each_with_object({}) do |record, graph| + id = record[:source_identifier] + next if id.blank? + + base_ids = split_or_single(record[:parent], split_pattern) + suffix_ids = suffixed_values(record[:raw_row], suffix_pattern) + graph[id] = (base_ids + suffix_ids).uniq + end + end + + def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern) + csv_data.each do |record| + id = record[:source_identifier] + next if id.blank? + + child_ids = split_or_single(record[:children], split_pattern) + + suffixed_values(record[:raw_row], suffix_pattern) + child_ids.each do |child_id| + graph[child_id] ||= [] + graph[child_id] << id unless graph[child_id].include?(id) + end + end + end + + def split_or_single(value, split_pattern) + if split_pattern + value.to_s.split(split_pattern).map(&:strip).reject(&:blank?) + elsif value.present? + [value.to_s.strip] + else + [] + end + end + + def suffixed_values(raw_row, suffix_pattern) + raw_row.select { |k, _| k.to_s.match?(suffix_pattern) } + .values.map(&:to_s).map(&:strip).reject(&:blank?) + end end end end diff --git a/app/validators/bulkrax/csv_row/circular_reference.rb b/app/validators/bulkrax/csv_row/circular_reference.rb new file mode 100644 index 00000000..cdcaf185 --- /dev/null +++ b/app/validators/bulkrax/csv_row/circular_reference.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +module Bulkrax + module CsvRow + ## + # Detects circular parent-child relationships in the CSV. + # A circular reference occurs when following the parent chain from any record + # eventually leads back to itself (e.g. A→B→C→A). + # + # The validator builds a directed graph (child → parents) from all records on + # first invocation and caches the set of all record ids involved in any cycle. + # Subsequent per-row calls simply check membership in that set. + # + # Requires context key: + # :relationship_graph – Hash { source_identifier => [parent_ids] } built by + # run_row_validators before iterating rows. + module CircularReference + def self.call(record, row_index, context) + cycle_ids = context[:circular_reference_ids] ||= detect_cycle_ids(context[:relationship_graph] || {}) + return unless cycle_ids.include?(record[:source_identifier]) + + context[:errors] << { + row: row_index, + source_identifier: record[:source_identifier], + severity: 'error', + category: 'circular_reference', + column: 'parents', + value: record[:source_identifier], + message: I18n.t('bulkrax.importer.guided_import.validation.circular_reference_validator.errors.message', + value: record[:source_identifier]), + suggestion: I18n.t('bulkrax.importer.guided_import.validation.circular_reference_validator.errors.suggestion') + } + end + + # Returns the set of all source identifiers that participate in at least one cycle. + # Uses recursive DFS with a per-branch ancestry set to detect back-edges. + def self.detect_cycle_ids(graph) + all_nodes = graph.keys.to_set | graph.values.flatten.to_set + visited = Set.new + cycle_ids = Set.new + + all_nodes.each do |node| + next if visited.include?(node) + dfs(node, graph, visited, [], cycle_ids) + end + + cycle_ids + end + private_class_method :detect_cycle_ids + + def self.dfs(node, graph, visited, ancestors, cycle_ids) # rubocop:disable Metrics/MethodLength + visited.add(node) + ancestors.push(node) + + (graph[node] || []).each do |neighbor| + if ancestors.include?(neighbor) + # Back-edge found: mark every node in the cycle path + cycle_start = ancestors.index(neighbor) + ancestors[cycle_start..].each { |n| cycle_ids.add(n) } + cycle_ids.add(neighbor) + elsif !visited.include?(neighbor) + dfs(neighbor, graph, visited, ancestors, cycle_ids) + end + end + + ancestors.pop + end + private_class_method :dfs + end + end +end diff --git a/config/locales/bulkrax.en.yml b/config/locales/bulkrax.en.yml index f13d6efb..33588ad8 100644 --- a/config/locales/bulkrax.en.yml +++ b/config/locales/bulkrax.en.yml @@ -346,6 +346,10 @@ en: errors: message: "Referenced child '%{value}' does not exist as a %{field} in this CSV." suggestion: "Check for typos or add the child record." + circular_reference_validator: + errors: + message: "'%{value}' is part of a circular parent-child relationship." + suggestion: "Review the parent/child columns and remove the incorrect references." passed: Validation Passed passed_warnings: Validation Passed with Warnings recognized_fields: 'Recognized fields: %{fields}' diff --git a/lib/bulkrax.rb b/lib/bulkrax.rb index 12649e29..c05ad29e 100644 --- a/lib/bulkrax.rb +++ b/lib/bulkrax.rb @@ -184,6 +184,7 @@ def csv_row_validators Bulkrax::CsvRow::DuplicateIdentifier, Bulkrax::CsvRow::ParentReference, Bulkrax::CsvRow::ChildReference, + Bulkrax::CsvRow::CircularReference, Bulkrax::CsvRow::RequiredValues, Bulkrax::CsvRow::ControlledVocabulary ] diff --git a/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb b/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb index 04d6a4f4..5a51719b 100644 --- a/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb +++ b/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb @@ -189,6 +189,93 @@ def find(id) end end + describe '#build_relationship_graph' do + let(:mappings) { {} } + + def record(source_identifier, parent: nil, children: nil, raw_row: {}) + { source_identifier: source_identifier, parent: parent, children: children, raw_row: raw_row } + end + + def graph(csv_data) + host.build_relationship_graph(csv_data, mappings) + end + + context 'with only parent declarations' do + it 'maps each record to its declared parents' do + data = [record('child', parent: 'parent1'), record('parent1')] + expect(graph(data)).to include('child' => ['parent1'], 'parent1' => []) + end + end + + context 'with suffixed parent columns (parents_1, parents_2)' do + it 'includes all suffixed parent values' do + data = [ + record('child', raw_row: { 'parents_1' => 'p1', 'parents_2' => 'p2' }), + record('p1'), + record('p2') + ] + result = graph(data) + expect(result['child']).to contain_exactly('p1', 'p2') + end + end + + context 'with only children declarations' do + it 'inverts children into parent edges on the child records' do + data = [record('parent1', children: 'child1'), record('child1')] + result = graph(data) + expect(result['child1']).to include('parent1') + end + end + + context 'with suffixed children columns (children_1, children_2)' do + it 'inverts all suffixed children into parent edges' do + data = [ + record('parent1', raw_row: { 'children_1' => 'c1', 'children_2' => 'c2' }), + record('c1'), + record('c2') + ] + result = graph(data) + expect(result['c1']).to include('parent1') + expect(result['c2']).to include('parent1') + end + end + + context 'with a cycle declared via children columns (matching rel-circular-ref.csv pattern)' do + # child1 declares children=child2; child2 declares children=child1 + # After inversion: child1 → [child2] and child2 → [child1] + it 'builds a graph that allows cycle detection to flag both nodes' do + data = [ + record('parent1'), + record('parent2'), + record('child1', children: 'child2', raw_row: { 'parents_1' => 'parent1', 'parents_2' => 'parent2' }), + record('child2', children: 'child1') + ] + result = graph(data) + expect(result['child1']).to include('child2') + expect(result['child2']).to include('child1') + end + end + + context 'when a child is already declared as a parent of the same record' do + it 'does not add duplicate edges' do + data = [ + record('a', parent: 'b'), + record('b', children: 'a') + ] + result = graph(data) + expect(result['a'].count('b')).to eq(1) + end + end + + context 'with blank source_identifier' do + it 'skips the record' do + data = [record(nil, parent: 'p1'), record('p1')] + result = graph(data) + expect(result.keys).not_to include(nil) + end + end + end + describe '#build_find_record' do let(:mapping_manager) { instance_double(Bulkrax::CsvTemplate::MappingManager) } let(:mappings) { {} } diff --git a/spec/validators/bulkrax/csv_row/circular_reference_spec.rb b/spec/validators/bulkrax/csv_row/circular_reference_spec.rb new file mode 100644 index 00000000..1772b51d --- /dev/null +++ b/spec/validators/bulkrax/csv_row/circular_reference_spec.rb @@ -0,0 +1,134 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe Bulkrax::CsvRow::CircularReference do + def make_context(graph) + { errors: [], relationship_graph: graph } + end + + def make_record(source_identifier) + { source_identifier: source_identifier } + end + + def call(record, context, row: 2) + described_class.call(record, row, context) + end + + # ─── detect_cycle_ids ──────────────────────────────────────────────────────── + + describe '.detect_cycle_ids (via call)' do + context 'with no relationships' do + it 'produces no errors' do + ctx = make_context('work1' => [], 'work2' => []) + call(make_record('work1'), ctx) + expect(ctx[:errors]).to be_empty + end + end + + context 'with a simple linear chain (no cycle)' do + # col1 ← work1 ← work2 (work1 has parent col1, work2 has parent work1) + it 'produces no errors' do + graph = { 'col1' => [], 'work1' => ['col1'], 'work2' => ['work1'] } + ctx = make_context(graph) + %w[col1 work1 work2].each { |id| call(make_record(id), ctx) } + expect(ctx[:errors]).to be_empty + end + end + + context 'with a direct self-reference (A → A)' do + it 'flags the self-referencing record' do + ctx = make_context('work1' => ['work1']) + call(make_record('work1'), ctx) + expect(ctx[:errors].length).to eq(1) + expect(ctx[:errors].first[:source_identifier]).to eq('work1') + expect(ctx[:errors].first[:category]).to eq('circular_reference') + end + end + + context 'with a two-node cycle (A → B → A)' do + it 'flags both records in the cycle' do + graph = { 'work1' => ['work2'], 'work2' => ['work1'] } + ctx = make_context(graph) + call(make_record('work1'), ctx, row: 2) + call(make_record('work2'), ctx, row: 3) + ids = ctx[:errors].map { |e| e[:source_identifier] } + expect(ids).to contain_exactly('work1', 'work2') + end + end + + context 'with a three-node cycle (A → B → C → A)' do + it 'flags all three records' do + graph = { 'a' => ['c'], 'b' => ['a'], 'c' => ['b'] } + ctx = make_context(graph) + %w[a b c].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) } + ids = ctx[:errors].map { |e| e[:source_identifier] } + expect(ids).to contain_exactly('a', 'b', 'c') + end + end + + context 'with a cycle on a branch (D → E → F → E)' do + it 'flags only the nodes in the cycle, not the entry node' do + # D has parent E; E has parent F; F has parent E → cycle between E and F + graph = { 'd' => ['e'], 'e' => ['f'], 'f' => ['e'] } + ctx = make_context(graph) + %w[d e f].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) } + ids = ctx[:errors].map { |e| e[:source_identifier] } + expect(ids).to include('e', 'f') + expect(ids).not_to include('d') + end + end + + context 'with a cycle and unrelated records' do + it 'does not flag records outside the cycle' do + graph = { 'col1' => [], 'work1' => ['col1'], 'a' => ['b'], 'b' => ['a'] } + ctx = make_context(graph) + %w[col1 work1 a b].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) } + ids = ctx[:errors].map { |e| e[:source_identifier] } + expect(ids).to contain_exactly('a', 'b') + end + end + end + + # ─── caching ───────────────────────────────────────────────────────────────── + + describe 'cycle detection caching' do + it 'only detects cycles once per context and reuses the result' do + graph = { 'a' => ['b'], 'b' => ['a'] } + ctx = make_context(graph) + + expect(described_class).to receive(:detect_cycle_ids).once.and_call_original + + call(make_record('a'), ctx, row: 2) + call(make_record('b'), ctx, row: 3) + end + end + + # ─── error shape ───────────────────────────────────────────────────────────── + + describe 'error message content' do + it 'includes the expected keys' do + ctx = make_context('work1' => ['work1']) + call(make_record('work1'), ctx) + error = ctx[:errors].first + expect(error).to include( + row: 2, + source_identifier: 'work1', + severity: 'error', + category: 'circular_reference' + ) + expect(error[:message]).to be_present + expect(error[:suggestion]).to be_present + end + end + + # ─── empty graph ───────────────────────────────────────────────────────────── + + describe 'when relationship_graph is absent from context' do + it 'does not raise and adds no errors' do + ctx = { errors: [] } + expect { call(make_record('work1'), ctx) }.not_to raise_error + expect(ctx[:errors]).to be_empty + end + end +end