samvera · laritakr · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb
@@ -112,7 +112,8 @@ def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadat
             children_column: resolve_relationship_column(mappings, 'related_children_field_mapping', 'children'),
             mappings: mappings,
             field_metadata: field_metadata,
-            find_record_by_source_identifier: find_record
+            find_record_by_source_identifier: find_record,
+            relationship_graph: build_relationship_graph(csv_data, mappings)
           }
           csv_data.each_with_index do |record, index|
             row_number = index + 2 # 1-indexed, plus header row

diff --git a/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb b/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb
@@ -192,6 +192,64 @@ def resolve_children_split_pattern(mappings)
 
         split_val
       end
+
+      # Builds a graph of { source_identifier => [parent_ids] } from all CSV records.
+      # Used by CircularReference validator to detect cycles across the whole CSV.
+      #
+      # Parent edges are collected from both directions:
+      #   - explicit parent declarations (parents / parents_N columns)
+      #   - inverted child declarations (children / children_N columns), mirroring
+      #     the normalisation done in importers_stepper.js#normalizeRelationships
+      def build_relationship_graph(csv_data, mappings)
+        parent_column   = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents')
+        children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children')
+        parent_suffix   = /\A#{Regexp.escape(parent_column)}_\d+\z/
+        children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/
+
+        graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings))
+        invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings))
+        graph
+      end
+
+      def build_parent_edges(csv_data, suffix_pattern, split_pattern)
+        csv_data.each_with_object({}) do |record, graph|
+          id = record[:source_identifier]
+          next if id.blank?
+
+          base_ids = split_or_single(record[:parent], split_pattern)
+          suffix_ids = suffixed_values(record[:raw_row], suffix_pattern)
+          graph[id] = (base_ids + suffix_ids).uniq
+        end
+      end
+
+      def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern)
+        csv_data.each do |record|
+          id = record[:source_identifier]
+          next if id.blank?
+
+          child_ids = split_or_single(record[:children], split_pattern) +
+                      suffixed_values(record[:raw_row], suffix_pattern)
+          child_ids.each do |child_id|
+            graph[child_id] ||= []
+            graph[child_id] << id unless graph[child_id].include?(id)
+          end
+        end
+      end
+
+      def split_or_single(value, split_pattern)
+        if split_pattern
+          value.to_s.split(split_pattern).map(&:strip).reject(&:blank?)
+        elsif value.present?
+          [value.to_s.strip]
+        else
+          []
+        end
+      end
+
+      def suffixed_values(raw_row, suffix_pattern)
+        raw_row.select { |k, _| k.to_s.match?(suffix_pattern) }
+               .values.map(&:to_s).map(&:strip).reject(&:blank?)
+      end
     end
   end
 end
diff --git a/app/validators/bulkrax/csv_row/circular_reference.rb b/app/validators/bulkrax/csv_row/circular_reference.rb
@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+
+module Bulkrax
+  module CsvRow
+    ##
+    # Detects circular parent-child relationships in the CSV.
+    # A circular reference occurs when following the parent chain from any record
+    # eventually leads back to itself (e.g. A→B→C→A).
+    #
+    # The validator builds a directed graph (child → parents) from all records on
+    # first invocation and caches the set of all record ids involved in any cycle.
+    # Subsequent per-row calls simply check membership in that set.
+    #
+    # Requires context key:
+    #   :relationship_graph  – Hash { source_identifier => [parent_ids] } built by
+    #                          run_row_validators before iterating rows.
+    module CircularReference
+      def self.call(record, row_index, context)
+        cycle_ids = context[:circular_reference_ids] ||= detect_cycle_ids(context[:relationship_graph] || {})
+        return unless cycle_ids.include?(record[:source_identifier])
+
+        context[:errors] << {
+          row: row_index,
+          source_identifier: record[:source_identifier],
+          severity: 'error',
+          category: 'circular_reference',
+          column: 'parents',
+          value: record[:source_identifier],
+          message: I18n.t('bulkrax.importer.guided_import.validation.circular_reference_validator.errors.message',
+                          value: record[:source_identifier]),
+          suggestion: I18n.t('bulkrax.importer.guided_import.validation.circular_reference_validator.errors.suggestion')
+        }
+      end
+
+      # Returns the set of all source identifiers that participate in at least one cycle.
+      # Uses recursive DFS with a per-branch ancestry set to detect back-edges.
+      def self.detect_cycle_ids(graph)
+        all_nodes = graph.keys.to_set | graph.values.flatten.to_set
+        visited   = Set.new
+        cycle_ids = Set.new
+
+        all_nodes.each do |node|
+          next if visited.include?(node)
+          dfs(node, graph, visited, [], cycle_ids)
+        end
+
+        cycle_ids
+      end
+      private_class_method :detect_cycle_ids
+
+      def self.dfs(node, graph, visited, ancestors, cycle_ids) # rubocop:disable Metrics/MethodLength
+        visited.add(node)
+        ancestors.push(node)
+
+        (graph[node] || []).each do |neighbor|
+          if ancestors.include?(neighbor)
+            # Back-edge found: mark every node in the cycle path
+            cycle_start = ancestors.index(neighbor)
+            ancestors[cycle_start..].each { |n| cycle_ids.add(n) }
+            cycle_ids.add(neighbor)
+          elsif !visited.include?(neighbor)
+            dfs(neighbor, graph, visited, ancestors, cycle_ids)
+          end
+        end
+
+        ancestors.pop
+      end
+      private_class_method :dfs
+    end
+  end
+end
diff --git a/config/locales/bulkrax.en.yml b/config/locales/bulkrax.en.yml
@@ -346,6 +346,10 @@ en:
             errors:
               message: "Referenced child '%{value}' does not exist as a %{field} in this CSV."
               suggestion: "Check for typos or add the child record."
+          circular_reference_validator:
+            errors:
+              message: "'%{value}' is part of a circular parent-child relationship."
+              suggestion: "Review the parent/child columns and remove the incorrect references."
           passed: Validation Passed
           passed_warnings: Validation Passed with Warnings
           recognized_fields: 'Recognized fields: %{fields}'

diff --git a/lib/bulkrax.rb b/lib/bulkrax.rb
@@ -184,6 +184,7 @@ def csv_row_validators
         Bulkrax::CsvRow::DuplicateIdentifier,
         Bulkrax::CsvRow::ParentReference,
         Bulkrax::CsvRow::ChildReference,
+        Bulkrax::CsvRow::CircularReference,
         Bulkrax::CsvRow::RequiredValues,
         Bulkrax::CsvRow::ControlledVocabulary
       ]

diff --git a/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb b/spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb
@@ -189,6 +189,93 @@ def find(id)
     end
   end
 
+  describe '#build_relationship_graph' do
+    let(:mappings) { {} }
+
+    def record(source_identifier, parent: nil, children: nil, raw_row: {})
+      { source_identifier: source_identifier, parent: parent, children: children, raw_row: raw_row }
+    end
+
+    def graph(csv_data)
+      host.build_relationship_graph(csv_data, mappings)
+    end
+
+    context 'with only parent declarations' do
+      it 'maps each record to its declared parents' do
+        data = [record('child', parent: 'parent1'), record('parent1')]
+        expect(graph(data)).to include('child' => ['parent1'], 'parent1' => [])
+      end
+    end
+
+    context 'with suffixed parent columns (parents_1, parents_2)' do
+      it 'includes all suffixed parent values' do
+        data = [
+          record('child', raw_row: { 'parents_1' => 'p1', 'parents_2' => 'p2' }),
+          record('p1'),
+          record('p2')
+        ]
+        result = graph(data)
+        expect(result['child']).to contain_exactly('p1', 'p2')
+      end
+    end
+
+    context 'with only children declarations' do
+      it 'inverts children into parent edges on the child records' do
+        data = [record('parent1', children: 'child1'), record('child1')]
+        result = graph(data)
+        expect(result['child1']).to include('parent1')
+      end
+    end
+
+    context 'with suffixed children columns (children_1, children_2)' do
+      it 'inverts all suffixed children into parent edges' do
+        data = [
+          record('parent1', raw_row: { 'children_1' => 'c1', 'children_2' => 'c2' }),
+          record('c1'),
+          record('c2')
+        ]
+        result = graph(data)
+        expect(result['c1']).to include('parent1')
+        expect(result['c2']).to include('parent1')
+      end
+    end
+
+    context 'with a cycle declared via children columns (matching rel-circular-ref.csv pattern)' do
+      # child1 declares children=child2; child2 declares children=child1
+      # After inversion: child1 → [child2] and child2 → [child1]
+      it 'builds a graph that allows cycle detection to flag both nodes' do
+        data = [
+          record('parent1'),
+          record('parent2'),
+          record('child1', children: 'child2', raw_row: { 'parents_1' => 'parent1', 'parents_2' => 'parent2' }),
+          record('child2', children: 'child1')
+        ]
+        result = graph(data)
+        expect(result['child1']).to include('child2')
+        expect(result['child2']).to include('child1')
+      end
+    end
+
+    context 'when a child is already declared as a parent of the same record' do
+      it 'does not add duplicate edges' do
+        data = [
+          record('a', parent: 'b'),
+          record('b', children: 'a')
+        ]
+        result = graph(data)
+        expect(result['a'].count('b')).to eq(1)
+      end
+    end
+
+    context 'with blank source_identifier' do
+      it 'skips the record' do
+        data = [record(nil, parent: 'p1'), record('p1')]
+        result = graph(data)
+        expect(result.keys).not_to include(nil)
+      end
+    end
+  end
+
   describe '#build_find_record' do
     let(:mapping_manager) { instance_double(Bulkrax::CsvTemplate::MappingManager) }
     let(:mappings)        { {} }

diff --git a/spec/validators/bulkrax/csv_row/circular_reference_spec.rb b/spec/validators/bulkrax/csv_row/circular_reference_spec.rb
@@ -0,0 +1,134 @@
+# frozen_string_literal: true
+
+require 'rails_helper'
+
+RSpec.describe Bulkrax::CsvRow::CircularReference do
+  def make_context(graph)
+    { errors: [], relationship_graph: graph }
+  end
+
+  def make_record(source_identifier)
+    { source_identifier: source_identifier }
+  end
+
+  def call(record, context, row: 2)
+    described_class.call(record, row, context)
+  end
+
+  # ─── detect_cycle_ids ────────────────────────────────────────────────────────
+
+  describe '.detect_cycle_ids (via call)' do
+    context 'with no relationships' do
+      it 'produces no errors' do
+        ctx = make_context('work1' => [], 'work2' => [])
+        call(make_record('work1'), ctx)
+        expect(ctx[:errors]).to be_empty
+      end
+    end
+
+    context 'with a simple linear chain (no cycle)' do
+      # col1 ← work1 ← work2  (work1 has parent col1, work2 has parent work1)
+      it 'produces no errors' do
+        graph = { 'col1' => [], 'work1' => ['col1'], 'work2' => ['work1'] }
+        ctx = make_context(graph)
+        %w[col1 work1 work2].each { |id| call(make_record(id), ctx) }
+        expect(ctx[:errors]).to be_empty
+      end
+    end
+
+    context 'with a direct self-reference (A → A)' do
+      it 'flags the self-referencing record' do
+        ctx = make_context('work1' => ['work1'])
+        call(make_record('work1'), ctx)
+        expect(ctx[:errors].length).to eq(1)
+        expect(ctx[:errors].first[:source_identifier]).to eq('work1')
+        expect(ctx[:errors].first[:category]).to eq('circular_reference')
+      end
+    end
+
+    context 'with a two-node cycle (A → B → A)' do
+      it 'flags both records in the cycle' do
+        graph = { 'work1' => ['work2'], 'work2' => ['work1'] }
+        ctx = make_context(graph)
+        call(make_record('work1'), ctx, row: 2)
+        call(make_record('work2'), ctx, row: 3)
+        ids = ctx[:errors].map { |e| e[:source_identifier] }
+        expect(ids).to contain_exactly('work1', 'work2')
+      end
+    end
+
+    context 'with a three-node cycle (A → B → C → A)' do
+      it 'flags all three records' do
+        graph = { 'a' => ['c'], 'b' => ['a'], 'c' => ['b'] }
+        ctx = make_context(graph)
+        %w[a b c].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) }
+        ids = ctx[:errors].map { |e| e[:source_identifier] }
+        expect(ids).to contain_exactly('a', 'b', 'c')
+      end
+    end
+
+    context 'with a cycle on a branch (D → E → F → E)' do
+      it 'flags only the nodes in the cycle, not the entry node' do
+        # D has parent E; E has parent F; F has parent E → cycle between E and F
+        graph = { 'd' => ['e'], 'e' => ['f'], 'f' => ['e'] }
+        ctx = make_context(graph)
+        %w[d e f].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) }
+        ids = ctx[:errors].map { |e| e[:source_identifier] }
+        expect(ids).to include('e', 'f')
+        expect(ids).not_to include('d')
+      end
+    end
+
+    context 'with a cycle and unrelated records' do
+      it 'does not flag records outside the cycle' do
+        graph = { 'col1' => [], 'work1' => ['col1'], 'a' => ['b'], 'b' => ['a'] }
+        ctx = make_context(graph)
+        %w[col1 work1 a b].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) }
+        ids = ctx[:errors].map { |e| e[:source_identifier] }
+        expect(ids).to contain_exactly('a', 'b')
+      end
+    end
+  end
+
+  # ─── caching ─────────────────────────────────────────────────────────────────
+
+  describe 'cycle detection caching' do
+    it 'only detects cycles once per context and reuses the result' do
+      graph = { 'a' => ['b'], 'b' => ['a'] }
+      ctx = make_context(graph)
+
+      expect(described_class).to receive(:detect_cycle_ids).once.and_call_original
+
+      call(make_record('a'), ctx, row: 2)
+      call(make_record('b'), ctx, row: 3)
+    end
+  end
+
+  # ─── error shape ─────────────────────────────────────────────────────────────
+
+  describe 'error message content' do
+    it 'includes the expected keys' do
+      ctx = make_context('work1' => ['work1'])
+      call(make_record('work1'), ctx)
+      error = ctx[:errors].first
+      expect(error).to include(
+        row: 2,
+        source_identifier: 'work1',
+        severity: 'error',
+        category: 'circular_reference'
+      )
+      expect(error[:message]).to be_present
+      expect(error[:suggestion]).to be_present
+    end
+  end
+
+  # ─── empty graph ─────────────────────────────────────────────────────────────
+
+  describe 'when relationship_graph is absent from context' do
+    it 'does not raise and adds no errors' do
+      ctx = { errors: [] }
+      expect { call(make_record('work1'), ctx) }.not_to raise_error
+      expect(ctx[:errors]).to be_empty
+    end
+  end
+end