Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadat
children_column: resolve_relationship_column(mappings, 'related_children_field_mapping', 'children'),
mappings: mappings,
field_metadata: field_metadata,
find_record_by_source_identifier: find_record
find_record_by_source_identifier: find_record,
relationship_graph: build_relationship_graph(csv_data, mappings)
}
csv_data.each_with_index do |record, index|
row_number = index + 2 # 1-indexed, plus header row
Expand Down
58 changes: 58 additions & 0 deletions app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,64 @@ def resolve_children_split_pattern(mappings)

split_val
end

# Builds a graph of { source_identifier => [parent_ids] } from all CSV records.
# Used by CircularReference validator to detect cycles across the whole CSV.
#
# Parent edges are collected from both directions:
# - explicit parent declarations (parents / parents_N columns)
# - inverted child declarations (children / children_N columns), mirroring
# the normalisation done in importers_stepper.js#normalizeRelationships
def build_relationship_graph(csv_data, mappings)
parent_column = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents')
children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children')
parent_suffix = /\A#{Regexp.escape(parent_column)}_\d+\z/
children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/

graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings))
invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings))
graph
end

def build_parent_edges(csv_data, suffix_pattern, split_pattern)
csv_data.each_with_object({}) do |record, graph|
id = record[:source_identifier]
next if id.blank?

base_ids = split_or_single(record[:parent], split_pattern)
suffix_ids = suffixed_values(record[:raw_row], suffix_pattern)
graph[id] = (base_ids + suffix_ids).uniq
end
end

def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern)
csv_data.each do |record|
id = record[:source_identifier]
next if id.blank?

child_ids = split_or_single(record[:children], split_pattern) +
suffixed_values(record[:raw_row], suffix_pattern)
child_ids.each do |child_id|
graph[child_id] ||= []
graph[child_id] << id unless graph[child_id].include?(id)
end
end
end

def split_or_single(value, split_pattern)
if split_pattern
value.to_s.split(split_pattern).map(&:strip).reject(&:blank?)
elsif value.present?
[value.to_s.strip]
else
[]
end
end

def suffixed_values(raw_row, suffix_pattern)
raw_row.select { |k, _| k.to_s.match?(suffix_pattern) }
.values.map(&:to_s).map(&:strip).reject(&:blank?)
end
end
end
end
71 changes: 71 additions & 0 deletions app/validators/bulkrax/csv_row/circular_reference.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# frozen_string_literal: true

module Bulkrax
module CsvRow
##
# Detects circular parent-child relationships in the CSV.
# A circular reference occurs when following the parent chain from any record
# eventually leads back to itself (e.g. A→B→C→A).
#
# The validator builds a directed graph (child → parents) from all records on
# first invocation and caches the set of all record ids involved in any cycle.
# Subsequent per-row calls simply check membership in that set.
#
# Requires context key:
# :relationship_graph – Hash { source_identifier => [parent_ids] } built by
# run_row_validators before iterating rows.
module CircularReference
def self.call(record, row_index, context)
cycle_ids = context[:circular_reference_ids] ||= detect_cycle_ids(context[:relationship_graph] || {})
return unless cycle_ids.include?(record[:source_identifier])

context[:errors] << {
row: row_index,
source_identifier: record[:source_identifier],
severity: 'error',
category: 'circular_reference',
column: 'parents',
value: record[:source_identifier],
message: I18n.t('bulkrax.importer.guided_import.validation.circular_reference_validator.errors.message',
value: record[:source_identifier]),
suggestion: I18n.t('bulkrax.importer.guided_import.validation.circular_reference_validator.errors.suggestion')
}
end

# Returns the set of all source identifiers that participate in at least one cycle.
# Uses recursive DFS with a per-branch ancestry set to detect back-edges.
def self.detect_cycle_ids(graph)
all_nodes = graph.keys.to_set | graph.values.flatten.to_set
visited = Set.new
cycle_ids = Set.new

all_nodes.each do |node|
next if visited.include?(node)
dfs(node, graph, visited, [], cycle_ids)
end

cycle_ids
end
private_class_method :detect_cycle_ids

def self.dfs(node, graph, visited, ancestors, cycle_ids) # rubocop:disable Metrics/MethodLength
visited.add(node)
ancestors.push(node)

(graph[node] || []).each do |neighbor|
if ancestors.include?(neighbor)
# Back-edge found: mark every node in the cycle path
cycle_start = ancestors.index(neighbor)
ancestors[cycle_start..].each { |n| cycle_ids.add(n) }
cycle_ids.add(neighbor)
elsif !visited.include?(neighbor)
dfs(neighbor, graph, visited, ancestors, cycle_ids)
end
end

ancestors.pop
end
private_class_method :dfs
end
end
end
4 changes: 4 additions & 0 deletions config/locales/bulkrax.en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,10 @@ en:
errors:
message: "Referenced child '%{value}' does not exist as a %{field} in this CSV."
suggestion: "Check for typos or add the child record."
circular_reference_validator:
errors:
message: "'%{value}' is part of a circular parent-child relationship."
suggestion: "Review the parent/child columns and remove the incorrect references."
passed: Validation Passed
passed_warnings: Validation Passed with Warnings
recognized_fields: 'Recognized fields: %{fields}'
Expand Down
1 change: 1 addition & 0 deletions lib/bulkrax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def csv_row_validators
Bulkrax::CsvRow::DuplicateIdentifier,
Bulkrax::CsvRow::ParentReference,
Bulkrax::CsvRow::ChildReference,
Bulkrax::CsvRow::CircularReference,
Bulkrax::CsvRow::RequiredValues,
Bulkrax::CsvRow::ControlledVocabulary
]
Expand Down
87 changes: 87 additions & 0 deletions spec/parsers/bulkrax/csv_parser/csv_validation_helpers_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,93 @@ def find(id)
end
end

describe '#build_relationship_graph' do
let(:mappings) { {} }

def record(source_identifier, parent: nil, children: nil, raw_row: {})
{ source_identifier: source_identifier, parent: parent, children: children, raw_row: raw_row }
end

def graph(csv_data)
host.build_relationship_graph(csv_data, mappings)
end

context 'with only parent declarations' do
it 'maps each record to its declared parents' do
data = [record('child', parent: 'parent1'), record('parent1')]
expect(graph(data)).to include('child' => ['parent1'], 'parent1' => [])
end
end

context 'with suffixed parent columns (parents_1, parents_2)' do
it 'includes all suffixed parent values' do
data = [
record('child', raw_row: { 'parents_1' => 'p1', 'parents_2' => 'p2' }),
record('p1'),
record('p2')
]
result = graph(data)
expect(result['child']).to contain_exactly('p1', 'p2')
end
end

context 'with only children declarations' do
it 'inverts children into parent edges on the child records' do
data = [record('parent1', children: 'child1'), record('child1')]
result = graph(data)
expect(result['child1']).to include('parent1')
end
end

context 'with suffixed children columns (children_1, children_2)' do
it 'inverts all suffixed children into parent edges' do
data = [
record('parent1', raw_row: { 'children_1' => 'c1', 'children_2' => 'c2' }),
record('c1'),
record('c2')
]
result = graph(data)
expect(result['c1']).to include('parent1')
expect(result['c2']).to include('parent1')
end
end

context 'with a cycle declared via children columns (matching rel-circular-ref.csv pattern)' do
# child1 declares children=child2; child2 declares children=child1
# After inversion: child1 → [child2] and child2 → [child1]
it 'builds a graph that allows cycle detection to flag both nodes' do
data = [
record('parent1'),
record('parent2'),
record('child1', children: 'child2', raw_row: { 'parents_1' => 'parent1', 'parents_2' => 'parent2' }),
record('child2', children: 'child1')
]
result = graph(data)
expect(result['child1']).to include('child2')
expect(result['child2']).to include('child1')
end
end

context 'when a child is already declared as a parent of the same record' do
it 'does not add duplicate edges' do
data = [
record('a', parent: 'b'),
record('b', children: 'a')
]
result = graph(data)
expect(result['a'].count('b')).to eq(1)
end
end

context 'with blank source_identifier' do
it 'skips the record' do
data = [record(nil, parent: 'p1'), record('p1')]
result = graph(data)
expect(result.keys).not_to include(nil)
end
end
end

describe '#build_find_record' do
let(:mapping_manager) { instance_double(Bulkrax::CsvTemplate::MappingManager) }
let(:mappings) { {} }
Expand Down
134 changes: 134 additions & 0 deletions spec/validators/bulkrax/csv_row/circular_reference_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# frozen_string_literal: true

require 'rails_helper'

RSpec.describe Bulkrax::CsvRow::CircularReference do
def make_context(graph)
{ errors: [], relationship_graph: graph }
end

def make_record(source_identifier)
{ source_identifier: source_identifier }
end

def call(record, context, row: 2)
described_class.call(record, row, context)
end

# ─── detect_cycle_ids ────────────────────────────────────────────────────────

describe '.detect_cycle_ids (via call)' do
context 'with no relationships' do
it 'produces no errors' do
ctx = make_context('work1' => [], 'work2' => [])
call(make_record('work1'), ctx)
expect(ctx[:errors]).to be_empty
end
end

context 'with a simple linear chain (no cycle)' do
# col1 ← work1 ← work2 (work1 has parent col1, work2 has parent work1)
it 'produces no errors' do
graph = { 'col1' => [], 'work1' => ['col1'], 'work2' => ['work1'] }
ctx = make_context(graph)
%w[col1 work1 work2].each { |id| call(make_record(id), ctx) }
expect(ctx[:errors]).to be_empty
end
end

context 'with a direct self-reference (A → A)' do
it 'flags the self-referencing record' do
ctx = make_context('work1' => ['work1'])
call(make_record('work1'), ctx)
expect(ctx[:errors].length).to eq(1)
expect(ctx[:errors].first[:source_identifier]).to eq('work1')
expect(ctx[:errors].first[:category]).to eq('circular_reference')
end
end

context 'with a two-node cycle (A → B → A)' do
it 'flags both records in the cycle' do
graph = { 'work1' => ['work2'], 'work2' => ['work1'] }
ctx = make_context(graph)
call(make_record('work1'), ctx, row: 2)
call(make_record('work2'), ctx, row: 3)
ids = ctx[:errors].map { |e| e[:source_identifier] }
expect(ids).to contain_exactly('work1', 'work2')
end
end

context 'with a three-node cycle (A → B → C → A)' do
it 'flags all three records' do
graph = { 'a' => ['c'], 'b' => ['a'], 'c' => ['b'] }
ctx = make_context(graph)
%w[a b c].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) }
ids = ctx[:errors].map { |e| e[:source_identifier] }
expect(ids).to contain_exactly('a', 'b', 'c')
end
end

context 'with a cycle on a branch (D → E → F → E)' do
it 'flags only the nodes in the cycle, not the entry node' do
# D has parent E; E has parent F; F has parent E → cycle between E and F
graph = { 'd' => ['e'], 'e' => ['f'], 'f' => ['e'] }
ctx = make_context(graph)
%w[d e f].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) }
ids = ctx[:errors].map { |e| e[:source_identifier] }
expect(ids).to include('e', 'f')
expect(ids).not_to include('d')
end
end

context 'with a cycle and unrelated records' do
it 'does not flag records outside the cycle' do
graph = { 'col1' => [], 'work1' => ['col1'], 'a' => ['b'], 'b' => ['a'] }
ctx = make_context(graph)
%w[col1 work1 a b].each_with_index { |id, i| call(make_record(id), ctx, row: i + 2) }
ids = ctx[:errors].map { |e| e[:source_identifier] }
expect(ids).to contain_exactly('a', 'b')
end
end
end

# ─── caching ─────────────────────────────────────────────────────────────────

describe 'cycle detection caching' do
it 'only detects cycles once per context and reuses the result' do
graph = { 'a' => ['b'], 'b' => ['a'] }
ctx = make_context(graph)

expect(described_class).to receive(:detect_cycle_ids).once.and_call_original

call(make_record('a'), ctx, row: 2)
call(make_record('b'), ctx, row: 3)
end
end

# ─── error shape ─────────────────────────────────────────────────────────────

describe 'error message content' do
it 'includes the expected keys' do
ctx = make_context('work1' => ['work1'])
call(make_record('work1'), ctx)
error = ctx[:errors].first
expect(error).to include(
row: 2,
source_identifier: 'work1',
severity: 'error',
category: 'circular_reference'
)
expect(error[:message]).to be_present
expect(error[:suggestion]).to be_present
end
end

# ─── empty graph ─────────────────────────────────────────────────────────────

describe 'when relationship_graph is absent from context' do
it 'does not raise and adds no errors' do
ctx = { errors: [] }
expect { call(make_record('work1'), ctx) }.not_to raise_error
expect(ctx[:errors]).to be_empty
end
end
end
Loading