From 891b0920f6825c8d5772124c7423e0511dfca34f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 17 Sep 2025 12:11:32 +0200 Subject: [PATCH 01/12] Add knn implementation --- cpp/CMakeLists.txt | 1 + cpp/knn_module/CMakeLists.txt | 12 + cpp/knn_module/algorithms/knn.hpp | 586 ++++++++++++++++++++++++++++++ cpp/knn_module/knn_module.cpp | 346 ++++++++++++++++++ 4 files changed, 945 insertions(+) create mode 100644 cpp/knn_module/CMakeLists.txt create mode 100644 cpp/knn_module/algorithms/knn.hpp create mode 100644 cpp/knn_module/knn_module.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cd6dd03ab..b2369e304 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -149,5 +149,6 @@ add_subdirectory(algo_module) add_subdirectory(set_property_module) add_subdirectory(leiden_community_detection_module) add_subdirectory(math_module) +add_subdirectory(knn_module) add_cugraph_subdirectory(cugraph_module) diff --git a/cpp/knn_module/CMakeLists.txt b/cpp/knn_module/CMakeLists.txt new file mode 100644 index 000000000..6049ffcd7 --- /dev/null +++ b/cpp/knn_module/CMakeLists.txt @@ -0,0 +1,12 @@ +set(knn_module_src + knn_module.cpp + algorithms/knn.hpp) + +add_query_module(knn 1 "${knn_module_src}") + +# Find OpenMP +find_package(OpenMP REQUIRED) + +# Link external libraries +target_link_libraries(knn PRIVATE mg_utility fmt::fmt OpenMP::OpenMP_CXX) +target_include_directories(knn PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp new file mode 100644 index 000000000..fc6f705af --- /dev/null +++ b/cpp/knn_module/algorithms/knn.hpp @@ -0,0 +1,586 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace knn_util { + +// Similarity functions supported by KNN +enum class SimilarityFunction { COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT }; + +// Initial sampler types +constexpr std::string_view kSamplerUniform = "uniform"; +constexpr std::string_view kSamplerRandomWalk = "randomWalk"; + +// Property configuration for KNN +struct PropertyConfig { + std::string name; + SimilarityFunction metric; + + PropertyConfig(const std::string &prop_name, SimilarityFunction sim_func) : name(prop_name), metric(sim_func) {} +}; + +// Configuration for KNN algorithm +struct KNNConfig { + int top_k = 1; + double similarity_cutoff = 0.0; + double delta_threshold = 0.001; + int max_iterations = 100; + int random_seed = 42; + double sample_rate = 0.5; + int concurrency = 1; + std::string initial_sampler = "uniform"; + std::vector node_properties; + SimilarityFunction default_similarity_function = SimilarityFunction::COSINE; +}; + +// Result structure for KNN +struct KNNResult { + mgp::Id node1_id; + mgp::Id node2_id; + double similarity; + + // Default constructor for std::vector compatibility + KNNResult() : similarity(0.0) { + // Initialize with default constructed Ids + node1_id = mgp::Id(); + node2_id = mgp::Id(); + } + + KNNResult(const mgp::Node &n1, const mgp::Node &n2, double sim) + : node1_id(n1.Id()), node2_id(n2.Id()), similarity(sim) {} + + KNNResult(mgp::Id id1, mgp::Id id2, double sim) : node1_id(id1), node2_id(id2), similarity(sim) {} +}; + +} // namespace knn_util + +namespace knn_algs { + +// Extract property values from a node +std::vector ExtractPropertyValues(const mgp::Node &node, const std::vector &properties) { + std::vector values; + values.reserve(properties.size()); + + for (const auto &property : properties) { + try { + auto prop_value = node.GetProperty(property); + if (prop_value.IsNull()) { + throw mgp::ValueException(fmt::format("Node missing property: {}", property)); + } + + if (prop_value.IsList()) { + // For vector properties, take the first element or average + auto list = prop_value.ValueList(); + if (list.Size() > 0) { + values.push_back(list[0].ValueDouble()); + } else { + values.push_back(0.0); + } + } else if (prop_value.IsDouble()) { + values.push_back(prop_value.ValueDouble()); + } else if (prop_value.IsInt()) { + values.push_back(static_cast(prop_value.ValueInt())); + } else { + throw mgp::ValueException(fmt::format("Property {} must be numeric or list of numbers", property)); + } + } catch (const mgp::ValueException &e) { + throw mgp::ValueException(fmt::format("Error extracting property {} : {}", property, e.what())); + } + } + + return values; +} + +// Cosine similarity between two vectors +double CosineSimilarity(const std::vector &vec1, const std::vector &vec2) { + if (vec1.size() != vec2.size()) { + throw mgp::ValueException("Vectors must have the same size for cosine similarity"); + } + + double dot_product = 0.0; + double norm1 = 0.0; + double norm2 = 0.0; + + for (size_t i = 0; i < vec1.size(); ++i) { + dot_product += vec1[i] * vec2[i]; + norm1 += vec1[i] * vec1[i]; + norm2 += vec2[i] * vec2[i]; + } + + double denominator = std::sqrt(norm1) * std::sqrt(norm2); + if (denominator < 1e-9) { + return 0.0; + } + + return dot_product / denominator; +} + +// Euclidean similarity (1 / (1 + distance)) +double EuclideanSimilarity(const std::vector &vec1, const std::vector &vec2) { + if (vec1.size() != vec2.size()) { + throw mgp::ValueException("Vectors must have the same size for euclidean similarity"); + } + + double sum_squared_diff = 0.0; + for (size_t i = 0; i < vec1.size(); ++i) { + double diff = vec1[i] - vec2[i]; + sum_squared_diff += diff * diff; + } + + double distance = std::sqrt(sum_squared_diff); + return 1.0 / (1.0 + distance); +} + +// Pearson correlation coefficient +double PearsonSimilarity(const std::vector &vec1, const std::vector &vec2) { + if (vec1.size() != vec2.size()) { + throw mgp::ValueException("Vectors must have the same size for pearson similarity"); + } + + if (vec1.size() < 2) { + return 1.0; // Perfect correlation for single values + } + + // Calculate means + double mean1 = 0.0, mean2 = 0.0; + for (size_t i = 0; i < vec1.size(); ++i) { + mean1 += vec1[i]; + mean2 += vec2[i]; + } + mean1 /= vec1.size(); + mean2 /= vec2.size(); + + // Calculate correlation + double numerator = 0.0; + double sum_sq1 = 0.0; + double sum_sq2 = 0.0; + + for (size_t i = 0; i < vec1.size(); ++i) { + double diff1 = vec1[i] - mean1; + double diff2 = vec2[i] - mean2; + numerator += diff1 * diff2; + sum_sq1 += diff1 * diff1; + sum_sq2 += diff2 * diff2; + } + + double denominator = std::sqrt(sum_sq1 * sum_sq2); + if (denominator < 1e-9) { + return 0.0; + } + + return numerator / denominator; +} + +// Overlap similarity (intersection / min size) +double OverlapSimilarity(const std::vector &vec1, const std::vector &vec2) { + if (vec1.size() != vec2.size()) { + throw mgp::ValueException("Vectors must have the same size for overlap similarity"); + } + + // For numeric vectors, we consider values as "overlapping" if they're close + const double threshold = 1e-6; + int overlap_count = 0; + + for (size_t i = 0; i < vec1.size(); ++i) { + if (std::abs(vec1[i] - vec2[i]) < threshold) { + overlap_count++; + } + } + + int min_size = std::min(vec1.size(), vec2.size()); + if (min_size == 0) { + return 0.0; + } + + return static_cast(overlap_count) / min_size; +} + +// Jaccard similarity (intersection / union) +double JaccardSimilarity(const std::vector &vec1, const std::vector &vec2) { + if (vec1.size() != vec2.size()) { + throw mgp::ValueException("Vectors must have the same size for jaccard similarity"); + } + + // For binary vectors (0 or 1), Jaccard = intersection / union + int intersection_count = 0; + int union_count = 0; + + for (size_t i = 0; i < vec1.size(); ++i) { + bool has_v1 = vec1[i] > 0; + bool has_v2 = vec2[i] > 0; + + if (has_v1 && has_v2) { + intersection_count++; + } + if (has_v1 || has_v2) { + union_count++; + } + } + + if (union_count == 0) { + return 0.0; + } + + return static_cast(intersection_count) / union_count; +} + +// Helper function to determine if a list contains integers +bool IsIntegerList(const mgp::List &list) { + for (size_t i = 0; i < list.Size(); ++i) { + if (list[i].IsNumeric()) { + double val = list[i].ValueNumeric(); + // Check if the numeric value is actually an integer + if (val != std::floor(val)) { + return false; + } + } else { + return false; + } + } + return true; +} + +// Helper function to get default similarity function based on data type +knn_util::SimilarityFunction GetDefaultSimilarityFunction(const mgp::Value &prop_value) { + if (prop_value.IsNumeric()) { + // Single numeric value - use cosine as default + return knn_util::SimilarityFunction::COSINE; + } else if (prop_value.IsList()) { + mgp::List list = prop_value.ValueList(); + if (IsIntegerList(list)) { + // List of integers - use Jaccard as default + return knn_util::SimilarityFunction::JACCARD; + } else { + // List of floats - use Cosine as default + return knn_util::SimilarityFunction::COSINE; + } + } else { + // Non-numeric property - use Cosine as fallback + return knn_util::SimilarityFunction::COSINE; + } +} + + +// Structure to hold pre-loaded node data for efficient comparison +struct NodeData { + mgp::Node node; + std::vector> property_values; // One vector per property + std::vector resolved_metrics; // Resolved metrics per property + + NodeData(const mgp::Node& n, size_t num_properties) + : node(n), property_values(num_properties), resolved_metrics(num_properties) {} +}; + +// Pre-load node properties into memory for efficient comparison +std::vector PreloadNodeData(const std::vector& nodes, const knn_util::KNNConfig& config) { + std::vector node_data; + node_data.reserve(nodes.size()); + + if (config.node_properties.empty()) { + throw mgp::ValueException("No node properties configured for similarity calculation"); + } + + for (const auto& node : nodes) { + NodeData node_info(node, config.node_properties.size()); + + // Load each property - throw immediately on any error + for (size_t prop_idx = 0; prop_idx < config.node_properties.size(); ++prop_idx) { + const auto& prop_config = config.node_properties[prop_idx]; + + mgp::Value prop_value = node.GetProperty(prop_config.name); + std::vector values; + + // Resolve DEFAULT metric based on data type + knn_util::SimilarityFunction resolved_metric = prop_config.metric; + if (resolved_metric == knn_util::SimilarityFunction::DEFAULT) { + if (prop_value.IsNumeric()) { + // Single value - will use scalar formula + resolved_metric = knn_util::SimilarityFunction::DEFAULT; // Keep as DEFAULT for scalar + } else if (prop_value.IsList()) { + mgp::List list = prop_value.ValueList(); + if (list.Size() > 0 && list[0].IsNumeric()) { + // Infer based on first element type + resolved_metric = list[0].IsInt() ? knn_util::SimilarityFunction::JACCARD : knn_util::SimilarityFunction::COSINE; + } + } + } + + // Determine expected data type based on resolved metric + bool expects_integers = (resolved_metric == knn_util::SimilarityFunction::JACCARD || + resolved_metric == knn_util::SimilarityFunction::OVERLAP); + + if (prop_value.IsNumeric()) { + // For scalar numbers, validate type and store the single value + if (expects_integers && !prop_value.IsInt()) { + throw mgp::ValueException(fmt::format("Property {} must be integer for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); + } + if (!expects_integers && !prop_value.IsDouble()) { + throw mgp::ValueException(fmt::format("Property {} must be double for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" : + (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) ? "EUCLIDEAN" : "PEARSON")); + } + values.push_back(prop_value.ValueNumeric()); + } else if (prop_value.IsList()) { + // For lists, validate type of first element and extract all numeric values + mgp::List list = prop_value.ValueList(); + if (list.Size() > 0 && list[0].IsNumeric()) { + // Check type of first element only + if (expects_integers && !list[0].IsInt()) { + throw mgp::ValueException(fmt::format("Property {} list elements must be integers for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); + } + if (!expects_integers && !list[0].IsDouble()) { + throw mgp::ValueException(fmt::format("Property {} list elements must be doubles for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" : + (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) ? "EUCLIDEAN" : "PEARSON")); + } + } + + // Extract all numeric values (trusting the rest are the same type) + for (size_t i = 0; i < list.Size(); ++i) { + if (list[i].IsNumeric()) { + values.push_back(list[i].ValueNumeric()); + } + } + } else { + throw mgp::ValueException(fmt::format("Property {} must be numeric or list of numbers for similarity calculation", prop_config.name)); + } + + if (values.empty()) { + throw mgp::ValueException(fmt::format("Invalid property values: empty lists for property {}", prop_config.name)); + } + + node_info.property_values[prop_idx] = values; + node_info.resolved_metrics[prop_idx] = resolved_metric; + } + + node_data.push_back(node_info); + } + + return node_data; +} + +// Calculate similarity between pre-loaded node data +double CalculateNodeSimilarity(const NodeData& node1_data, const NodeData& node2_data, const knn_util::KNNConfig& config) { + double total_similarity = 0.0; + const size_t num_properties = config.node_properties.size(); + + // Calculate similarity for each property and compute the mean + for (size_t prop_idx = 0; prop_idx < num_properties; ++prop_idx) { + const auto& values1 = node1_data.property_values[prop_idx]; + const auto& values2 = node2_data.property_values[prop_idx]; + + double property_similarity = 0.0; + + // Use the pre-resolved metric from NodeData + knn_util::SimilarityFunction metric = node1_data.resolved_metrics[prop_idx]; + + // For scalar numbers, use the formula: 1 / (1 + |a - b|) + if (values1.size() == 1) { + property_similarity = 1.0 / (1.0 + std::abs(values1[0] - values2[0])); + } else { + // For vectors, use the pre-resolved similarity function + switch (metric) { + case knn_util::SimilarityFunction::COSINE: + property_similarity = CosineSimilarity(values1, values2); + break; + case knn_util::SimilarityFunction::EUCLIDEAN: + property_similarity = EuclideanSimilarity(values1, values2); + break; + case knn_util::SimilarityFunction::PEARSON: + property_similarity = PearsonSimilarity(values1, values2); + break; + case knn_util::SimilarityFunction::OVERLAP: + property_similarity = OverlapSimilarity(values1, values2); + break; + case knn_util::SimilarityFunction::JACCARD: + property_similarity = JaccardSimilarity(values1, values2); + break; + default: + property_similarity = CosineSimilarity(values1, values2); + break; + } + } + + total_similarity += property_similarity; + } + + // Return the mean of all property similarities + return total_similarity / num_properties; +} + +// Validate configuration parameters +void ValidateConfig(const knn_util::KNNConfig& config) { + if (config.initial_sampler == knn_util::kSamplerRandomWalk) { + throw mgp::ValueException("Random walk sampling not implemented"); + } else if (config.initial_sampler != knn_util::kSamplerUniform) { + throw mgp::ValueException(fmt::format("Unknown initial sampler: {}", config.initial_sampler)); + } +} + +// Get candidate indices for comparison, excluding self +std::vector GetCandidateIndices( + size_t node_idx, + size_t total_nodes, + const knn_util::KNNConfig& config) { + + std::vector comparison_indices; + + if (config.sample_rate < 1.0) { + // Create indices for all nodes except self + std::vector all_indices; + all_indices.reserve(total_nodes - 1); + for (size_t i = 0; i < total_nodes; ++i) { + if (i != node_idx) { // Skip self + all_indices.push_back(i); + } + } + + // Shuffle indices for uniform sampling + std::mt19937 rng(config.random_seed); + std::shuffle(all_indices.begin(), all_indices.end(), rng); + + // Calculate sample size + size_t sample_size = static_cast(all_indices.size() * config.sample_rate); + comparison_indices.reserve(sample_size); + + // Take the first sample_size indices + for (size_t i = 0; i < sample_size; ++i) { + comparison_indices.push_back(all_indices[i]); + } + } else { + // Compare against all other nodes + comparison_indices.reserve(total_nodes - 1); + for (size_t j = 0; j < total_nodes; ++j) { + if (j != node_idx) { // Skip self-comparison + comparison_indices.push_back(j); + } + } + } + + return comparison_indices; +} + +// Calculate similarity for one node against all candidates (parallel implementation) +std::vector CalculateSimilarityForNode( + size_t node_idx, + const std::vector& node_data, + const std::vector& comparison_indices, + const knn_util::KNNConfig& config) { + + const auto& node1_data = node_data[node_idx]; + + // Pre-allocate results vector + std::vector results; + results.reserve(comparison_indices.size()); + + // Convert comparison_indices to array for OpenMP (similar to betweenness_centrality_online.cpp) + auto array_size = comparison_indices.size(); + std::vector comparison_indices_array(array_size); + std::copy(comparison_indices.begin(), comparison_indices.end(), comparison_indices_array.begin()); + + // Pre-allocate parallel results vector + std::vector parallel_results(array_size); + + // Set OpenMP parameters + omp_set_dynamic(0); + omp_set_num_threads(config.concurrency); + + // Parallel similarity calculation using OpenMP +#pragma omp parallel for + for (size_t i = 0; i < array_size; ++i) { + size_t idx = comparison_indices_array[i]; + const auto& node2_data = node_data[idx]; + double similarity = CalculateNodeSimilarity(node1_data, node2_data, config); + + // Store result (will be filtered later) + parallel_results[i] = knn_util::KNNResult(node1_data.node.Id(), node2_data.node.Id(), similarity); + } + + // Filter results based on similarity cutoff and add to final results + for (const auto& result : parallel_results) { + if (result.similarity >= config.similarity_cutoff) { + results.push_back(result); + } + } + + return results; +} + +// Sort and insert top-k results into final results +void InsertTopKResults( + const std::vector& top_k_results, + const mgp::Graph& graph, + std::vector>& final_results) { + + // Sort by similarity (descending) + std::vector sorted_results = top_k_results; + std::sort(sorted_results.begin(), sorted_results.end(), + [](const knn_util::KNNResult& a, const knn_util::KNNResult& b) { + return a.similarity > b.similarity; + }); + + // Convert to final results with actual nodes + for (const auto& result : sorted_results) { + try { + auto node1 = graph.GetNodeById(result.node1_id); + auto node2 = graph.GetNodeById(result.node2_id); + final_results.push_back(std::make_tuple(node1, node2, result.similarity)); + } catch (const std::exception& e) { + // Skip if node not found + continue; + } + } +} + +// Main KNN algorithm implementation +std::vector> CalculateKNN(const mgp::Graph &graph, + const knn_util::KNNConfig &config) { + std::vector> results; + std::vector nodes; + + // 1. Validate configuration + ValidateConfig(config); + + // Collect all nodes + for (const auto &node : graph.Nodes()) { + nodes.push_back(node); + } + + if (nodes.size() < 2) { + return results; // Need at least 2 nodes for similarity + } + + // Pre-load node properties into memory for efficient comparison + std::vector node_data = PreloadNodeData(nodes, config); + + // For each node, find its top-k most similar nodes + for (size_t i = 0; i < node_data.size(); ++i) { + // Get candidate indices for comparison + std::vector comparison_indices = GetCandidateIndices(i, node_data.size(), config); + + // 2. Calculate similarity for one node + std::vector top_k_results = CalculateSimilarityForNode( + i, node_data, comparison_indices, config); + + // Take only top-k results + if (top_k_results.size() > static_cast(config.top_k)) { + top_k_results.erase(top_k_results.begin() + config.top_k, top_k_results.end()); + } + + // 3. Insert sorted top-k results + InsertTopKResults(top_k_results, graph, results); + } + + return results; +} + +} // namespace knn_algs diff --git a/cpp/knn_module/knn_module.cpp b/cpp/knn_module/knn_module.cpp new file mode 100644 index 000000000..e54129c93 --- /dev/null +++ b/cpp/knn_module/knn_module.cpp @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include + +#include "algorithms/knn.hpp" + +// Procedure names +constexpr std::string_view kProcedureGet = "get"; + +// Argument names +constexpr std::string_view kArgumentConfig = "config"; +constexpr std::string_view kConfigNodeProperties = "nodeProperties"; +constexpr std::string_view kConfigTopK = "topK"; +constexpr std::string_view kConfigSimilarityCutoff = "similarityCutoff"; +constexpr std::string_view kConfigDeltaThreshold = "deltaThreshold"; +constexpr std::string_view kConfigMaxIterations = "maxIterations"; +constexpr std::string_view kConfigRandomSeed = "randomSeed"; +constexpr std::string_view kConfigSampleRate = "sampleRate"; +constexpr std::string_view kConfigConcurrency = "concurrency"; +constexpr std::string_view kConfigSimilarityFunction = "similarityFunction"; +constexpr std::string_view kConfigInitialSampler = "initialSampler"; + +// Return field names +constexpr std::string_view kFieldNode = "node"; +constexpr std::string_view kFieldNeighbour = "neighbour"; +constexpr std::string_view kFieldSimilarity = "similarity"; + +// Similarity function values +constexpr std::string_view kSimilarityCosine = "COSINE"; +constexpr std::string_view kSimilarityEuclidean = "EUCLIDEAN"; +constexpr std::string_view kSimilarityPearson = "PEARSON"; +constexpr std::string_view kSimilarityOverlap = "OVERLAP"; +constexpr std::string_view kSimilarityJaccard = "JACCARD"; +constexpr std::string_view kSimilarityDefault = "DEFAULT"; + +// Default parameter values +constexpr int kDefaultTopK = 1; +constexpr double kDefaultSimilarityCutoff = 0.0; +constexpr double kDefaultDeltaThreshold = 0.001; +constexpr int kDefaultMaxIterations = 100; +constexpr int kDefaultConcurrency = 1; +constexpr double kDefaultSampleRate = 0.5; +constexpr std::string_view kDefaultInitialSampler = "uniform"; + +// Initial sampler values (using constants from knn.hpp) +// constexpr std::string_view kSamplerUniform = knn_util::kSamplerUniform; +// constexpr std::string_view kSamplerRandomWalk = knn_util::kSamplerRandomWalk; + +// Helper function to validate if a string is a valid similarity function +bool IsValidSimilarityFunction(const std::string& func_str) { + return func_str == kSimilarityCosine || + func_str == kSimilarityEuclidean || + func_str == kSimilarityPearson || + func_str == kSimilarityOverlap || + func_str == kSimilarityJaccard || + func_str == kSimilarityDefault; +} + +// Helper function to validate if a string is a valid initial sampler +bool IsValidInitialSampler(const std::string& sampler_str) { + std::string lower_sampler = sampler_str; + std::transform(lower_sampler.begin(), lower_sampler.end(), lower_sampler.begin(), ::tolower); + return lower_sampler == knn_util::kSamplerUniform || lower_sampler == knn_util::kSamplerRandomWalk; +} + +// Helper function to validate parameter ranges +void ValidateParameterRanges(const knn_util::KNNConfig& config) { + // Validate range [0, 1] parameters + if (config.sample_rate < 0.0 || config.sample_rate > 1.0) { + throw mgp::ValueException(fmt::format("knn: sampleRate must be between 0 and 1, got {}", config.sample_rate)); + } + + if (config.delta_threshold < 0.0 || config.delta_threshold > 1.0) { + throw mgp::ValueException(fmt::format("knn: deltaThreshold must be between 0 and 1, got {}", config.delta_threshold)); + } + + if (config.similarity_cutoff < 0.0 || config.similarity_cutoff > 1.0) { + throw mgp::ValueException(fmt::format("knn: similarityCutoff must be between 0 and 1, got {}", config.similarity_cutoff)); + } + + // Validate positive integer parameters + if (config.top_k <= 0) { + throw mgp::ValueException(fmt::format("knn: topK must be a positive integer, got {}", config.top_k)); + } + + if (config.concurrency <= 0) { + throw mgp::ValueException(fmt::format("knn: concurrency must be a positive integer, got {}", config.concurrency)); + } + + if (config.max_iterations <= 0) { + throw mgp::ValueException(fmt::format("knn: maxIterations must be a positive integer, got {}", config.max_iterations)); + } + + // randomSeed can be negative, so we only check it's not zero + if (config.random_seed == 0) { + throw mgp::ValueException("knn: randomSeed cannot be 0"); + } +} + +// Helper function to parse similarity function from string +knn_util::SimilarityFunction ParseSimilarityFunction(const std::string& func_str) { + if (func_str == kSimilarityCosine) { + return knn_util::SimilarityFunction::COSINE; + } else if (func_str == kSimilarityEuclidean) { + return knn_util::SimilarityFunction::EUCLIDEAN; + } else if (func_str == kSimilarityPearson) { + return knn_util::SimilarityFunction::PEARSON; + } else if (func_str == kSimilarityOverlap) { + return knn_util::SimilarityFunction::OVERLAP; + } else if (func_str == kSimilarityJaccard) { + return knn_util::SimilarityFunction::JACCARD; + } else if (func_str == kSimilarityDefault) { + return knn_util::SimilarityFunction::DEFAULT; + } else { + return knn_util::SimilarityFunction::COSINE; // Default fallback + } +} + +// Helper function to parse nodeProperties configuration +std::vector ParseNodeProperties(const mgp::Value& node_props_value) { + std::vector properties; + + if (node_props_value.IsString()) { + // Single property name - use default similarity function + std::string prop_name = std::string(node_props_value.ValueString()); + if (prop_name.empty()) { + throw mgp::ValueException("knn: Property name cannot be empty"); + } + properties.emplace_back(prop_name, knn_util::SimilarityFunction::DEFAULT); + } else if (node_props_value.IsMap()) { + // Map of property names to metrics + mgp::Map prop_map = node_props_value.ValueMap(); + if (prop_map.Size() == 0) { + throw mgp::ValueException("knn: Property map cannot be empty"); + } + + for (const auto& entry : prop_map) { + // Validate property name + std::string prop_name = std::string(entry.key); + if (prop_name.empty()) { + throw mgp::ValueException("knn: Property name cannot be empty"); + } + + // Validate metric value + if (!entry.value.IsString()) { + throw mgp::ValueException(fmt::format("knn: Metric value must be a string for property '{}'", prop_name)); + } + + std::string metric_str = std::string(entry.value.ValueString()); + if (metric_str.empty()) { + throw mgp::ValueException(fmt::format("knn: Metric value cannot be empty for property '{}'", prop_name)); + } + + if (!IsValidSimilarityFunction(metric_str)) { + throw mgp::ValueException(fmt::format("knn: Invalid metric '{}' for property '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name)); + } + + knn_util::SimilarityFunction metric = ParseSimilarityFunction(metric_str); + properties.emplace_back(prop_name, metric); + } + } else if (node_props_value.IsList()) { + // List of strings and/or maps + mgp::List prop_list = node_props_value.ValueList(); + if (prop_list.Size() == 0) { + throw mgp::ValueException("knn: Property list cannot be empty"); + } + + for (size_t i = 0; i < prop_list.Size(); ++i) { + if (prop_list[i].IsString()) { + // String property name - use default similarity function + std::string prop_name = std::string(prop_list[i].ValueString()); + if (prop_name.empty()) { + throw mgp::ValueException(fmt::format("knn: Property name at index {} cannot be empty", i)); + } + properties.emplace_back(prop_name, knn_util::SimilarityFunction::DEFAULT); + } else if (prop_list[i].IsMap()) { + // Map entry + mgp::Map prop_map = prop_list[i].ValueMap(); + if (prop_map.Size() == 0) { + throw mgp::ValueException(fmt::format("knn: Property map at index {} cannot be empty", i)); + } + + for (const auto& entry : prop_map) { + // Validate property name + std::string prop_name = std::string(entry.key); + if (prop_name.empty()) { + throw mgp::ValueException(fmt::format("knn: Property name cannot be empty in map at index {}", i)); + } + + // Validate metric value + if (!entry.value.IsString()) { + throw mgp::ValueException(fmt::format("knn: Metric value must be a string for property '{}' in map at index {}", prop_name, i)); + } + + std::string metric_str = std::string(entry.value.ValueString()); + if (metric_str.empty()) { + throw mgp::ValueException(fmt::format("knn: Metric value cannot be empty for property '{}' in map at index {}", prop_name, i)); + } + + if (!IsValidSimilarityFunction(metric_str)) { + throw mgp::ValueException(fmt::format("knn: Invalid metric '{}' for property '{}' in map at index {}. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name, i)); + } + + knn_util::SimilarityFunction metric = ParseSimilarityFunction(metric_str); + properties.emplace_back(prop_name, metric); + } + } else { + throw mgp::ValueException(fmt::format("knn: Property list element at index {} must be a string or map", i)); + } + } + } else { + throw mgp::ValueException("knn: nodeProperties must be a string, map, or list"); + } + + if (properties.empty()) { + throw mgp::ValueException("knn: No valid properties found in nodeProperties configuration"); + } + + return properties; +} + +// Helper function to insert results into record factory +void InsertResults(const std::vector>& results, const mgp::RecordFactory& record_factory) { + for (const auto& result : results) { + auto new_record = record_factory.NewRecord(); + new_record.Insert(kFieldNode.data(), std::get<0>(result)); + new_record.Insert(kFieldNeighbour.data(), std::get<1>(result)); + new_record.Insert(kFieldSimilarity.data(), std::get<2>(result)); + } +} + +// Get procedure - returns similarity pairs +void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory) { + mgp::MemoryDispatcherGuard guard{memory}; + const auto record_factory = mgp::RecordFactory(result); + const auto &arguments = mgp::List(args); + const auto &config_map = arguments[0].ValueMap(); + + try { + knn_util::KNNConfig config; + + // Parse node properties - required parameter + if (!config_map.KeyExists(kConfigNodeProperties)) { + throw mgp::ValueException("knn: Required parameter 'nodeProperties' is missing from config"); + } + + config.node_properties = ParseNodeProperties(config_map[kConfigNodeProperties]); + + // Parse other parameters with defaults + config.top_k = config_map.KeyExists(kConfigTopK) ? + static_cast(config_map[kConfigTopK].ValueInt()) : kDefaultTopK; + config.similarity_cutoff = config_map.KeyExists(kConfigSimilarityCutoff) ? + config_map[kConfigSimilarityCutoff].ValueDouble() : kDefaultSimilarityCutoff; + config.delta_threshold = config_map.KeyExists(kConfigDeltaThreshold) ? + config_map[kConfigDeltaThreshold].ValueDouble() : kDefaultDeltaThreshold; + config.max_iterations = config_map.KeyExists(kConfigMaxIterations) ? + static_cast(config_map[kConfigMaxIterations].ValueInt()) : kDefaultMaxIterations; + // Parse concurrency first (needed for validation) + config.concurrency = config_map.KeyExists(kConfigConcurrency) ? + static_cast(config_map[kConfigConcurrency].ValueInt()) : kDefaultConcurrency; + + // Parse random seed with validation + if (config_map.KeyExists(kConfigRandomSeed)) { + config.random_seed = static_cast(config_map[kConfigRandomSeed].ValueInt()); + // If seed is provided, concurrency must be 1 for deterministic results + if (config.concurrency != 1) { + throw mgp::ValueException("knn: When 'randomSeed' is specified, 'concurrency' must be set to 1 for deterministic results"); + } + } else { + // Generate completely random seed + std::random_device rd; + config.random_seed = static_cast(rd()); + } + + config.sample_rate = config_map.KeyExists(kConfigSampleRate) ? + config_map[kConfigSampleRate].ValueDouble() : kDefaultSampleRate; + + // Parse initial sampler + if (config_map.KeyExists(kConfigInitialSampler)) { + std::string sampler_str = std::string(config_map[kConfigInitialSampler].ValueString()); + if (!IsValidInitialSampler(sampler_str)) { + throw mgp::ValueException(fmt::format("knn: Invalid initialSampler '{}'. Valid values are: uniform, randomWalk", sampler_str)); + } + // Convert to lowercase for consistency + std::transform(sampler_str.begin(), sampler_str.end(), sampler_str.begin(), ::tolower); + config.initial_sampler = sampler_str; + } else { + config.initial_sampler = kDefaultInitialSampler; + } + + // Parse default similarity function + if (config_map.KeyExists(kConfigSimilarityFunction)) { + std::string func_str = std::string(config_map[kConfigSimilarityFunction].ValueString()); + if (!IsValidSimilarityFunction(func_str)) { + throw mgp::ValueException(fmt::format("knn: Invalid similarityFunction '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", func_str)); + } + config.default_similarity_function = ParseSimilarityFunction(func_str); + } else { + config.default_similarity_function = knn_util::SimilarityFunction::COSINE; // Default + } + + // Validate all parameter ranges + ValidateParameterRanges(config); + + auto results = knn_algs::CalculateKNN(mgp::Graph(memgraph_graph), config); + InsertResults(results, record_factory); + } catch (const mgp::ValueException &e) { + record_factory.SetErrorMessage(e.what()); + } catch (const std::exception &e) { + record_factory.SetErrorMessage(fmt::format("Unexpected error: {}", e.what())); + } +} + + +extern "C" int mgp_init_module(struct mgp_module *module, struct mgp_memory *memory) { + try { + mgp::MemoryDispatcherGuard guard{memory}; + + // Return types for get procedure + std::vector returns = { + mgp::Return(kFieldNode, mgp::Type::Node), + mgp::Return(kFieldNeighbour, mgp::Type::Node), + mgp::Return(kFieldSimilarity, mgp::Type::Double) + }; + + // Single config parameter + std::vector parameters = { + mgp::Parameter(kArgumentConfig, mgp::Type::Map) + }; + + // Add the single get procedure + mgp::AddProcedure(Get, kProcedureGet, mgp::ProcedureType::Read, + parameters, returns, module, memory); + + } catch(const std::exception &e) { + return 1; + } + return 0; +} + +extern "C" int mgp_shutdown_module() { + return 0; +} From 9c51c82dd34f44b354a90ff6c18603a6efa0a2ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 17 Sep 2025 18:29:08 +0200 Subject: [PATCH 02/12] Improve parallelization --- cpp/CMakeLists.txt | 4 +- cpp/knn_module/algorithms/knn.hpp | 217 +++++++++++++++--------------- cpp/knn_module/knn_module.cpp | 54 ++++---- 3 files changed, 137 insertions(+), 138 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b2369e304..4daeaae71 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,10 +28,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall \ # Don't omit frame pointer in RelWithDebInfo, for additional callchain debug. set(CMAKE_CXX_FLAGS_RELWITHDEBINFO - "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-omit-frame-pointer") + "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -march=x86-64-v3 -ffast-math -fopt-info-vec-optimized -fno-omit-frame-pointer") # Release flags. -set(CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=x86-64-v3 -ffast-math -DNDEBUG") set(CMAKE_SHARED_LIBRARY_PREFIX "") find_package(Threads REQUIRED) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index fc6f705af..c1388df36 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -1,13 +1,13 @@ #include #include +#include #include #include #include #include -#include #include -#include +#include namespace knn_util { @@ -99,22 +99,18 @@ std::vector ExtractPropertyValues(const mgp::Node &node, const std::vect } // Cosine similarity between two vectors -double CosineSimilarity(const std::vector &vec1, const std::vector &vec2) { - if (vec1.size() != vec2.size()) { - throw mgp::ValueException("Vectors must have the same size for cosine similarity"); - } +double CosineSimilarity(const std::vector &vec1, const std::vector &vec2, double norm1, double norm2) { + const size_t n = vec1.size(); + const double *a = vec1.data(); + const double *b = vec2.data(); double dot_product = 0.0; - double norm1 = 0.0; - double norm2 = 0.0; - for (size_t i = 0; i < vec1.size(); ++i) { - dot_product += vec1[i] * vec2[i]; - norm1 += vec1[i] * vec1[i]; - norm2 += vec2[i] * vec2[i]; + for (size_t i = 0; i < n; ++i) { + dot_product += a[i] * b[i]; } - double denominator = std::sqrt(norm1) * std::sqrt(norm2); + double denominator = norm1 * norm2; if (denominator < 1e-9) { return 0.0; } @@ -124,10 +120,6 @@ double CosineSimilarity(const std::vector &vec1, const std::vector &vec1, const std::vector &vec2) { - if (vec1.size() != vec2.size()) { - throw mgp::ValueException("Vectors must have the same size for euclidean similarity"); - } - double sum_squared_diff = 0.0; for (size_t i = 0; i < vec1.size(); ++i) { double diff = vec1[i] - vec2[i]; @@ -140,10 +132,6 @@ double EuclideanSimilarity(const std::vector &vec1, const std::vector &vec1, const std::vector &vec2) { - if (vec1.size() != vec2.size()) { - throw mgp::ValueException("Vectors must have the same size for pearson similarity"); - } - if (vec1.size() < 2) { return 1.0; // Perfect correlation for single values } @@ -180,10 +168,6 @@ double PearsonSimilarity(const std::vector &vec1, const std::vector &vec1, const std::vector &vec2) { - if (vec1.size() != vec2.size()) { - throw mgp::ValueException("Vectors must have the same size for overlap similarity"); - } - // For numeric vectors, we consider values as "overlapping" if they're close const double threshold = 1e-6; int overlap_count = 0; @@ -204,10 +188,6 @@ double OverlapSimilarity(const std::vector &vec1, const std::vector &vec1, const std::vector &vec2) { - if (vec1.size() != vec2.size()) { - throw mgp::ValueException("Vectors must have the same size for jaccard similarity"); - } - // For binary vectors (0 or 1), Jaccard = intersection / union int intersection_count = 0; int union_count = 0; @@ -267,65 +247,69 @@ knn_util::SimilarityFunction GetDefaultSimilarityFunction(const mgp::Value &prop } } - // Structure to hold pre-loaded node data for efficient comparison struct NodeData { mgp::Node node; - std::vector> property_values; // One vector per property + std::vector> property_values; // One vector per property + std::vector norms; std::vector resolved_metrics; // Resolved metrics per property - - NodeData(const mgp::Node& n, size_t num_properties) - : node(n), property_values(num_properties), resolved_metrics(num_properties) {} + + NodeData(const mgp::Node &n, size_t num_properties) + : node(n), property_values(num_properties), resolved_metrics(num_properties) {} }; // Pre-load node properties into memory for efficient comparison -std::vector PreloadNodeData(const std::vector& nodes, const knn_util::KNNConfig& config) { +std::vector PreloadNodeData(const std::vector &nodes, const knn_util::KNNConfig &config) { std::vector node_data; node_data.reserve(nodes.size()); - + if (config.node_properties.empty()) { throw mgp::ValueException("No node properties configured for similarity calculation"); } - - for (const auto& node : nodes) { + + for (const auto &node : nodes) { NodeData node_info(node, config.node_properties.size()); - + // Load each property - throw immediately on any error for (size_t prop_idx = 0; prop_idx < config.node_properties.size(); ++prop_idx) { - const auto& prop_config = config.node_properties[prop_idx]; - + const auto &prop_config = config.node_properties[prop_idx]; + mgp::Value prop_value = node.GetProperty(prop_config.name); std::vector values; - + // Resolve DEFAULT metric based on data type knn_util::SimilarityFunction resolved_metric = prop_config.metric; if (resolved_metric == knn_util::SimilarityFunction::DEFAULT) { if (prop_value.IsNumeric()) { // Single value - will use scalar formula - resolved_metric = knn_util::SimilarityFunction::DEFAULT; // Keep as DEFAULT for scalar + resolved_metric = knn_util::SimilarityFunction::DEFAULT; // Keep as DEFAULT for scalar } else if (prop_value.IsList()) { mgp::List list = prop_value.ValueList(); if (list.Size() > 0 && list[0].IsNumeric()) { // Infer based on first element type - resolved_metric = list[0].IsInt() ? knn_util::SimilarityFunction::JACCARD : knn_util::SimilarityFunction::COSINE; + resolved_metric = + list[0].IsInt() ? knn_util::SimilarityFunction::JACCARD : knn_util::SimilarityFunction::COSINE; } } } - + // Determine expected data type based on resolved metric - bool expects_integers = (resolved_metric == knn_util::SimilarityFunction::JACCARD || - resolved_metric == knn_util::SimilarityFunction::OVERLAP); - + bool expects_integers = (resolved_metric == knn_util::SimilarityFunction::JACCARD || + resolved_metric == knn_util::SimilarityFunction::OVERLAP); + if (prop_value.IsNumeric()) { // For scalar numbers, validate type and store the single value if (expects_integers && !prop_value.IsInt()) { - throw mgp::ValueException(fmt::format("Property {} must be integer for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); + throw mgp::ValueException( + fmt::format("Property {} must be integer for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); } if (!expects_integers && !prop_value.IsDouble()) { throw mgp::ValueException(fmt::format("Property {} must be double for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" : - (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) ? "EUCLIDEAN" : "PEARSON")); + (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" + : (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) + ? "EUCLIDEAN" + : "PEARSON")); } values.push_back(prop_value.ValueNumeric()); } else if (prop_value.IsList()) { @@ -334,16 +318,19 @@ std::vector PreloadNodeData(const std::vector& nodes, const if (list.Size() > 0 && list[0].IsNumeric()) { // Check type of first element only if (expects_integers && !list[0].IsInt()) { - throw mgp::ValueException(fmt::format("Property {} list elements must be integers for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); + throw mgp::ValueException( + fmt::format("Property {} list elements must be integers for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); } if (!expects_integers && !list[0].IsDouble()) { - throw mgp::ValueException(fmt::format("Property {} list elements must be doubles for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" : - (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) ? "EUCLIDEAN" : "PEARSON")); + throw mgp::ValueException( + fmt::format("Property {} list elements must be doubles for {} metric", prop_config.name, + (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" + : (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) ? "EUCLIDEAN" + : "PEARSON")); } } - + // Extract all numeric values (trusting the rest are the same type) for (size_t i = 0; i < list.Size(); ++i) { if (list[i].IsNumeric()) { @@ -351,38 +338,61 @@ std::vector PreloadNodeData(const std::vector& nodes, const } } } else { - throw mgp::ValueException(fmt::format("Property {} must be numeric or list of numbers for similarity calculation", prop_config.name)); + throw mgp::ValueException( + fmt::format("Property {} must be numeric or list of numbers for similarity calculation", prop_config.name)); } - + if (values.empty()) { - throw mgp::ValueException(fmt::format("Invalid property values: empty lists for property {}", prop_config.name)); + throw mgp::ValueException( + fmt::format("Invalid property values: empty lists for property {}", prop_config.name)); } - + node_info.property_values[prop_idx] = values; node_info.resolved_metrics[prop_idx] = resolved_metric; } - + node_data.push_back(node_info); } - + + for (auto i = 1; i < node_data.size(); i++) { + for (auto j = 0; j < node_data[i].property_values.size(); j++) { + if (node_data[i].property_values[j].size() != node_data[0].property_values[j].size()) { + throw mgp::ValueException("Vectors must have the same size for similarity calculation"); + } + } + } + return node_data; } +void PreloadNorms(std::vector &node_data, const knn_util::KNNConfig &config) { + for (auto &node : node_data) { + for (auto i = 0; i < node.property_values.size(); i++) { + if (config.node_properties[i].metric == knn_util::SimilarityFunction::COSINE) { + node.norms.push_back(std::sqrt(std::inner_product(node.property_values[i].begin(), node.property_values[i].end(), node.property_values[i].begin(), 0.0))); + } else { + node.norms.push_back(0.0); + } + } + } +} + // Calculate similarity between pre-loaded node data -double CalculateNodeSimilarity(const NodeData& node1_data, const NodeData& node2_data, const knn_util::KNNConfig& config) { +double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2_data, + const knn_util::KNNConfig &config) { double total_similarity = 0.0; const size_t num_properties = config.node_properties.size(); - + // Calculate similarity for each property and compute the mean for (size_t prop_idx = 0; prop_idx < num_properties; ++prop_idx) { - const auto& values1 = node1_data.property_values[prop_idx]; - const auto& values2 = node2_data.property_values[prop_idx]; - + const auto &values1 = node1_data.property_values[prop_idx]; + const auto &values2 = node2_data.property_values[prop_idx]; + double property_similarity = 0.0; - + // Use the pre-resolved metric from NodeData knn_util::SimilarityFunction metric = node1_data.resolved_metrics[prop_idx]; - + // For scalar numbers, use the formula: 1 / (1 + |a - b|) if (values1.size() == 1) { property_similarity = 1.0 / (1.0 + std::abs(values1[0] - values2[0])); @@ -390,7 +400,7 @@ double CalculateNodeSimilarity(const NodeData& node1_data, const NodeData& node2 // For vectors, use the pre-resolved similarity function switch (metric) { case knn_util::SimilarityFunction::COSINE: - property_similarity = CosineSimilarity(values1, values2); + property_similarity = CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); break; case knn_util::SimilarityFunction::EUCLIDEAN: property_similarity = EuclideanSimilarity(values1, values2); @@ -405,20 +415,20 @@ double CalculateNodeSimilarity(const NodeData& node1_data, const NodeData& node2 property_similarity = JaccardSimilarity(values1, values2); break; default: - property_similarity = CosineSimilarity(values1, values2); + property_similarity = CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); break; } } - + total_similarity += property_similarity; } - + // Return the mean of all property similarities return total_similarity / num_properties; } // Validate configuration parameters -void ValidateConfig(const knn_util::KNNConfig& config) { +void ValidateConfig(const knn_util::KNNConfig &config) { if (config.initial_sampler == knn_util::kSamplerRandomWalk) { throw mgp::ValueException("Random walk sampling not implemented"); } else if (config.initial_sampler != knn_util::kSamplerUniform) { @@ -427,13 +437,9 @@ void ValidateConfig(const knn_util::KNNConfig& config) { } // Get candidate indices for comparison, excluding self -std::vector GetCandidateIndices( - size_t node_idx, - size_t total_nodes, - const knn_util::KNNConfig& config) { - +std::vector GetCandidateIndices(size_t node_idx, size_t total_nodes, const knn_util::KNNConfig &config) { std::vector comparison_indices; - + if (config.sample_rate < 1.0) { // Create indices for all nodes except self std::vector all_indices; @@ -443,15 +449,15 @@ std::vector GetCandidateIndices( all_indices.push_back(i); } } - + // Shuffle indices for uniform sampling std::mt19937 rng(config.random_seed); std::shuffle(all_indices.begin(), all_indices.end(), rng); - + // Calculate sample size size_t sample_size = static_cast(all_indices.size() * config.sample_rate); comparison_indices.reserve(sample_size); - + // Take the first sample_size indices for (size_t i = 0; i < sample_size; ++i) { comparison_indices.push_back(all_indices[i]); @@ -465,19 +471,16 @@ std::vector GetCandidateIndices( } } } - + return comparison_indices; } // Calculate similarity for one node against all candidates (parallel implementation) -std::vector CalculateSimilarityForNode( - size_t node_idx, - const std::vector& node_data, - const std::vector& comparison_indices, - const knn_util::KNNConfig& config) { - - const auto& node1_data = node_data[node_idx]; - +std::vector CalculateSimilarityForNode(size_t node_idx, const std::vector &node_data, + const std::vector &comparison_indices, + const knn_util::KNNConfig &config) { + const auto &node1_data = node_data[node_idx]; + // Pre-allocate results vector std::vector results; results.reserve(comparison_indices.size()); @@ -498,15 +501,15 @@ std::vector CalculateSimilarityForNode( #pragma omp parallel for for (size_t i = 0; i < array_size; ++i) { size_t idx = comparison_indices_array[i]; - const auto& node2_data = node_data[idx]; + const auto &node2_data = node_data[idx]; double similarity = CalculateNodeSimilarity(node1_data, node2_data, config); - + // Store result (will be filtered later) parallel_results[i] = knn_util::KNNResult(node1_data.node.Id(), node2_data.node.Id(), similarity); } // Filter results based on similarity cutoff and add to final results - for (const auto& result : parallel_results) { + for (const auto &result : parallel_results) { if (result.similarity >= config.similarity_cutoff) { results.push_back(result); } @@ -516,25 +519,20 @@ std::vector CalculateSimilarityForNode( } // Sort and insert top-k results into final results -void InsertTopKResults( - const std::vector& top_k_results, - const mgp::Graph& graph, - std::vector>& final_results) { - +void InsertTopKResults(const std::vector &top_k_results, const mgp::Graph &graph, + std::vector> &final_results) { // Sort by similarity (descending) std::vector sorted_results = top_k_results; std::sort(sorted_results.begin(), sorted_results.end(), - [](const knn_util::KNNResult& a, const knn_util::KNNResult& b) { - return a.similarity > b.similarity; - }); + [](const knn_util::KNNResult &a, const knn_util::KNNResult &b) { return a.similarity > b.similarity; }); // Convert to final results with actual nodes - for (const auto& result : sorted_results) { + for (const auto &result : sorted_results) { try { auto node1 = graph.GetNodeById(result.node1_id); auto node2 = graph.GetNodeById(result.node2_id); final_results.push_back(std::make_tuple(node1, node2, result.similarity)); - } catch (const std::exception& e) { + } catch (const std::exception &e) { // Skip if node not found continue; } @@ -561,6 +559,7 @@ std::vector> CalculateKNN(const mgp::Gr // Pre-load node properties into memory for efficient comparison std::vector node_data = PreloadNodeData(nodes, config); + PreloadNorms(node_data, config); // For each node, find its top-k most similar nodes for (size_t i = 0; i < node_data.size(); ++i) { @@ -568,8 +567,8 @@ std::vector> CalculateKNN(const mgp::Gr std::vector comparison_indices = GetCandidateIndices(i, node_data.size(), config); // 2. Calculate similarity for one node - std::vector top_k_results = CalculateSimilarityForNode( - i, node_data, comparison_indices, config); + std::vector top_k_results = + CalculateSimilarityForNode(i, node_data, comparison_indices, config); // Take only top-k results if (top_k_results.size() > static_cast(config.top_k)) { diff --git a/cpp/knn_module/knn_module.cpp b/cpp/knn_module/knn_module.cpp index e54129c93..ac1c5f48f 100644 --- a/cpp/knn_module/knn_module.cpp +++ b/cpp/knn_module/knn_module.cpp @@ -70,33 +70,33 @@ bool IsValidInitialSampler(const std::string& sampler_str) { void ValidateParameterRanges(const knn_util::KNNConfig& config) { // Validate range [0, 1] parameters if (config.sample_rate < 0.0 || config.sample_rate > 1.0) { - throw mgp::ValueException(fmt::format("knn: sampleRate must be between 0 and 1, got {}", config.sample_rate)); + throw mgp::ValueException(fmt::format("sampleRate must be between 0 and 1, got {}", config.sample_rate)); } if (config.delta_threshold < 0.0 || config.delta_threshold > 1.0) { - throw mgp::ValueException(fmt::format("knn: deltaThreshold must be between 0 and 1, got {}", config.delta_threshold)); + throw mgp::ValueException(fmt::format("deltaThreshold must be between 0 and 1, got {}", config.delta_threshold)); } if (config.similarity_cutoff < 0.0 || config.similarity_cutoff > 1.0) { - throw mgp::ValueException(fmt::format("knn: similarityCutoff must be between 0 and 1, got {}", config.similarity_cutoff)); + throw mgp::ValueException(fmt::format("similarityCutoff must be between 0 and 1, got {}", config.similarity_cutoff)); } // Validate positive integer parameters if (config.top_k <= 0) { - throw mgp::ValueException(fmt::format("knn: topK must be a positive integer, got {}", config.top_k)); + throw mgp::ValueException(fmt::format("topK must be a positive integer, got {}", config.top_k)); } if (config.concurrency <= 0) { - throw mgp::ValueException(fmt::format("knn: concurrency must be a positive integer, got {}", config.concurrency)); + throw mgp::ValueException(fmt::format("concurrency must be a positive integer, got {}", config.concurrency)); } if (config.max_iterations <= 0) { - throw mgp::ValueException(fmt::format("knn: maxIterations must be a positive integer, got {}", config.max_iterations)); + throw mgp::ValueException(fmt::format("maxIterations must be a positive integer, got {}", config.max_iterations)); } // randomSeed can be negative, so we only check it's not zero if (config.random_seed == 0) { - throw mgp::ValueException("knn: randomSeed cannot be 0"); + throw mgp::ValueException("randomSeed cannot be 0"); } } @@ -127,35 +127,35 @@ std::vector ParseNodeProperties(const mgp::Value& node // Single property name - use default similarity function std::string prop_name = std::string(node_props_value.ValueString()); if (prop_name.empty()) { - throw mgp::ValueException("knn: Property name cannot be empty"); + throw mgp::ValueException("Property name cannot be empty"); } properties.emplace_back(prop_name, knn_util::SimilarityFunction::DEFAULT); } else if (node_props_value.IsMap()) { // Map of property names to metrics mgp::Map prop_map = node_props_value.ValueMap(); if (prop_map.Size() == 0) { - throw mgp::ValueException("knn: Property map cannot be empty"); + throw mgp::ValueException("Property map cannot be empty"); } for (const auto& entry : prop_map) { // Validate property name std::string prop_name = std::string(entry.key); if (prop_name.empty()) { - throw mgp::ValueException("knn: Property name cannot be empty"); + throw mgp::ValueException("Property name cannot be empty"); } // Validate metric value if (!entry.value.IsString()) { - throw mgp::ValueException(fmt::format("knn: Metric value must be a string for property '{}'", prop_name)); + throw mgp::ValueException(fmt::format("Metric value must be a string for property '{}'", prop_name)); } std::string metric_str = std::string(entry.value.ValueString()); if (metric_str.empty()) { - throw mgp::ValueException(fmt::format("knn: Metric value cannot be empty for property '{}'", prop_name)); + throw mgp::ValueException(fmt::format("Metric value cannot be empty for property '{}'", prop_name)); } if (!IsValidSimilarityFunction(metric_str)) { - throw mgp::ValueException(fmt::format("knn: Invalid metric '{}' for property '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name)); + throw mgp::ValueException(fmt::format("Invalid metric '{}' for property '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name)); } knn_util::SimilarityFunction metric = ParseSimilarityFunction(metric_str); @@ -165,7 +165,7 @@ std::vector ParseNodeProperties(const mgp::Value& node // List of strings and/or maps mgp::List prop_list = node_props_value.ValueList(); if (prop_list.Size() == 0) { - throw mgp::ValueException("knn: Property list cannot be empty"); + throw mgp::ValueException("Property list cannot be empty"); } for (size_t i = 0; i < prop_list.Size(); ++i) { @@ -173,50 +173,50 @@ std::vector ParseNodeProperties(const mgp::Value& node // String property name - use default similarity function std::string prop_name = std::string(prop_list[i].ValueString()); if (prop_name.empty()) { - throw mgp::ValueException(fmt::format("knn: Property name at index {} cannot be empty", i)); + throw mgp::ValueException(fmt::format("Property name at index {} cannot be empty", i)); } properties.emplace_back(prop_name, knn_util::SimilarityFunction::DEFAULT); } else if (prop_list[i].IsMap()) { // Map entry mgp::Map prop_map = prop_list[i].ValueMap(); if (prop_map.Size() == 0) { - throw mgp::ValueException(fmt::format("knn: Property map at index {} cannot be empty", i)); + throw mgp::ValueException(fmt::format("Property map at index {} cannot be empty", i)); } for (const auto& entry : prop_map) { // Validate property name std::string prop_name = std::string(entry.key); if (prop_name.empty()) { - throw mgp::ValueException(fmt::format("knn: Property name cannot be empty in map at index {}", i)); + throw mgp::ValueException(fmt::format("Property name cannot be empty in map at index {}", i)); } // Validate metric value if (!entry.value.IsString()) { - throw mgp::ValueException(fmt::format("knn: Metric value must be a string for property '{}' in map at index {}", prop_name, i)); + throw mgp::ValueException(fmt::format("Metric value must be a string for property '{}' in map at index {}", prop_name, i)); } std::string metric_str = std::string(entry.value.ValueString()); if (metric_str.empty()) { - throw mgp::ValueException(fmt::format("knn: Metric value cannot be empty for property '{}' in map at index {}", prop_name, i)); + throw mgp::ValueException(fmt::format("Metric value cannot be empty for property '{}' in map at index {}", prop_name, i)); } if (!IsValidSimilarityFunction(metric_str)) { - throw mgp::ValueException(fmt::format("knn: Invalid metric '{}' for property '{}' in map at index {}. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name, i)); + throw mgp::ValueException(fmt::format("Invalid metric '{}' for property '{}' in map at index {}. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name, i)); } knn_util::SimilarityFunction metric = ParseSimilarityFunction(metric_str); properties.emplace_back(prop_name, metric); } } else { - throw mgp::ValueException(fmt::format("knn: Property list element at index {} must be a string or map", i)); + throw mgp::ValueException(fmt::format("Property list element at index {} must be a string or map", i)); } } } else { - throw mgp::ValueException("knn: nodeProperties must be a string, map, or list"); + throw mgp::ValueException("nodeProperties must be a string, map, or list"); } if (properties.empty()) { - throw mgp::ValueException("knn: No valid properties found in nodeProperties configuration"); + throw mgp::ValueException("No valid properties found in nodeProperties configuration"); } return properties; @@ -244,7 +244,7 @@ void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memo // Parse node properties - required parameter if (!config_map.KeyExists(kConfigNodeProperties)) { - throw mgp::ValueException("knn: Required parameter 'nodeProperties' is missing from config"); + throw mgp::ValueException("Required parameter 'nodeProperties' is missing from config"); } config.node_properties = ParseNodeProperties(config_map[kConfigNodeProperties]); @@ -267,7 +267,7 @@ void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memo config.random_seed = static_cast(config_map[kConfigRandomSeed].ValueInt()); // If seed is provided, concurrency must be 1 for deterministic results if (config.concurrency != 1) { - throw mgp::ValueException("knn: When 'randomSeed' is specified, 'concurrency' must be set to 1 for deterministic results"); + throw mgp::ValueException("When 'randomSeed' is specified, 'concurrency' must be set to 1 for deterministic results"); } } else { // Generate completely random seed @@ -282,7 +282,7 @@ void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memo if (config_map.KeyExists(kConfigInitialSampler)) { std::string sampler_str = std::string(config_map[kConfigInitialSampler].ValueString()); if (!IsValidInitialSampler(sampler_str)) { - throw mgp::ValueException(fmt::format("knn: Invalid initialSampler '{}'. Valid values are: uniform, randomWalk", sampler_str)); + throw mgp::ValueException(fmt::format("Invalid initialSampler '{}'. Valid values are: uniform, randomWalk", sampler_str)); } // Convert to lowercase for consistency std::transform(sampler_str.begin(), sampler_str.end(), sampler_str.begin(), ::tolower); @@ -295,7 +295,7 @@ void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memo if (config_map.KeyExists(kConfigSimilarityFunction)) { std::string func_str = std::string(config_map[kConfigSimilarityFunction].ValueString()); if (!IsValidSimilarityFunction(func_str)) { - throw mgp::ValueException(fmt::format("knn: Invalid similarityFunction '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", func_str)); + throw mgp::ValueException(fmt::format("Invalid similarityFunction '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", func_str)); } config.default_similarity_function = ParseSimilarityFunction(func_str); } else { From 09df1a3203a1324df48aebb3a0d8be9d7eb73bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 17 Sep 2025 19:23:13 +0200 Subject: [PATCH 03/12] Parallelize norms --- cpp/CMakeLists.txt | 2 +- cpp/knn_module/algorithms/knn.hpp | 84 +++++++++++++++---------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4daeaae71..c9fa299ec 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall \ # Don't omit frame pointer in RelWithDebInfo, for additional callchain debug. set(CMAKE_CXX_FLAGS_RELWITHDEBINFO - "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -march=x86-64-v3 -ffast-math -fopt-info-vec-optimized -fno-omit-frame-pointer") + "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -march=x86-64-v3 -ffast-math -fopenmp -fopt-info-vec-optimized -fno-omit-frame-pointer") # Release flags. set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=x86-64-v3 -ffast-math -DNDEBUG") diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index c1388df36..e8a6110d4 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,7 @@ struct KNNResult { namespace knn_algs { + // Extract property values from a node std::vector ExtractPropertyValues(const mgp::Node &node, const std::vector &properties) { std::vector values; @@ -98,24 +100,13 @@ std::vector ExtractPropertyValues(const mgp::Node &node, const std::vect return values; } -// Cosine similarity between two vectors -double CosineSimilarity(const std::vector &vec1, const std::vector &vec2, double norm1, double norm2) { - const size_t n = vec1.size(); - const double *a = vec1.data(); - const double *b = vec2.data(); - - double dot_product = 0.0; - - for (size_t i = 0; i < n; ++i) { - dot_product += a[i] * b[i]; - } - - double denominator = norm1 * norm2; - if (denominator < 1e-9) { - return 0.0; - } +inline double CosineSimilarity(const std::vector &vec1, const std::vector &vec2, double norm1, double norm2) { + const double dot = + std::transform_reduce(vec1.begin(), vec1.end(), vec2.begin(), 0.0, std::plus<>(), std::multiplies<>()); - return dot_product / denominator; + const double denom = norm1 * norm2; + if (denom < 1e-9) return 0.0; + return dot / denom; } // Euclidean similarity (1 / (1 + distance)) @@ -250,7 +241,7 @@ knn_util::SimilarityFunction GetDefaultSimilarityFunction(const mgp::Value &prop // Structure to hold pre-loaded node data for efficient comparison struct NodeData { mgp::Node node; - std::vector> property_values; // One vector per property + std::vector> property_values; // One vector per property std::vector norms; std::vector resolved_metrics; // Resolved metrics per property @@ -354,8 +345,8 @@ std::vector PreloadNodeData(const std::vector &nodes, const node_data.push_back(node_info); } - for (auto i = 1; i < node_data.size(); i++) { - for (auto j = 0; j < node_data[i].property_values.size(); j++) { + for (size_t i = 1; i < node_data.size(); i++) { + for (size_t j = 0; j < node_data[i].property_values.size(); j++) { if (node_data[i].property_values[j].size() != node_data[0].property_values[j].size()) { throw mgp::ValueException("Vectors must have the same size for similarity calculation"); } @@ -366,12 +357,14 @@ std::vector PreloadNodeData(const std::vector &nodes, const } void PreloadNorms(std::vector &node_data, const knn_util::KNNConfig &config) { - for (auto &node : node_data) { - for (auto i = 0; i < node.property_values.size(); i++) { - if (config.node_properties[i].metric == knn_util::SimilarityFunction::COSINE) { - node.norms.push_back(std::sqrt(std::inner_product(node.property_values[i].begin(), node.property_values[i].end(), node.property_values[i].begin(), 0.0))); - } else { - node.norms.push_back(0.0); + #pragma omp parallel for + for (size_t ni = 0; ni < node_data.size(); ++ni) { + auto &node = node_data[ni]; + node.norms.resize(node.property_values.size(), 0.0); + for (size_t i = 0; i < node.property_values.size(); ++i) { + if (node.resolved_metrics[i] == knn_util::SimilarityFunction::COSINE) { + const auto &v = node.property_values[i]; + node.norms[i] = std::sqrt(std::inner_product(v.begin(), v.end(), v.begin(), 0.0)); } } } @@ -400,7 +393,8 @@ double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2 // For vectors, use the pre-resolved similarity function switch (metric) { case knn_util::SimilarityFunction::COSINE: - property_similarity = CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); + property_similarity = + CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); break; case knn_util::SimilarityFunction::EUCLIDEAN: property_similarity = EuclideanSimilarity(values1, values2); @@ -415,7 +409,8 @@ double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2 property_similarity = JaccardSimilarity(values1, values2); break; default: - property_similarity = CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); + property_similarity = + CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); break; } } @@ -502,9 +497,11 @@ std::vector CalculateSimilarityForNode(size_t node_idx, con for (size_t i = 0; i < array_size; ++i) { size_t idx = comparison_indices_array[i]; const auto &node2_data = node_data[idx]; + + // Calculate similarity directly double similarity = CalculateNodeSimilarity(node1_data, node2_data, config); - // Store result (will be filtered later) + // Store result parallel_results[i] = knn_util::KNNResult(node1_data.node.Id(), node2_data.node.Id(), similarity); } @@ -515,19 +512,27 @@ std::vector CalculateSimilarityForNode(size_t node_idx, con } } + const size_t k = std::min(results.size(), static_cast(config.top_k)); + auto cmp = [](const knn_util::KNNResult &a, const knn_util::KNNResult &b) { + return a.similarity > b.similarity; // descending + }; + + if (k > 0 && results.size() > k) { + std::nth_element(results.begin(), results.begin() + k, results.end(), cmp); + results.resize(k); + std::sort(results.begin(), results.end(), cmp); // sort only top-k + } else { + std::sort(results.begin(), results.end(), cmp); // small n or k >= n + } + return results; } -// Sort and insert top-k results into final results +// Insert top-k results into final results void InsertTopKResults(const std::vector &top_k_results, const mgp::Graph &graph, std::vector> &final_results) { - // Sort by similarity (descending) - std::vector sorted_results = top_k_results; - std::sort(sorted_results.begin(), sorted_results.end(), - [](const knn_util::KNNResult &a, const knn_util::KNNResult &b) { return a.similarity > b.similarity; }); - - // Convert to final results with actual nodes - for (const auto &result : sorted_results) { + // Convert to final results with actual nodes (results are already sorted) + for (const auto &result : top_k_results) { try { auto node1 = graph.GetNodeById(result.node1_id); auto node2 = graph.GetNodeById(result.node2_id); @@ -570,11 +575,6 @@ std::vector> CalculateKNN(const mgp::Gr std::vector top_k_results = CalculateSimilarityForNode(i, node_data, comparison_indices, config); - // Take only top-k results - if (top_k_results.size() > static_cast(config.top_k)) { - top_k_results.erase(top_k_results.begin() + config.top_k, top_k_results.end()); - } - // 3. Insert sorted top-k results InsertTopKResults(top_k_results, graph, results); } From 0774438112d7fccced4c873b098dcd1798f23eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 24 Sep 2025 12:15:55 +0200 Subject: [PATCH 04/12] Add implementation of knn with basic tests --- cpp/knn_module/algorithms/knn.hpp | 463 ++++++----------- cpp/knn_module/knn_module.cpp | 489 +++++++----------- e2e/knn_test/test_knn_avg/input.cyp | 2 + e2e/knn_test/test_knn_avg/test.yml | 11 + e2e/knn_test/test_knn_empty/input.cyp | 25 + e2e/knn_test/test_knn_empty/test.yml | 5 + .../test_knn_error_size_of_vectors/input.cyp | 2 + .../test_knn_error_size_of_vectors/test.yml | 5 + e2e/knn_test/test_knn_simple/input.cyp | 2 + e2e/knn_test/test_knn_simple/test.yml | 11 + 10 files changed, 410 insertions(+), 605 deletions(-) create mode 100644 e2e/knn_test/test_knn_avg/input.cyp create mode 100644 e2e/knn_test/test_knn_avg/test.yml create mode 100644 e2e/knn_test/test_knn_empty/input.cyp create mode 100644 e2e/knn_test/test_knn_empty/test.yml create mode 100644 e2e/knn_test/test_knn_error_size_of_vectors/input.cyp create mode 100644 e2e/knn_test/test_knn_error_size_of_vectors/test.yml create mode 100644 e2e/knn_test/test_knn_simple/input.cyp create mode 100644 e2e/knn_test/test_knn_simple/test.yml diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index e8a6110d4..4a05cd845 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -7,25 +7,12 @@ #include #include #include -#include #include namespace knn_util { -// Similarity functions supported by KNN -enum class SimilarityFunction { COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT }; - -// Initial sampler types -constexpr std::string_view kSamplerUniform = "uniform"; -constexpr std::string_view kSamplerRandomWalk = "randomWalk"; - -// Property configuration for KNN -struct PropertyConfig { - std::string name; - SimilarityFunction metric; - - PropertyConfig(const std::string &prop_name, SimilarityFunction sim_func) : name(prop_name), metric(sim_func) {} -}; +// Aggregate methods for merging property values +enum class AggregateMethod { NONE, APPEND, MIN, MAX, AVG, SUM }; // Configuration for KNN algorithm struct KNNConfig { @@ -36,9 +23,8 @@ struct KNNConfig { int random_seed = 42; double sample_rate = 0.5; int concurrency = 1; - std::string initial_sampler = "uniform"; - std::vector node_properties; - SimilarityFunction default_similarity_function = SimilarityFunction::COSINE; + std::vector node_properties; + AggregateMethod aggregate_method = AggregateMethod::NONE; }; // Result structure for KNN @@ -64,43 +50,8 @@ struct KNNResult { namespace knn_algs { - -// Extract property values from a node -std::vector ExtractPropertyValues(const mgp::Node &node, const std::vector &properties) { - std::vector values; - values.reserve(properties.size()); - - for (const auto &property : properties) { - try { - auto prop_value = node.GetProperty(property); - if (prop_value.IsNull()) { - throw mgp::ValueException(fmt::format("Node missing property: {}", property)); - } - - if (prop_value.IsList()) { - // For vector properties, take the first element or average - auto list = prop_value.ValueList(); - if (list.Size() > 0) { - values.push_back(list[0].ValueDouble()); - } else { - values.push_back(0.0); - } - } else if (prop_value.IsDouble()) { - values.push_back(prop_value.ValueDouble()); - } else if (prop_value.IsInt()) { - values.push_back(static_cast(prop_value.ValueInt())); - } else { - throw mgp::ValueException(fmt::format("Property {} must be numeric or list of numbers", property)); - } - } catch (const mgp::ValueException &e) { - throw mgp::ValueException(fmt::format("Error extracting property {} : {}", property, e.what())); - } - } - - return values; -} - -inline double CosineSimilarity(const std::vector &vec1, const std::vector &vec2, double norm1, double norm2) { +inline double CosineSimilarity(const std::vector &vec1, const std::vector &vec2, const double norm1, + const double norm2) { const double dot = std::transform_reduce(vec1.begin(), vec1.end(), vec2.begin(), 0.0, std::plus<>(), std::multiplies<>()); @@ -109,144 +60,106 @@ inline double CosineSimilarity(const std::vector &vec1, const std::vecto return dot / denom; } -// Euclidean similarity (1 / (1 + distance)) -double EuclideanSimilarity(const std::vector &vec1, const std::vector &vec2) { - double sum_squared_diff = 0.0; - for (size_t i = 0; i < vec1.size(); ++i) { - double diff = vec1[i] - vec2[i]; - sum_squared_diff += diff * diff; - } - - double distance = std::sqrt(sum_squared_diff); - return 1.0 / (1.0 + distance); -} - -// Pearson correlation coefficient -double PearsonSimilarity(const std::vector &vec1, const std::vector &vec2) { - if (vec1.size() < 2) { - return 1.0; // Perfect correlation for single values +// Helper function to aggregate property values based on the specified method +std::vector AggregatePropertyValues(const std::vector> &property_vectors, + knn_util::AggregateMethod method) { + if (property_vectors.empty()) { + throw mgp::ValueException("Properties of the nodes for aggregation cannot be empty!"); } - // Calculate means - double mean1 = 0.0, mean2 = 0.0; - for (size_t i = 0; i < vec1.size(); ++i) { - mean1 += vec1[i]; - mean2 += vec2[i]; - } - mean1 /= vec1.size(); - mean2 /= vec2.size(); - - // Calculate correlation - double numerator = 0.0; - double sum_sq1 = 0.0; - double sum_sq2 = 0.0; - - for (size_t i = 0; i < vec1.size(); ++i) { - double diff1 = vec1[i] - mean1; - double diff2 = vec2[i] - mean2; - numerator += diff1 * diff2; - sum_sq1 += diff1 * diff1; - sum_sq2 += diff2 * diff2; - } - - double denominator = std::sqrt(sum_sq1 * sum_sq2); - if (denominator < 1e-9) { - return 0.0; - } - - return numerator / denominator; -} - -// Overlap similarity (intersection / min size) -double OverlapSimilarity(const std::vector &vec1, const std::vector &vec2) { - // For numeric vectors, we consider values as "overlapping" if they're close - const double threshold = 1e-6; - int overlap_count = 0; - - for (size_t i = 0; i < vec1.size(); ++i) { - if (std::abs(vec1[i] - vec2[i]) < threshold) { - overlap_count++; + // Validate vector sizes for methods that require same-sized vectors + bool requires_same_size = (method == knn_util::AggregateMethod::MIN || method == knn_util::AggregateMethod::MAX || + method == knn_util::AggregateMethod::AVG || method == knn_util::AggregateMethod::SUM); + + if (requires_same_size) { + size_t expected_size = property_vectors[0].size(); + for (size_t i = 1; i < property_vectors.size(); ++i) { + if (property_vectors[i].size() != expected_size) { + throw mgp::ValueException(fmt::format( + "All property vectors must have the same size for aggregation. Expected size: {}, but vector has size: {}", + expected_size, property_vectors[i].size())); + } } } - int min_size = std::min(vec1.size(), vec2.size()); - if (min_size == 0) { - return 0.0; - } - - return static_cast(overlap_count) / min_size; -} - -// Jaccard similarity (intersection / union) -double JaccardSimilarity(const std::vector &vec1, const std::vector &vec2) { - // For binary vectors (0 or 1), Jaccard = intersection / union - int intersection_count = 0; - int union_count = 0; + std::vector result; - for (size_t i = 0; i < vec1.size(); ++i) { - bool has_v1 = vec1[i] > 0; - bool has_v2 = vec2[i] > 0; - - if (has_v1 && has_v2) { - intersection_count++; + switch (method) { + case knn_util::AggregateMethod::NONE: { + throw mgp::ValueException( + "Unexpected error: aggregation of property values happened without aggregation method specified! Please " + "contact Memgraph support."); } - if (has_v1 || has_v2) { - union_count++; + case knn_util::AggregateMethod::APPEND: { + // Concatenate all property vectors + for (const auto &vec : property_vectors) { + result.insert(result.end(), vec.begin(), vec.end()); + } + break; } - } - - if (union_count == 0) { - return 0.0; - } - - return static_cast(intersection_count) / union_count; -} - -// Helper function to determine if a list contains integers -bool IsIntegerList(const mgp::List &list) { - for (size_t i = 0; i < list.Size(); ++i) { - if (list[i].IsNumeric()) { - double val = list[i].ValueNumeric(); - // Check if the numeric value is actually an integer - if (val != std::floor(val)) { - return false; + case knn_util::AggregateMethod::MIN: { + // Take minimum value from each position across all vectors + size_t vector_size = property_vectors[0].size(); + result.resize(vector_size); + for (size_t i = 0; i < vector_size; ++i) { + result[i] = property_vectors[0][i]; + for (size_t j = 1; j < property_vectors.size(); ++j) { + result[i] = std::min(result[i], property_vectors[j][i]); + } } - } else { - return false; + break; } - } - return true; -} - -// Helper function to get default similarity function based on data type -knn_util::SimilarityFunction GetDefaultSimilarityFunction(const mgp::Value &prop_value) { - if (prop_value.IsNumeric()) { - // Single numeric value - use cosine as default - return knn_util::SimilarityFunction::COSINE; - } else if (prop_value.IsList()) { - mgp::List list = prop_value.ValueList(); - if (IsIntegerList(list)) { - // List of integers - use Jaccard as default - return knn_util::SimilarityFunction::JACCARD; - } else { - // List of floats - use Cosine as default - return knn_util::SimilarityFunction::COSINE; + case knn_util::AggregateMethod::MAX: { + // Take maximum value from each position across all vectors + size_t vector_size = property_vectors[0].size(); + result.resize(vector_size); + for (size_t i = 0; i < vector_size; ++i) { + result[i] = property_vectors[0][i]; + for (size_t j = 1; j < property_vectors.size(); ++j) { + result[i] = std::max(result[i], property_vectors[j][i]); + } + } + break; + } + case knn_util::AggregateMethod::AVG: { + // Take average value from each position across all vectors + size_t vector_size = property_vectors[0].size(); + result.resize(vector_size); + for (size_t i = 0; i < vector_size; ++i) { + double sum = 0.0; + for (const auto &vec : property_vectors) { + sum += vec[i]; + } + result[i] = sum / property_vectors.size(); + } + break; + } + case knn_util::AggregateMethod::SUM: { + // Take sum value from each position across all vectors + size_t vector_size = property_vectors[0].size(); + result.resize(vector_size); + for (size_t i = 0; i < vector_size; ++i) { + result[i] = 0.0; + for (const auto &vec : property_vectors) { + result[i] += vec[i]; + } + } + break; } - } else { - // Non-numeric property - use Cosine as fallback - return knn_util::SimilarityFunction::COSINE; } + + return result; } // Structure to hold pre-loaded node data for efficient comparison struct NodeData { - mgp::Node node; - std::vector> property_values; // One vector per property - std::vector norms; - std::vector resolved_metrics; // Resolved metrics per property + mgp::Id node_id; + std::vector> + property_values; // One vector per property (for NONE) or single aggregated vector (for aggregation) + std::vector norms; // Norms for each property - NodeData(const mgp::Node &n, size_t num_properties) - : node(n), property_values(num_properties), resolved_metrics(num_properties) {} + NodeData(const mgp::Node &n, const std::vector> &prop_values) + : node_id(n.Id()), property_values(prop_values) {} }; // Pre-load node properties into memory for efficient comparison @@ -254,101 +167,60 @@ std::vector PreloadNodeData(const std::vector &nodes, const std::vector node_data; node_data.reserve(nodes.size()); - if (config.node_properties.empty()) { - throw mgp::ValueException("No node properties configured for similarity calculation"); - } - for (const auto &node : nodes) { - NodeData node_info(node, config.node_properties.size()); + // Collect all property values first + std::vector> property_values(config.node_properties.size()); - // Load each property - throw immediately on any error + // Load all properties into temporary vectors for (size_t prop_idx = 0; prop_idx < config.node_properties.size(); ++prop_idx) { - const auto &prop_config = config.node_properties[prop_idx]; - - mgp::Value prop_value = node.GetProperty(prop_config.name); + const std::string &prop_name = config.node_properties[prop_idx]; + mgp::Value prop_value = node.GetProperty(prop_name); std::vector values; - // Resolve DEFAULT metric based on data type - knn_util::SimilarityFunction resolved_metric = prop_config.metric; - if (resolved_metric == knn_util::SimilarityFunction::DEFAULT) { - if (prop_value.IsNumeric()) { - // Single value - will use scalar formula - resolved_metric = knn_util::SimilarityFunction::DEFAULT; // Keep as DEFAULT for scalar - } else if (prop_value.IsList()) { - mgp::List list = prop_value.ValueList(); - if (list.Size() > 0 && list[0].IsNumeric()) { - // Infer based on first element type - resolved_metric = - list[0].IsInt() ? knn_util::SimilarityFunction::JACCARD : knn_util::SimilarityFunction::COSINE; - } - } + if (!prop_value.IsList()) { + throw mgp::ValueException( + fmt::format("Property {} must be a list of doubles for similarity calculation", prop_name)); } - // Determine expected data type based on resolved metric - bool expects_integers = (resolved_metric == knn_util::SimilarityFunction::JACCARD || - resolved_metric == knn_util::SimilarityFunction::OVERLAP); - - if (prop_value.IsNumeric()) { - // For scalar numbers, validate type and store the single value - if (expects_integers && !prop_value.IsInt()) { + mgp::List list = prop_value.ValueList(); + for (size_t i = 0; i < list.Size(); ++i) { + if (!list[i].IsDouble()) { throw mgp::ValueException( - fmt::format("Property {} must be integer for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); - } - if (!expects_integers && !prop_value.IsDouble()) { - throw mgp::ValueException(fmt::format("Property {} must be double for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" - : (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) - ? "EUCLIDEAN" - : "PEARSON")); - } - values.push_back(prop_value.ValueNumeric()); - } else if (prop_value.IsList()) { - // For lists, validate type of first element and extract all numeric values - mgp::List list = prop_value.ValueList(); - if (list.Size() > 0 && list[0].IsNumeric()) { - // Check type of first element only - if (expects_integers && !list[0].IsInt()) { - throw mgp::ValueException( - fmt::format("Property {} list elements must be integers for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::JACCARD) ? "JACCARD" : "OVERLAP")); - } - if (!expects_integers && !list[0].IsDouble()) { - throw mgp::ValueException( - fmt::format("Property {} list elements must be doubles for {} metric", prop_config.name, - (resolved_metric == knn_util::SimilarityFunction::COSINE) ? "COSINE" - : (resolved_metric == knn_util::SimilarityFunction::EUCLIDEAN) ? "EUCLIDEAN" - : "PEARSON")); - } - } - - // Extract all numeric values (trusting the rest are the same type) - for (size_t i = 0; i < list.Size(); ++i) { - if (list[i].IsNumeric()) { - values.push_back(list[i].ValueNumeric()); - } + fmt::format("Property {} must be a list of doubles for similarity calculation", prop_name)); } - } else { - throw mgp::ValueException( - fmt::format("Property {} must be numeric or list of numbers for similarity calculation", prop_config.name)); + values.push_back(list[i].ValueDouble()); } if (values.empty()) { - throw mgp::ValueException( - fmt::format("Invalid property values: empty lists for property {}", prop_config.name)); + throw mgp::ValueException(fmt::format("Invalid property values: empty lists for property {}", prop_name)); } - node_info.property_values[prop_idx] = values; - node_info.resolved_metrics[prop_idx] = resolved_metric; + property_values[prop_idx] = values; } + // Handle aggregation if needed + const bool is_aggregated = (config.aggregate_method != knn_util::AggregateMethod::NONE); + if (is_aggregated) { + // Aggregate the property values + std::vector aggregated = AggregatePropertyValues(property_values, config.aggregate_method); + property_values.clear(); + property_values.push_back(aggregated); + } + + // Create node_info at the end with the final property_values + NodeData node_info(node, property_values); node_data.push_back(node_info); } - for (size_t i = 1; i < node_data.size(); i++) { - for (size_t j = 0; j < node_data[i].property_values.size(); j++) { - if (node_data[i].property_values[j].size() != node_data[0].property_values[j].size()) { - throw mgp::ValueException("Vectors must have the same size for similarity calculation"); + // Validate vector sizes + if (node_data.size() > 1) { + // Validate that all property vectors have the same size + for (size_t prop_idx = 0; prop_idx < node_data[0].property_values.size(); ++prop_idx) { + size_t expected_size = node_data[0].property_values[prop_idx].size(); + for (size_t i = 1; i < node_data.size(); ++i) { + if (node_data[i].property_values[prop_idx].size() != expected_size) { + throw mgp::ValueException("Property vectors must have the same size for similarity calculation"); + } } } } @@ -357,15 +229,15 @@ std::vector PreloadNodeData(const std::vector &nodes, const } void PreloadNorms(std::vector &node_data, const knn_util::KNNConfig &config) { - #pragma omp parallel for +#pragma omp parallel for for (size_t ni = 0; ni < node_data.size(); ++ni) { auto &node = node_data[ni]; + + // Calculate norms for each property vector node.norms.resize(node.property_values.size(), 0.0); for (size_t i = 0; i < node.property_values.size(); ++i) { - if (node.resolved_metrics[i] == knn_util::SimilarityFunction::COSINE) { - const auto &v = node.property_values[i]; - node.norms[i] = std::sqrt(std::inner_product(v.begin(), v.end(), v.begin(), 0.0)); - } + const auto &v = node.property_values[i]; + node.norms[i] = std::sqrt(std::inner_product(v.begin(), v.end(), v.begin(), 0.0)); } } } @@ -374,46 +246,15 @@ void PreloadNorms(std::vector &node_data, const knn_util::KNNConfig &c double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2_data, const knn_util::KNNConfig &config) { double total_similarity = 0.0; - const size_t num_properties = config.node_properties.size(); + const size_t num_properties = node1_data.property_values.size(); - // Calculate similarity for each property and compute the mean for (size_t prop_idx = 0; prop_idx < num_properties; ++prop_idx) { const auto &values1 = node1_data.property_values[prop_idx]; const auto &values2 = node2_data.property_values[prop_idx]; - double property_similarity = 0.0; - - // Use the pre-resolved metric from NodeData - knn_util::SimilarityFunction metric = node1_data.resolved_metrics[prop_idx]; - - // For scalar numbers, use the formula: 1 / (1 + |a - b|) - if (values1.size() == 1) { - property_similarity = 1.0 / (1.0 + std::abs(values1[0] - values2[0])); - } else { - // For vectors, use the pre-resolved similarity function - switch (metric) { - case knn_util::SimilarityFunction::COSINE: - property_similarity = - CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); - break; - case knn_util::SimilarityFunction::EUCLIDEAN: - property_similarity = EuclideanSimilarity(values1, values2); - break; - case knn_util::SimilarityFunction::PEARSON: - property_similarity = PearsonSimilarity(values1, values2); - break; - case knn_util::SimilarityFunction::OVERLAP: - property_similarity = OverlapSimilarity(values1, values2); - break; - case knn_util::SimilarityFunction::JACCARD: - property_similarity = JaccardSimilarity(values1, values2); - break; - default: - property_similarity = - CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); - break; - } - } + // Use cosine similarity for each property + double property_similarity = + CosineSimilarity(values1, values2, node1_data.norms[prop_idx], node2_data.norms[prop_idx]); total_similarity += property_similarity; } @@ -422,17 +263,9 @@ double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2 return total_similarity / num_properties; } -// Validate configuration parameters -void ValidateConfig(const knn_util::KNNConfig &config) { - if (config.initial_sampler == knn_util::kSamplerRandomWalk) { - throw mgp::ValueException("Random walk sampling not implemented"); - } else if (config.initial_sampler != knn_util::kSamplerUniform) { - throw mgp::ValueException(fmt::format("Unknown initial sampler: {}", config.initial_sampler)); - } -} - // Get candidate indices for comparison, excluding self -std::vector GetCandidateIndices(size_t node_idx, size_t total_nodes, const knn_util::KNNConfig &config) { +std::vector GetCandidateIndices(const size_t node_idx, const size_t total_nodes, + const knn_util::KNNConfig &config) { std::vector comparison_indices; if (config.sample_rate < 1.0) { @@ -450,7 +283,7 @@ std::vector GetCandidateIndices(size_t node_idx, size_t total_nodes, con std::shuffle(all_indices.begin(), all_indices.end(), rng); // Calculate sample size - size_t sample_size = static_cast(all_indices.size() * config.sample_rate); + const size_t sample_size = static_cast(all_indices.size() * config.sample_rate); comparison_indices.reserve(sample_size); // Take the first sample_size indices @@ -471,7 +304,8 @@ std::vector GetCandidateIndices(size_t node_idx, size_t total_nodes, con } // Calculate similarity for one node against all candidates (parallel implementation) -std::vector CalculateSimilarityForNode(size_t node_idx, const std::vector &node_data, +std::vector CalculateSimilarityForNode(const size_t node_idx, + const std::vector &node_data, const std::vector &comparison_indices, const knn_util::KNNConfig &config) { const auto &node1_data = node_data[node_idx]; @@ -481,7 +315,7 @@ std::vector CalculateSimilarityForNode(size_t node_idx, con results.reserve(comparison_indices.size()); // Convert comparison_indices to array for OpenMP (similar to betweenness_centrality_online.cpp) - auto array_size = comparison_indices.size(); + const auto array_size = comparison_indices.size(); std::vector comparison_indices_array(array_size); std::copy(comparison_indices.begin(), comparison_indices.end(), comparison_indices_array.begin()); @@ -495,14 +329,14 @@ std::vector CalculateSimilarityForNode(size_t node_idx, con // Parallel similarity calculation using OpenMP #pragma omp parallel for for (size_t i = 0; i < array_size; ++i) { - size_t idx = comparison_indices_array[i]; + const size_t idx = comparison_indices_array[i]; const auto &node2_data = node_data[idx]; - + // Calculate similarity directly - double similarity = CalculateNodeSimilarity(node1_data, node2_data, config); + const double similarity = CalculateNodeSimilarity(node1_data, node2_data, config); // Store result - parallel_results[i] = knn_util::KNNResult(node1_data.node.Id(), node2_data.node.Id(), similarity); + parallel_results[i] = knn_util::KNNResult(node1_data.node_id, node2_data.node_id, similarity); } // Filter results based on similarity cutoff and add to final results @@ -534,8 +368,8 @@ void InsertTopKResults(const std::vector &top_k_results, co // Convert to final results with actual nodes (results are already sorted) for (const auto &result : top_k_results) { try { - auto node1 = graph.GetNodeById(result.node1_id); - auto node2 = graph.GetNodeById(result.node2_id); + const auto node1 = graph.GetNodeById(result.node1_id); + const auto node2 = graph.GetNodeById(result.node2_id); final_results.push_back(std::make_tuple(node1, node2, result.similarity)); } catch (const std::exception &e) { // Skip if node not found @@ -550,9 +384,6 @@ std::vector> CalculateKNN(const mgp::Gr std::vector> results; std::vector nodes; - // 1. Validate configuration - ValidateConfig(config); - // Collect all nodes for (const auto &node : graph.Nodes()) { nodes.push_back(node); @@ -569,10 +400,10 @@ std::vector> CalculateKNN(const mgp::Gr // For each node, find its top-k most similar nodes for (size_t i = 0; i < node_data.size(); ++i) { // Get candidate indices for comparison - std::vector comparison_indices = GetCandidateIndices(i, node_data.size(), config); + const std::vector comparison_indices = GetCandidateIndices(i, node_data.size(), config); // 2. Calculate similarity for one node - std::vector top_k_results = + const std::vector top_k_results = CalculateSimilarityForNode(i, node_data, comparison_indices, config); // 3. Insert sorted top-k results diff --git a/cpp/knn_module/knn_module.cpp b/cpp/knn_module/knn_module.cpp index ac1c5f48f..232dd4d07 100644 --- a/cpp/knn_module/knn_module.cpp +++ b/cpp/knn_module/knn_module.cpp @@ -1,9 +1,8 @@ -#include +#include #include +#include #include -#include #include -#include #include "algorithms/knn.hpp" @@ -20,22 +19,13 @@ constexpr std::string_view kConfigMaxIterations = "maxIterations"; constexpr std::string_view kConfigRandomSeed = "randomSeed"; constexpr std::string_view kConfigSampleRate = "sampleRate"; constexpr std::string_view kConfigConcurrency = "concurrency"; -constexpr std::string_view kConfigSimilarityFunction = "similarityFunction"; -constexpr std::string_view kConfigInitialSampler = "initialSampler"; +constexpr std::string_view kConfigAggregateMethod = "aggregateMethod"; // Return field names constexpr std::string_view kFieldNode = "node"; constexpr std::string_view kFieldNeighbour = "neighbour"; constexpr std::string_view kFieldSimilarity = "similarity"; -// Similarity function values -constexpr std::string_view kSimilarityCosine = "COSINE"; -constexpr std::string_view kSimilarityEuclidean = "EUCLIDEAN"; -constexpr std::string_view kSimilarityPearson = "PEARSON"; -constexpr std::string_view kSimilarityOverlap = "OVERLAP"; -constexpr std::string_view kSimilarityJaccard = "JACCARD"; -constexpr std::string_view kSimilarityDefault = "DEFAULT"; - // Default parameter values constexpr int kDefaultTopK = 1; constexpr double kDefaultSimilarityCutoff = 0.0; @@ -43,304 +33,225 @@ constexpr double kDefaultDeltaThreshold = 0.001; constexpr int kDefaultMaxIterations = 100; constexpr int kDefaultConcurrency = 1; constexpr double kDefaultSampleRate = 0.5; -constexpr std::string_view kDefaultInitialSampler = "uniform"; - -// Initial sampler values (using constants from knn.hpp) -// constexpr std::string_view kSamplerUniform = knn_util::kSamplerUniform; -// constexpr std::string_view kSamplerRandomWalk = knn_util::kSamplerRandomWalk; - -// Helper function to validate if a string is a valid similarity function -bool IsValidSimilarityFunction(const std::string& func_str) { - return func_str == kSimilarityCosine || - func_str == kSimilarityEuclidean || - func_str == kSimilarityPearson || - func_str == kSimilarityOverlap || - func_str == kSimilarityJaccard || - func_str == kSimilarityDefault; + +// Aggregate method values +constexpr std::string_view kAggregateNone = "NONE"; +constexpr std::string_view kAggregateAppend = "APPEND"; +constexpr std::string_view kAggregateMin = "MIN"; +constexpr std::string_view kAggregateMax = "MAX"; +constexpr std::string_view kAggregateAvg = "AVG"; +constexpr std::string_view kAggregateSum = "SUM"; + +// Helper function to validate if a string is a valid aggregate method +bool IsValidAggregateMethod(const std::string &method_str) { + return method_str == kAggregateNone || method_str == kAggregateAppend || method_str == kAggregateMin || + method_str == kAggregateMax || method_str == kAggregateAvg || method_str == kAggregateSum; } -// Helper function to validate if a string is a valid initial sampler -bool IsValidInitialSampler(const std::string& sampler_str) { - std::string lower_sampler = sampler_str; - std::transform(lower_sampler.begin(), lower_sampler.end(), lower_sampler.begin(), ::tolower); - return lower_sampler == knn_util::kSamplerUniform || lower_sampler == knn_util::kSamplerRandomWalk; +// Helper function to parse aggregate method from string +knn_util::AggregateMethod ParseAggregateMethod(const std::string &method_str) { + if (method_str == kAggregateNone) { + return knn_util::AggregateMethod::NONE; + } else if (method_str == kAggregateAppend) { + return knn_util::AggregateMethod::APPEND; + } else if (method_str == kAggregateMin) { + return knn_util::AggregateMethod::MIN; + } else if (method_str == kAggregateMax) { + return knn_util::AggregateMethod::MAX; + } else if (method_str == kAggregateAvg) { + return knn_util::AggregateMethod::AVG; + } else if (method_str == kAggregateSum) { + return knn_util::AggregateMethod::SUM; + } else { + return knn_util::AggregateMethod::NONE; // Default fallback + } } // Helper function to validate parameter ranges -void ValidateParameterRanges(const knn_util::KNNConfig& config) { - // Validate range [0, 1] parameters - if (config.sample_rate < 0.0 || config.sample_rate > 1.0) { - throw mgp::ValueException(fmt::format("sampleRate must be between 0 and 1, got {}", config.sample_rate)); - } - - if (config.delta_threshold < 0.0 || config.delta_threshold > 1.0) { - throw mgp::ValueException(fmt::format("deltaThreshold must be between 0 and 1, got {}", config.delta_threshold)); - } - - if (config.similarity_cutoff < 0.0 || config.similarity_cutoff > 1.0) { - throw mgp::ValueException(fmt::format("similarityCutoff must be between 0 and 1, got {}", config.similarity_cutoff)); - } - - // Validate positive integer parameters - if (config.top_k <= 0) { - throw mgp::ValueException(fmt::format("topK must be a positive integer, got {}", config.top_k)); - } - - if (config.concurrency <= 0) { - throw mgp::ValueException(fmt::format("concurrency must be a positive integer, got {}", config.concurrency)); - } - - if (config.max_iterations <= 0) { - throw mgp::ValueException(fmt::format("maxIterations must be a positive integer, got {}", config.max_iterations)); - } - - // randomSeed can be negative, so we only check it's not zero - if (config.random_seed == 0) { - throw mgp::ValueException("randomSeed cannot be 0"); - } -} +void ValidateParameterRanges(const knn_util::KNNConfig &config) { + // Validate range [0, 1] parameters + if (config.sample_rate < 0.0 || config.sample_rate > 1.0) { + throw mgp::ValueException(fmt::format("sampleRate must be between 0 and 1, got {}", config.sample_rate)); + } -// Helper function to parse similarity function from string -knn_util::SimilarityFunction ParseSimilarityFunction(const std::string& func_str) { - if (func_str == kSimilarityCosine) { - return knn_util::SimilarityFunction::COSINE; - } else if (func_str == kSimilarityEuclidean) { - return knn_util::SimilarityFunction::EUCLIDEAN; - } else if (func_str == kSimilarityPearson) { - return knn_util::SimilarityFunction::PEARSON; - } else if (func_str == kSimilarityOverlap) { - return knn_util::SimilarityFunction::OVERLAP; - } else if (func_str == kSimilarityJaccard) { - return knn_util::SimilarityFunction::JACCARD; - } else if (func_str == kSimilarityDefault) { - return knn_util::SimilarityFunction::DEFAULT; - } else { - return knn_util::SimilarityFunction::COSINE; // Default fallback - } + if (config.delta_threshold < 0.0 || config.delta_threshold > 1.0) { + throw mgp::ValueException(fmt::format("deltaThreshold must be between 0 and 1, got {}", config.delta_threshold)); + } + + if (config.similarity_cutoff < 0.0 || config.similarity_cutoff > 1.0) { + throw mgp::ValueException( + fmt::format("similarityCutoff must be between 0 and 1, got {}", config.similarity_cutoff)); + } + + // Validate positive integer parameters + if (config.top_k <= 0) { + throw mgp::ValueException(fmt::format("topK must be a positive integer, got {}", config.top_k)); + } + + if (config.concurrency <= 0) { + throw mgp::ValueException(fmt::format("concurrency must be a positive integer, got {}", config.concurrency)); + } + + if (config.max_iterations <= 0) { + throw mgp::ValueException(fmt::format("maxIterations must be a positive integer, got {}", config.max_iterations)); + } + + // randomSeed can be negative, so we only check it's not zero + if (config.random_seed == 0) { + throw mgp::ValueException("randomSeed cannot be 0"); + } } // Helper function to parse nodeProperties configuration -std::vector ParseNodeProperties(const mgp::Value& node_props_value) { - std::vector properties; - - if (node_props_value.IsString()) { - // Single property name - use default similarity function - std::string prop_name = std::string(node_props_value.ValueString()); +std::vector ParseNodeProperties(const mgp::Value &node_props_value) { + std::vector properties; + + if (node_props_value.IsString()) { + // Single property name + const std::string prop_name = std::string(node_props_value.ValueString()); + if (prop_name.empty()) { + throw mgp::ValueException("Property name cannot be empty"); + } + properties.push_back(prop_name); + } else if (node_props_value.IsList()) { + // List of property names + mgp::List prop_list = node_props_value.ValueList(); + if (prop_list.Size() == 0) { + throw mgp::ValueException("Property list cannot be empty"); + } + + for (size_t i = 0; i < prop_list.Size(); ++i) { + if (prop_list[i].IsString()) { + const std::string prop_name = std::string(prop_list[i].ValueString()); if (prop_name.empty()) { - throw mgp::ValueException("Property name cannot be empty"); - } - properties.emplace_back(prop_name, knn_util::SimilarityFunction::DEFAULT); - } else if (node_props_value.IsMap()) { - // Map of property names to metrics - mgp::Map prop_map = node_props_value.ValueMap(); - if (prop_map.Size() == 0) { - throw mgp::ValueException("Property map cannot be empty"); - } - - for (const auto& entry : prop_map) { - // Validate property name - std::string prop_name = std::string(entry.key); - if (prop_name.empty()) { - throw mgp::ValueException("Property name cannot be empty"); - } - - // Validate metric value - if (!entry.value.IsString()) { - throw mgp::ValueException(fmt::format("Metric value must be a string for property '{}'", prop_name)); - } - - std::string metric_str = std::string(entry.value.ValueString()); - if (metric_str.empty()) { - throw mgp::ValueException(fmt::format("Metric value cannot be empty for property '{}'", prop_name)); - } - - if (!IsValidSimilarityFunction(metric_str)) { - throw mgp::ValueException(fmt::format("Invalid metric '{}' for property '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name)); - } - - knn_util::SimilarityFunction metric = ParseSimilarityFunction(metric_str); - properties.emplace_back(prop_name, metric); - } - } else if (node_props_value.IsList()) { - // List of strings and/or maps - mgp::List prop_list = node_props_value.ValueList(); - if (prop_list.Size() == 0) { - throw mgp::ValueException("Property list cannot be empty"); - } - - for (size_t i = 0; i < prop_list.Size(); ++i) { - if (prop_list[i].IsString()) { - // String property name - use default similarity function - std::string prop_name = std::string(prop_list[i].ValueString()); - if (prop_name.empty()) { - throw mgp::ValueException(fmt::format("Property name at index {} cannot be empty", i)); - } - properties.emplace_back(prop_name, knn_util::SimilarityFunction::DEFAULT); - } else if (prop_list[i].IsMap()) { - // Map entry - mgp::Map prop_map = prop_list[i].ValueMap(); - if (prop_map.Size() == 0) { - throw mgp::ValueException(fmt::format("Property map at index {} cannot be empty", i)); - } - - for (const auto& entry : prop_map) { - // Validate property name - std::string prop_name = std::string(entry.key); - if (prop_name.empty()) { - throw mgp::ValueException(fmt::format("Property name cannot be empty in map at index {}", i)); - } - - // Validate metric value - if (!entry.value.IsString()) { - throw mgp::ValueException(fmt::format("Metric value must be a string for property '{}' in map at index {}", prop_name, i)); - } - - std::string metric_str = std::string(entry.value.ValueString()); - if (metric_str.empty()) { - throw mgp::ValueException(fmt::format("Metric value cannot be empty for property '{}' in map at index {}", prop_name, i)); - } - - if (!IsValidSimilarityFunction(metric_str)) { - throw mgp::ValueException(fmt::format("Invalid metric '{}' for property '{}' in map at index {}. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", metric_str, prop_name, i)); - } - - knn_util::SimilarityFunction metric = ParseSimilarityFunction(metric_str); - properties.emplace_back(prop_name, metric); - } - } else { - throw mgp::ValueException(fmt::format("Property list element at index {} must be a string or map", i)); - } + throw mgp::ValueException(fmt::format("Property name at index {} cannot be empty", i)); } - } else { - throw mgp::ValueException("nodeProperties must be a string, map, or list"); - } - - if (properties.empty()) { - throw mgp::ValueException("No valid properties found in nodeProperties configuration"); + properties.push_back(prop_name); + } else { + throw mgp::ValueException(fmt::format("Property list element at index {} must be a string", i)); + } } - - return properties; + } else { + throw mgp::ValueException( + "nodeProperties must be a string or list of strings defining properties to be used for similarity calculation. " + "Each property must be a list of numbers."); + } + + if (properties.empty()) { + throw mgp::ValueException("No valid properties found in nodeProperties configuration"); + } + + return properties; } // Helper function to insert results into record factory -void InsertResults(const std::vector>& results, const mgp::RecordFactory& record_factory) { - for (const auto& result : results) { - auto new_record = record_factory.NewRecord(); - new_record.Insert(kFieldNode.data(), std::get<0>(result)); - new_record.Insert(kFieldNeighbour.data(), std::get<1>(result)); - new_record.Insert(kFieldSimilarity.data(), std::get<2>(result)); - } +void InsertResults(const std::vector> &results, + const mgp::RecordFactory &record_factory) { + for (const auto &result : results) { + auto new_record = record_factory.NewRecord(); + new_record.Insert(kFieldNode.data(), std::get<0>(result)); + new_record.Insert(kFieldNeighbour.data(), std::get<1>(result)); + new_record.Insert(kFieldSimilarity.data(), std::get<2>(result)); + } } // Get procedure - returns similarity pairs void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory) { - mgp::MemoryDispatcherGuard guard{memory}; - const auto record_factory = mgp::RecordFactory(result); - const auto &arguments = mgp::List(args); - const auto &config_map = arguments[0].ValueMap(); - - try { - knn_util::KNNConfig config; - - // Parse node properties - required parameter - if (!config_map.KeyExists(kConfigNodeProperties)) { - throw mgp::ValueException("Required parameter 'nodeProperties' is missing from config"); - } - - config.node_properties = ParseNodeProperties(config_map[kConfigNodeProperties]); - - // Parse other parameters with defaults - config.top_k = config_map.KeyExists(kConfigTopK) ? - static_cast(config_map[kConfigTopK].ValueInt()) : kDefaultTopK; - config.similarity_cutoff = config_map.KeyExists(kConfigSimilarityCutoff) ? - config_map[kConfigSimilarityCutoff].ValueDouble() : kDefaultSimilarityCutoff; - config.delta_threshold = config_map.KeyExists(kConfigDeltaThreshold) ? - config_map[kConfigDeltaThreshold].ValueDouble() : kDefaultDeltaThreshold; - config.max_iterations = config_map.KeyExists(kConfigMaxIterations) ? - static_cast(config_map[kConfigMaxIterations].ValueInt()) : kDefaultMaxIterations; - // Parse concurrency first (needed for validation) - config.concurrency = config_map.KeyExists(kConfigConcurrency) ? - static_cast(config_map[kConfigConcurrency].ValueInt()) : kDefaultConcurrency; - - // Parse random seed with validation - if (config_map.KeyExists(kConfigRandomSeed)) { - config.random_seed = static_cast(config_map[kConfigRandomSeed].ValueInt()); - // If seed is provided, concurrency must be 1 for deterministic results - if (config.concurrency != 1) { - throw mgp::ValueException("When 'randomSeed' is specified, 'concurrency' must be set to 1 for deterministic results"); - } - } else { - // Generate completely random seed - std::random_device rd; - config.random_seed = static_cast(rd()); - } - - config.sample_rate = config_map.KeyExists(kConfigSampleRate) ? - config_map[kConfigSampleRate].ValueDouble() : kDefaultSampleRate; - - // Parse initial sampler - if (config_map.KeyExists(kConfigInitialSampler)) { - std::string sampler_str = std::string(config_map[kConfigInitialSampler].ValueString()); - if (!IsValidInitialSampler(sampler_str)) { - throw mgp::ValueException(fmt::format("Invalid initialSampler '{}'. Valid values are: uniform, randomWalk", sampler_str)); - } - // Convert to lowercase for consistency - std::transform(sampler_str.begin(), sampler_str.end(), sampler_str.begin(), ::tolower); - config.initial_sampler = sampler_str; - } else { - config.initial_sampler = kDefaultInitialSampler; - } - - // Parse default similarity function - if (config_map.KeyExists(kConfigSimilarityFunction)) { - std::string func_str = std::string(config_map[kConfigSimilarityFunction].ValueString()); - if (!IsValidSimilarityFunction(func_str)) { - throw mgp::ValueException(fmt::format("Invalid similarityFunction '{}'. Valid metrics are: COSINE, EUCLIDEAN, PEARSON, OVERLAP, JACCARD, DEFAULT", func_str)); - } - config.default_similarity_function = ParseSimilarityFunction(func_str); - } else { - config.default_similarity_function = knn_util::SimilarityFunction::COSINE; // Default - } - - // Validate all parameter ranges - ValidateParameterRanges(config); - - auto results = knn_algs::CalculateKNN(mgp::Graph(memgraph_graph), config); - InsertResults(results, record_factory); - } catch (const mgp::ValueException &e) { - record_factory.SetErrorMessage(e.what()); - } catch (const std::exception &e) { - record_factory.SetErrorMessage(fmt::format("Unexpected error: {}", e.what())); + mgp::MemoryDispatcherGuard guard{memory}; + const auto record_factory = mgp::RecordFactory(result); + const auto &arguments = mgp::List(args); + const auto &config_map = arguments[0].ValueMap(); + + try { + knn_util::KNNConfig config; + + // Parse node properties - required parameter + if (!config_map.KeyExists(kConfigNodeProperties)) { + throw mgp::ValueException("Required parameter 'nodeProperties' is missing from config"); } -} + config.node_properties = ParseNodeProperties(config_map[kConfigNodeProperties]); -extern "C" int mgp_init_module(struct mgp_module *module, struct mgp_memory *memory) { - try { - mgp::MemoryDispatcherGuard guard{memory}; - - // Return types for get procedure - std::vector returns = { - mgp::Return(kFieldNode, mgp::Type::Node), - mgp::Return(kFieldNeighbour, mgp::Type::Node), - mgp::Return(kFieldSimilarity, mgp::Type::Double) - }; - - // Single config parameter - std::vector parameters = { - mgp::Parameter(kArgumentConfig, mgp::Type::Map) - }; - - // Add the single get procedure - mgp::AddProcedure(Get, kProcedureGet, mgp::ProcedureType::Read, - parameters, returns, module, memory); - - } catch(const std::exception &e) { - return 1; - } - return 0; + // Parse other parameters with defaults + config.top_k = + config_map.KeyExists(kConfigTopK) ? static_cast(config_map[kConfigTopK].ValueInt()) : kDefaultTopK; + config.similarity_cutoff = config_map.KeyExists(kConfigSimilarityCutoff) + ? config_map[kConfigSimilarityCutoff].ValueDouble() + : kDefaultSimilarityCutoff; + config.delta_threshold = config_map.KeyExists(kConfigDeltaThreshold) + ? config_map[kConfigDeltaThreshold].ValueDouble() + : kDefaultDeltaThreshold; + config.max_iterations = config_map.KeyExists(kConfigMaxIterations) + ? static_cast(config_map[kConfigMaxIterations].ValueInt()) + : kDefaultMaxIterations; + // Parse concurrency first (needed for validation) + config.concurrency = config_map.KeyExists(kConfigConcurrency) + ? static_cast(config_map[kConfigConcurrency].ValueInt()) + : kDefaultConcurrency; + + // Parse random seed with validation + if (config_map.KeyExists(kConfigRandomSeed)) { + config.random_seed = static_cast(config_map[kConfigRandomSeed].ValueInt()); + // If seed is provided, concurrency must be 1 for deterministic results + if (config.concurrency != 1) { + throw mgp::ValueException( + "When 'randomSeed' is specified, 'concurrency' must be set to 1 for deterministic results"); + } + } else { + // Generate completely random seed + std::random_device rd; + config.random_seed = static_cast(rd()); + } + + config.sample_rate = + config_map.KeyExists(kConfigSampleRate) ? config_map[kConfigSampleRate].ValueDouble() : kDefaultSampleRate; + + // Parse aggregate method + if (config_map.KeyExists(kConfigAggregateMethod)) { + const std::string method_str = std::string(config_map[kConfigAggregateMethod].ValueString()); + if (!IsValidAggregateMethod(method_str)) { + throw mgp::ValueException(fmt::format( + "Invalid aggregateMethod '{}'. Valid methods are: NONE, APPEND, MIN, MAX, AVG, SUM", method_str)); + } + if (config.node_properties.size() < 2) { + throw mgp::ValueException("aggregateMethod can only be used when nodeProperties has at least two properties"); + } + config.aggregate_method = ParseAggregateMethod(method_str); + } else { + config.aggregate_method = knn_util::AggregateMethod::NONE; // Default + } + + // Validate all parameter ranges + ValidateParameterRanges(config); + + const auto results = knn_algs::CalculateKNN(mgp::Graph(memgraph_graph), config); + InsertResults(results, record_factory); + } catch (const mgp::ValueException &e) { + record_factory.SetErrorMessage(e.what()); + } catch (const std::exception &e) { + record_factory.SetErrorMessage(fmt::format("Unexpected error: {}", e.what())); + } } -extern "C" int mgp_shutdown_module() { - return 0; +extern "C" int mgp_init_module(struct mgp_module *module, struct mgp_memory *memory) { + try { + mgp::MemoryDispatcherGuard guard{memory}; + + // Return types for get procedure + std::vector returns = {mgp::Return(kFieldNode, mgp::Type::Node), + mgp::Return(kFieldNeighbour, mgp::Type::Node), + mgp::Return(kFieldSimilarity, mgp::Type::Double)}; + + // Single config parameter + std::vector parameters = {mgp::Parameter(kArgumentConfig, mgp::Type::Map)}; + + // Add the single get procedure + mgp::AddProcedure(Get, kProcedureGet, mgp::ProcedureType::Read, parameters, returns, module, memory); + + } catch (const std::exception &e) { + return 1; + } + return 0; } + +extern "C" int mgp_shutdown_module() { return 0; } diff --git a/e2e/knn_test/test_knn_avg/input.cyp b/e2e/knn_test/test_knn_avg/input.cyp new file mode 100644 index 000000000..ea515ed7e --- /dev/null +++ b/e2e/knn_test/test_knn_avg/input.cyp @@ -0,0 +1,2 @@ +CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5], embedding2: [1.0, 0.0, 0.5]}); +CREATE (:Node {id:2, embedding: [1.0, 0.0, 0.5], embedding2: [1.0, 0.0, 0.5]}); diff --git a/e2e/knn_test/test_knn_avg/test.yml b/e2e/knn_test/test_knn_avg/test.yml new file mode 100644 index 000000000..9f3b98299 --- /dev/null +++ b/e2e/knn_test/test_knn_avg/test.yml @@ -0,0 +1,11 @@ +query: > + CALL knn.get({nodeProperties: ["embedding", "embedding2"], aggregateMethod: "AVG"}) YIELD node, neighbour, similarity + RETURN node.id as node_id, neighbour.id as neighbour_id, similarity + +output: + - node_id: 1 + neighbour_id: 2 + similarity: 1 + - node_id: 2 + neighbour_id: 1 + similarity: 1 diff --git a/e2e/knn_test/test_knn_empty/input.cyp b/e2e/knn_test/test_knn_empty/input.cyp new file mode 100644 index 000000000..7144a2605 --- /dev/null +++ b/e2e/knn_test/test_knn_empty/input.cyp @@ -0,0 +1,25 @@ +CREATE INDEX ON :Node(id); +CREATE (:Node {id:0, embedding: [0.90678340196609497, 0.74690568447113037, -0.65984714031219482]}); +CREATE (:Node {id:1, embedding: [1.2019195556640625, 0.42643040418624878, -0.4709840714931488]}); +CREATE (:Node {id:2, embedding: [1.1005796194076538, 0.67131000757217407, -0.5418705940246582]}); +CREATE (:Node {id:4, embedding: [1.1840434074401855, 0.39269298315048218, -0.5063326358795166]}); +CREATE (:Node {id:5, embedding: [0.83302301168441772, 0.5545622706413269, -0.31265774369239807]}); +CREATE (:Node {id:6, embedding: [0.78877884149551392, 0.5189281702041626, -0.097793936729431152]}); +CREATE (:Node {id:7, embedding: [0.61398810148239136, 0.5255049467086792, -0.3551192581653595]}); +CREATE (:Node {id:8, embedding: [0.83923488855361938, -0.0041203685104846954, -0.51874136924743652]}); +CREATE (:Node {id:9, embedding: [0.60883384943008423, 0.60958302021026611, -0.40317356586456299]}); +MATCH (a:Node {id: 0}) MATCH (b:Node {id: 1}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 1}) MATCH (b:Node {id: 2}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 2}) MATCH (b:Node {id: 0}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 0}) MATCH (b:Node {id: 4}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 4}) MATCH (b:Node {id: 1}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 4}) MATCH (b:Node {id: 2}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 0}) MATCH (b:Node {id: 5}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 5}) MATCH (b:Node {id: 6}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 6}) MATCH (b:Node {id: 7}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 7}) MATCH (b:Node {id: 8}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 8}) MATCH (b:Node {id: 6}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 6}) MATCH (b:Node {id: 9}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 9}) MATCH (b:Node {id: 7}) MERGE (a)-[:RELATION]->(b); +MATCH (a:Node {id: 9}) MATCH (b:Node {id: 8}) MERGE (a)-[:RELATION]->(b); +DROP INDEX ON :Node(id); diff --git a/e2e/knn_test/test_knn_empty/test.yml b/e2e/knn_test/test_knn_empty/test.yml new file mode 100644 index 000000000..8276187b8 --- /dev/null +++ b/e2e/knn_test/test_knn_empty/test.yml @@ -0,0 +1,5 @@ +query: > + CALL knn.get({nodeProperties: ["embedding"]}) YIELD node, neighbour, similarity + RETURN node.id as node_id, neighbour.id as neighbour_id, similarity + +output: [] diff --git a/e2e/knn_test/test_knn_error_size_of_vectors/input.cyp b/e2e/knn_test/test_knn_error_size_of_vectors/input.cyp new file mode 100644 index 000000000..53d095c81 --- /dev/null +++ b/e2e/knn_test/test_knn_error_size_of_vectors/input.cyp @@ -0,0 +1,2 @@ +CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5], embedding2: [1.0, 0.0]}); +CREATE (:Node {id:2, embedding: [1.0, 0.0, -0.5], embedding2: [1.0, 0.0]}); diff --git a/e2e/knn_test/test_knn_error_size_of_vectors/test.yml b/e2e/knn_test/test_knn_error_size_of_vectors/test.yml new file mode 100644 index 000000000..dc59c7d43 --- /dev/null +++ b/e2e/knn_test/test_knn_error_size_of_vectors/test.yml @@ -0,0 +1,5 @@ +query: > + CALL knn.get({nodeProperties: ["embedding", "embedding2"], aggregateMethod: "AVG"}) YIELD node, neighbour, similarity + RETURN node.id as node_id, neighbour.id as neighbour_id, similarity + +exception: "All property vectors must have the same size for aggregation. Expected size: 3, but vector has size: 2" diff --git a/e2e/knn_test/test_knn_simple/input.cyp b/e2e/knn_test/test_knn_simple/input.cyp new file mode 100644 index 000000000..aa31ea74d --- /dev/null +++ b/e2e/knn_test/test_knn_simple/input.cyp @@ -0,0 +1,2 @@ +CREATE (:Node {id:1, embedding: [1, 0, 0.5]}); +CREATE (:Node {id:2, embedding: [1, 0, 0.5]}); diff --git a/e2e/knn_test/test_knn_simple/test.yml b/e2e/knn_test/test_knn_simple/test.yml new file mode 100644 index 000000000..cec536109 --- /dev/null +++ b/e2e/knn_test/test_knn_simple/test.yml @@ -0,0 +1,11 @@ +query: > + CALL knn.get({nodeProperties: ["embedding"]}) YIELD node, neighbour, similarity + RETURN node.id as node_id, neighbour.id as neighbour_id, similarity + +output: + - node_id: 1 + neighbour_id: 2 + similarity: 1 + - node_id: 2 + neighbour_id: 1 + similarity: 1 From 68170375aec1760161f62c87f3f159dea2d5f1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Mon, 29 Sep 2025 13:51:50 +0200 Subject: [PATCH 05/12] Refactor knn module --- cpp/knn_module/algorithms/knn.hpp | 188 ++++++------------------------ cpp/knn_module/knn_module.cpp | 49 -------- 2 files changed, 36 insertions(+), 201 deletions(-) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index 4a05cd845..343707a42 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -11,9 +11,6 @@ namespace knn_util { -// Aggregate methods for merging property values -enum class AggregateMethod { NONE, APPEND, MIN, MAX, AVG, SUM }; - // Configuration for KNN algorithm struct KNNConfig { int top_k = 1; @@ -24,7 +21,6 @@ struct KNNConfig { double sample_rate = 0.5; int concurrency = 1; std::vector node_properties; - AggregateMethod aggregate_method = AggregateMethod::NONE; }; // Result structure for KNN @@ -60,103 +56,11 @@ inline double CosineSimilarity(const std::vector &vec1, const std::vecto return dot / denom; } -// Helper function to aggregate property values based on the specified method -std::vector AggregatePropertyValues(const std::vector> &property_vectors, - knn_util::AggregateMethod method) { - if (property_vectors.empty()) { - throw mgp::ValueException("Properties of the nodes for aggregation cannot be empty!"); - } - - // Validate vector sizes for methods that require same-sized vectors - bool requires_same_size = (method == knn_util::AggregateMethod::MIN || method == knn_util::AggregateMethod::MAX || - method == knn_util::AggregateMethod::AVG || method == knn_util::AggregateMethod::SUM); - - if (requires_same_size) { - size_t expected_size = property_vectors[0].size(); - for (size_t i = 1; i < property_vectors.size(); ++i) { - if (property_vectors[i].size() != expected_size) { - throw mgp::ValueException(fmt::format( - "All property vectors must have the same size for aggregation. Expected size: {}, but vector has size: {}", - expected_size, property_vectors[i].size())); - } - } - } - - std::vector result; - - switch (method) { - case knn_util::AggregateMethod::NONE: { - throw mgp::ValueException( - "Unexpected error: aggregation of property values happened without aggregation method specified! Please " - "contact Memgraph support."); - } - case knn_util::AggregateMethod::APPEND: { - // Concatenate all property vectors - for (const auto &vec : property_vectors) { - result.insert(result.end(), vec.begin(), vec.end()); - } - break; - } - case knn_util::AggregateMethod::MIN: { - // Take minimum value from each position across all vectors - size_t vector_size = property_vectors[0].size(); - result.resize(vector_size); - for (size_t i = 0; i < vector_size; ++i) { - result[i] = property_vectors[0][i]; - for (size_t j = 1; j < property_vectors.size(); ++j) { - result[i] = std::min(result[i], property_vectors[j][i]); - } - } - break; - } - case knn_util::AggregateMethod::MAX: { - // Take maximum value from each position across all vectors - size_t vector_size = property_vectors[0].size(); - result.resize(vector_size); - for (size_t i = 0; i < vector_size; ++i) { - result[i] = property_vectors[0][i]; - for (size_t j = 1; j < property_vectors.size(); ++j) { - result[i] = std::max(result[i], property_vectors[j][i]); - } - } - break; - } - case knn_util::AggregateMethod::AVG: { - // Take average value from each position across all vectors - size_t vector_size = property_vectors[0].size(); - result.resize(vector_size); - for (size_t i = 0; i < vector_size; ++i) { - double sum = 0.0; - for (const auto &vec : property_vectors) { - sum += vec[i]; - } - result[i] = sum / property_vectors.size(); - } - break; - } - case knn_util::AggregateMethod::SUM: { - // Take sum value from each position across all vectors - size_t vector_size = property_vectors[0].size(); - result.resize(vector_size); - for (size_t i = 0; i < vector_size; ++i) { - result[i] = 0.0; - for (const auto &vec : property_vectors) { - result[i] += vec[i]; - } - } - break; - } - } - - return result; -} - // Structure to hold pre-loaded node data for efficient comparison struct NodeData { mgp::Id node_id; - std::vector> - property_values; // One vector per property (for NONE) or single aggregated vector (for aggregation) - std::vector norms; // Norms for each property + std::vector> property_values; // One vector per property + std::vector norms; // Norms for each property NodeData(const mgp::Node &n, const std::vector> &prop_values) : node_id(n.Id()), property_values(prop_values) {} @@ -182,8 +86,11 @@ std::vector PreloadNodeData(const std::vector &nodes, const fmt::format("Property {} must be a list of doubles for similarity calculation", prop_name)); } - mgp::List list = prop_value.ValueList(); - for (size_t i = 0; i < list.Size(); ++i) { + const auto &list = prop_value.ValueList(); + const auto size = list.Size(); + values.reserve(size); + + for (size_t i = 0; i < size; ++i) { if (!list[i].IsDouble()) { throw mgp::ValueException( fmt::format("Property {} must be a list of doubles for similarity calculation", prop_name)); @@ -198,18 +105,8 @@ std::vector PreloadNodeData(const std::vector &nodes, const property_values[prop_idx] = values; } - // Handle aggregation if needed - const bool is_aggregated = (config.aggregate_method != knn_util::AggregateMethod::NONE); - if (is_aggregated) { - // Aggregate the property values - std::vector aggregated = AggregatePropertyValues(property_values, config.aggregate_method); - property_values.clear(); - property_values.push_back(aggregated); - } - // Create node_info at the end with the final property_values - NodeData node_info(node, property_values); - node_data.push_back(node_info); + node_data.emplace_back(node, std::move(property_values)); } // Validate vector sizes @@ -264,40 +161,20 @@ double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2 } // Get candidate indices for comparison, excluding self -std::vector GetCandidateIndices(const size_t node_idx, const size_t total_nodes, +std::vector GetCandidateIndices(const size_t node_idx, std::vector &all_indices, const knn_util::KNNConfig &config) { - std::vector comparison_indices; - - if (config.sample_rate < 1.0) { - // Create indices for all nodes except self - std::vector all_indices; - all_indices.reserve(total_nodes - 1); - for (size_t i = 0; i < total_nodes; ++i) { - if (i != node_idx) { // Skip self - all_indices.push_back(i); - } - } - - // Shuffle indices for uniform sampling - std::mt19937 rng(config.random_seed); - std::shuffle(all_indices.begin(), all_indices.end(), rng); + // Shuffle indices for uniform sampling + std::mt19937 rng(config.random_seed); + std::shuffle(all_indices.begin(), all_indices.end(), rng); - // Calculate sample size - const size_t sample_size = static_cast(all_indices.size() * config.sample_rate); - comparison_indices.reserve(sample_size); + const size_t sample_size = static_cast(all_indices.size() * config.sample_rate); - // Take the first sample_size indices - for (size_t i = 0; i < sample_size; ++i) { + std::vector comparison_indices; + comparison_indices.reserve(sample_size); + for (size_t i = 0; i < sample_size; ++i) { + if (all_indices[i] != node_idx) { comparison_indices.push_back(all_indices[i]); } - } else { - // Compare against all other nodes - comparison_indices.reserve(total_nodes - 1); - for (size_t j = 0; j < total_nodes; ++j) { - if (j != node_idx) { // Skip self-comparison - comparison_indices.push_back(j); - } - } } return comparison_indices; @@ -309,18 +186,14 @@ std::vector CalculateSimilarityForNode(const size_t node_id const std::vector &comparison_indices, const knn_util::KNNConfig &config) { const auto &node1_data = node_data[node_idx]; + const auto num_of_similarities = comparison_indices.size(); // Pre-allocate results vector std::vector results; - results.reserve(comparison_indices.size()); - - // Convert comparison_indices to array for OpenMP (similar to betweenness_centrality_online.cpp) - const auto array_size = comparison_indices.size(); - std::vector comparison_indices_array(array_size); - std::copy(comparison_indices.begin(), comparison_indices.end(), comparison_indices_array.begin()); + results.reserve(num_of_similarities); // Pre-allocate parallel results vector - std::vector parallel_results(array_size); + std::vector parallel_results(num_of_similarities); // Set OpenMP parameters omp_set_dynamic(0); @@ -328,8 +201,8 @@ std::vector CalculateSimilarityForNode(const size_t node_id // Parallel similarity calculation using OpenMP #pragma omp parallel for - for (size_t i = 0; i < array_size; ++i) { - const size_t idx = comparison_indices_array[i]; + for (size_t i = 0; i < num_of_similarities; ++i) { + const size_t idx = comparison_indices[i]; const auto &node2_data = node_data[idx]; // Calculate similarity directly @@ -382,6 +255,8 @@ void InsertTopKResults(const std::vector &top_k_results, co std::vector> CalculateKNN(const mgp::Graph &graph, const knn_util::KNNConfig &config) { std::vector> results; + + // we can't reserve here because it's an iterator std::vector nodes; // Collect all nodes @@ -390,17 +265,26 @@ std::vector> CalculateKNN(const mgp::Gr } if (nodes.size() < 2) { - return results; // Need at least 2 nodes for similarity + // Need at least 2 nodes for similarity + return results; } // Pre-load node properties into memory for efficient comparison std::vector node_data = PreloadNodeData(nodes, config); PreloadNorms(node_data, config); + const auto num_nodes = nodes.size(); + + std::vector all_indices; + all_indices.reserve(num_nodes); + for (size_t i = 0; i < num_nodes; ++i) { + all_indices[i] = i; + } + // For each node, find its top-k most similar nodes - for (size_t i = 0; i < node_data.size(); ++i) { + for (size_t i = 0; i < num_nodes; ++i) { // Get candidate indices for comparison - const std::vector comparison_indices = GetCandidateIndices(i, node_data.size(), config); + const std::vector comparison_indices = GetCandidateIndices(i, all_indices, config); // 2. Calculate similarity for one node const std::vector top_k_results = diff --git a/cpp/knn_module/knn_module.cpp b/cpp/knn_module/knn_module.cpp index 232dd4d07..48c34b2f2 100644 --- a/cpp/knn_module/knn_module.cpp +++ b/cpp/knn_module/knn_module.cpp @@ -19,7 +19,6 @@ constexpr std::string_view kConfigMaxIterations = "maxIterations"; constexpr std::string_view kConfigRandomSeed = "randomSeed"; constexpr std::string_view kConfigSampleRate = "sampleRate"; constexpr std::string_view kConfigConcurrency = "concurrency"; -constexpr std::string_view kConfigAggregateMethod = "aggregateMethod"; // Return field names constexpr std::string_view kFieldNode = "node"; @@ -34,39 +33,6 @@ constexpr int kDefaultMaxIterations = 100; constexpr int kDefaultConcurrency = 1; constexpr double kDefaultSampleRate = 0.5; -// Aggregate method values -constexpr std::string_view kAggregateNone = "NONE"; -constexpr std::string_view kAggregateAppend = "APPEND"; -constexpr std::string_view kAggregateMin = "MIN"; -constexpr std::string_view kAggregateMax = "MAX"; -constexpr std::string_view kAggregateAvg = "AVG"; -constexpr std::string_view kAggregateSum = "SUM"; - -// Helper function to validate if a string is a valid aggregate method -bool IsValidAggregateMethod(const std::string &method_str) { - return method_str == kAggregateNone || method_str == kAggregateAppend || method_str == kAggregateMin || - method_str == kAggregateMax || method_str == kAggregateAvg || method_str == kAggregateSum; -} - -// Helper function to parse aggregate method from string -knn_util::AggregateMethod ParseAggregateMethod(const std::string &method_str) { - if (method_str == kAggregateNone) { - return knn_util::AggregateMethod::NONE; - } else if (method_str == kAggregateAppend) { - return knn_util::AggregateMethod::APPEND; - } else if (method_str == kAggregateMin) { - return knn_util::AggregateMethod::MIN; - } else if (method_str == kAggregateMax) { - return knn_util::AggregateMethod::MAX; - } else if (method_str == kAggregateAvg) { - return knn_util::AggregateMethod::AVG; - } else if (method_str == kAggregateSum) { - return knn_util::AggregateMethod::SUM; - } else { - return knn_util::AggregateMethod::NONE; // Default fallback - } -} - // Helper function to validate parameter ranges void ValidateParameterRanges(const knn_util::KNNConfig &config) { // Validate range [0, 1] parameters @@ -206,21 +172,6 @@ void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memo config.sample_rate = config_map.KeyExists(kConfigSampleRate) ? config_map[kConfigSampleRate].ValueDouble() : kDefaultSampleRate; - // Parse aggregate method - if (config_map.KeyExists(kConfigAggregateMethod)) { - const std::string method_str = std::string(config_map[kConfigAggregateMethod].ValueString()); - if (!IsValidAggregateMethod(method_str)) { - throw mgp::ValueException(fmt::format( - "Invalid aggregateMethod '{}'. Valid methods are: NONE, APPEND, MIN, MAX, AVG, SUM", method_str)); - } - if (config.node_properties.size() < 2) { - throw mgp::ValueException("aggregateMethod can only be used when nodeProperties has at least two properties"); - } - config.aggregate_method = ParseAggregateMethod(method_str); - } else { - config.aggregate_method = knn_util::AggregateMethod::NONE; // Default - } - // Validate all parameter ranges ValidateParameterRanges(config); From 8b25778a7805edfe074db0d5bda890d3741d642e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Mon, 29 Sep 2025 16:23:26 +0200 Subject: [PATCH 06/12] Refactor flags --- cpp/CMakeLists.txt | 72 +++++++++++++++---------------- cpp/knn_module/algorithms/knn.hpp | 2 +- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c9fa299ec..ea0c762cc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,10 +28,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall \ # Don't omit frame pointer in RelWithDebInfo, for additional callchain debug. set(CMAKE_CXX_FLAGS_RELWITHDEBINFO - "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -march=x86-64-v3 -ffast-math -fopenmp -fopt-info-vec-optimized -fno-omit-frame-pointer") + "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -fno-omit-frame-pointer") # Release flags. -set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=x86-64-v3 -ffast-math -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") set(CMAKE_SHARED_LIBRARY_PREFIX "") find_package(Threads REQUIRED) @@ -115,40 +115,40 @@ endfunction() # Available query modules. include(CTest) enable_testing() -add_subdirectory(betweenness_centrality_module) -add_subdirectory(biconnected_components_module) -add_subdirectory(bipartite_matching_module) -add_subdirectory(bridges_module) -add_subdirectory(connectivity_module) -add_subdirectory(cycles_module) -add_subdirectory(community_detection_module) -add_subdirectory(pagerank_module) -add_subdirectory(uuid_module) -add_subdirectory(katz_centrality_module) -add_subdirectory(degree_centrality_module) -add_subdirectory(graph_util_module) -add_subdirectory(node_similarity_module) -add_subdirectory(distance_calculator) -add_subdirectory(do_module) -add_subdirectory(periodic_module) -add_subdirectory(util_module) -add_subdirectory(create_module) -add_subdirectory(map_module) -add_subdirectory(collections_module) -add_subdirectory(nodes_module) -add_subdirectory(meta_module) -add_subdirectory(label_module) -add_subdirectory(text_module) -add_subdirectory(path_module) -add_subdirectory(node_module) -add_subdirectory(neighbors_module) -add_subdirectory(refactor_module) -add_subdirectory(merge_module) -add_subdirectory(csv_utils_module) -add_subdirectory(algo_module) -add_subdirectory(set_property_module) -add_subdirectory(leiden_community_detection_module) -add_subdirectory(math_module) +# add_subdirectory(betweenness_centrality_module) +# add_subdirectory(biconnected_components_module) +# add_subdirectory(bipartite_matching_module) +# add_subdirectory(bridges_module) +# add_subdirectory(connectivity_module) +# add_subdirectory(cycles_module) +# add_subdirectory(community_detection_module) +# add_subdirectory(pagerank_module) +# add_subdirectory(uuid_module) +# add_subdirectory(katz_centrality_module) +# add_subdirectory(degree_centrality_module) +# add_subdirectory(graph_util_module) +# add_subdirectory(node_similarity_module) +# add_subdirectory(distance_calculator) +# add_subdirectory(do_module) +# add_subdirectory(periodic_module) +# add_subdirectory(util_module) +# add_subdirectory(create_module) +# add_subdirectory(map_module) +# add_subdirectory(collections_module) +# add_subdirectory(nodes_module) +# add_subdirectory(meta_module) +# add_subdirectory(label_module) +# add_subdirectory(text_module) +# add_subdirectory(path_module) +# add_subdirectory(node_module) +# add_subdirectory(neighbors_module) +# add_subdirectory(refactor_module) +# add_subdirectory(merge_module) +# add_subdirectory(csv_utils_module) +# add_subdirectory(algo_module) +# add_subdirectory(set_property_module) +# add_subdirectory(leiden_community_detection_module) +# add_subdirectory(math_module) add_subdirectory(knn_module) add_cugraph_subdirectory(cugraph_module) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index 343707a42..6722190b6 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -278,7 +278,7 @@ std::vector> CalculateKNN(const mgp::Gr std::vector all_indices; all_indices.reserve(num_nodes); for (size_t i = 0; i < num_nodes; ++i) { - all_indices[i] = i; + all_indices.push_back(i); } // For each node, find its top-k most similar nodes From a39eaf0c5bc5b5a0deccc17d18efa0b09e2a9b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Mon, 29 Sep 2025 16:44:48 +0200 Subject: [PATCH 07/12] Add tests --- e2e/knn_test/test_knn_avg/input.cyp | 2 - .../test_knn_cosine_distance copy/input.cyp | 2 + .../test.yml | 6 +-- e2e/knn_test/test_knn_empty/input.cyp | 25 ----------- .../test_knn_error_int_passed/input.cyp | 2 + .../test_knn_error_int_passed/test.yml | 5 +++ .../test_knn_error_size_of_vectors/input.cyp | 2 - .../test_knn_error_size_of_vectors/test.yml | 5 --- e2e/knn_test/test_knn_neighbours/input.cyp | 6 +++ e2e/knn_test/test_knn_neighbours/test.yml | 41 +++++++++++++++++++ e2e/knn_test/test_knn_simple/input.cyp | 4 +- e2e/knn_test/test_knn_simple/test.yml | 6 +-- 12 files changed, 64 insertions(+), 42 deletions(-) delete mode 100644 e2e/knn_test/test_knn_avg/input.cyp create mode 100644 e2e/knn_test/test_knn_cosine_distance copy/input.cyp rename e2e/knn_test/{test_knn_avg => test_knn_cosine_distance copy}/test.yml (50%) create mode 100644 e2e/knn_test/test_knn_error_int_passed/input.cyp create mode 100644 e2e/knn_test/test_knn_error_int_passed/test.yml delete mode 100644 e2e/knn_test/test_knn_error_size_of_vectors/input.cyp delete mode 100644 e2e/knn_test/test_knn_error_size_of_vectors/test.yml create mode 100644 e2e/knn_test/test_knn_neighbours/input.cyp create mode 100644 e2e/knn_test/test_knn_neighbours/test.yml diff --git a/e2e/knn_test/test_knn_avg/input.cyp b/e2e/knn_test/test_knn_avg/input.cyp deleted file mode 100644 index ea515ed7e..000000000 --- a/e2e/knn_test/test_knn_avg/input.cyp +++ /dev/null @@ -1,2 +0,0 @@ -CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5], embedding2: [1.0, 0.0, 0.5]}); -CREATE (:Node {id:2, embedding: [1.0, 0.0, 0.5], embedding2: [1.0, 0.0, 0.5]}); diff --git a/e2e/knn_test/test_knn_cosine_distance copy/input.cyp b/e2e/knn_test/test_knn_cosine_distance copy/input.cyp new file mode 100644 index 000000000..4989fee19 --- /dev/null +++ b/e2e/knn_test/test_knn_cosine_distance copy/input.cyp @@ -0,0 +1,2 @@ +CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5]}); +CREATE (:Node {id:2, embedding: [1.0, 0.0, -0.5]}); diff --git a/e2e/knn_test/test_knn_avg/test.yml b/e2e/knn_test/test_knn_cosine_distance copy/test.yml similarity index 50% rename from e2e/knn_test/test_knn_avg/test.yml rename to e2e/knn_test/test_knn_cosine_distance copy/test.yml index 9f3b98299..d9db80e5c 100644 --- a/e2e/knn_test/test_knn_avg/test.yml +++ b/e2e/knn_test/test_knn_cosine_distance copy/test.yml @@ -1,11 +1,11 @@ query: > - CALL knn.get({nodeProperties: ["embedding", "embedding2"], aggregateMethod: "AVG"}) YIELD node, neighbour, similarity + CALL knn.get({nodeProperties: ["embedding"], sampleRate: 1.0}) YIELD node, neighbour, similarity RETURN node.id as node_id, neighbour.id as neighbour_id, similarity output: - node_id: 1 neighbour_id: 2 - similarity: 1 + similarity: 0.6 - node_id: 2 neighbour_id: 1 - similarity: 1 + similarity: 0.6 diff --git a/e2e/knn_test/test_knn_empty/input.cyp b/e2e/knn_test/test_knn_empty/input.cyp index 7144a2605..e69de29bb 100644 --- a/e2e/knn_test/test_knn_empty/input.cyp +++ b/e2e/knn_test/test_knn_empty/input.cyp @@ -1,25 +0,0 @@ -CREATE INDEX ON :Node(id); -CREATE (:Node {id:0, embedding: [0.90678340196609497, 0.74690568447113037, -0.65984714031219482]}); -CREATE (:Node {id:1, embedding: [1.2019195556640625, 0.42643040418624878, -0.4709840714931488]}); -CREATE (:Node {id:2, embedding: [1.1005796194076538, 0.67131000757217407, -0.5418705940246582]}); -CREATE (:Node {id:4, embedding: [1.1840434074401855, 0.39269298315048218, -0.5063326358795166]}); -CREATE (:Node {id:5, embedding: [0.83302301168441772, 0.5545622706413269, -0.31265774369239807]}); -CREATE (:Node {id:6, embedding: [0.78877884149551392, 0.5189281702041626, -0.097793936729431152]}); -CREATE (:Node {id:7, embedding: [0.61398810148239136, 0.5255049467086792, -0.3551192581653595]}); -CREATE (:Node {id:8, embedding: [0.83923488855361938, -0.0041203685104846954, -0.51874136924743652]}); -CREATE (:Node {id:9, embedding: [0.60883384943008423, 0.60958302021026611, -0.40317356586456299]}); -MATCH (a:Node {id: 0}) MATCH (b:Node {id: 1}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 1}) MATCH (b:Node {id: 2}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 2}) MATCH (b:Node {id: 0}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 0}) MATCH (b:Node {id: 4}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 4}) MATCH (b:Node {id: 1}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 4}) MATCH (b:Node {id: 2}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 0}) MATCH (b:Node {id: 5}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 5}) MATCH (b:Node {id: 6}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 6}) MATCH (b:Node {id: 7}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 7}) MATCH (b:Node {id: 8}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 8}) MATCH (b:Node {id: 6}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 6}) MATCH (b:Node {id: 9}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 9}) MATCH (b:Node {id: 7}) MERGE (a)-[:RELATION]->(b); -MATCH (a:Node {id: 9}) MATCH (b:Node {id: 8}) MERGE (a)-[:RELATION]->(b); -DROP INDEX ON :Node(id); diff --git a/e2e/knn_test/test_knn_error_int_passed/input.cyp b/e2e/knn_test/test_knn_error_int_passed/input.cyp new file mode 100644 index 000000000..dc41b7a15 --- /dev/null +++ b/e2e/knn_test/test_knn_error_int_passed/input.cyp @@ -0,0 +1,2 @@ +CREATE (:Node {id:1, embedding: [1, 0.0, 0.5]}); +CREATE (:Node {id:2, embedding: [1, 0.0, 0.5]}); diff --git a/e2e/knn_test/test_knn_error_int_passed/test.yml b/e2e/knn_test/test_knn_error_int_passed/test.yml new file mode 100644 index 000000000..0af204ff5 --- /dev/null +++ b/e2e/knn_test/test_knn_error_int_passed/test.yml @@ -0,0 +1,5 @@ +query: > + CALL knn.get({nodeProperties: ["embedding"]}) YIELD node, neighbour, similarity + RETURN node.id as node_id, neighbour.id as neighbour_id, similarity + +exception: "Property 'embedding' must be a list of doubles for similarity calculation" \ No newline at end of file diff --git a/e2e/knn_test/test_knn_error_size_of_vectors/input.cyp b/e2e/knn_test/test_knn_error_size_of_vectors/input.cyp deleted file mode 100644 index 53d095c81..000000000 --- a/e2e/knn_test/test_knn_error_size_of_vectors/input.cyp +++ /dev/null @@ -1,2 +0,0 @@ -CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5], embedding2: [1.0, 0.0]}); -CREATE (:Node {id:2, embedding: [1.0, 0.0, -0.5], embedding2: [1.0, 0.0]}); diff --git a/e2e/knn_test/test_knn_error_size_of_vectors/test.yml b/e2e/knn_test/test_knn_error_size_of_vectors/test.yml deleted file mode 100644 index dc59c7d43..000000000 --- a/e2e/knn_test/test_knn_error_size_of_vectors/test.yml +++ /dev/null @@ -1,5 +0,0 @@ -query: > - CALL knn.get({nodeProperties: ["embedding", "embedding2"], aggregateMethod: "AVG"}) YIELD node, neighbour, similarity - RETURN node.id as node_id, neighbour.id as neighbour_id, similarity - -exception: "All property vectors must have the same size for aggregation. Expected size: 3, but vector has size: 2" diff --git a/e2e/knn_test/test_knn_neighbours/input.cyp b/e2e/knn_test/test_knn_neighbours/input.cyp new file mode 100644 index 000000000..e098cbdd8 --- /dev/null +++ b/e2e/knn_test/test_knn_neighbours/input.cyp @@ -0,0 +1,6 @@ +CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5]}); +CREATE (:Node {id:2, embedding: [1.0, 0.0, 0.5]}); +CREATE (:Node {id:3, embedding: [1.0, 0.0, 0.5]}); +CREATE (:Node {id:4, embedding: [1.0, 0.0, -0.5]}); +CREATE (:Node {id:5, embedding: [1.0, 0.0, -0.5]}); +CREATE (:Node {id:6, embedding: [1.0, 0.0, -0.5]}); diff --git a/e2e/knn_test/test_knn_neighbours/test.yml b/e2e/knn_test/test_knn_neighbours/test.yml new file mode 100644 index 000000000..689574861 --- /dev/null +++ b/e2e/knn_test/test_knn_neighbours/test.yml @@ -0,0 +1,41 @@ +query: > + CALL knn.get({nodeProperties: ["embedding"], sampleRate: 1.0, topK: 2}) YIELD node, neighbour, similarity + RETURN node.id as node_id, neighbour.id as neighbour_id, similarity ORDER BY node_id, similarity DESC, neighbour_id + +output: + - node_id: 1 + neighbour_id: 2 + similarity: 1 + - node_id: 1 + neighbour_id: 3 + similarity: 1 + - node_id: 2 + neighbour_id: 1 + similarity: 1 + - node_id: 2 + neighbour_id: 3 + similarity: 1 + - node_id: 3 + neighbour_id: 1 + similarity: 1 + - node_id: 3 + neighbour_id: 2 + similarity: 1 + - node_id: 4 + neighbour_id: 5 + similarity: 1 + - node_id: 4 + neighbour_id: 6 + similarity: 1 + - node_id: 5 + neighbour_id: 4 + similarity: 1 + - node_id: 5 + neighbour_id: 6 + similarity: 1 + - node_id: 6 + neighbour_id: 4 + similarity: 1 + - node_id: 6 + neighbour_id: 5 + similarity: 1 \ No newline at end of file diff --git a/e2e/knn_test/test_knn_simple/input.cyp b/e2e/knn_test/test_knn_simple/input.cyp index aa31ea74d..884801de0 100644 --- a/e2e/knn_test/test_knn_simple/input.cyp +++ b/e2e/knn_test/test_knn_simple/input.cyp @@ -1,2 +1,2 @@ -CREATE (:Node {id:1, embedding: [1, 0, 0.5]}); -CREATE (:Node {id:2, embedding: [1, 0, 0.5]}); +CREATE (:Node {id:1, embedding: [1.0, 0.0, 0.5]}); +CREATE (:Node {id:2, embedding: [1.0, 0.0, 0.5]}); diff --git a/e2e/knn_test/test_knn_simple/test.yml b/e2e/knn_test/test_knn_simple/test.yml index cec536109..52d1bd6bb 100644 --- a/e2e/knn_test/test_knn_simple/test.yml +++ b/e2e/knn_test/test_knn_simple/test.yml @@ -1,11 +1,11 @@ query: > - CALL knn.get({nodeProperties: ["embedding"]}) YIELD node, neighbour, similarity + CALL knn.get({nodeProperties: ["embedding"], sampleRate: 1.0}) YIELD node, neighbour, similarity RETURN node.id as node_id, neighbour.id as neighbour_id, similarity output: - node_id: 1 neighbour_id: 2 - similarity: 1 + similarity: 1.0 - node_id: 2 neighbour_id: 1 - similarity: 1 + similarity: 1.0 From 13d2d7de6292ac0da9889078843d04404927cc5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Tue, 30 Sep 2025 09:27:12 +0200 Subject: [PATCH 08/12] Uncomment modules --- cpp/CMakeLists.txt | 68 +++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ea0c762cc..3110c6c86 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -115,40 +115,40 @@ endfunction() # Available query modules. include(CTest) enable_testing() -# add_subdirectory(betweenness_centrality_module) -# add_subdirectory(biconnected_components_module) -# add_subdirectory(bipartite_matching_module) -# add_subdirectory(bridges_module) -# add_subdirectory(connectivity_module) -# add_subdirectory(cycles_module) -# add_subdirectory(community_detection_module) -# add_subdirectory(pagerank_module) -# add_subdirectory(uuid_module) -# add_subdirectory(katz_centrality_module) -# add_subdirectory(degree_centrality_module) -# add_subdirectory(graph_util_module) -# add_subdirectory(node_similarity_module) -# add_subdirectory(distance_calculator) -# add_subdirectory(do_module) -# add_subdirectory(periodic_module) -# add_subdirectory(util_module) -# add_subdirectory(create_module) -# add_subdirectory(map_module) -# add_subdirectory(collections_module) -# add_subdirectory(nodes_module) -# add_subdirectory(meta_module) -# add_subdirectory(label_module) -# add_subdirectory(text_module) -# add_subdirectory(path_module) -# add_subdirectory(node_module) -# add_subdirectory(neighbors_module) -# add_subdirectory(refactor_module) -# add_subdirectory(merge_module) -# add_subdirectory(csv_utils_module) -# add_subdirectory(algo_module) -# add_subdirectory(set_property_module) -# add_subdirectory(leiden_community_detection_module) -# add_subdirectory(math_module) +add_subdirectory(betweenness_centrality_module) +add_subdirectory(biconnected_components_module) +add_subdirectory(bipartite_matching_module) +add_subdirectory(bridges_module) +add_subdirectory(connectivity_module) +add_subdirectory(cycles_module) +add_subdirectory(community_detection_module) +add_subdirectory(pagerank_module) +add_subdirectory(uuid_module) +add_subdirectory(katz_centrality_module) +add_subdirectory(degree_centrality_module) +add_subdirectory(graph_util_module) +add_subdirectory(node_similarity_module) +add_subdirectory(distance_calculator) +add_subdirectory(do_module) +add_subdirectory(periodic_module) +add_subdirectory(util_module) +add_subdirectory(create_module) +add_subdirectory(map_module) +add_subdirectory(collections_module) +add_subdirectory(nodes_module) +add_subdirectory(meta_module) +add_subdirectory(label_module) +add_subdirectory(text_module) +add_subdirectory(path_module) +add_subdirectory(node_module) +add_subdirectory(neighbors_module) +add_subdirectory(refactor_module) +add_subdirectory(merge_module) +add_subdirectory(csv_utils_module) +add_subdirectory(algo_module) +add_subdirectory(set_property_module) +add_subdirectory(leiden_community_detection_module) +add_subdirectory(math_module) add_subdirectory(knn_module) add_cugraph_subdirectory(cugraph_module) From c843fea5fe657914ff444b5e85c2f157aab1bccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Tue, 30 Sep 2025 15:44:56 +0200 Subject: [PATCH 09/12] Update cmakelists --- cpp/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3110c6c86..c3b3ed432 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,10 +28,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall \ # Don't omit frame pointer in RelWithDebInfo, for additional callchain debug. set(CMAKE_CXX_FLAGS_RELWITHDEBINFO - "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -fno-omit-frame-pointer") + "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O2 -funsafe-math-optimizations -fvect-cost-model=dynamic -fno-omit-frame-pointer") # Release flags. -set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O2 -funsafe-math-optimizations -fvect-cost-model=dynamic -DNDEBUG") set(CMAKE_SHARED_LIBRARY_PREFIX "") find_package(Threads REQUIRED) From 25c16d72a640a94e9afd1435f18171931f170d0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 1 Oct 2025 10:10:38 +0200 Subject: [PATCH 10/12] Address PR comments --- cpp/knn_module/algorithms/knn.hpp | 17 +++++------------ cpp/knn_module/knn_module.cpp | 8 +++----- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index 6722190b6..ded99e1db 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -17,7 +17,7 @@ struct KNNConfig { double similarity_cutoff = 0.0; double delta_threshold = 0.001; int max_iterations = 100; - int random_seed = 42; + int random_seed = 42; // the value is being set from the knn_module.cpp file double sample_rate = 0.5; int concurrency = 1; std::vector node_properties; @@ -227,10 +227,8 @@ std::vector CalculateSimilarityForNode(const size_t node_id if (k > 0 && results.size() > k) { std::nth_element(results.begin(), results.begin() + k, results.end(), cmp); results.resize(k); - std::sort(results.begin(), results.end(), cmp); // sort only top-k - } else { - std::sort(results.begin(), results.end(), cmp); // small n or k >= n } + std::sort(results.begin(), results.end(), cmp); return results; } @@ -240,14 +238,9 @@ void InsertTopKResults(const std::vector &top_k_results, co std::vector> &final_results) { // Convert to final results with actual nodes (results are already sorted) for (const auto &result : top_k_results) { - try { - const auto node1 = graph.GetNodeById(result.node1_id); - const auto node2 = graph.GetNodeById(result.node2_id); - final_results.push_back(std::make_tuple(node1, node2, result.similarity)); - } catch (const std::exception &e) { - // Skip if node not found - continue; - } + const auto node1 = graph.GetNodeById(result.node1_id); + const auto node2 = graph.GetNodeById(result.node2_id); + final_results.emplace_back(node1, node2, result.similarity); } } diff --git a/cpp/knn_module/knn_module.cpp b/cpp/knn_module/knn_module.cpp index 48c34b2f2..3b2d516f8 100644 --- a/cpp/knn_module/knn_module.cpp +++ b/cpp/knn_module/knn_module.cpp @@ -157,12 +157,10 @@ void Get(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memo // Parse random seed with validation if (config_map.KeyExists(kConfigRandomSeed)) { - config.random_seed = static_cast(config_map[kConfigRandomSeed].ValueInt()); - // If seed is provided, concurrency must be 1 for deterministic results - if (config.concurrency != 1) { - throw mgp::ValueException( - "When 'randomSeed' is specified, 'concurrency' must be set to 1 for deterministic results"); + if (!config_map[kConfigRandomSeed].IsInt()) { + throw mgp::ValueException("randomSeed must be an integer"); } + config.random_seed = static_cast(config_map[kConfigRandomSeed].ValueInt()); } else { // Generate completely random seed std::random_device rd; From e41e7c1d7630d21b8ce1bac3eba56278c8d4d71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 1 Oct 2025 10:19:39 +0200 Subject: [PATCH 11/12] Add comment for SonarCloud --- cpp/knn_module/algorithms/knn.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index ded99e1db..8e26fe72d 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -163,7 +163,7 @@ double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2 // Get candidate indices for comparison, excluding self std::vector GetCandidateIndices(const size_t node_idx, std::vector &all_indices, const knn_util::KNNConfig &config) { - // Shuffle indices for uniform sampling + // Safe: std::mt19937 is used for reproducible simulations, not cryptography std::mt19937 rng(config.random_seed); std::shuffle(all_indices.begin(), all_indices.end(), rng); From 007d0c9390c5f61b7e559a8c087b71b6bb23f109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josip=20Mr=C4=91en?= Date: Wed, 1 Oct 2025 10:27:33 +0200 Subject: [PATCH 12/12] Suppress sonar --- cpp/knn_module/algorithms/knn.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/knn_module/algorithms/knn.hpp b/cpp/knn_module/algorithms/knn.hpp index 8e26fe72d..2fba169fa 100644 --- a/cpp/knn_module/algorithms/knn.hpp +++ b/cpp/knn_module/algorithms/knn.hpp @@ -164,8 +164,8 @@ double CalculateNodeSimilarity(const NodeData &node1_data, const NodeData &node2 std::vector GetCandidateIndices(const size_t node_idx, std::vector &all_indices, const knn_util::KNNConfig &config) { // Safe: std::mt19937 is used for reproducible simulations, not cryptography - std::mt19937 rng(config.random_seed); - std::shuffle(all_indices.begin(), all_indices.end(), rng); + std::mt19937 rng(config.random_seed); // NOSONAR + std::shuffle(all_indices.begin(), all_indices.end(), rng); // NOSONAR const size_t sample_size = static_cast(all_indices.size() * config.sample_rate);