-
Notifications
You must be signed in to change notification settings - Fork 1
Add type-safe index parameter system with dynamic IVF calculation based on join window vector count #55
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add type-safe index parameter system with dynamic IVF calculation based on join window vector count #55
Changes from 2 commits
d94ab26
c4bcbef
4ecae83
fb513c8
2bca13f
7805b6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| #include <cassert> | ||
| #include <iostream> | ||
| #include <algorithm> | ||
| #include <cmath> | ||
|
|
||
| #include "utils/logger.h" | ||
|
|
||
|
|
@@ -28,6 +29,13 @@ bool JoinOperator::createIndexPair(IndexType type, const std::string& prefix) { | |
| return left_index_id_ != -1 && right_index_id_ != -1; | ||
| } | ||
|
|
||
| bool JoinOperator::createIndexPair(IndexType type, const std::string& prefix, const IndexParameters& params) { | ||
| if (!concurrency_manager_) return false; | ||
| left_index_id_ = concurrency_manager_->create_index(prefix + "_left", type, join_func_->getDim(), params); | ||
| right_index_id_ = concurrency_manager_->create_index(prefix + "_right", type, join_func_->getDim(), params); | ||
| return left_index_id_ != -1 && right_index_id_ != -1; | ||
| } | ||
|
|
||
| static inline std::string to_lower_copy(std::string v) { | ||
| std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c){return char(std::tolower(c));}); | ||
| return v; | ||
|
|
@@ -66,31 +74,50 @@ JoinOperator::JoinOperator(std::unique_ptr<Function> &join_func, | |
|
|
||
| if (algo == "ivf") { | ||
| index_kind_ = InternalIndexKind::IVF; | ||
| if (createIndexPair(IndexType::IVF, "join_ivf")) { | ||
| // Calculate IVF parameters based on window size | ||
| // nlist = 4 * sqrt(window_size/step_size), rebuild_threshold = 1.5, nprobes = 10 | ||
| int64_t window_size = join_func_->getWindowSize(); | ||
| int64_t step_size = join_func_->getStepSize(); | ||
| // Calculate actual vector count in window | ||
| int64_t vector_count = (step_size > 0) ? (window_size / step_size) : window_size; | ||
|
||
| int nlist = static_cast<int>(4.0 * std::sqrt(static_cast<double>(vector_count))); | ||
| // Ensure nlist is at least 1 | ||
| if (nlist < 1) nlist = 1; | ||
|
|
||
| IVFParameters ivf_params{ | ||
| .nlist = nlist, | ||
| .rebuild_threshold = 1.5, | ||
| .nprobes = 10 | ||
| }; | ||
|
|
||
| if (createIndexPair(IndexType::IVF, "join_ivf", ivf_params)) { | ||
| use_index_ = true; | ||
| join_method_ = std::make_unique<IvfJoinMethod>(left_index_id_, right_index_id_, | ||
| join_similarity_threshold_, concurrency_manager_); | ||
| join_similarity_threshold_, concurrency_manager_); | ||
| } else { | ||
| index_kind_ = InternalIndexKind::NONE; | ||
| use_index_ = false; | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>(-1, -1, join_similarity_threshold_, concurrency_manager_); | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>( | ||
| -1, -1, join_similarity_threshold_, concurrency_manager_); | ||
| } | ||
| } else if (algo == "bruteforce" || algo == "bf" ) { | ||
| index_kind_ = InternalIndexKind::BRUTEFORCE; | ||
| if (createIndexPair(IndexType::BruteForce, "join_bf")) { | ||
| use_index_ = true; | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>(left_index_id_, right_index_id_, | ||
| join_similarity_threshold_, concurrency_manager_); | ||
| join_similarity_threshold_, concurrency_manager_); | ||
| } else { | ||
| index_kind_ = InternalIndexKind::NONE; | ||
| use_index_ = false; | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>(-1, -1, join_similarity_threshold_, concurrency_manager_); | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>( | ||
| -1, -1, join_similarity_threshold_, concurrency_manager_); | ||
| } | ||
| } else { | ||
| index_kind_ = InternalIndexKind::NONE; | ||
| use_index_ = false; | ||
| is_eager_ = false; | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>(-1, -1, join_similarity_threshold_, concurrency_manager_); | ||
| join_method_ = std::make_unique<BruteForceJoinMethod>( | ||
| -1, -1, join_similarity_threshold_, concurrency_manager_); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment states the formula uses
window_size/step_size, but the actual implementation on line 82 usesvector_countwhich is calculated differently. The comment should accurately reflect thatvector_countrepresents the actual number of vectors in the window, not just the raw division result.