Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Term indexer benchmark #304

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ endif()
# end up defined differently. There is probably a better way to achieve
# this than assuming what absl used.
# Using CACHE allows the user to override the default.
set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ standard to build with")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# No compiler-specific extensions, i.e. -std=c++11, not -std=gnu++11.
set(CMAKE_CXX_EXTENSIONS OFF)
Expand Down Expand Up @@ -73,6 +73,7 @@ endif()
# add_subdirectory(s2-submodule)
if (NOT TARGET absl::base)
find_package(absl REQUIRED)
find_package(GTest)
endif()
find_package(OpenSSL REQUIRED)
# pthreads isn't used directly, but this is still required for std::thread.
Expand Down Expand Up @@ -216,13 +217,11 @@ add_library(s2
src/s2/util/math/mathutil.cc
src/s2/util/units/length-units.cc)

if (GTEST_ROOT)
add_library(s2testing STATIC
src/s2/s2builderutil_testing.cc
src/s2/s2shapeutil_testing.cc
src/s2/s2testing.cc
src/s2/thread_testing.cc)
endif()

target_link_libraries(
s2
Expand All @@ -248,13 +247,11 @@ target_link_libraries(
absl::utility
${CMAKE_THREAD_LIBS_INIT})

if (GTEST_ROOT)
target_link_libraries(
s2testing
${GFLAGS_LIBRARIES} ${GLOG_LIBRARIES}
absl::memory
absl::strings)
endif()

# Allow other CMake projects to use this one with:
# list(APPEND CMAKE_MODULE_PATH "<path_to_s2geometry_dir>/third_party/cmake")
Expand Down Expand Up @@ -428,22 +425,13 @@ install(FILES src/s2/util/units/length-units.h
src/s2/util/units/physical-units.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/s2/util/units")

if (GTEST_ROOT)
set(S2_TARGETS s2 s2testing)
else()
set(S2_TARGETS s2)
endif()

install(TARGETS ${S2_TARGETS}
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")

message("GTEST_ROOT: ${GTEST_ROOT}")
if (GTEST_ROOT)
add_subdirectory(${GTEST_ROOT} build_gtest)
include_directories(${GTEST_ROOT}/include)

set(S2TestFiles
src/s2/encoded_s2cell_id_vector_test.cc
src/s2/encoded_s2point_vector_test.cc
Expand Down Expand Up @@ -570,10 +558,9 @@ if (GTEST_ROOT)
absl::span
absl::strings
absl::synchronization
gtest_main)
GTest::gtest_main)
add_test(${test} ${test})
endforeach()
endif()

if (BUILD_EXAMPLES AND TARGET s2testing)
add_subdirectory("doc/examples" examples)
Expand All @@ -582,3 +569,21 @@ endif()
if (${SWIG_FOUND} AND ${Python3_FOUND})
add_subdirectory("src/python" python)
endif()

find_package(benchmark)

add_executable(s2region_term_indexer_benchmark src/s2/s2region_term_indexer_benchmark.cpp)
target_link_libraries(
s2region_term_indexer_benchmark
PUBLIC
s2testing s2
absl::base
absl::btree
absl::core_headers
absl::flags_reflection
absl::memory
absl::span
absl::strings
absl::synchronization
benchmark::benchmark
benchmark::benchmark_main)
91 changes: 65 additions & 26 deletions src/s2/s2region_term_indexer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@
// never be any document regions larger than the query region. This can
// significantly reduce the size of queries.
//
// + If the query will contain only points (rather than general regions), then
// we can skip all the ancestor terms mentioned above (except last cell see
// `GetIndexTerms(const S2Point& point...` for details) because there will
// never be any document regions larger than the index region. This can
// significantly reduce the size of index.
//
// + If it is more important to optimize index size rather than query speed,
// the number of index terms can be reduced by creating ancestor terms only
// for the *proper* ancestors of the cells in a document region, and
Expand Down Expand Up @@ -126,6 +132,14 @@ string S2RegionTermIndexer::GetTerm(TermType term_type, const S2CellId id,

vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Point& point,
string_view prefix) {
vector<string> terms;
GetIndexTerms(point, prefix, &terms);
return terms;
}

void S2RegionTermIndexer::GetIndexTerms(const S2Point& point,
string_view prefix,
vector<string>* terms) {
// See the top of this file for an overview of the indexing strategy.
//
// The last cell generated by this loop is effectively the covering for
Expand All @@ -136,12 +150,13 @@ vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Point& point,
// max_level() != true_max_level() (see S2RegionCoverer::Options).

const S2CellId id(point);
vector<string> terms;
for (int level = options_.min_level(); level <= options_.max_level();
level += options_.level_mod()) {
terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
int level = options_.min_level();
if (options_.query_contains_points_only()) {
level = options_.true_max_level();
}
for (; level <= options_.max_level(); level += options_.level_mod()) {
terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
}
return terms;
}

vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Region& region,
Expand All @@ -154,6 +169,13 @@ vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Region& region,

vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
const S2CellUnion& covering, string_view prefix) {
vector<string> terms;
GetIndexTermsForCanonicalCovering(covering, prefix, &terms);
return terms;
}

void S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
const S2CellUnion& covering, string_view prefix, vector<string>* terms) {
// See the top of this file for an overview of the indexing strategy.
//
// Cells in the covering are normally indexed as covering terms. If we are
Expand All @@ -168,7 +190,6 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
*coverer_.mutable_options() = options_;
S2_CHECK(coverer_.IsCanonical(covering));
}
vector<string> terms;
S2CellId prev_id = S2CellId::None();
int true_max_level = options_.true_max_level();
for (S2CellId id : covering) {
Expand All @@ -178,14 +199,20 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
S2_DCHECK_GE(level, options_.min_level());
S2_DCHECK_LE(level, options_.max_level());
S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod());
S2_DCHECK_LE(level, options_.true_max_level());

if (level < true_max_level) {
// Add a covering term for this cell.
terms.push_back(GetTerm(TermType::COVERING, id, prefix));
}
if (level == true_max_level || !options_.optimize_for_space()) {
// Add an ancestor term for this cell at the constrained level.
terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
const bool is_max_level_cell = level == true_max_level;
// Add a term for this cell, max_level cell ANCESTOR is optimization
terms->push_back(GetTerm(is_max_level_cell ? TermType::ANCESTOR
: TermType::COVERING,
id, prefix));

// If query only contains points, there are no need other terms.
if (options_.query_contains_points_only()) continue;

if (!options_.optimize_for_space() && !is_max_level_cell) {
// Add an ancestor term for this cell.
terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix));
}
// Finally, add ancestor terms for all the ancestors of this cell.
while ((level -= options_.level_mod()) >= options_.min_level()) {
Expand All @@ -194,29 +221,34 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
prev_id.parent(level) == ancestor_id) {
break; // We have already processed this cell and its ancestors.
}
terms.push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix));
terms->push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix));
}
prev_id = id;
}
return terms;
}

vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Point& point,
string_view prefix) {
vector<string> terms;
GetQueryTerms(point, prefix, &terms);
return terms;
}

void S2RegionTermIndexer::GetQueryTerms(const S2Point& point,
string_view prefix,
vector<string>* terms) {
// See the top of this file for an overview of the indexing strategy.

const S2CellId id(point);
vector<string> terms;
// Recall that all true_max_level() cells are indexed only as ancestor terms.
int level = options_.true_max_level();
terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
if (options_.index_contains_points_only()) return terms;
terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
if (options_.index_contains_points_only()) return;

// Add covering terms for all the ancestor cells.
for (; level >= options_.min_level(); level -= options_.level_mod()) {
terms.push_back(GetTerm(TermType::COVERING, id.parent(level), prefix));
terms->push_back(GetTerm(TermType::COVERING, id.parent(level), prefix));
}
return terms;
}

vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Region& region,
Expand All @@ -229,13 +261,20 @@ vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Region& region,

vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
const S2CellUnion& covering, string_view prefix) {
vector<string> terms;
GetQueryTermsForCanonicalCovering(covering, prefix, &terms);
return terms;
}

void S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
const S2CellUnion& covering, string_view prefix, vector<string>* terms) {
// See the top of this file for an overview of the indexing strategy.

S2_CHECK(!options_.query_contains_points_only());
if (google::DEBUG_MODE) {
*coverer_.mutable_options() = options_;
S2_CHECK(coverer_.IsCanonical(covering));
}
vector<string> terms;
S2CellId prev_id = S2CellId::None();
int true_max_level = options_.true_max_level();
for (S2CellId id : covering) {
Expand All @@ -245,18 +284,19 @@ vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
S2_DCHECK_GE(level, options_.min_level());
S2_DCHECK_LE(level, options_.max_level());
S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod());
S2_DCHECK_LE(level, options_.true_max_level());

// Cells in the covering are always queried as ancestor terms.
terms.push_back(GetTerm(TermType::ANCESTOR, id, prefix));
terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix));

// If the index only contains points, there are no covering terms.
if (options_.index_contains_points_only()) continue;

// If we are optimizing for index space rather than query time, cells are
// also queried as covering terms (except for true_max_level() cells,
// which are indexed and queried as ancestor cells only).
if (options_.optimize_for_space() && level < true_max_level) {
terms.push_back(GetTerm(TermType::COVERING, id, prefix));
if (options_.optimize_for_space() && level != true_max_level) {
terms->push_back(GetTerm(TermType::COVERING, id, prefix));
}
// Finally, add covering terms for all the ancestors of this cell.
while ((level -= options_.level_mod()) >= options_.min_level()) {
Expand All @@ -265,9 +305,8 @@ vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
prev_id.parent(level) == ancestor_id) {
break; // We have already processed this cell and its ancestors.
}
terms.push_back(GetTerm(TermType::COVERING, ancestor_id, prefix));
terms->push_back(GetTerm(TermType::COVERING, ancestor_id, prefix));
}
prev_id = id;
}
return terms;
}
35 changes: 32 additions & 3 deletions src/s2/s2region_term_indexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,21 @@ class S2RegionTermIndexer {
// this flag if your index consists entirely of points.)
//
// DEFAULT: false
bool index_contains_points_only() const { return points_only_; }
void set_index_contains_points_only(bool value) { points_only_ = value; }
bool index_contains_points_only() const { return index_points_only_; }
void set_index_contains_points_only(bool value) { index_points_only_ = value; }

// If your query will only contain points (rather than regions), be sure
// to set this flag. This will generate smaller and faster index that
// are specialized for the points-only case.
//
// With the default quality settings, this flag reduces the number of
// index terms by about a factor of two. (The improvement gets smaller
// as max_cells() is increased, but there is really no reason not to use
// this flag if your query consist entirely of points.)
//
// DEFAULT: false
bool query_contains_points_only() const { return query_points_only_; }
void set_query_contains_points_only(bool value) { query_points_only_ = value; }

// If true, the index will be optimized for space rather than for query
// time. With the default quality settings, this flag reduces the number
Expand All @@ -221,7 +234,8 @@ class S2RegionTermIndexer {
void set_marker_character(char ch);

private:
bool points_only_ = false;
bool index_points_only_ = false;
bool query_points_only_ = false;
bool optimize_for_space_ = false;
std::string marker_ = std::string(1, '$');
};
Expand Down Expand Up @@ -287,6 +301,21 @@ class S2RegionTermIndexer {
std::vector<std::string> GetQueryTermsForCanonicalCovering(
const S2CellUnion& covering, absl::string_view prefix);

// Same as above but allows to reuse same buffer for different points or use
// single buffer for multiple points (common case is GeoJson MultiPoint)
void GetIndexTerms(const S2Point& point, absl::string_view prefix,
std::vector<std::string>* terms);
void GetQueryTerms(const S2Point& point, absl::string_view prefix,
std::vector<std::string>* terms);

// Same as above but allows to reuse same buffer for different covering
void GetIndexTermsForCanonicalCovering(const S2CellUnion &covering,
absl::string_view prefix,
std::vector<std::string> *terms);
void GetQueryTermsForCanonicalCovering(const S2CellUnion &covering,
absl::string_view prefix,
std::vector<std::string> *terms);

private:
enum TermType { ANCESTOR, COVERING };

Expand Down
Loading