diff --git a/CMakeLists.txt b/CMakeLists.txt index 4044cd90..cad8ef6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ endif() # end up defined differently. There is probably a better way to achieve # this than assuming what absl used. # Using CACHE allows the user to override the default. -set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ standard to build with") +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) # No compiler-specific extensions, i.e. -std=c++11, not -std=gnu++11. set(CMAKE_CXX_EXTENSIONS OFF) @@ -73,6 +73,7 @@ endif() # add_subdirectory(s2-submodule) if (NOT TARGET absl::base) find_package(absl REQUIRED) + find_package(GTest) endif() find_package(OpenSSL REQUIRED) # pthreads isn't used directly, but this is still required for std::thread. @@ -216,13 +217,11 @@ add_library(s2 src/s2/util/math/mathutil.cc src/s2/util/units/length-units.cc) -if (GTEST_ROOT) add_library(s2testing STATIC src/s2/s2builderutil_testing.cc src/s2/s2shapeutil_testing.cc src/s2/s2testing.cc src/s2/thread_testing.cc) -endif() target_link_libraries( s2 @@ -248,13 +247,11 @@ target_link_libraries( absl::utility ${CMAKE_THREAD_LIBS_INIT}) -if (GTEST_ROOT) target_link_libraries( s2testing ${GFLAGS_LIBRARIES} ${GLOG_LIBRARIES} absl::memory absl::strings) -endif() # Allow other CMake projects to use this one with: # list(APPEND CMAKE_MODULE_PATH "/third_party/cmake") @@ -428,22 +425,13 @@ install(FILES src/s2/util/units/length-units.h src/s2/util/units/physical-units.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/s2/util/units") -if (GTEST_ROOT) set(S2_TARGETS s2 s2testing) -else() - set(S2_TARGETS s2) -endif() install(TARGETS ${S2_TARGETS} RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}") -message("GTEST_ROOT: ${GTEST_ROOT}") -if (GTEST_ROOT) - add_subdirectory(${GTEST_ROOT} build_gtest) - include_directories(${GTEST_ROOT}/include) - set(S2TestFiles src/s2/encoded_s2cell_id_vector_test.cc src/s2/encoded_s2point_vector_test.cc @@ -570,10 +558,9 @@ if (GTEST_ROOT) absl::span absl::strings absl::synchronization - gtest_main) + GTest::gtest_main) add_test(${test} ${test}) endforeach() -endif() if (BUILD_EXAMPLES AND TARGET s2testing) add_subdirectory("doc/examples" examples) @@ -582,3 +569,21 @@ endif() if (${SWIG_FOUND} AND ${Python3_FOUND}) add_subdirectory("src/python" python) endif() + +find_package(benchmark) + +add_executable(s2region_term_indexer_benchmark src/s2/s2region_term_indexer_benchmark.cpp) +target_link_libraries( + s2region_term_indexer_benchmark + PUBLIC + s2testing s2 + absl::base + absl::btree + absl::core_headers + absl::flags_reflection + absl::memory + absl::span + absl::strings + absl::synchronization + benchmark::benchmark + benchmark::benchmark_main) diff --git a/src/s2/s2region_term_indexer.cc b/src/s2/s2region_term_indexer.cc index 5c73e66d..8deee62e 100644 --- a/src/s2/s2region_term_indexer.cc +++ b/src/s2/s2region_term_indexer.cc @@ -61,6 +61,12 @@ // never be any document regions larger than the query region. This can // significantly reduce the size of queries. // +// + If the query will contain only points (rather than general regions), then +// we can skip all the ancestor terms mentioned above (except last cell see +// `GetIndexTerms(const S2Point& point...` for details) because there will +// never be any document regions larger than the index region. This can +// significantly reduce the size of index. +// // + If it is more important to optimize index size rather than query speed, // the number of index terms can be reduced by creating ancestor terms only // for the *proper* ancestors of the cells in a document region, and @@ -126,6 +132,14 @@ string S2RegionTermIndexer::GetTerm(TermType term_type, const S2CellId id, vector S2RegionTermIndexer::GetIndexTerms(const S2Point& point, string_view prefix) { + vector terms; + GetIndexTerms(point, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetIndexTerms(const S2Point& point, + string_view prefix, + vector* terms) { // See the top of this file for an overview of the indexing strategy. // // The last cell generated by this loop is effectively the covering for @@ -136,12 +150,13 @@ vector S2RegionTermIndexer::GetIndexTerms(const S2Point& point, // max_level() != true_max_level() (see S2RegionCoverer::Options). const S2CellId id(point); - vector terms; - for (int level = options_.min_level(); level <= options_.max_level(); - level += options_.level_mod()) { - terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); + int level = options_.min_level(); + if (options_.query_contains_points_only()) { + level = options_.true_max_level(); + } + for (; level <= options_.max_level(); level += options_.level_mod()) { + terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); } - return terms; } vector S2RegionTermIndexer::GetIndexTerms(const S2Region& region, @@ -154,6 +169,13 @@ vector S2RegionTermIndexer::GetIndexTerms(const S2Region& region, vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( const S2CellUnion& covering, string_view prefix) { + vector terms; + GetIndexTermsForCanonicalCovering(covering, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( + const S2CellUnion& covering, string_view prefix, vector* terms) { // See the top of this file for an overview of the indexing strategy. // // Cells in the covering are normally indexed as covering terms. If we are @@ -168,7 +190,6 @@ vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( *coverer_.mutable_options() = options_; S2_CHECK(coverer_.IsCanonical(covering)); } - vector terms; S2CellId prev_id = S2CellId::None(); int true_max_level = options_.true_max_level(); for (S2CellId id : covering) { @@ -178,14 +199,20 @@ vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( S2_DCHECK_GE(level, options_.min_level()); S2_DCHECK_LE(level, options_.max_level()); S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod()); + S2_DCHECK_LE(level, options_.true_max_level()); - if (level < true_max_level) { - // Add a covering term for this cell. - terms.push_back(GetTerm(TermType::COVERING, id, prefix)); - } - if (level == true_max_level || !options_.optimize_for_space()) { - // Add an ancestor term for this cell at the constrained level. - terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); + const bool is_max_level_cell = level == true_max_level; + // Add a term for this cell, max_level cell ANCESTOR is optimization + terms->push_back(GetTerm(is_max_level_cell ? TermType::ANCESTOR + : TermType::COVERING, + id, prefix)); + + // If query only contains points, there are no need other terms. + if (options_.query_contains_points_only()) continue; + + if (!options_.optimize_for_space() && !is_max_level_cell) { + // Add an ancestor term for this cell. + terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix)); } // Finally, add ancestor terms for all the ancestors of this cell. while ((level -= options_.level_mod()) >= options_.min_level()) { @@ -194,29 +221,34 @@ vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( prev_id.parent(level) == ancestor_id) { break; // We have already processed this cell and its ancestors. } - terms.push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix)); + terms->push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix)); } prev_id = id; } - return terms; } vector S2RegionTermIndexer::GetQueryTerms(const S2Point& point, string_view prefix) { + vector terms; + GetQueryTerms(point, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetQueryTerms(const S2Point& point, + string_view prefix, + vector* terms) { // See the top of this file for an overview of the indexing strategy. const S2CellId id(point); - vector terms; // Recall that all true_max_level() cells are indexed only as ancestor terms. int level = options_.true_max_level(); - terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); - if (options_.index_contains_points_only()) return terms; + terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); + if (options_.index_contains_points_only()) return; // Add covering terms for all the ancestor cells. for (; level >= options_.min_level(); level -= options_.level_mod()) { - terms.push_back(GetTerm(TermType::COVERING, id.parent(level), prefix)); + terms->push_back(GetTerm(TermType::COVERING, id.parent(level), prefix)); } - return terms; } vector S2RegionTermIndexer::GetQueryTerms(const S2Region& region, @@ -229,13 +261,20 @@ vector S2RegionTermIndexer::GetQueryTerms(const S2Region& region, vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( const S2CellUnion& covering, string_view prefix) { + vector terms; + GetQueryTermsForCanonicalCovering(covering, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( + const S2CellUnion& covering, string_view prefix, vector* terms) { // See the top of this file for an overview of the indexing strategy. + S2_CHECK(!options_.query_contains_points_only()); if (google::DEBUG_MODE) { *coverer_.mutable_options() = options_; S2_CHECK(coverer_.IsCanonical(covering)); } - vector terms; S2CellId prev_id = S2CellId::None(); int true_max_level = options_.true_max_level(); for (S2CellId id : covering) { @@ -245,9 +284,10 @@ vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( S2_DCHECK_GE(level, options_.min_level()); S2_DCHECK_LE(level, options_.max_level()); S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod()); + S2_DCHECK_LE(level, options_.true_max_level()); // Cells in the covering are always queried as ancestor terms. - terms.push_back(GetTerm(TermType::ANCESTOR, id, prefix)); + terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix)); // If the index only contains points, there are no covering terms. if (options_.index_contains_points_only()) continue; @@ -255,8 +295,8 @@ vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( // If we are optimizing for index space rather than query time, cells are // also queried as covering terms (except for true_max_level() cells, // which are indexed and queried as ancestor cells only). - if (options_.optimize_for_space() && level < true_max_level) { - terms.push_back(GetTerm(TermType::COVERING, id, prefix)); + if (options_.optimize_for_space() && level != true_max_level) { + terms->push_back(GetTerm(TermType::COVERING, id, prefix)); } // Finally, add covering terms for all the ancestors of this cell. while ((level -= options_.level_mod()) >= options_.min_level()) { @@ -265,9 +305,8 @@ vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( prev_id.parent(level) == ancestor_id) { break; // We have already processed this cell and its ancestors. } - terms.push_back(GetTerm(TermType::COVERING, ancestor_id, prefix)); + terms->push_back(GetTerm(TermType::COVERING, ancestor_id, prefix)); } prev_id = id; } - return terms; } diff --git a/src/s2/s2region_term_indexer.h b/src/s2/s2region_term_indexer.h index 44650799..17587c5d 100644 --- a/src/s2/s2region_term_indexer.h +++ b/src/s2/s2region_term_indexer.h @@ -196,8 +196,21 @@ class S2RegionTermIndexer { // this flag if your index consists entirely of points.) // // DEFAULT: false - bool index_contains_points_only() const { return points_only_; } - void set_index_contains_points_only(bool value) { points_only_ = value; } + bool index_contains_points_only() const { return index_points_only_; } + void set_index_contains_points_only(bool value) { index_points_only_ = value; } + + // If your query will only contain points (rather than regions), be sure + // to set this flag. This will generate smaller and faster index that + // are specialized for the points-only case. + // + // With the default quality settings, this flag reduces the number of + // index terms by about a factor of two. (The improvement gets smaller + // as max_cells() is increased, but there is really no reason not to use + // this flag if your query consist entirely of points.) + // + // DEFAULT: false + bool query_contains_points_only() const { return query_points_only_; } + void set_query_contains_points_only(bool value) { query_points_only_ = value; } // If true, the index will be optimized for space rather than for query // time. With the default quality settings, this flag reduces the number @@ -221,7 +234,8 @@ class S2RegionTermIndexer { void set_marker_character(char ch); private: - bool points_only_ = false; + bool index_points_only_ = false; + bool query_points_only_ = false; bool optimize_for_space_ = false; std::string marker_ = std::string(1, '$'); }; @@ -287,6 +301,21 @@ class S2RegionTermIndexer { std::vector GetQueryTermsForCanonicalCovering( const S2CellUnion& covering, absl::string_view prefix); + // Same as above but allows to reuse same buffer for different points or use + // single buffer for multiple points (common case is GeoJson MultiPoint) + void GetIndexTerms(const S2Point& point, absl::string_view prefix, + std::vector* terms); + void GetQueryTerms(const S2Point& point, absl::string_view prefix, + std::vector* terms); + + // Same as above but allows to reuse same buffer for different covering + void GetIndexTermsForCanonicalCovering(const S2CellUnion &covering, + absl::string_view prefix, + std::vector *terms); + void GetQueryTermsForCanonicalCovering(const S2CellUnion &covering, + absl::string_view prefix, + std::vector *terms); + private: enum TermType { ANCESTOR, COVERING }; diff --git a/src/s2/s2region_term_indexer_benchmark.cpp b/src/s2/s2region_term_indexer_benchmark.cpp new file mode 100644 index 00000000..98f602a8 --- /dev/null +++ b/src/s2/s2region_term_indexer_benchmark.cpp @@ -0,0 +1,67 @@ +#include "s2region_term_indexer.h" +#include "s2testing.h" +#include + +static void BM_OldApiGetIndexTerms(benchmark::State &state) { + S2RegionTermIndexer indexer; + benchmark::DoNotOptimize(indexer); + auto const count = state.range(0); + for (auto _ : state) { + for (int64_t i = 0; i < count; ++i) { + auto value = indexer.GetIndexTerms(S2Testing::RandomPoint(), {}); + benchmark::DoNotOptimize(value); + } + } +} + +BENCHMARK(BM_OldApiGetIndexTerms)->RangeMultiplier(4)->Range(1, 65536); + +static void BM_NewApiGetIndexTerms(benchmark::State &state) { + S2RegionTermIndexer indexer; + benchmark::DoNotOptimize(indexer); + auto const count = state.range(0); + for (auto _ : state) { + std::vector terms; + benchmark::DoNotOptimize(terms); + for (int64_t i = 0; i < count; ++i) { + terms.clear(); + indexer.GetIndexTerms(S2Testing::RandomPoint(), {}, &terms); + benchmark::DoNotOptimize(terms); + } + } +} + +BENCHMARK(BM_NewApiGetIndexTerms)->RangeMultiplier(4)->Range(1, 65536); + +static void BM_OldApiGetQueryTerms(benchmark::State &state) { + S2RegionTermIndexer indexer; + benchmark::DoNotOptimize(indexer); + auto const count = state.range(0); + for (auto _ : state) { + for (int64_t i = 0; i < count; ++i) { + auto value = indexer.GetQueryTerms(S2Testing::RandomPoint(), {}); + benchmark::DoNotOptimize(value); + } + } +} + +BENCHMARK(BM_OldApiGetQueryTerms)->RangeMultiplier(4)->Range(1, 65536); + +static void BM_NewApiGetQueryTerms(benchmark::State &state) { + S2RegionTermIndexer indexer; + benchmark::DoNotOptimize(indexer); + auto const count = state.range(0); + for (auto _ : state) { + std::vector terms; + benchmark::DoNotOptimize(terms); + for (int64_t i = 0; i < count; ++i) { + terms.clear(); + indexer.GetQueryTerms(S2Testing::RandomPoint(), {}, &terms); + benchmark::DoNotOptimize(terms); + } + } +} + +BENCHMARK(BM_NewApiGetQueryTerms)->RangeMultiplier(4)->Range(1, 65536); + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/src/s2/s2region_term_indexer_test.cc b/src/s2/s2region_term_indexer_test.cc index bff52a00..65cd6f6b 100644 --- a/src/s2/s2region_term_indexer_test.cc +++ b/src/s2/s2region_term_indexer_test.cc @@ -27,6 +27,7 @@ #include "absl/container/flat_hash_map.h" #include "absl/flags/flag.h" +#include "absl/strings/str_cat.h" #include "s2/base/commandlineflags.h" #include "s2/base/logging.h" @@ -44,10 +45,13 @@ S2_DEFINE_int32(iters, 400, "number of iterations for testing"); namespace { -enum QueryType { POINT, CAP }; +enum DataType { + POINT = 0, + CAP = 1, +}; void TestRandomCaps(const S2RegionTermIndexer::Options& options, - QueryType query_type) { + DataType index_type, DataType query_type) { // This function creates an index consisting either of points (if // options.index_contains_points_only() is true) or S2Caps of random size. // It then executes queries consisting of points (if query_type == POINT) @@ -63,7 +67,7 @@ void TestRandomCaps(const S2RegionTermIndexer::Options& options, // of random size (up to a full sphere). S2Cap cap; vector terms; - if (options.index_contains_points_only()) { + if (index_type == DataType::POINT) { cap = S2Cap::FromPoint(S2Testing::RandomPoint()); terms = indexer.GetIndexTerms(cap.center(), ""); } else { @@ -84,7 +88,7 @@ void TestRandomCaps(const S2RegionTermIndexer::Options& options, // random size. S2Cap cap; vector terms; - if (query_type == QueryType::CAP) { + if (query_type == DataType::POINT) { cap = S2Cap::FromPoint(S2Testing::RandomPoint()); terms = indexer.GetQueryTerms(cap.center(), ""); } else { @@ -112,62 +116,80 @@ void TestRandomCaps(const S2RegionTermIndexer::Options& options, static_cast(query_terms) / absl::GetFlag(FLAGS_iters)); } -// We run one test case for each combination of space vs. time optimization, -// and indexing regions vs. only points. +using TestCase = std::tuple; + +class S2RegionTermIndexerTest : public testing::TestWithParam { +protected: + void SetUp() override { + index_type = std::get<0>(GetParam()); + query_type = std::get<1>(GetParam()); + options.set_optimize_for_space(std::get<2>(GetParam())); + options.set_index_contains_points_only(std::get<3>(GetParam())); + options.set_query_contains_points_only(std::get<4>(GetParam())); + if (index_type != DataType::POINT && options.index_contains_points_only()) { + GTEST_SKIP() << "Case index_type != DataType::POINT && " + "options.index_contains_points_only() is invalid."; + } + if (query_type != DataType::POINT && options.query_contains_points_only()) { + GTEST_SKIP() << "Case query_type != DataType::POINT && " + "options.query_contains_points_only() is invalid."; + } + } -TEST(S2RegionTermIndexer, IndexRegionsQueryRegionsOptimizeTime) { S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(0); // Use face cells. - options.set_max_level(16); - options.set_max_cells(20); - TestRandomCaps(options, QueryType::CAP); + DataType index_type{}; + DataType query_type{}; +}; + +// We run one test case for each combination: +// index_type: POINT, CAP +// query_type: POINT, CAP +// optimize_for_space: false, true +// index_contains_points_only: false, true +// query_contains_points_only: false, true +INSTANTIATE_TEST_CASE_P( + S2RegionTermIndexerTests, S2RegionTermIndexerTest, + testing::Combine(testing::Values(DataType::POINT, DataType::CAP), + testing::Values(DataType::POINT, DataType::CAP), + testing::Bool(), testing::Bool(), testing::Bool()), + [](const testing::TestParamInfo& info) { + return absl::StrCat( + "Index", std::get<0>(info.param), "Query", std::get<1>(info.param), + "SpaceOpt", std::get<2>(info.param), "IndexOpt", + std::get<3>(info.param), "QueryOpt", std::get<4>(info.param)); + }); + +TEST_P(S2RegionTermIndexerTest, DefaultParametersValues) { + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexRegionsQueryPointsOptimizeTime) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(0); // Use face cells. +TEST_P(S2RegionTermIndexerTest, UseFaceCells) { + options.set_min_level(0); options.set_max_level(16); options.set_max_cells(20); - TestRandomCaps(options, QueryType::POINT); + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexRegionsQueryRegionsOptimizeTimeWithLevelMod) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(6); // Constrain min/max levels. +TEST_P(S2RegionTermIndexerTest, ConstrainMinMaxLevels) { + options.set_min_level(6); options.set_max_level(12); options.set_level_mod(3); - TestRandomCaps(options, QueryType::CAP); + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexRegionsQueryRegionsOptimizeSpace) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(true); // Optimize for space. +TEST_P(S2RegionTermIndexerTest, UseLeafCells) { options.set_min_level(4); - options.set_max_level(S2CellId::kMaxLevel); // Use leaf cells. + options.set_max_level(S2CellId::kMaxLevel); options.set_max_cells(8); - TestRandomCaps(options, QueryType::CAP); + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexPointsQueryRegionsOptimizeTime) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(0); // Use face cells. +TEST_P(S2RegionTermIndexerTest, UseFaceCells2) { + options.set_min_level(0); options.set_max_level(S2CellId::kMaxLevel); options.set_level_mod(2); options.set_max_cells(20); - options.set_index_contains_points_only(true); - TestRandomCaps(options, QueryType::CAP); -} - -TEST(S2RegionTermIndexer, IndexPointsQueryRegionsOptimizeSpace) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(true); // Optimize for space. - options.set_index_contains_points_only(true); - // Use default parameter values. - TestRandomCaps(options, QueryType::CAP); + TestRandomCaps(options, index_type, query_type); } TEST(S2RegionTermIndexer, MarkerCharacter) {