From 2db2e90786f92aa65b773cc3c129632c6fb053b9 Mon Sep 17 00:00:00 2001 From: Bikramjeet Vig Date: Fri, 7 Feb 2025 11:58:52 -0800 Subject: [PATCH] fix: wildcard support in json extract using simdjson (#12281) Summary: This change addresses 2 bugs in the wildcard support of json_extract: Bug 1: Failure to iterate on values inside an object, resulting in empty arrays. For eg: calling JSON_EXTRACT( JSON '{"foo": {"a" : {"c": 1 , "d": 3} }, "b": 2}', '$.foo.a.[*]') will return an empty array whereas we expect [1, 3] Bug 2: Inconsistent handling of indefinite results, always returning a JSON array even when the token is not evaluated. or eg: JSON_EXTRACT('({"a": {"b": [123, 456]}})', '$.a.c[*]') should return NULL instead of [] as the key 'c' does not exist. Reviewed By: kevinwilfong Differential Revision: D69281233 --- velox/functions/prestosql/JsonFunctions.h | 13 +++-- .../prestosql/json/SIMDJsonExtractor.h | 53 ++++++++++++------- .../json/tests/SIMDJsonExtractorTest.cpp | 18 ++++++- .../prestosql/tests/JsonFunctionsTest.cpp | 13 +++++ 4 files changed, 73 insertions(+), 24 deletions(-) diff --git a/velox/functions/prestosql/JsonFunctions.h b/velox/functions/prestosql/JsonFunctions.h index 1596a95a0c94..539d2e9e215e 100644 --- a/velox/functions/prestosql/JsonFunctions.h +++ b/velox/functions/prestosql/JsonFunctions.h @@ -254,7 +254,8 @@ struct JsonExtractScalarFunction { }; auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); - SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer)); + bool isDefinitePath = true; + SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer, isDefinitePath)); if (resultStr.has_value()) { result.copy_from(*resultStr); @@ -321,16 +322,17 @@ struct JsonExtractFunction { }; auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); - SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer)); + bool isDefinitePath = true; + SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer, isDefinitePath)); if (resultSize == 0) { - if (extractor.isDefinitePath()) { + if (isDefinitePath) { // If the path didn't map to anything in the JSON object, return null. return simdjson::NO_SUCH_FIELD; } result.copy_from("[]"); - } else if (resultSize == 1 && extractor.isDefinitePath()) { + } else if (resultSize == 1 && isDefinitePath) { // If there was only one value mapped to by the path, don't wrap it in an // array. result.copy_from(results); @@ -392,7 +394,8 @@ struct JsonSizeFunction { }; auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); - SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer)); + bool isDefinitePath = true; + SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer, isDefinitePath)); if (resultCount == 0) { // If the path didn't map to anything in the JSON object, return null. diff --git a/velox/functions/prestosql/json/SIMDJsonExtractor.h b/velox/functions/prestosql/json/SIMDJsonExtractor.h index a816ec08874d..98733a9aff89 100644 --- a/velox/functions/prestosql/json/SIMDJsonExtractor.h +++ b/velox/functions/prestosql/json/SIMDJsonExtractor.h @@ -29,10 +29,14 @@ namespace facebook::velox::functions { class SIMDJsonExtractor { public: + /// Executes json extract on 'json' and passes all the matches to 'consumer'. + /// 'isDefinitePath' is an ouput param that will get set to false if a token + /// is evaluated which can return multiple results like '*'. template simdjson::error_code extract( simdjson::ondemand::value& json, TConsumer& consumer, + bool& isDefinitePath, size_t tokenStartIndex = 0); /// Returns true if this extractor was initialized with the trivial path "$". @@ -40,18 +44,6 @@ class SIMDJsonExtractor { return tokens_.empty(); } - /// Returns true if this extractor was initialized with a path that's - /// guaranteed to match at most one entry. - bool isDefinitePath() { - for (const auto& token : tokens_) { - if (token == "*") { - return false; - } - } - - return true; - } - /// Use this method to get an instance of SIMDJsonExtractor given a JSON path. /// Given the nature of the cache, it's important this is only used by /// the callers of simdJsonExtract. @@ -87,6 +79,7 @@ template simdjson::error_code SIMDJsonExtractor::extract( simdjson::ondemand::value& json, TConsumer& consumer, + bool& isDefinitePath, size_t tokenStartIndex) { simdjson::ondemand::value input = json; // Temporary extraction result holder. @@ -95,8 +88,29 @@ simdjson::error_code SIMDJsonExtractor::extract( for (int tokenIndex = tokenStartIndex; tokenIndex < tokens_.size(); tokenIndex++) { auto& token = tokens_[tokenIndex]; + if (token == "*") { + isDefinitePath = false; + } if (input.type() == simdjson::ondemand::json_type::object) { - SIMDJSON_TRY(extractObject(input, token, result)); + if (token == "*") { + SIMDJSON_ASSIGN_OR_RAISE(auto jsonObj, input.get_object()); + for (auto field : jsonObj) { + simdjson::ondemand::value val = field.value(); + if (tokenIndex == tokens_.size() - 1) { + // If this is the last token in the path, consume each element in + // the object. + SIMDJSON_TRY(consumer(val)); + } else { + // If not, then recursively call the extract function on each + // element in the object. + SIMDJSON_TRY( + extract(val, consumer, isDefinitePath, tokenIndex + 1)); + } + } + return simdjson::SUCCESS; + } else { + SIMDJSON_TRY(extractObject(input, token, result)); + } } else if (input.type() == simdjson::ondemand::json_type::array) { if (token == "*") { for (auto child : input.get_array()) { @@ -107,10 +121,10 @@ simdjson::error_code SIMDJsonExtractor::extract( } else { // If not, then recursively call the extract function on each // element in the array. - SIMDJSON_TRY(extract(child.value(), consumer, tokenIndex + 1)); + SIMDJSON_TRY(extract( + child.value(), consumer, isDefinitePath, tokenIndex + 1)); } } - return simdjson::SUCCESS; } else { SIMDJSON_TRY(extractArray(input, token, result)); @@ -135,7 +149,8 @@ simdjson::error_code SIMDJsonExtractor::extract( * object, an array, or a scalar. * "." Child operator to get a child object. * "[]" Subscript operator for array. - * "*" Wildcard for [], get all the elements of an array. + * "*" Wildcard for [], get all the elements of an array or + * all values in an object. * @param consumer: Function to consume the extracted elements. Should be able * to take an argument that can either be a * simdjson::ondemand::document or a simdjson::ondemand::value. @@ -149,7 +164,8 @@ template simdjson::error_code simdJsonExtract( const velox::StringView& json, SIMDJsonExtractor& extractor, - TConsumer&& consumer) { + TConsumer&& consumer, + bool& isDefinitePath) { simdjson::padded_string paddedJson(json.data(), json.size()); SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, simdjsonParse(paddedJson)); @@ -160,7 +176,8 @@ simdjson::error_code simdJsonExtract( return consumer(jsonDoc); } SIMDJSON_ASSIGN_OR_RAISE(auto value, jsonDoc.get_value()); - return extractor.extract(value, std::forward(consumer)); + return extractor.extract( + value, std::forward(consumer), isDefinitePath); } } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp b/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp index 503e5cc7ddc2..490594a63fb8 100644 --- a/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp +++ b/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp @@ -31,8 +31,12 @@ simdjson::error_code simdJsonExtract( const std::string& path, TConsumer&& consumer) { auto& extractor = SIMDJsonExtractor::getInstance(path); + bool isDefinitePath = true; return simdJsonExtract( - velox::StringView(json), extractor, std::forward(consumer)); + velox::StringView(json), + extractor, + std::forward(consumer), + isDefinitePath); } class SIMDJsonExtractorTest : public testing::Test { @@ -177,6 +181,17 @@ TEST_F(SIMDJsonExtractorTest, generalJsonTest) { testExtract(json, "$[\"e mail\"]", "\"amy@only_for_json_udf_test.net\""); testExtract(json, "$.owner", "\"amy\""); + // Wildcard over object's value elements + testExtract( + json, + "$.store.book[0].[*]", + std::vector{ + "\"Nigel Rees\"", + "\"ayings of the Century\"", + "\"reference\"", + "8.95"}); + testExtract(json, "$.store.[*].price", std::vector{"19.95"}); + testExtract("[[1.1,[2.1,2.2]],2,{\"a\":\"b\"}]", "$[0][1][1]", "2.2"); json = "[1,2,{\"a\":\"b\"}]"; @@ -190,6 +205,7 @@ TEST_F(SIMDJsonExtractorTest, generalJsonTest) { testExtract("{\"a\":\"b\"}", " $ ", "{\"a\":\"b\"}"); + // Wildcard over array elements json = "[[{\"key\": 1, \"value\": 2}," "{\"key\": 2, \"value\": 4}]," diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 22e84d68452d..5695acec445e 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -916,6 +916,19 @@ TEST_F(JsonFunctionsTest, jsonExtract) { EXPECT_EQ("[]", jsonExtract(R"({"a": [{"b": 123}]})", "$.a[*].c")); EXPECT_EQ(std::nullopt, jsonExtract(R"({"a": [{"b": 123}]})", "$.a[0].c")); + // Wildcard on empty object and array + EXPECT_EQ("[]", jsonExtract("{\"a\": {}", "$.a.[*]")); + EXPECT_EQ("[]", jsonExtract("{\"a\": []}", "$.a.[*]")); + + // Calling wildcard on a scalar + EXPECT_EQ("[]", jsonExtract(R"({"a": {"b": [123, 456]}})", "$.a.b.[0].[*]")); + + // non-definite paths that end up being evaluated vs. not evaluated + EXPECT_EQ( + "[123,456]", jsonExtract(R"({"a": {"b": [123, 456]}})", "$.a.b[*]")); + EXPECT_EQ( + std::nullopt, jsonExtract(R"({"a": {"b": [123, 456]}})", "$.a.c[*]")); + // TODO The following paths are supported by Presto via Jayway, but do not // work in Velox yet. Figure out how to add support for these. VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path");