Skip to content

Commit

Permalink
fix: wildcard support in json extract using simdjson (#12281)
Browse files Browse the repository at this point in the history
Summary:

This change addresses 2 bugs in the wildcard support of json_extract:

Bug 1: Failure to iterate on values inside an object, resulting in
empty arrays.
For eg: calling JSON_EXTRACT(
   JSON '{"foo": {"a" : {"c": 1 , "d": 3} }, "b": 2}', '$.foo.a.[*]')
will return an empty array whereas we expect [1, 3]

Bug 2: Inconsistent handling of indefinite results, always returning a
JSON array even when the token is not evaluated.
or eg: JSON_EXTRACT('({"a": {"b": [123, 456]}})', '$.a.c[*]') should
return NULL instead of [] as the key 'c' does not exist.

Reviewed By: kevinwilfong

Differential Revision: D69281233
  • Loading branch information
Bikramjeet Vig authored and facebook-github-bot committed Feb 7, 2025
1 parent cf48065 commit 2db2e90
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 24 deletions.
13 changes: 8 additions & 5 deletions velox/functions/prestosql/JsonFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ struct JsonExtractScalarFunction {
};

auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer));
bool isDefinitePath = true;
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer, isDefinitePath));

if (resultStr.has_value()) {
result.copy_from(*resultStr);
Expand Down Expand Up @@ -321,16 +322,17 @@ struct JsonExtractFunction {
};

auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer));
bool isDefinitePath = true;
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer, isDefinitePath));

if (resultSize == 0) {
if (extractor.isDefinitePath()) {
if (isDefinitePath) {
// If the path didn't map to anything in the JSON object, return null.
return simdjson::NO_SUCH_FIELD;
}

result.copy_from("[]");
} else if (resultSize == 1 && extractor.isDefinitePath()) {
} else if (resultSize == 1 && isDefinitePath) {
// If there was only one value mapped to by the path, don't wrap it in an
// array.
result.copy_from(results);
Expand Down Expand Up @@ -392,7 +394,8 @@ struct JsonSizeFunction {
};

auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer));
bool isDefinitePath = true;
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer, isDefinitePath));

if (resultCount == 0) {
// If the path didn't map to anything in the JSON object, return null.
Expand Down
53 changes: 35 additions & 18 deletions velox/functions/prestosql/json/SIMDJsonExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,21 @@ namespace facebook::velox::functions {

class SIMDJsonExtractor {
public:
/// Executes json extract on 'json' and passes all the matches to 'consumer'.
/// 'isDefinitePath' is an ouput param that will get set to false if a token
/// is evaluated which can return multiple results like '*'.
template <typename TConsumer>
simdjson::error_code extract(
simdjson::ondemand::value& json,
TConsumer& consumer,
bool& isDefinitePath,
size_t tokenStartIndex = 0);

/// Returns true if this extractor was initialized with the trivial path "$".
bool isRootOnlyPath() {
return tokens_.empty();
}

/// Returns true if this extractor was initialized with a path that's
/// guaranteed to match at most one entry.
bool isDefinitePath() {
for (const auto& token : tokens_) {
if (token == "*") {
return false;
}
}

return true;
}

/// Use this method to get an instance of SIMDJsonExtractor given a JSON path.
/// Given the nature of the cache, it's important this is only used by
/// the callers of simdJsonExtract.
Expand Down Expand Up @@ -87,6 +79,7 @@ template <typename TConsumer>
simdjson::error_code SIMDJsonExtractor::extract(
simdjson::ondemand::value& json,
TConsumer& consumer,
bool& isDefinitePath,
size_t tokenStartIndex) {
simdjson::ondemand::value input = json;
// Temporary extraction result holder.
Expand All @@ -95,8 +88,29 @@ simdjson::error_code SIMDJsonExtractor::extract(
for (int tokenIndex = tokenStartIndex; tokenIndex < tokens_.size();
tokenIndex++) {
auto& token = tokens_[tokenIndex];
if (token == "*") {
isDefinitePath = false;
}
if (input.type() == simdjson::ondemand::json_type::object) {
SIMDJSON_TRY(extractObject(input, token, result));
if (token == "*") {
SIMDJSON_ASSIGN_OR_RAISE(auto jsonObj, input.get_object());
for (auto field : jsonObj) {
simdjson::ondemand::value val = field.value();
if (tokenIndex == tokens_.size() - 1) {
// If this is the last token in the path, consume each element in
// the object.
SIMDJSON_TRY(consumer(val));
} else {
// If not, then recursively call the extract function on each
// element in the object.
SIMDJSON_TRY(
extract(val, consumer, isDefinitePath, tokenIndex + 1));
}
}
return simdjson::SUCCESS;
} else {
SIMDJSON_TRY(extractObject(input, token, result));
}
} else if (input.type() == simdjson::ondemand::json_type::array) {
if (token == "*") {
for (auto child : input.get_array()) {
Expand All @@ -107,10 +121,10 @@ simdjson::error_code SIMDJsonExtractor::extract(
} else {
// If not, then recursively call the extract function on each
// element in the array.
SIMDJSON_TRY(extract(child.value(), consumer, tokenIndex + 1));
SIMDJSON_TRY(extract(
child.value(), consumer, isDefinitePath, tokenIndex + 1));
}
}

return simdjson::SUCCESS;
} else {
SIMDJSON_TRY(extractArray(input, token, result));
Expand All @@ -135,7 +149,8 @@ simdjson::error_code SIMDJsonExtractor::extract(
* object, an array, or a scalar.
* "." Child operator to get a child object.
* "[]" Subscript operator for array.
* "*" Wildcard for [], get all the elements of an array.
* "*" Wildcard for [], get all the elements of an array or
* all values in an object.
* @param consumer: Function to consume the extracted elements. Should be able
* to take an argument that can either be a
* simdjson::ondemand::document or a simdjson::ondemand::value.
Expand All @@ -149,7 +164,8 @@ template <typename TConsumer>
simdjson::error_code simdJsonExtract(
const velox::StringView& json,
SIMDJsonExtractor& extractor,
TConsumer&& consumer) {
TConsumer&& consumer,
bool& isDefinitePath) {
simdjson::padded_string paddedJson(json.data(), json.size());
SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, simdjsonParse(paddedJson));

Expand All @@ -160,7 +176,8 @@ simdjson::error_code simdJsonExtract(
return consumer(jsonDoc);
}
SIMDJSON_ASSIGN_OR_RAISE(auto value, jsonDoc.get_value());
return extractor.extract(value, std::forward<TConsumer>(consumer));
return extractor.extract(
value, std::forward<TConsumer>(consumer), isDefinitePath);
}

} // namespace facebook::velox::functions
18 changes: 17 additions & 1 deletion velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ simdjson::error_code simdJsonExtract(
const std::string& path,
TConsumer&& consumer) {
auto& extractor = SIMDJsonExtractor::getInstance(path);
bool isDefinitePath = true;
return simdJsonExtract(
velox::StringView(json), extractor, std::forward<TConsumer>(consumer));
velox::StringView(json),
extractor,
std::forward<TConsumer>(consumer),
isDefinitePath);
}

class SIMDJsonExtractorTest : public testing::Test {
Expand Down Expand Up @@ -177,6 +181,17 @@ TEST_F(SIMDJsonExtractorTest, generalJsonTest) {
testExtract(json, "$[\"e mail\"]", "\"amy@only_for_json_udf_test.net\"");
testExtract(json, "$.owner", "\"amy\"");

// Wildcard over object's value elements
testExtract(
json,
"$.store.book[0].[*]",
std::vector<std::string>{
"\"Nigel Rees\"",
"\"ayings of the Century\"",
"\"reference\"",
"8.95"});
testExtract(json, "$.store.[*].price", std::vector<std::string>{"19.95"});

testExtract("[[1.1,[2.1,2.2]],2,{\"a\":\"b\"}]", "$[0][1][1]", "2.2");

json = "[1,2,{\"a\":\"b\"}]";
Expand All @@ -190,6 +205,7 @@ TEST_F(SIMDJsonExtractorTest, generalJsonTest) {

testExtract("{\"a\":\"b\"}", " $ ", "{\"a\":\"b\"}");

// Wildcard over array elements
json =
"[[{\"key\": 1, \"value\": 2},"
"{\"key\": 2, \"value\": 4}],"
Expand Down
13 changes: 13 additions & 0 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,19 @@ TEST_F(JsonFunctionsTest, jsonExtract) {
EXPECT_EQ("[]", jsonExtract(R"({"a": [{"b": 123}]})", "$.a[*].c"));
EXPECT_EQ(std::nullopt, jsonExtract(R"({"a": [{"b": 123}]})", "$.a[0].c"));

// Wildcard on empty object and array
EXPECT_EQ("[]", jsonExtract("{\"a\": {}", "$.a.[*]"));
EXPECT_EQ("[]", jsonExtract("{\"a\": []}", "$.a.[*]"));

// Calling wildcard on a scalar
EXPECT_EQ("[]", jsonExtract(R"({"a": {"b": [123, 456]}})", "$.a.b.[0].[*]"));

// non-definite paths that end up being evaluated vs. not evaluated
EXPECT_EQ(
"[123,456]", jsonExtract(R"({"a": {"b": [123, 456]}})", "$.a.b[*]"));
EXPECT_EQ(
std::nullopt, jsonExtract(R"({"a": {"b": [123, 456]}})", "$.a.c[*]"));

// TODO The following paths are supported by Presto via Jayway, but do not
// work in Velox yet. Figure out how to add support for these.
VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path");
Expand Down

0 comments on commit 2db2e90

Please sign in to comment.