Skip to content

Commit

Permalink
feat(fuzzer): Add input generator for json_parse in expression fuzzer (
Browse files Browse the repository at this point in the history
…facebookincubator#12019)

Summary:

Make expression fuzzer generate input vectors of valid JSON strings for the
json_parse function. To test corner cases, the JSON strings may be
randomly truncated or inserted with a space character.

Differential Revision: D67820571
  • Loading branch information
kagamiori authored and facebook-github-bot committed Feb 7, 2025
1 parent 788555c commit 9cf9fb0
Show file tree
Hide file tree
Showing 18 changed files with 396 additions and 204 deletions.
1 change: 0 additions & 1 deletion velox/exec/fuzzer/PrestoQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,6 @@ bool PrestoQueryRunner::isSupported(const exec::FunctionSignature& signature) {
usesTypeName(signature, "interval year to month") ||
usesTypeName(signature, "hugeint") ||
usesTypeName(signature, "hyperloglog") ||
usesInputTypeName(signature, "json") ||
usesInputTypeName(signature, "ipaddress") ||
usesInputTypeName(signature, "ipprefix") ||
usesInputTypeName(signature, "uuid"));
Expand Down
82 changes: 82 additions & 0 deletions velox/expression/fuzzer/ArgsOverrideFunctions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/expression/fuzzer/ArgsOverrideFunctions.h"

#include "velox/common/fuzzer/ConstrainedGenerators.h"
#include "velox/common/fuzzer/Utils.h"
#include "velox/core/Expressions.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"

namespace facebook::velox::fuzzer {

std::vector<core::TypedExprPtr> generateJsonParseArgs::generate(
const CallableSignature& signature,
const VectorFuzzer::Options& options,
FuzzerGenerator& rng,
ExpressionFuzzerState& state) {
VELOX_CHECK_EQ(signature.args.size(), 1);
std::vector<core::TypedExprPtr> inputExpressions;

state.inputRowTypes_.emplace_back(signature.args[0]);
state.inputRowNames_.emplace_back(
fmt::format("c{}", state.inputRowTypes_.size() - 1));

const auto representedType = facebook::velox::randType(rng, 3);
const auto seed = rand<uint32_t>(rng);
const auto nullRatio = options.nullRatio;
state.customInputGenerators_.emplace_back(
std::make_shared<fuzzer::JsonInputGenerator>(
seed,
signature.args[0],
nullRatio,
fuzzer::getRandomInputGenerator(seed, representedType, nullRatio),
true));

inputExpressions.push_back(std::make_shared<core::FieldAccessTypedExpr>(
signature.args[0], state.inputRowNames_.back()));
return inputExpressions;
}

/*std::vector<core::TypedExprPtr> generateJsonParseArg(
const CallableSignature& signature,
const VectorFuzzer::Options& options,
FuzzerGenerator& rng,
ExpressionFuzzer::State& state) {
VELOX_CHECK_EQ(signature.args.size(), 1);
std::vector<core::TypedExprPtr> inputExpressions;
state.inputRowTypes_.emplace_back(signature.args[0]);
state.inputRowNames_.emplace_back(
fmt::format("c{}", state.inputRowTypes_.size() - 1));
const auto representedType = facebook::velox::randType(rng, 3);
const auto seed = rand<uint32_t>(rng);
const auto nullRatio = options.nullRatio;
state.customInputGenerators_.emplace_back(
std::make_shared<fuzzer::JsonInputGenerator>(
seed,
signature.args[0],
nullRatio,
fuzzer::getRandomInputGenerator(seed, representedType, nullRatio),
true));
inputExpressions.push_back(std::make_shared<core::FieldAccessTypedExpr>(
signature.args[0], state.inputRowNames_.back()));
return inputExpressions;
}*/

} // namespace facebook::velox::fuzzer
33 changes: 33 additions & 0 deletions velox/expression/fuzzer/ArgsOverrideFunctions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "velox/expression/fuzzer/FuzzerToolkit.h"

namespace facebook::velox::fuzzer {

class generateJsonParseArgs : public ArgValuesGenerator {
public:
~generateJsonParseArgs() override = default;

std::vector<core::TypedExprPtr> generate(
const CallableSignature& signature,
const VectorFuzzer::Options& options,
FuzzerGenerator& rng,
ExpressionFuzzerState& state) override;
};

} // namespace facebook::velox::fuzzer
5 changes: 4 additions & 1 deletion velox/expression/fuzzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ target_link_libraries(

add_library(
velox_expression_fuzzer
ArgsOverrideFunctions.cpp
ArgumentTypeFuzzer.cpp
DecimalArgGeneratorBase.cpp
ExpressionFuzzer.cpp
Expand All @@ -38,11 +39,13 @@ target_link_libraries(
velox_type
velox_vector_fuzzer
velox_vector_test_lib
velox_constrained_input_generators
velox_function_registry
velox_expression_test_utility
velox_file
velox_hive_connector
velox_fuzzer_util)
velox_fuzzer_util
velox_common_fuzzer_util)

add_executable(velox_expression_fuzzer_test ExpressionFuzzerTest.cpp)

Expand Down
121 changes: 46 additions & 75 deletions velox/expression/fuzzer/ExpressionFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <unordered_set>

#include "velox/common/base/Exceptions.h"
#include "velox/common/fuzzer/ConstrainedGenerators.h"
#include "velox/exec/fuzzer/FuzzerUtil.h"
#include "velox/expression/Expr.h"
#include "velox/expression/FunctionSignature.h"
Expand Down Expand Up @@ -272,11 +273,14 @@ ExpressionFuzzer::ExpressionFuzzer(
const std::shared_ptr<VectorFuzzer>& vectorFuzzer,
const std::optional<ExpressionFuzzer::Options>& options,
const std::unordered_map<std::string, std::shared_ptr<ArgGenerator>>&
argGenerators)
argGenerators,
const std::unordered_map<std::string, std::shared_ptr<ArgValuesGenerator>>&
argsOverrideFuncs)
: options_(options.value_or(Options())),
vectorFuzzer_(vectorFuzzer),
state{rng_, std::max(1, options_.maxLevelOfNesting)},
argGenerators_(argGenerators) {
state_{rng_, std::max(1, options_.maxLevelOfNesting)},
argGenerators_(argGenerators),
funcArgOverrides_{argsOverrideFuncs} {
VELOX_CHECK(vectorFuzzer, "Vector fuzzer must be provided");
seed(initialSeed);

Expand Down Expand Up @@ -432,10 +436,6 @@ ExpressionFuzzer::ExpressionFuzzer(
addToTypeToExpressionListByTicketTimes("row", "row_constructor");
addToTypeToExpressionListByTicketTimes(kTypeParameterName, "dereference");
}

// Register function override (for cases where we want to restrict the types
// or parameters we pass to functions).
registerFuncOverride(&ExpressionFuzzer::generateSwitchArgs, "switch");
}

bool ExpressionFuzzer::isSupportedSignature(
Expand Down Expand Up @@ -519,13 +519,6 @@ void ExpressionFuzzer::addToTypeToExpressionListByTicketTimes(
}
}

template <typename TFunc>
void ExpressionFuzzer::registerFuncOverride(
TFunc func,
const std::string& name) {
funcArgOverrides_[name] = std::bind(func, this, std::placeholders::_1);
}

void ExpressionFuzzer::seed(size_t seed) {
rng_.seed(seed);
vectorFuzzer_->reSeed(seed);
Expand All @@ -548,22 +541,23 @@ core::TypedExprPtr ExpressionFuzzer::generateArgConstant(const TypePtr& arg) {
// columns of the same type exist then there is a 30% chance that it will
// re-use one of them.
core::TypedExprPtr ExpressionFuzzer::generateArgColumn(const TypePtr& arg) {
auto& listOfCandidateCols = state.typeToColumnNames_[arg->toString()];
auto& listOfCandidateCols = state_.typeToColumnNames_[arg->toString()];
bool reuseColumn = options_.enableColumnReuse &&
!listOfCandidateCols.empty() && vectorFuzzer_->coinToss(0.3);

if (!reuseColumn && options_.maxInputsThreshold.has_value() &&
state.inputRowTypes_.size() >= options_.maxInputsThreshold.value()) {
state_.inputRowTypes_.size() >= options_.maxInputsThreshold.value()) {
reuseColumn = !listOfCandidateCols.empty();
}

if (!reuseColumn) {
state.inputRowTypes_.emplace_back(arg);
state.inputRowNames_.emplace_back(
fmt::format("c{}", state.inputRowTypes_.size() - 1));
listOfCandidateCols.push_back(state.inputRowNames_.back());
state_.inputRowTypes_.emplace_back(arg);
state_.inputRowNames_.emplace_back(
fmt::format("c{}", state_.inputRowTypes_.size() - 1));
state_.customInputGenerators_.emplace_back(nullptr);
listOfCandidateCols.push_back(state_.inputRowNames_.back());
return std::make_shared<core::FieldAccessTypedExpr>(
arg, state.inputRowNames_.back());
arg, state_.inputRowNames_.back());
}
size_t chosenColIndex = rand32(0, listOfCandidateCols.size() - 1);
return std::make_shared<core::FieldAccessTypedExpr>(
Expand All @@ -582,7 +576,7 @@ core::TypedExprPtr ExpressionFuzzer::generateArg(const TypePtr& arg) {
// - Lambdas
// - Try
if (argClass >= kArgExpression) {
if (state.remainingLevelOfNesting_ > 0) {
if (state_.remainingLevelOfNesting_ > 0) {
return generateExpression(arg);
}
argClass = rand32(0, 1);
Expand Down Expand Up @@ -732,18 +726,19 @@ std::vector<core::TypedExprPtr> ExpressionFuzzer::generateSwitchArgs(

ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions(
const RowTypePtr& outType) {
state.reset();
state_.reset();
VELOX_CHECK_EQ(
state.remainingLevelOfNesting_, std::max(1, options_.maxLevelOfNesting));
state_.remainingLevelOfNesting_, std::max(1, options_.maxLevelOfNesting));

std::vector<core::TypedExprPtr> expressions;
for (int i = 0; i < outType->size(); i++) {
expressions.push_back(generateExpression(outType->childAt(i)));
}
return {
std::move(expressions),
ROW(std::move(state.inputRowNames_), std::move(state.inputRowTypes_)),
std::move(state.expressionStats_)};
ROW(std::move(state_.inputRowNames_), std::move(state_.inputRowTypes_)),
std::move(state_.customInputGenerators_),
std::move(state_.expressionStats_)};
}

ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions(
Expand All @@ -760,16 +755,16 @@ ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpression() {
// chance that it will re-use one of them.
core::TypedExprPtr ExpressionFuzzer::generateExpression(
const TypePtr& returnType) {
VELOX_CHECK_GT(state.remainingLevelOfNesting_, 0);
--state.remainingLevelOfNesting_;
auto guard = folly::makeGuard([&] { ++state.remainingLevelOfNesting_; });
VELOX_CHECK_GT(state_.remainingLevelOfNesting_, 0);
--state_.remainingLevelOfNesting_;
auto guard = folly::makeGuard([&] { ++state_.remainingLevelOfNesting_; });

core::TypedExprPtr expression;
bool reuseExpression =
options_.enableExpressionReuse && vectorFuzzer_->coinToss(0.3);
if (reuseExpression) {
expression = state.expressionBank_.getRandomExpression(
returnType, state.remainingLevelOfNesting_ + 1);
expression = state_.expressionBank_.getRandomExpression(
returnType, state_.remainingLevelOfNesting_ + 1);
if (expression) {
return expression;
}
Expand All @@ -796,11 +791,11 @@ core::TypedExprPtr ExpressionFuzzer::generateExpression(

auto exprTransformer = options_.exprTransformers.find(chosenFunctionName);
if (exprTransformer != options_.exprTransformers.end()) {
state.remainingLevelOfNesting_ -=
state_.remainingLevelOfNesting_ -=
exprTransformer->second->extraLevelOfNesting();
}

if (state.remainingLevelOfNesting_ >= 0) {
if (state_.remainingLevelOfNesting_ >= 0) {
if (chosenFunctionName == "cast") {
expression = generateCastExpression(returnType);
} else if (chosenFunctionName == "row_constructor") {
Expand All @@ -825,7 +820,7 @@ core::TypedExprPtr ExpressionFuzzer::generateExpression(
if (expression) {
expression = exprTransformer->second->transform(std::move(expression));
}
state.remainingLevelOfNesting_ +=
state_.remainingLevelOfNesting_ +=
exprTransformer->second->extraLevelOfNesting();
}
}
Expand All @@ -841,17 +836,32 @@ core::TypedExprPtr ExpressionFuzzer::generateExpression(
return generateArgColumn(returnType);
}
}
state.expressionBank_.insert(expression);
state_.expressionBank_.insert(expression);
return expression;
}

std::vector<core::TypedExprPtr> ExpressionFuzzer::getArgsForCallable(
const CallableSignature& callable) {
// Special case for switch because it has a variable number of arguments not
// specified in the signature. Other functions' argument override should be
// specified through funcArgOverrides_.
if (callable.name == "switch") {
return generateSwitchArgs(callable);
}

auto funcIt = funcArgOverrides_.find(callable.name);
if (funcIt == funcArgOverrides_.end()) {
return generateArgs(callable);
}
return funcIt->second(callable);
auto args = funcIt->second->generate(
callable, vectorFuzzer_->getOptions(), rng_, state_);
for (auto i = 0; i < args.size(); ++i) {
// Generate arguments not specified in the override.
if (args[i] == nullptr) {
args[i] = generateArg(callable.args.at(i), callable.constantArgs.at(i));
}
}
return args;
}

core::TypedExprPtr ExpressionFuzzer::getCallExprFromCallable(
Expand Down Expand Up @@ -1124,45 +1134,6 @@ core::TypedExprPtr ExpressionFuzzer::generateDereferenceExpression(
inputExpressions[0],
fmt::format("row_field{}", referencedIndex));
}
void ExpressionFuzzer::ExprBank::insert(const core::TypedExprPtr& expression) {
auto typeString = expression->type()->toString();
if (typeToExprsByLevel_.find(typeString) == typeToExprsByLevel_.end()) {
typeToExprsByLevel_.insert(
{typeString, ExprsIndexedByLevel(maxLevelOfNesting_ + 1)});
}
auto& expressionsByLevel = typeToExprsByLevel_[typeString];
int nestingLevel = getNestedLevel(expression);
VELOX_CHECK_LE(nestingLevel, maxLevelOfNesting_);
expressionsByLevel[nestingLevel].push_back(expression);
}

core::TypedExprPtr ExpressionFuzzer::ExprBank::getRandomExpression(
const facebook::velox::TypePtr& returnType,
int uptoLevelOfNesting) {
VELOX_CHECK_LE(uptoLevelOfNesting, maxLevelOfNesting_);
auto typeString = returnType->toString();
if (typeToExprsByLevel_.find(typeString) == typeToExprsByLevel_.end()) {
return nullptr;
}
auto& expressionsByLevel = typeToExprsByLevel_[typeString];
int totalToConsider = 0;
for (int i = 0; i <= uptoLevelOfNesting; i++) {
totalToConsider += expressionsByLevel[i].size();
}
if (totalToConsider > 0) {
int choice = boost::random::uniform_int_distribution<uint32_t>(
0, totalToConsider - 1)(rng_);
for (int i = 0; i <= uptoLevelOfNesting; i++) {
if (choice >= expressionsByLevel[i].size()) {
choice -= expressionsByLevel[i].size();
continue;
}
return expressionsByLevel[i][choice];
}
VELOX_CHECK(false, "Should have found an expression.");
}
return nullptr;
}

TypePtr ExpressionFuzzer::fuzzReturnType() {
auto chooseFromConcreteSignatures = rand32(0, 1);
Expand Down
Loading

0 comments on commit 9cf9fb0

Please sign in to comment.