From 075922272d9cc3cac5cadf76e350373e47e8ed21 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Mon, 7 Oct 2024 17:32:23 -0700 Subject: [PATCH] Move Aggregation Function Data Specs into the Query Runners (#11192) Summary: Aggregation Function Data Specs declare when we don't want to generate NaN/Infinity for the arguments to a particular aggregate. Today these are hard coded in the AggregationFuzzer itself, but whether or not we support NaN/Infinity in fuzzing an aggregate function is more tightly tied to what we are comparing against (in what cases it's compatible with Velox) which Fuzzer we're running (as evidenced by the fact the Presto and Spark Aggregation Fuzzers are using the same lists currently). We are seeing cases where fuzzing with the Presto Query Runner is succeeding while fuzzing with the DuckDB Query Runner is failing. To fix this I've moved the Aggregation Function Data Specs map into the Query Runner, so that we can specify per Query Runner in what aggregation functions we don't want to produce Nan/Infinity for. Differential Revision: D64011761 --- velox/exec/fuzzer/AggregationFuzzerOptions.h | 2 -- velox/exec/fuzzer/AggregationFuzzerRunner.h | 5 +++- velox/exec/fuzzer/DuckQueryRunner.cpp | 23 +++++++++++++++++++ velox/exec/fuzzer/DuckQueryRunner.h | 3 +++ velox/exec/fuzzer/PrestoQueryRunner.cpp | 23 +++++++++++++++++++ velox/exec/fuzzer/PrestoQueryRunner.h | 3 +++ velox/exec/fuzzer/ReferenceQueryRunner.h | 3 +++ velox/exec/fuzzer/WindowFuzzerRunner.h | 5 +++- .../fuzzer/AggregationFuzzerTest.cpp | 20 +--------------- .../prestosql/fuzzer/WindowFuzzerTest.cpp | 19 --------------- .../fuzzer/SparkAggregationFuzzerTest.cpp | 19 --------------- .../sparksql/fuzzer/SparkQueryRunner.cpp | 8 +++++++ .../sparksql/fuzzer/SparkQueryRunner.h | 3 +++ 13 files changed, 75 insertions(+), 61 deletions(-) diff --git a/velox/exec/fuzzer/AggregationFuzzerOptions.h b/velox/exec/fuzzer/AggregationFuzzerOptions.h index 1d372d5743ab..71807617817f 100644 --- a/velox/exec/fuzzer/AggregationFuzzerOptions.h +++ b/velox/exec/fuzzer/AggregationFuzzerOptions.h @@ -50,8 +50,6 @@ struct AggregationFuzzerOptions { std::unordered_set orderDependentFunctions; - std::unordered_map functionDataSpec; - /// Timestamp precision to use when generating inputs of type TIMESTAMP. VectorFuzzer::Options::TimestampPrecision timestampPrecision{ VectorFuzzer::Options::TimestampPrecision::kMilliSeconds}; diff --git a/velox/exec/fuzzer/AggregationFuzzerRunner.h b/velox/exec/fuzzer/AggregationFuzzerRunner.h index cc62a52911a5..0d299bbae462 100644 --- a/velox/exec/fuzzer/AggregationFuzzerRunner.h +++ b/velox/exec/fuzzer/AggregationFuzzerRunner.h @@ -107,12 +107,15 @@ class AggregationFuzzerRunner { registerVectorSerde(); facebook::velox::filesystems::registerLocalFileSystem(); + auto& aggregationFunctionDataSpecs = + referenceQueryRunner->aggregationFunctionDataSpecs(); + facebook::velox::exec::test::aggregateFuzzer( filteredSignatures, seed, options.customVerificationFunctions, options.customInputGenerators, - options.functionDataSpec, + aggregationFunctionDataSpecs, options.timestampPrecision, options.queryConfigs, options.hiveConfigs, diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp index 1491333c4948..3f4e3e6a6320 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.cpp +++ b/velox/exec/fuzzer/DuckQueryRunner.cpp @@ -79,6 +79,29 @@ const std::vector& DuckQueryRunner::supportedScalarTypes() const { return kScalarTypes; } +const std::unordered_map& +DuckQueryRunner::aggregationFunctionDataSpecs() const { + // There are some functions for which DuckDB and Velox have inconsistent + // behavior with Nan and Infinity, so we exclude those. + static const std::unordered_map + kAggregationFunctionDataSpecs{ + {"covar_pop", DataSpec{true, false}}, + {"covar_samp", DataSpec{true, false}}, + {"histogram", DataSpec{false, false}}, + {"regr_avgx", DataSpec{true, false}}, + {"regr_avgy", DataSpec{true, false}}, + {"regr_intercept", DataSpec{false, false}}, + {"regr_r2", DataSpec{false, false}}, + {"regr_replacement", DataSpec{false, false}}, + {"regr_slope", DataSpec{false, false}}, + {"regr_sxx", DataSpec{false, false}}, + {"regr_sxy", DataSpec{false, false}}, + {"regr_syy", DataSpec{false, false}}, + {"var_pop", DataSpec{false, false}}}; + + return kAggregationFunctionDataSpecs; +} + std::multiset> DuckQueryRunner::execute( const std::string& sql, const std::vector& input, diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h index 491cc14aa368..4fa826af0488 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.h +++ b/velox/exec/fuzzer/DuckQueryRunner.h @@ -34,6 +34,9 @@ class DuckQueryRunner : public ReferenceQueryRunner { /// TODO Investigate mismatches reported when comparing Varbinary. const std::vector& supportedScalarTypes() const override; + const std::unordered_map& + aggregationFunctionDataSpecs() const override; + /// Specify names of aggregate function to exclude from the list of supported /// functions. Used to exclude functions that are non-determonistic, have bugs /// or whose semantics differ from Velox. diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index 956e7e235943..1172992c90a1 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -273,6 +273,29 @@ const std::vector& PrestoQueryRunner::supportedScalarTypes() const { return kScalarTypes; } +const std::unordered_map& +PrestoQueryRunner::aggregationFunctionDataSpecs() const { + // For some functions, velox supports NaN, Infinity better than presto query + // runner, which makes the comparison impossible. + // Add data constraint in vector fuzzer to enforce to not generate such data + // for those functions before they are fixed in presto query runner + static const std::unordered_map + kAggregationFunctionDataSpecs{ + {"regr_avgx", DataSpec{false, false}}, + {"regr_avgy", DataSpec{false, false}}, + {"regr_r2", DataSpec{false, false}}, + {"regr_sxx", DataSpec{false, false}}, + {"regr_syy", DataSpec{false, false}}, + {"regr_sxy", DataSpec{false, false}}, + {"regr_slope", DataSpec{false, false}}, + {"regr_replacement", DataSpec{false, false}}, + {"covar_pop", DataSpec{true, false}}, + {"covar_samp", DataSpec{true, false}}, + }; + + return kAggregationFunctionDataSpecs; +} + std::optional PrestoQueryRunner::toSql( const std::shared_ptr& aggregationNode) { // Assume plan is Aggregation over Values. diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h index cc4ec047eef9..17b289366b2c 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.h +++ b/velox/exec/fuzzer/PrestoQueryRunner.h @@ -53,6 +53,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { const std::vector& supportedScalarTypes() const override; + const std::unordered_map& + aggregationFunctionDataSpecs() const override; + /// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or /// Window with an optional Project on top. /// diff --git a/velox/exec/fuzzer/ReferenceQueryRunner.h b/velox/exec/fuzzer/ReferenceQueryRunner.h index c666eeb6a789..75226a7b9ecc 100644 --- a/velox/exec/fuzzer/ReferenceQueryRunner.h +++ b/velox/exec/fuzzer/ReferenceQueryRunner.h @@ -45,6 +45,9 @@ class ReferenceQueryRunner { return defaultScalarTypes(); } + virtual const std::unordered_map& + aggregationFunctionDataSpecs() const = 0; + /// Converts Velox plan into SQL accepted by the reference database. /// @return std::nullopt if the plan uses features not supported by the /// reference database. diff --git a/velox/exec/fuzzer/WindowFuzzerRunner.h b/velox/exec/fuzzer/WindowFuzzerRunner.h index 4f66d2c4918c..d24536ff8175 100644 --- a/velox/exec/fuzzer/WindowFuzzerRunner.h +++ b/velox/exec/fuzzer/WindowFuzzerRunner.h @@ -79,6 +79,9 @@ class WindowFuzzerRunner { registerVectorSerde(); facebook::velox::filesystems::registerLocalFileSystem(); + auto& aggregationFunctionDataSpecs = + referenceQueryRunner->aggregationFunctionDataSpecs(); + facebook::velox::exec::test::windowFuzzer( filteredAggregationSignatures, filteredWindowSignatures, @@ -86,7 +89,7 @@ class WindowFuzzerRunner { options.customVerificationFunctions, options.customInputGenerators, options.orderDependentFunctions, - options.functionDataSpec, + aggregationFunctionDataSpecs, options.timestampPrecision, options.queryConfigs, options.hiveConfigs, diff --git a/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp index 64391a42f83b..12650de90df4 100644 --- a/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp +++ b/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp @@ -184,24 +184,6 @@ int main(int argc, char** argv) { {"sum_data_size_for_stats", nullptr}, }; - using facebook::velox::DataSpec; - // For some functions, velox supports NaN, Infinity better than presto query - // runner, which makes the comparison impossible. - // Add data constraint in vector fuzzer to enforce to not generate such data - // for those functions before they are fixed in presto query runner - static const std::unordered_map functionDataSpec = { - {"regr_avgx", DataSpec{false, false}}, - {"regr_avgy", DataSpec{false, false}}, - {"regr_r2", DataSpec{false, false}}, - {"regr_sxx", DataSpec{false, false}}, - {"regr_syy", DataSpec{false, false}}, - {"regr_sxy", DataSpec{false, false}}, - {"regr_slope", DataSpec{false, false}}, - {"regr_replacement", DataSpec{false, false}}, - {"covar_pop", DataSpec{true, false}}, - {"covar_samp", DataSpec{true, false}}, - }; - using Runner = facebook::velox::exec::test::AggregationFuzzerRunner; using Options = facebook::velox::exec::test::AggregationFuzzerOptions; @@ -213,9 +195,9 @@ int main(int argc, char** argv) { facebook::velox::exec::test::getCustomInputGenerators(); options.timestampPrecision = facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds; - options.functionDataSpec = functionDataSpec; std::shared_ptr rootPool{ facebook::velox::memory::memoryManager()->addRootPool()}; + return Runner::run( initialSeed, setupReferenceQueryRunner( diff --git a/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp index 2cdd0ebceb6e..f7d863d7b809 100644 --- a/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp +++ b/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp @@ -141,24 +141,6 @@ int main(int argc, char** argv) { {"sum_data_size_for_stats", nullptr}, }; - using facebook::velox::DataSpec; - // For some functions, velox supports NaN, Infinity better than presto query - // runner, which makes the comparison impossible. - // Add data spec in vector fuzzer to enforce to not generate such data - // for those functions before they are fixed in presto query runner - static const std::unordered_map functionDataSpec = { - {"regr_avgx", DataSpec{false, false}}, - {"regr_avgy", DataSpec{false, false}}, - {"regr_r2", DataSpec{false, false}}, - {"regr_sxx", DataSpec{false, false}}, - {"regr_syy", DataSpec{false, false}}, - {"regr_sxy", DataSpec{false, false}}, - {"regr_slope", DataSpec{false, false}}, - {"regr_replacement", DataSpec{false, false}}, - {"covar_pop", DataSpec{true, false}}, - {"covar_samp", DataSpec{true, false}}, - }; - static const std::unordered_set orderDependentFunctions = { // Window functions. "first_value", @@ -199,7 +181,6 @@ int main(int argc, char** argv) { options.orderDependentFunctions = orderDependentFunctions; options.timestampPrecision = facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds; - options.functionDataSpec = functionDataSpec; std::shared_ptr rootPool{ facebook::velox::memory::memoryManager()->addRootPool()}; return Runner::run( diff --git a/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp b/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp index 3ed339e94d23..e5c1c3e17981 100644 --- a/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp +++ b/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp @@ -111,24 +111,6 @@ int main(int argc, char** argv) { // formula. The results from the two methods are completely different. "kurtosis"}); - using facebook::velox::DataSpec; - // For some functions, velox supports NaN, Infinity better than presto query - // runner, which makes the comparison impossible. - // Add data spec in vector fuzzer to enforce to not generate such data - // for those functions before they are fixed in presto query runner - static const std::unordered_map functionDataSpec = { - {"regr_avgx", DataSpec{false, false}}, - {"regr_avgy", DataSpec{false, false}}, - {"regr_r2", DataSpec{false, false}}, - {"regr_sxx", DataSpec{false, false}}, - {"regr_syy", DataSpec{false, false}}, - {"regr_sxy", DataSpec{false, false}}, - {"regr_slope", DataSpec{false, false}}, - {"regr_replacement", DataSpec{false, false}}, - {"covar_pop", DataSpec{true, false}}, - {"covar_samp", DataSpec{true, false}}, - }; - using Runner = facebook::velox::exec::test::AggregationFuzzerRunner; using Options = facebook::velox::exec::test::AggregationFuzzerOptions; @@ -137,6 +119,5 @@ int main(int argc, char** argv) { options.skipFunctions = skipFunctions; options.customVerificationFunctions = customVerificationFunctions; options.orderableGroupKeys = true; - options.functionDataSpec = functionDataSpec; return Runner::run(initialSeed, std::move(duckQueryRunner), options); } diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp index 89d05aeb604d..07cf0dd70392 100644 --- a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp +++ b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp @@ -90,6 +90,14 @@ const std::vector& SparkQueryRunner::supportedScalarTypes() const { return kScalarTypes; } +const std::unordered_map& +SparkQueryRunner::aggregationFunctionDataSpecs() const { + static const std::unordered_map + kAggregationFunctionDataSpecs{}; + + return kAggregationFunctionDataSpecs; +} + std::optional SparkQueryRunner::toSql( const velox::core::PlanNodePtr& plan) { if (const auto aggregationNode = diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.h b/velox/functions/sparksql/fuzzer/SparkQueryRunner.h index 038f3a82b04f..078deed96534 100644 --- a/velox/functions/sparksql/fuzzer/SparkQueryRunner.h +++ b/velox/functions/sparksql/fuzzer/SparkQueryRunner.h @@ -73,6 +73,9 @@ class SparkQueryRunner : public velox::exec::test::ReferenceQueryRunner { const std::vector& supportedScalarTypes() const override; + const std::unordered_map& + aggregationFunctionDataSpecs() const override; + bool supportsVeloxVectorResults() const override { return true; }