From 68e7149986d9b78721311852ab4c62bc57e733bc Mon Sep 17 00:00:00 2001 From: Piotr Grabowski Date: Fri, 3 Jan 2025 09:49:36 +0100 Subject: [PATCH] Use "Array" variants of aggregates in schema_array_transformer schema_array_transformer transforms the SQL query for Array columns. Before this change, if an aggregation was performed on a Array column, e.g. sum(myArrayColumn), the transformer would change it into sum(arrayJoin(myArrayColumn)). However using arrayJoin function has problems - arrayJoin modifies the result set of SQL query introducing additional rows. If there are many arrayJoins, a Cartesian product many rows will be introduced: this causes query slowdown and makes the result invalid (we don't actually want to do a Cartesian product!). Solve the problem by using "Array" variants of aggregates (e.g. sumArray instead of sum(arrayJoin())), which does not inflate the number of result rows. Note that this PR does NOT get rid of arrayJoin() fully. There are panels that actually need it, such as "Top products this week" in eCommerce dashboard. --- quesma/quesma/schema_array_transformer.go | 177 +++++++++++++++++++--- quesma/quesma/schema_transformer_test.go | 2 +- 2 files changed, 155 insertions(+), 24 deletions(-) diff --git a/quesma/quesma/schema_array_transformer.go b/quesma/quesma/schema_array_transformer.go index 0ccf39c92..3adfba283 100644 --- a/quesma/quesma/schema_array_transformer.go +++ b/quesma/quesma/schema_array_transformer.go @@ -16,6 +16,130 @@ import ( // // +// Aggregate functions names, generated from ClickHouse documentation: +// git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git +// cd ClickHouse/docs/en/sql-reference/aggregate-functions/reference +// find . -type f | cut -c3- | rev | cut -c4- | rev | sort + +var aggregateFunctions = map[string]bool{ + "aggthrow": true, + "analysis_of_variance": true, + "any": true, + "anyheavy": true, + "anylast": true, + "approxtopk": true, + "approxtopsum": true, + "argmax": true, + "argmin": true, + "arrayconcatagg": true, + "avg": true, + "avgweighted": true, + "boundrat": true, + "categoricalinformationvalue": true, + "contingency": true, + "corr": true, + "corrmatrix": true, + "corrstable": true, + "count": true, + "covarpop": true, + "covarpopmatrix": true, + "covarpopstable": true, + "covarsamp": true, + "covarsampmatrix": true, + "covarsampstable": true, + "cramersv": true, + "cramersvbiascorrected": true, + "deltasum": true, + "deltasumtimestamp": true, + "entropy": true, + "exponentialmovingaverage": true, + "exponentialtimedecayedavg": true, + "exponentialtimedecayedcount": true, + "exponentialtimedecayedmax": true, + "exponentialtimedecayedsum": true, + "first_value": true, + "flame_graph": true, + "grouparray": true, + "grouparrayinsertat": true, + "grouparrayintersect": true, + "grouparraylast": true, + "grouparraymovingavg": true, + "grouparraymovingsum": true, + "grouparraysample": true, + "grouparraysorted": true, + "groupbitand": true, + "groupbitmap": true, + "groupbitmapand": true, + "groupbitmapor": true, + "groupbitmapxor": true, + "groupbitor": true, + "groupbitxor": true, + "groupconcat": true, + "groupuniqarray": true, + "index": true, + "intervalLengthSum": true, + "kolmogorovsmirnovtest": true, + "kurtpop": true, + "kurtsamp": true, + "largestTriangleThreeBuckets": true, + "last_value": true, + "mannwhitneyutest": true, + "max": true, + "maxintersections": true, + "maxintersectionsposition": true, + "maxmap": true, + "meanztest": true, + "median": true, + "min": true, + "minmap": true, + "quantile": true, + "quantileGK": true, + "quantilebfloat16": true, + "quantileddsketch": true, + "quantiledeterministic": true, + "quantileexact": true, + "quantileexactweighted": true, + "quantileinterpolatedweighted": true, + "quantiles": true, + "quantiletdigest": true, + "quantiletdigestweighted": true, + "quantiletiming": true, + "quantiletimingweighted": true, + "rankCorr": true, + "simplelinearregression": true, + "singlevalueornull": true, + "skewpop": true, + "skewsamp": true, + "sparkbar": true, + "stddevpop": true, + "stddevpopstable": true, + "stddevsamp": true, + "stddevsampstable": true, + "stochasticlinearregression": true, + "stochasticlogisticregression": true, + "studentttest": true, + "sum": true, + "sumcount": true, + "sumkahan": true, + "summap": true, + "summapwithoverflow": true, + "sumwithoverflow": true, + "theilsu": true, + "topk": true, + "topkweighted": true, + "uniq": true, + "uniqcombined": true, + "uniqcombined64": true, + "uniqexact": true, + "uniqhll12": true, + "uniqthetasketch": true, + "varpop": true, + "varpopstable": true, + "varsamp": true, + "varsampstable": true, + "welchttest": true, +} + type arrayTypeResolver struct { indexSchema schema.Schema } @@ -81,13 +205,28 @@ func NewArrayTypeVisitor(resolver arrayTypeResolver) model.ExprVisitor { if ok { dbType := resolver.dbColumnType(column.ColumnName) if strings.HasPrefix(dbType, "Array") { - if strings.HasPrefix(e.Name, "sum") { - // here we apply -Array combinator to the sum function + funcName := e.Name + + ifSuffix := strings.HasSuffix(funcName, "If") + if ifSuffix { + funcName = strings.TrimSuffix(funcName, "If") + } + orNullSuffix := strings.HasSuffix(funcName, "OrNull") + if orNullSuffix { + funcName = strings.TrimSuffix(funcName, "OrNull") + } + + if aggregateFunctions[strings.ToLower(funcName)] { + // Use a variant of the function with "Array" suffix: // https://clickhouse.com/docs/en/sql-reference/aggregate-functions/combinators#-array - // - // TODO this can be rewritten to transform all aggregate functions as well - // - e.Name = strings.ReplaceAll(e.Name, "sum", "sumArray") + newName := funcName + "Array" + if orNullSuffix { + newName = newName + "OrNull" + } + if ifSuffix { + newName = newName + "If" + } + e.Name = newName } else { logger.Error().Msgf("Unhandled array function %s, column %v (%v)", e.Name, column.ColumnName, dbType) } @@ -98,6 +237,15 @@ func NewArrayTypeVisitor(resolver arrayTypeResolver) model.ExprVisitor { args := b.VisitChildren(e.Args) return model.NewFunction(e.Name, args...) } + + visitor.OverrideVisitColumnRef = func(b *model.BaseExprVisitor, e model.ColumnRef) interface{} { + dbType := resolver.dbColumnType(e.ColumnName) + if strings.HasPrefix(dbType, "Array") { + logger.Error().Msgf("Unhandled array column ref %v (%v)", e.ColumnName, dbType) + } + return e + } + return visitor } @@ -148,23 +296,6 @@ func checkIfGroupingByArrayColumn(selectCommand model.SelectCommand, resolver ar return &e } - visitor.OverrideVisitFunction = func(b *model.BaseExprVisitor, e model.FunctionExpr) interface{} { - - if strings.HasPrefix(e.Name, "sum") || strings.HasPrefix(e.Name, "count") { - - if len(e.Args) > 0 { - arg := e.Args[0] - - if isArrayColumn(arg) { - found = true - } - - } - - } - return e - } - selectCommand.Accept(visitor) return found diff --git a/quesma/quesma/schema_transformer_test.go b/quesma/quesma/schema_transformer_test.go index d5430fb66..cb9b91399 100644 --- a/quesma/quesma/schema_transformer_test.go +++ b/quesma/quesma/schema_transformer_test.go @@ -487,7 +487,7 @@ func Test_arrayType(t *testing.T) { FromClause: model.NewTableRef("kibana_sample_data_ecommerce"), Columns: []model.Expr{ model.NewColumnRef("order_date"), - model.NewAliasedExpr(model.NewFunction("sumOrNull", model.NewFunction("arrayJoin", model.NewColumnRef("products_quantity"))), "column_1"), + model.NewAliasedExpr(model.NewFunction("sumArrayOrNull", model.NewColumnRef("products_quantity")), "column_1"), }, GroupBy: []model.Expr{model.NewColumnRef("order_date")}, },