From fe6f1258346d2f11d3d87ec3baab8a41bd43522e Mon Sep 17 00:00:00 2001 From: Xinyu Hao <75524174+ishaoxy@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:53:04 +0800 Subject: [PATCH 1/3] backport 4297 Signed-off-by: Xinyu Hao --- .../org/opensearch/sql/analysis/Analyzer.java | 6 + .../sql/ast/AbstractNodeVisitor.java | 5 + .../opensearch/sql/ast/tree/StreamWindow.java | 71 + .../sql/calcite/CalciteRelNodeVisitor.java | 2092 +++++++++++++++-- .../calcite/plan/PPLAggGroupMergeRule.java | 156 ++ .../sql/calcite/utils/PlanUtils.java | 182 +- docs/category.json | 67 +- docs/user/ppl/cmd/streamstats.rst | 229 ++ docs/user/ppl/index.rst | 72 +- .../sql/calcite/CalciteNoPushdownIT.java | 1 + .../sql/calcite/remote/CalciteExplainIT.java | 1520 +++++++++++- .../remote/CalcitePPLEventstatsIT.java | 257 +- .../remote/CalciteStreamstatsCommandIT.java | 1095 +++++++++ .../calcite/explain_streamstats_dc.yaml | 9 + .../explain_streamstats_distinct_count.yaml | 15 + .../explain_streamstats_earliest_latest.yaml | 15 + ...reamstats_earliest_latest_custom_time.yaml | 15 + ..._streamstats_earliest_latest_no_group.yaml | 9 + .../calcite/explain_streamstats_global.yaml | 29 + .../calcite/explain_streamstats_reset.yaml | 38 + .../explain_streamstats_dc.yaml | 10 + .../explain_streamstats_distinct_count.yaml | 15 + .../explain_streamstats_earliest_latest.yaml | 15 + ...reamstats_earliest_latest_custom_time.yaml | 15 + ..._streamstats_earliest_latest_no_group.yaml | 10 + .../explain_streamstats_global.yaml | 30 + .../explain_streamstats_reset.yaml | 38 + ppl/src/main/antlr/OpenSearchPPLLexer.g4 | 100 +- ppl/src/main/antlr/OpenSearchPPLParser.g4 | 648 +++-- .../opensearch/sql/ppl/parser/AstBuilder.java | 830 ++++++- .../sql/ppl/utils/ArgumentFactory.java | 280 ++- .../sql/ppl/utils/PPLQueryDataAnonymizer.java | 377 ++- .../calcite/CalcitePPLStreamstatsTest.java | 189 ++ .../ppl/utils/PPLQueryDataAnonymizerTest.java | 573 ++++- 34 files changed, 8202 insertions(+), 811 deletions(-) create mode 100644 core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java create mode 100644 core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java create mode 100644 docs/user/ppl/cmd/streamstats.rst create mode 100644 integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml create mode 100644 ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index 29b6480f3ce..9f528319742 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -82,6 +82,7 @@ import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; @@ -675,6 +676,11 @@ public LogicalPlan visitTrendline(Trendline node, AnalysisContext context) { computationsAndTypes.build()); } + @Override + public LogicalPlan visitStreamWindow(StreamWindow node, AnalysisContext context) { + throw getOnlyForCalciteException("Streamstats"); + } + @Override public LogicalPlan visitFlatten(Flatten node, AnalysisContext context) { throw new UnsupportedOperationException( diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index 5e67f3fba0e..ab1181a9d5f 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -70,6 +70,7 @@ import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Sort; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; @@ -368,6 +369,10 @@ public T visitWindow(Window window, C context) { return visitChildren(window, context); } + public T visitStreamWindow(StreamWindow node, C context) { + return visitChildren(node, context); + } + public T visitJoin(Join node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java b/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java new file mode 100644 index 00000000000..ed7bcf10289 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java @@ -0,0 +1,71 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +@Getter +@ToString +@EqualsAndHashCode(callSuper = false) +public class StreamWindow extends UnresolvedPlan { + + private final List windowFunctionList; + private final List groupList; + private final boolean current; + private final int window; + private final boolean global; + private final UnresolvedExpression resetBefore; + private final UnresolvedExpression resetAfter; + @ToString.Exclude private UnresolvedPlan child; + + /** StreamWindow Constructor. */ + public StreamWindow( + List windowFunctionList, + List groupList, + boolean current, + int window, + boolean global, + UnresolvedExpression resetBefore, + UnresolvedExpression resetAfter) { + this.windowFunctionList = windowFunctionList; + this.groupList = groupList; + this.current = current; + this.window = window; + this.global = global; + this.resetBefore = resetBefore; + this.resetAfter = resetAfter; + } + + public boolean isCurrent() { + return current; + } + + public boolean isGlobal() { + return global; + } + + @Override + public StreamWindow attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitStreamWindow(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 44ee4d90386..573a51de2a7 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -6,6 +6,7 @@ package org.opensearch.sql.calcite; import static org.apache.calcite.sql.SqlKind.AS; +import static org.opensearch.sql.analysis.DataSourceSchemaIdentifierNameResolver.INFORMATION_SCHEMA_NAME; import static org.opensearch.sql.ast.tree.Join.JoinType.ANTI; import static org.opensearch.sql.ast.tree.Join.JoinType.SEMI; import static org.opensearch.sql.ast.tree.Sort.NullOrder.NULL_FIRST; @@ -13,17 +14,23 @@ import static org.opensearch.sql.ast.tree.Sort.SortOption.DEFAULT_DESC; import static org.opensearch.sql.ast.tree.Sort.SortOrder.ASC; import static org.opensearch.sql.ast.tree.Sort.SortOrder.DESC; -import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME; -import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME_MAIN; -import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME_SUBSEARCH; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_DEDUP; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_MAIN; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_RARE_TOP; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_STREAMSTATS; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_SUBSEARCH; import static org.opensearch.sql.calcite.utils.PlanUtils.getRelation; +import static org.opensearch.sql.calcite.utils.PlanUtils.getRexCall; import static org.opensearch.sql.calcite.utils.PlanUtils.transformPlanToAttachChild; +import static org.opensearch.sql.utils.SystemIndexUtils.DATASOURCES_TABLE_NAME; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Streams; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -33,20 +40,31 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.calcite.adapter.enumerable.RexToLixTranslator; import org.apache.calcite.plan.RelOptTable; import org.apache.calcite.plan.ViewExpanders; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Aggregate; import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.hint.HintStrategyTable; +import org.apache.calcite.rel.hint.RelHint; +import org.apache.calcite.rel.logical.LogicalAggregate; import org.apache.calcite.rel.logical.LogicalValues; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFamily; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexCorrelVariable; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexVisitorImpl; import org.apache.calcite.rex.RexWindowBounds; +import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.apache.calcite.sql.type.ArraySqlType; +import org.apache.calcite.sql.type.MapSqlType; +import org.apache.calcite.sql.type.SqlTypeFamily; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.RelBuilder; import org.apache.calcite.tools.RelBuilder.AggCall; @@ -54,7 +72,9 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.tuple.Pair; import org.checkerframework.checker.nullness.qual.Nullable; +import org.opensearch.sql.analysis.DataSourceSchemaIdentifierNameResolver; import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.EmptySourcePropagateVisitor; import org.opensearch.sql.ast.Node; import org.opensearch.sql.ast.dsl.AstDSL; import org.opensearch.sql.ast.expression.AggregateFunction; @@ -70,6 +90,8 @@ import org.opensearch.sql.ast.expression.ParseMethod; import org.opensearch.sql.ast.expression.PatternMethod; import org.opensearch.sql.ast.expression.PatternMode; +import org.opensearch.sql.ast.expression.Span; +import org.opensearch.sql.ast.expression.SpanUnit; import org.opensearch.sql.ast.expression.UnresolvedExpression; import org.opensearch.sql.ast.expression.WindowFrame; import org.opensearch.sql.ast.expression.WindowFrame.FrameType; @@ -77,7 +99,9 @@ import org.opensearch.sql.ast.expression.subquery.SubqueryExpression; import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.Aggregation; +import org.opensearch.sql.ast.tree.Append; import org.opensearch.sql.ast.tree.AppendCol; +import org.opensearch.sql.ast.tree.Bin; import org.opensearch.sql.ast.tree.CloseCursor; import org.opensearch.sql.ast.tree.Dedupe; import org.opensearch.sql.ast.tree.Eval; @@ -92,41 +116,59 @@ import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.ast.tree.Lookup.OutputStrategy; import org.opensearch.sql.ast.tree.ML; +import org.opensearch.sql.ast.tree.Multisearch; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; +import org.opensearch.sql.ast.tree.Replace; +import org.opensearch.sql.ast.tree.ReplacePair; +import org.opensearch.sql.ast.tree.Rex; +import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.Trendline.TrendlineType; import org.opensearch.sql.ast.tree.UnresolvedPlan; +import org.opensearch.sql.ast.tree.Values; import org.opensearch.sql.ast.tree.Window; +import org.opensearch.sql.calcite.plan.LogicalSystemLimit; +import org.opensearch.sql.calcite.plan.LogicalSystemLimit.SystemLimitType; import org.opensearch.sql.calcite.plan.OpenSearchConstants; +import org.opensearch.sql.calcite.utils.BinUtils; import org.opensearch.sql.calcite.utils.JoinAndLookupUtils; import org.opensearch.sql.calcite.utils.PlanUtils; import org.opensearch.sql.calcite.utils.UserDefinedFunctionUtils; +import org.opensearch.sql.calcite.utils.WildcardUtils; import org.opensearch.sql.common.patterns.PatternUtils; import org.opensearch.sql.common.utils.StringUtils; +import org.opensearch.sql.datasource.DataSourceService; import org.opensearch.sql.exception.CalciteUnsupportedException; import org.opensearch.sql.exception.SemanticCheckException; import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.expression.function.PPLFuncImpTable; +import org.opensearch.sql.expression.parse.RegexCommonUtils; import org.opensearch.sql.utils.ParseUtils; +import org.opensearch.sql.utils.WildcardRenameUtils; public class CalciteRelNodeVisitor extends AbstractNodeVisitor { private final CalciteRexNodeVisitor rexVisitor; private final CalciteAggCallVisitor aggVisitor; + private final DataSourceService dataSourceService; - public CalciteRelNodeVisitor() { + public CalciteRelNodeVisitor(DataSourceService dataSourceService) { this.rexVisitor = new CalciteRexNodeVisitor(this); this.aggVisitor = new CalciteAggCallVisitor(rexVisitor); + this.dataSourceService = dataSourceService; } public RelNode analyze(UnresolvedPlan unresolved, CalcitePlanContext context) { @@ -135,6 +177,21 @@ public RelNode analyze(UnresolvedPlan unresolved, CalcitePlanContext context) { @Override public RelNode visitRelation(Relation node, CalcitePlanContext context) { + DataSourceSchemaIdentifierNameResolver nameResolver = + new DataSourceSchemaIdentifierNameResolver( + dataSourceService, node.getTableQualifiedName().getParts()); + if (!nameResolver + .getDataSourceName() + .equals(DataSourceSchemaIdentifierNameResolver.DEFAULT_DATASOURCE_NAME)) { + throw new CalciteUnsupportedException( + "Datasource " + nameResolver.getDataSourceName() + " is unsupported in Calcite"); + } + if (nameResolver.getIdentifierName().equals(DATASOURCES_TABLE_NAME)) { + throw new CalciteUnsupportedException("SHOW DATASOURCES is unsupported in Calcite"); + } + if (nameResolver.getSchemaName().equals(INFORMATION_SCHEMA_NAME)) { + throw new CalciteUnsupportedException("information_schema is unsupported in Calcite"); + } context.relBuilder.scan(node.getTableQualifiedName().getParts()); return context.relBuilder.peek(); } @@ -150,6 +207,21 @@ private RelBuilder scan(RelOptTable tableSchema, CalcitePlanContext context) { return context.relBuilder; } + @Override + public RelNode visitSearch(Search node, CalcitePlanContext context) { + // Visit the Relation child to get the scan + node.getChild().get(0).accept(this, context); + // Create query_string function + Function queryStringFunc = + AstDSL.function( + "query_string", + AstDSL.unresolvedArg("query", AstDSL.stringLiteral(node.getQueryString()))); + RexNode queryStringRex = rexVisitor.analyze(queryStringFunc, context); + + context.relBuilder.filter(queryStringRex); + return context.relBuilder.peek(); + } + @Override public RelNode visitFilter(Filter node, CalcitePlanContext context) { visitChildren(node, context); @@ -169,6 +241,97 @@ public RelNode visitFilter(Filter node, CalcitePlanContext context) { return context.relBuilder.peek(); } + @Override + public RelNode visitRegex(Regex node, CalcitePlanContext context) { + visitChildren(node, context); + + RexNode fieldRex = rexVisitor.analyze(node.getField(), context); + RexNode patternRex = rexVisitor.analyze(node.getPattern(), context); + + if (!SqlTypeFamily.CHARACTER.contains(fieldRex.getType())) { + throw new IllegalArgumentException( + String.format( + "Regex command requires field of string type, but got %s for field '%s'", + fieldRex.getType().getSqlTypeName(), node.getField().toString())); + } + + RexNode regexCondition = + context.rexBuilder.makeCall( + org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS, fieldRex, patternRex); + + if (node.isNegated()) { + regexCondition = context.rexBuilder.makeCall(SqlStdOperatorTable.NOT, regexCondition); + } + + context.relBuilder.filter(regexCondition); + return context.relBuilder.peek(); + } + + public RelNode visitRex(Rex node, CalcitePlanContext context) { + visitChildren(node, context); + + RexNode fieldRex = rexVisitor.analyze(node.getField(), context); + String patternStr = (String) node.getPattern().getValue(); + + if (node.getMode() == Rex.RexMode.SED) { + RexNode sedCall = createOptimizedSedCall(fieldRex, patternStr, context); + String fieldName = node.getField().toString(); + projectPlusOverriding(List.of(sedCall), List.of(fieldName), context); + return context.relBuilder.peek(); + } + + List namedGroups = RegexCommonUtils.getNamedGroupCandidates(patternStr); + + if (namedGroups.isEmpty()) { + throw new IllegalArgumentException( + "Rex pattern must contain at least one named capture group"); + } + + // TODO: Once JDK 20+ is supported, consider using Pattern.namedGroups() API for more efficient + // named group handling instead of manual parsing in RegexCommonUtils + + List newFields = new ArrayList<>(); + List newFieldNames = new ArrayList<>(); + + for (String groupName : namedGroups) { + RexNode extractCall; + if (node.getMaxMatch().isPresent() && node.getMaxMatch().get() > 1) { + extractCall = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.REX_EXTRACT_MULTI, + fieldRex, + context.rexBuilder.makeLiteral(patternStr), + context.rexBuilder.makeLiteral(groupName), + context.relBuilder.literal(node.getMaxMatch().get())); + } else { + extractCall = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.REX_EXTRACT, + fieldRex, + context.rexBuilder.makeLiteral(patternStr), + context.rexBuilder.makeLiteral(groupName)); + } + newFields.add(extractCall); + newFieldNames.add(groupName); + } + + if (node.getOffsetField().isPresent()) { + RexNode offsetCall = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.REX_OFFSET, + fieldRex, + context.rexBuilder.makeLiteral(patternStr)); + newFields.add(offsetCall); + newFieldNames.add(node.getOffsetField().get()); + } + + projectPlusOverriding(newFields, newFieldNames, context); + return context.relBuilder.peek(); + } + private boolean containsSubqueryExpression(Node expr) { if (expr == null) { return false; @@ -176,8 +339,7 @@ private boolean containsSubqueryExpression(Node expr) { if (expr instanceof SubqueryExpression) { return true; } - if (expr instanceof Let) { - Let l = (Let) expr; + if (expr instanceof Let l) { return containsSubqueryExpression(l.getExpression()); } for (Node child : expr.getChild()) { @@ -191,31 +353,121 @@ private boolean containsSubqueryExpression(Node expr) { @Override public RelNode visitProject(Project node, CalcitePlanContext context) { visitChildren(node, context); - List projectList; - if (node.getProjectList().size() == 1 - && node.getProjectList().get(0) instanceof AllFields) { - AllFields allFields = (AllFields) node.getProjectList().get(0); - tryToRemoveNestedFields(context); - tryToRemoveMetaFields(context, allFields instanceof AllFieldsExcludeMeta); - return context.relBuilder.peek(); - } else { - projectList = - node.getProjectList().stream() - .map(expr -> rexVisitor.analyze(expr, context)) - .collect(Collectors.toList()); + + if (isSingleAllFieldsProject(node)) { + return handleAllFieldsProject(node, context); } + + List currentFields = context.relBuilder.peek().getRowType().getFieldNames(); + List expandedFields = + expandProjectFields(node.getProjectList(), currentFields, context); + if (node.isExcluded()) { - context.relBuilder.projectExcept(projectList); + validateExclusion(expandedFields, currentFields); + context.relBuilder.projectExcept(expandedFields); } else { - // Only set when not resolving subquery and it's not projectExcept. if (!context.isResolvingSubquery()) { context.setProjectVisited(true); } - context.relBuilder.project(projectList); + context.relBuilder.project(expandedFields); + } + return context.relBuilder.peek(); + } + + private boolean isSingleAllFieldsProject(Project node) { + return node.getProjectList().size() == 1 + && node.getProjectList().getFirst() instanceof AllFields; + } + + private RelNode handleAllFieldsProject(Project node, CalcitePlanContext context) { + if (node.isExcluded()) { + throw new IllegalArgumentException( + "Invalid field exclusion: operation would exclude all fields from the result set"); } + AllFields allFields = (AllFields) node.getProjectList().getFirst(); + if (!(allFields instanceof AllFieldsExcludeMeta)) { + // Should not remove nested fields for AllFieldsExcludeMeta. + tryToRemoveNestedFields(context); + } + tryToRemoveMetaFields(context, allFields instanceof AllFieldsExcludeMeta); return context.relBuilder.peek(); } + private List expandProjectFields( + List projectList, + List currentFields, + CalcitePlanContext context) { + List expandedFields = new ArrayList<>(); + Set addedFields = new HashSet<>(); + + for (UnresolvedExpression expr : projectList) { + switch (expr) { + case Field field -> { + String fieldName = field.getField().toString(); + if (WildcardUtils.containsWildcard(fieldName)) { + List matchingFields = + WildcardUtils.expandWildcardPattern(fieldName, currentFields).stream() + .filter(f -> !isMetadataField(f)) + .filter(addedFields::add) + .toList(); + if (matchingFields.isEmpty()) { + continue; + } + matchingFields.forEach(f -> expandedFields.add(context.relBuilder.field(f))); + } else if (addedFields.add(fieldName)) { + expandedFields.add(rexVisitor.analyze(field, context)); + } + } + case AllFields ignored -> { + currentFields.stream() + .filter(field -> !isMetadataField(field)) + .filter(addedFields::add) + .forEach(field -> expandedFields.add(context.relBuilder.field(field))); + } + default -> throw new IllegalStateException( + "Unexpected expression type in project list: " + expr.getClass().getSimpleName()); + } + } + + if (expandedFields.isEmpty()) { + validateWildcardPatterns(projectList, currentFields); + } + + return expandedFields; + } + + private void validateExclusion(List fieldsToExclude, List currentFields) { + Set nonMetaFields = + currentFields.stream().filter(field -> !isMetadataField(field)).collect(Collectors.toSet()); + + if (fieldsToExclude.size() >= nonMetaFields.size()) { + throw new IllegalArgumentException( + "Invalid field exclusion: operation would exclude all fields from the result set"); + } + } + + private void validateWildcardPatterns( + List projectList, List currentFields) { + String firstWildcardPattern = + projectList.stream() + .filter( + expr -> + expr instanceof Field field + && WildcardUtils.containsWildcard(field.getField().toString())) + .map(expr -> ((Field) expr).getField().toString()) + .findFirst() + .orElse(null); + + if (firstWildcardPattern != null) { + throw new IllegalArgumentException( + String.format("wildcard pattern [%s] matches no fields", firstWildcardPattern)); + } + } + + private boolean isMetadataField(String fieldName) { + return OpenSearchConstants.METADATAFIELD_TYPE_MAP.containsKey(fieldName); + } + /** See logic in {@link org.opensearch.sql.analysis.symbol.SymbolTable#lookupAllFields} */ private static void tryToRemoveNestedFields(CalcitePlanContext context) { Set allFields = new HashSet<>(context.relBuilder.peek().getRowType().getFieldNames()); @@ -227,7 +479,7 @@ private static void tryToRemoveNestedFields(CalcitePlanContext context) { return -1 != lastDot && allFields.contains(field.substring(0, lastDot)); }) .map(field -> (RexNode) context.relBuilder.field(field)) - .collect(Collectors.toList()); + .toList(); if (!duplicatedNestedFields.isEmpty()) { // This is a workaround to avoid the bug in Calcite: // In {@link RelBuilder#project_(Iterable, Iterable, Iterable, boolean, Iterable)}, @@ -281,7 +533,7 @@ private static void tryToRemoveMetaFields(CalcitePlanContext context, boolean ex originalFields.stream() .filter(OpenSearchConstants.METADATAFIELD_TYPE_MAP::containsKey) .map(metaField -> (RexNode) context.relBuilder.field(metaField)) - .collect(Collectors.toList()); + .toList(); // Remove metadata fields if there is and ensure there are other fields. if (!metaFieldsRef.isEmpty() && metaFieldsRef.size() != originalFields.size()) { context.relBuilder.projectExcept(metaFieldsRef); @@ -294,27 +546,52 @@ public RelNode visitRename(Rename node, CalcitePlanContext context) { visitChildren(node, context); List originalNames = context.relBuilder.peek().getRowType().getFieldNames(); List newNames = new ArrayList<>(originalNames); + for (org.opensearch.sql.ast.expression.Map renameMap : node.getRenameList()) { - if (renameMap.getTarget() instanceof Field) { - Field t = (Field) renameMap.getTarget(); - String newName = t.getField().toString(); - RexNode check = rexVisitor.analyze(renameMap.getOrigin(), context); - if (check instanceof RexInputRef) { - RexInputRef ref = (RexInputRef) check; - newNames.set(ref.getIndex(), newName); - } else { - throw new SemanticCheckException( - String.format("the original field %s cannot be resolved", renameMap.getOrigin())); - } - } else { + if (!(renameMap.getTarget() instanceof Field)) { throw new SemanticCheckException( String.format("the target expected to be field, but is %s", renameMap.getTarget())); } + + String sourcePattern = ((Field) renameMap.getOrigin()).getField().toString(); + String targetPattern = ((Field) renameMap.getTarget()).getField().toString(); + + if (WildcardRenameUtils.isWildcardPattern(sourcePattern) + && !WildcardRenameUtils.validatePatternCompatibility(sourcePattern, targetPattern)) { + throw new SemanticCheckException( + "Source and target patterns have different wildcard counts"); + } + + List matchingFields = WildcardRenameUtils.matchFieldNames(sourcePattern, newNames); + + for (String fieldName : matchingFields) { + String newName = + WildcardRenameUtils.applyWildcardTransformation( + sourcePattern, targetPattern, fieldName); + if (newNames.contains(newName) && !newName.equals(fieldName)) { + removeFieldIfExists(newName, newNames, context); + } + int fieldIndex = newNames.indexOf(fieldName); + if (fieldIndex != -1) { + newNames.set(fieldIndex, newName); + } + } + + if (matchingFields.isEmpty() && newNames.contains(targetPattern)) { + removeFieldIfExists(targetPattern, newNames, context); + context.relBuilder.rename(newNames); + } } context.relBuilder.rename(newNames); return context.relBuilder.peek(); } + private void removeFieldIfExists( + String fieldName, List newNames, CalcitePlanContext context) { + newNames.remove(fieldName); + context.relBuilder.projectExcept(context.relBuilder.field(fieldName)); + } + @Override public RelNode visitSort(Sort node, CalcitePlanContext context) { visitChildren(node, context); @@ -337,6 +614,11 @@ public RelNode visitSort(Sort node, CalcitePlanContext context) { }) .collect(Collectors.toList()); context.relBuilder.sort(sortList); + // Apply count parameter as limit + if (node.getCount() != 0) { + context.relBuilder.limit(0, node.getCount()); + } + return context.relBuilder.peek(); } @@ -359,6 +641,43 @@ public RelNode visitHead(Head node, CalcitePlanContext context) { return context.relBuilder.peek(); } + private static final String REVERSE_ROW_NUM = "__reverse_row_num__"; + + @Override + public RelNode visitReverse( + org.opensearch.sql.ast.tree.Reverse node, CalcitePlanContext context) { + visitChildren(node, context); + // Add ROW_NUMBER() column + RexNode rowNumber = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(REVERSE_ROW_NUM); + context.relBuilder.projectPlus(rowNumber); + // Sort by row number descending + context.relBuilder.sort(context.relBuilder.desc(context.relBuilder.field(REVERSE_ROW_NUM))); + // Remove row number column + context.relBuilder.projectExcept(context.relBuilder.field(REVERSE_ROW_NUM)); + return context.relBuilder.peek(); + } + + @Override + public RelNode visitBin(Bin node, CalcitePlanContext context) { + visitChildren(node, context); + + RexNode fieldExpr = rexVisitor.analyze(node.getField(), context); + String fieldName = BinUtils.extractFieldName(node); + + RexNode binExpression = BinUtils.createBinExpression(node, fieldExpr, context, rexVisitor); + + String alias = node.getAlias() != null ? node.getAlias() : fieldName; + projectPlusOverriding(List.of(binExpression), List.of(alias), context); + + return context.relBuilder.peek(); + } + @Override public RelNode visitParse(Parse node, CalcitePlanContext context) { visitChildren(node, context); @@ -366,9 +685,17 @@ public RelNode visitParse(Parse node, CalcitePlanContext context) { return context.relBuilder.peek(); } + @Override + public RelNode visitSpath(SPath node, CalcitePlanContext context) { + return visitEval(node.rewriteAsEval(), context); + } + @Override public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { visitChildren(node, context); + RexNode showNumberedTokenExpr = rexVisitor.analyze(node.getShowNumberedToken(), context); + Boolean showNumberedToken = + Boolean.TRUE.equals(((RexLiteral) showNumberedTokenExpr).getValueAs(Boolean.class)); if (PatternMethod.SIMPLE_PATTERN.equals(node.getPatternMethod())) { Parse parseNode = new Parse( @@ -391,42 +718,58 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { node.getSourceField(), ImmutableList.of(node.getPatternMaxSampleCount())))) .map(aggFun -> aggVisitor.analyze(aggFun, context)) - .collect(Collectors.toList()); + .toList(); List groupByList = new ArrayList<>(); groupByList.add(rexVisitor.analyze(patternField, context)); groupByList.addAll( node.getPartitionByList().stream() .map(expr -> rexVisitor.analyze(expr, context)) - .collect(Collectors.toList())); + .toList()); context.relBuilder.aggregate(context.relBuilder.groupKey(groupByList), aggCalls); - RexNode parsedNode = - PPLFuncImpTable.INSTANCE.resolve( - context.rexBuilder, - BuiltinFunctionName.INTERNAL_PATTERN_PARSER, - context.relBuilder.field(node.getAlias()), - context.relBuilder.field(PatternUtils.SAMPLE_LOGS)); - flattenParsedPattern(node.getAlias(), parsedNode, context, false); - context.relBuilder.projectExcept(context.relBuilder.field(PatternUtils.SAMPLE_LOGS)); - } else { + if (showNumberedToken) { + RexNode parsedNode = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_PATTERN_PARSER, + context.relBuilder.field(node.getAlias()), + context.relBuilder.field(PatternUtils.SAMPLE_LOGS)); + flattenParsedPattern(node.getAlias(), parsedNode, context, false, true); + // Reorder fields for consistency with Brain's output + projectPlusOverriding( + List.of( + context.relBuilder.field(node.getAlias()), + context.relBuilder.field(PatternUtils.PATTERN_COUNT), + context.relBuilder.field(PatternUtils.TOKENS), + context.relBuilder.field(PatternUtils.SAMPLE_LOGS)), + List.of( + node.getAlias(), + PatternUtils.PATTERN_COUNT, + PatternUtils.TOKENS, + PatternUtils.SAMPLE_LOGS), + context); + } + } else if (showNumberedToken) { RexNode parsedNode = PPLFuncImpTable.INSTANCE.resolve( context.rexBuilder, BuiltinFunctionName.INTERNAL_PATTERN_PARSER, context.relBuilder.field(node.getAlias()), rexVisitor.analyze(node.getSourceField(), context)); - flattenParsedPattern(node.getAlias(), parsedNode, context, false); + flattenParsedPattern(node.getAlias(), parsedNode, context, false, true); } } else { List funcParamList = new ArrayList<>(); funcParamList.add(node.getSourceField()); funcParamList.add(node.getPatternMaxSampleCount()); funcParamList.add(node.getPatternBufferLimit()); + funcParamList.add(node.getShowNumberedToken()); funcParamList.addAll( node.getArguments().entrySet().stream() + .filter(entry -> PatternUtils.VALID_BRAIN_PARAMETERS.contains(entry.getKey())) .map(entry -> new Argument(entry.getKey(), entry.getValue())) .sorted(Comparator.comparing(Argument::getArgName)) - .collect(Collectors.toList())); + .toList()); if (PatternMode.LABEL.equals( node.getPatternMode())) { // Label mode, resolve the plan as window function RexNode windowNode = @@ -444,11 +787,16 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { context.rexBuilder, BuiltinFunctionName.INTERNAL_PATTERN_PARSER, rexVisitor.analyze(node.getSourceField(), context), - windowNode), + windowNode, + showNumberedTokenExpr), node.getAlias()); context.relBuilder.projectPlus(nestedNode); flattenParsedPattern( - node.getAlias(), context.relBuilder.field(node.getAlias()), context, false); + node.getAlias(), + context.relBuilder.field(node.getAlias()), + context, + false, + showNumberedToken); } else { // Aggregation mode, resolve plan as aggregation AggCall aggCall = aggVisitor @@ -461,12 +809,16 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { List groupByList = node.getPartitionByList().stream() .map(expr -> rexVisitor.analyze(expr, context)) - .collect(Collectors.toList()); + .toList(); context.relBuilder.aggregate(context.relBuilder.groupKey(groupByList), aggCall); buildExpandRelNode( context.relBuilder.field(node.getAlias()), node.getAlias(), node.getAlias(), context); flattenParsedPattern( - node.getAlias(), context.relBuilder.field(node.getAlias()), context, true); + node.getAlias(), + context.relBuilder.field(node.getAlias()), + context, + true, + showNumberedToken); } } return context.relBuilder.peek(); @@ -475,7 +827,6 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { @Override public RelNode visitEval(Eval node, CalcitePlanContext context) { visitChildren(node, context); - List originalFieldNames = context.relBuilder.peek().getRowType().getFieldNames(); node.getExpressionList() .forEach( expr -> { @@ -510,9 +861,9 @@ private void projectPlusOverriding( List originalFieldNames = context.relBuilder.peek().getRowType().getFieldNames(); List toOverrideList = originalFieldNames.stream() - .filter(newNames::contains) + .filter(originalName -> shouldOverrideField(originalName, newNames)) .map(a -> (RexNode) context.relBuilder.field(a)) - .collect(Collectors.toList()); + .toList(); // 1. add the new fields, For example "age0, country0" context.relBuilder.projectPlus(newFields); // 2. drop the overriding field list, it's duplicated now. For example "age, country" @@ -530,9 +881,37 @@ private void projectPlusOverriding( context.relBuilder.rename(expectedRenameFields); } + private boolean shouldOverrideField(String originalName, List newNames) { + return newNames.stream() + .anyMatch( + newName -> + // Match exact field names (e.g., "age" == "age") for flat fields + newName.equals(originalName) + // OR match nested paths (e.g., "resource.attributes..." starts with + // "resource.") + || newName.startsWith(originalName + ".")); + } + + private List> extractInputRefList(List aggCalls) { + return aggCalls.stream() + .map(RelBuilder.AggCall::over) + .map(RelBuilder.OverCall::toRex) + .map(node -> getRexCall(node, this::isCountField)) + .map(list -> list.isEmpty() ? null : list.getFirst()) + .map(PlanUtils::getInputRefs) + .toList(); + } + + /** Is count(FIELD) */ + private boolean isCountField(RexCall call) { + return call.isA(SqlKind.COUNT) + && call.getOperands().size() == 1 // count(FIELD) + && call.getOperands().get(0) instanceof RexInputRef; + } + /** * Resolve the aggregation with trimming unused fields to avoid bugs in {@link - * org.apache.calcite.sql2rel.RelDecorrelator#decorrelateRel(Aggregate, boolean)} + * org.apache.calcite.sql2rel.RelDecorrelator#decorrelateRel(Aggregate, boolean, boolean)} * * @param groupExprList group by expression list * @param aggExprList aggregate expression list @@ -543,6 +922,72 @@ private Pair, List> aggregateWithTrimming( List groupExprList, List aggExprList, CalcitePlanContext context) { + Pair, List> resolved = + resolveAttributesForAggregation(groupExprList, aggExprList, context); + List resolvedGroupByList = resolved.getLeft(); + List resolvedAggCallList = resolved.getRight(); + + // `doc_count` optimization required a filter `isNotNull(RexInputRef)` for the + // `count(FIELD)` aggregation which only can be applied to single FIELD without grouping: + // + // Example 1: source=t | stats count(a) + // Before: Aggregate(count(a)) + // \- Scan t + // After: Aggregate(count(a)) + // \- Filter(isNotNull(a)) + // \- Scan t + // + // Example 2: source=t | stats count(a), count(a) + // Before: Aggregate(count(a), count(a)) + // \- Scan t + // After: Aggregate(count(a), count(a)) + // \- Filter(isNotNull(a)) + // \- Scan t + // + // Example 3: source=t | stats count(a) by b + // Before & After: Aggregate(count(a) by b) + // \- Scan t + // + // Example 4: source=t | stats count() + // Before & After: Aggregate(count()) + // \- Scan t + // + // Example 5: source=t | stats count(), count(a) + // Before & After: Aggregate(count(), count(a)) + // \- Scan t + // + // Example 6: source=t | stats count(a), count(b) + // Before & After: Aggregate(count(a), count(b)) + // \- Scan t + // + // Example 7: source=t | stats count(a+1) + // Before & After: Aggregate(count(a+1)) + // \- Scan t + if (resolvedGroupByList.isEmpty()) { + List> refsPerCount = extractInputRefList(resolvedAggCallList); + List distinctRefsOfCounts; + if (context.relBuilder.peek() instanceof org.apache.calcite.rel.core.Project project) { + List mappedInProject = + refsPerCount.stream() + .flatMap(List::stream) + .map(ref -> project.getProjects().get(ref.getIndex())) + .toList(); + if (mappedInProject.stream().allMatch(RexInputRef.class::isInstance)) { + distinctRefsOfCounts = + mappedInProject.stream().map(RexInputRef.class::cast).distinct().toList(); + } else { + distinctRefsOfCounts = List.of(); + } + } else { + distinctRefsOfCounts = refsPerCount.stream().flatMap(List::stream).distinct().toList(); + } + if (distinctRefsOfCounts.size() == 1 && refsPerCount.stream().noneMatch(List::isEmpty)) { + context.relBuilder.filter(context.relBuilder.isNotNull(distinctRefsOfCounts.getFirst())); + } + } + + // Add project before aggregate: + // // Example 1: source=t | where a > 1 | stats avg(b + 1) by c // Before: Aggregate(avg(b + 1)) // \- Filter(a > 1) @@ -553,18 +998,22 @@ private Pair, List> aggregateWithTrimming( // \- Scan t // // Example 2: source=t | where a > 1 | top b by c - // Before: Aggregate(count) - // \-Filter(a > 1) + // Before: Aggregate(count(b) by c) + // \-Filter(a > 1 && isNotNull(b)) // \- Scan t - // After: Aggregate(count) + // After: Aggregate(count(b) by c) // \- Project([c, b]) - // \- Filter(a > 1) + // \- Filter(a > 1 && isNotNull(b)) // \- Scan t - Pair, List> resolved = - resolveAttributesForAggregation(groupExprList, aggExprList, context); + // + // Example 3: source=t | stats count(): no change for count() + // Before: Aggregate(count()) + // \- Scan t + // After: Aggregate(count()) + // \- Scan t List trimmedRefs = new ArrayList<>(); - trimmedRefs.addAll(PlanUtils.getInputRefs(resolved.getLeft())); // group-by keys first - trimmedRefs.addAll(PlanUtils.getInputRefsFromAggCall(resolved.getRight())); + trimmedRefs.addAll(PlanUtils.getInputRefs(resolvedGroupByList)); // group-by keys first + trimmedRefs.addAll(PlanUtils.getInputRefsFromAggCall(resolvedAggCallList)); context.relBuilder.project(trimmedRefs); // Re-resolve all attributes based on adding trimmed Project. @@ -572,11 +1021,57 @@ private Pair, List> aggregateWithTrimming( // because that Mapping only works for RexNode, but we need both AggCall and RexNode list. Pair, List> reResolved = resolveAttributesForAggregation(groupExprList, aggExprList, context); + + List intendedGroupKeyAliases = getGroupKeyNamesAfterAggregation(reResolved.getLeft()); context.relBuilder.aggregate( context.relBuilder.groupKey(reResolved.getLeft()), reResolved.getRight()); + // During aggregation, Calcite projects both input dependencies and output group-by fields. + // When names conflict, Calcite adds numeric suffixes (e.g., "value0"). + // Apply explicit renaming to restore the intended aliases. + context.relBuilder.rename(intendedGroupKeyAliases); + return Pair.of(reResolved.getLeft(), reResolved.getRight()); } + /** + * Imitates {@code Registrar.registerExpression} of {@link RelBuilder} to derive the output order + * of group-by keys after aggregation. + * + *

The projected input reference comes first, while any other computed expression follows. + */ + private List getGroupKeyNamesAfterAggregation(List nodes) { + List reordered = new ArrayList<>(); + List left = new ArrayList<>(); + for (RexNode n : nodes) { + // The same group-key won't be added twice + if (reordered.contains(n) || left.contains(n)) { + continue; + } + if (isInputRef(n)) { + reordered.add(n); + } else { + left.add(n); + } + } + reordered.addAll(left); + return reordered.stream() + .map(this::extractAliasLiteral) + .flatMap(Optional::stream) + .map(RexLiteral::stringValue) + .toList(); + } + + /** Whether a rex node is an aliased input reference */ + private boolean isInputRef(RexNode node) { + return switch (node.getKind()) { + case AS, DESCENDING, NULLS_FIRST, NULLS_LAST -> { + final List operands = ((RexCall) node).operands; + yield isInputRef(operands.getFirst()); + } + default -> node instanceof RexInputRef; + }; + } + /** * Resolve attributes for aggregation. * @@ -590,9 +1085,9 @@ private Pair, List> resolveAttributesForAggregation( List aggExprList, CalcitePlanContext context) { List aggCallList = - aggExprList.stream().map(expr -> aggVisitor.analyze(expr, context)).collect(Collectors.toList()); + aggExprList.stream().map(expr -> aggVisitor.analyze(expr, context)).toList(); List groupByList = - groupExprList.stream().map(expr -> rexVisitor.analyze(expr, context)).collect(Collectors.toList()); + groupExprList.stream().map(expr -> rexVisitor.analyze(expr, context)).toList(); return Pair.of(groupByList, aggCallList); } @@ -605,12 +1100,44 @@ public RelNode visitAggregation(Aggregation node, CalcitePlanContext context) { // The span column is always the first column in result whatever // the order of span in query is first or last one UnresolvedExpression span = node.getSpan(); - if (!Objects.isNull(span)) { + if (Objects.nonNull(span)) { groupExprList.add(span); + List timeSpanFilters = + getTimeSpanField(span).stream() + .map(f -> rexVisitor.analyze(f, context)) + .map(context.relBuilder::isNotNull) + .toList(); + if (!timeSpanFilters.isEmpty()) { + // add isNotNull filter before aggregation for time span + context.relBuilder.filter(timeSpanFilters); + } } groupExprList.addAll(node.getGroupExprList()); + + // add stats hint to LogicalAggregation + Argument.ArgumentMap statsArgs = Argument.ArgumentMap.of(node.getArgExprList()); + Boolean bucketNullable = + (Boolean) statsArgs.getOrDefault(Argument.BUCKET_NULLABLE, Literal.TRUE).getValue(); + boolean toAddHintsOnAggregate = false; + if (!bucketNullable + && !groupExprList.isEmpty() + && !(groupExprList.size() == 1 && getTimeSpanField(span).isPresent())) { + toAddHintsOnAggregate = true; + // add isNotNull filter before aggregation for non-nullable buckets + List groupByList = + groupExprList.stream().map(expr -> rexVisitor.analyze(expr, context)).toList(); + context.relBuilder.filter( + PlanUtils.getSelectColumns(groupByList).stream() + .map(context.relBuilder::field) + .map(context.relBuilder::isNotNull) + .toList()); + } + Pair, List> aggregationAttributes = aggregateWithTrimming(groupExprList, aggExprList, context); + if (toAddHintsOnAggregate) { + addIgnoreNullBucketHintToAggregate(context); + } // schema reordering // As an example, in command `stats count() by colA, colB`, @@ -628,16 +1155,27 @@ public RelNode visitAggregation(Aggregation node, CalcitePlanContext context) { aggregationAttributes.getLeft().stream() .map(this::extractAliasLiteral) .flatMap(Optional::stream) - .map(ref -> ((RexLiteral) ref).getValueAs(String.class)) + .map(ref -> ref.getValueAs(String.class)) .map(context.relBuilder::field) .map(f -> (RexNode) f) - .collect(Collectors.toList()); + .toList(); reordered.addAll(aliasedGroupByList); context.relBuilder.project(reordered); return context.relBuilder.peek(); } + private Optional getTimeSpanField(UnresolvedExpression expr) { + if (Objects.isNull(expr)) return Optional.empty(); + if (expr instanceof Span span && SpanUnit.isTimeUnit(span.getUnit())) { + return Optional.of(span.getField()); + } + if (expr instanceof Alias alias) { + return getTimeSpanField(alias.getDelegated()); + } + return Optional.empty(); + } + /** extract the RexLiteral of Alias from a node */ private Optional extractAliasLiteral(RexNode node) { if (node == null) { @@ -653,6 +1191,79 @@ private Optional extractAliasLiteral(RexNode node) { public RelNode visitJoin(Join node, CalcitePlanContext context) { List children = node.getChildren(); children.forEach(c -> analyze(c, context)); + // add join.subsearch_maxout limit to subsearch side, 0 and negative means unlimited. + if (context.sysLimit.joinSubsearchLimit() > 0) { + PlanUtils.replaceTop( + context.relBuilder, + LogicalSystemLimit.create( + SystemLimitType.JOIN_SUBSEARCH_MAXOUT, + context.relBuilder.peek(), + context.relBuilder.literal(context.sysLimit.joinSubsearchLimit()))); + } + if (node.getJoinCondition().isEmpty()) { + // join-with-field-list grammar + List leftColumns = context.relBuilder.peek(1).getRowType().getFieldNames(); + List rightColumns = context.relBuilder.peek().getRowType().getFieldNames(); + List duplicatedFieldNames = + leftColumns.stream().filter(rightColumns::contains).toList(); + RexNode joinCondition; + if (node.getJoinFields().isPresent()) { + joinCondition = + node.getJoinFields().get().stream() + .map(field -> buildJoinConditionByFieldName(context, field.getField().toString())) + .reduce(context.rexBuilder::and) + .orElse(context.relBuilder.literal(true)); + } else { + joinCondition = + duplicatedFieldNames.stream() + .map(fieldName -> buildJoinConditionByFieldName(context, fieldName)) + .reduce(context.rexBuilder::and) + .orElse(context.relBuilder.literal(true)); + } + if (node.getJoinType() == SEMI || node.getJoinType() == ANTI) { + // semi and anti join only return left table outputs + context.relBuilder.join( + JoinAndLookupUtils.translateJoinType(node.getJoinType()), joinCondition); + return context.relBuilder.peek(); + } + List toBeRemovedFields; + if (node.getArgumentMap().get("overwrite") == null // 'overwrite' default value is true + || (node.getArgumentMap().get("overwrite").equals(Literal.TRUE))) { + toBeRemovedFields = + duplicatedFieldNames.stream() + .map(field -> JoinAndLookupUtils.analyzeFieldsForLookUp(field, true, context)) + .toList(); + } else { + toBeRemovedFields = + duplicatedFieldNames.stream() + .map(field -> JoinAndLookupUtils.analyzeFieldsForLookUp(field, false, context)) + .toList(); + } + Literal max = node.getArgumentMap().get("max"); + if (max != null && !max.equals(Literal.ZERO)) { + // max != 0 means the right-side should be dedup + Integer allowedDuplication = (Integer) max.getValue(); + if (allowedDuplication < 0) { + throw new SemanticCheckException("max option must be a positive integer"); + } + List dedupeFields = + node.getJoinFields().isPresent() + ? node.getJoinFields().get().stream() + .map(a -> (RexNode) context.relBuilder.field(a.getField().toString())) + .toList() + : duplicatedFieldNames.stream() + .map(a -> (RexNode) context.relBuilder.field(a)) + .toList(); + buildDedupNotNull(context, dedupeFields, allowedDuplication); + } + context.relBuilder.join( + JoinAndLookupUtils.translateJoinType(node.getJoinType()), joinCondition); + if (!toBeRemovedFields.isEmpty()) { + context.relBuilder.projectExcept(toBeRemovedFields); + } + return context.relBuilder.peek(); + } + // The join-with-criteria grammar doesn't allow empty join condition RexNode joinCondition = node.getJoinCondition() .map(c -> rexVisitor.analyzeJoinCondition(c, context)) @@ -676,7 +1287,7 @@ public RelNode visitJoin(Join node, CalcitePlanContext context) { // If the plan convert to Spark plan, and there are two table1: database1.table1 and // database2.table1. The query with column `table1.id` can only be resolved in the namespace // of "database1". User should run `using database1` before the query which access `table1.id` - String rightTableQualifiedName = rightTableName.get(rightTableName.size() - 1); + String rightTableQualifiedName = rightTableName.getLast(); // new columns with alias or table; List rightColumnsWithAliasIfConflict = rightColumns.stream() @@ -687,7 +1298,20 @@ public RelNode visitJoin(Join node, CalcitePlanContext context) { .map(a -> a + "." + col) .orElse(rightTableQualifiedName + "." + col) : col) - .collect(Collectors.toList()); + .toList(); + + Literal max = node.getArgumentMap().get("max"); + if (max != null && !max.equals(Literal.ZERO)) { + // max != 0 means the right-side should be dedup + Integer allowedDuplication = (Integer) max.getValue(); + if (allowedDuplication < 0) { + throw new SemanticCheckException("max option must be a positive integer"); + } + List dedupeFields = + getRightColumnsInJoinCriteria(context.relBuilder, joinCondition); + + buildDedupNotNull(context, dedupeFields, allowedDuplication); + } context.relBuilder.join( JoinAndLookupUtils.translateJoinType(node.getJoinType()), joinCondition); JoinAndLookupUtils.renameToExpectedFields( @@ -696,6 +1320,37 @@ public RelNode visitJoin(Join node, CalcitePlanContext context) { return context.relBuilder.peek(); } + private List getRightColumnsInJoinCriteria( + RelBuilder relBuilder, RexNode joinCondition) { + int stackSize = relBuilder.size(); + int leftFieldCount = relBuilder.peek(stackSize - 1).getRowType().getFieldCount(); + RelNode right = relBuilder.peek(stackSize - 2); + List allColumnNamesOfRight = right.getRowType().getFieldNames(); + + List rightColumnIndexes = new ArrayList<>(); + joinCondition.accept( + new RexVisitorImpl(true) { + @Override + public Void visitInputRef(RexInputRef inputRef) { + if (inputRef.getIndex() >= leftFieldCount) { + rightColumnIndexes.add(inputRef.getIndex() - leftFieldCount); + } + return super.visitInputRef(inputRef); + } + }); + return rightColumnIndexes.stream() + .map(allColumnNamesOfRight::get) + .map(n -> (RexNode) relBuilder.field(n)) + .toList(); + } + + private static RexNode buildJoinConditionByFieldName( + CalcitePlanContext context, String fieldName) { + RexNode lookupKey = JoinAndLookupUtils.analyzeFieldsForLookUp(fieldName, false, context); + RexNode sourceKey = JoinAndLookupUtils.analyzeFieldsForLookUp(fieldName, true, context); + return context.rexBuilder.equals(sourceKey, lookupKey); + } + @Override public RelNode visitSubqueryAlias(SubqueryAlias node, CalcitePlanContext context) { visitChildren(node, context); @@ -725,15 +1380,15 @@ public RelNode visitLookup(Lookup node, CalcitePlanContext context) { List toBeRemovedLookupFieldNames = node.getMappingAliasMap().keySet().stream() .filter(k -> !node.getOutputAliasMap().containsKey(k)) - .collect(Collectors.toList()); + .toList(); List providedFieldNames = lookupTableFieldNames.stream() .filter(k -> !toBeRemovedLookupFieldNames.contains(k)) - .collect(Collectors.toList()); + .toList(); List toBeRemovedLookupFields = toBeRemovedLookupFieldNames.stream() .map(d -> (RexNode) context.relBuilder.field(2, 1, d)) - .collect(Collectors.toList()); + .toList(); List toBeRemovedFields = new ArrayList<>(toBeRemovedLookupFields); // 4. Find duplicated fields between source table fields and lookup table provided fields. @@ -744,19 +1399,19 @@ public RelNode visitLookup(Lookup node, CalcitePlanContext context) { List duplicatedSourceFields = duplicatedFieldNamesMap.keySet().stream() .map(field -> JoinAndLookupUtils.analyzeFieldsForLookUp(field, true, context)) - .collect(Collectors.toList()); + .toList(); // Duplicated fields in source-field should always be removed. toBeRemovedFields.addAll(duplicatedSourceFields); // Construct a new field name for the new provided-fields. List expectedProvidedFieldNames = - providedFieldNames.stream().map(k -> node.getOutputAliasMap().getOrDefault(k, k)).collect(Collectors.toList()); + providedFieldNames.stream().map(k -> node.getOutputAliasMap().getOrDefault(k, k)).toList(); List newCoalesceList = new ArrayList<>(); if (!duplicatedFieldNamesMap.isEmpty() && node.getOutputStrategy() == OutputStrategy.APPEND) { List duplicatedProvidedFields = duplicatedFieldNamesMap.values().stream() .map(field -> JoinAndLookupUtils.analyzeFieldsForLookUp(field, false, context)) - .collect(Collectors.toList()); + .toList(); for (int i = 0; i < duplicatedProvidedFields.size(); ++i) { newCoalesceList.add( context.rexBuilder.coalesce( @@ -772,7 +1427,7 @@ public RelNode visitLookup(Lookup node, CalcitePlanContext context) { new ArrayList<>( expectedProvidedFieldNames.stream() .filter(k -> !duplicatedFieldNamesMap.containsKey(k)) - .collect(Collectors.toList())); + .toList()); newExpectedFieldNames.addAll(duplicatedFieldNamesMap.keySet()); expectedProvidedFieldNames = newExpectedFieldNames; } @@ -812,103 +1467,476 @@ public RelNode visitDedupe(Dedupe node, CalcitePlanContext context) { throw new IllegalArgumentException("Number of duplicate events must be greater than 0"); } if (consecutive) { - throw new UnsupportedOperationException("Consecutive deduplication is not supported"); + throw new CalciteUnsupportedException("Consecutive deduplication is unsupported in Calcite"); } // Columns to deduplicate List dedupeFields = - node.getFields().stream().map(f -> rexVisitor.analyze(f, context)).collect(Collectors.toList()); + node.getFields().stream().map(f -> rexVisitor.analyze(f, context)).toList(); if (keepEmpty) { - /* - * | dedup 2 a, b keepempty=false - * DropColumns('_row_number_) - * +- Filter ('_row_number_ <= n OR isnull('a) OR isnull('b)) - * +- Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_], ['a, 'b], ['a ASC NULLS FIRST, 'b ASC NULLS FIRST] - * +- ... - */ - // Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, - // specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_], ['a, 'b], ['a ASC - // NULLS FIRST, 'b ASC NULLS FIRST] - RexNode rowNumber = - context - .relBuilder - .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) - .over() - .partitionBy(dedupeFields) - .orderBy(dedupeFields) - .rowsTo(RexWindowBounds.CURRENT_ROW) - .as("_row_number_"); - context.relBuilder.projectPlus(rowNumber); - RexNode _row_number_ = context.relBuilder.field("_row_number_"); - // Filter (isnull('a) OR isnull('b) OR '_row_number_ <= n) - context.relBuilder.filter( - context.relBuilder.or( - context.relBuilder.or(dedupeFields.stream().map(context.relBuilder::isNull).collect(Collectors.toList())), - context.relBuilder.lessThanOrEqual( - _row_number_, context.relBuilder.literal(allowedDuplication)))); - // DropColumns('_row_number_) - context.relBuilder.projectExcept(_row_number_); + buildDedupOrNull(context, dedupeFields, allowedDuplication); } else { - /* - * | dedup 2 a, b keepempty=false - * DropColumns('_row_number_) - * +- Filter ('_row_number_ <= n) - * +- Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_], ['a, 'b], ['a ASC NULLS FIRST, 'b ASC NULLS FIRST] - * +- Filter (isnotnull('a) AND isnotnull('b)) - * +- ... - */ - // Filter (isnotnull('a) AND isnotnull('b)) - context.relBuilder.filter( - context.relBuilder.and( - dedupeFields.stream().map(context.relBuilder::isNotNull).collect(Collectors.toList()))); - // Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, - // specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_], ['a, 'b], ['a ASC - // NULLS FIRST, 'b ASC NULLS FIRST] - RexNode rowNumber = - context - .relBuilder - .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) - .over() - .partitionBy(dedupeFields) - .orderBy(dedupeFields) - .rowsTo(RexWindowBounds.CURRENT_ROW) - .as("_row_number_"); - context.relBuilder.projectPlus(rowNumber); - RexNode _row_number_ = context.relBuilder.field("_row_number_"); - // Filter ('_row_number_ <= n) - context.relBuilder.filter( - context.relBuilder.lessThanOrEqual( - _row_number_, context.relBuilder.literal(allowedDuplication))); - // DropColumns('_row_number_) - context.relBuilder.projectExcept(_row_number_); + buildDedupNotNull(context, dedupeFields, allowedDuplication); } return context.relBuilder.peek(); } + private static void buildDedupOrNull( + CalcitePlanContext context, List dedupeFields, Integer allowedDuplication) { + /* + * | dedup 2 a, b keepempty=false + * DropColumns('_row_number_dedup_) + * +- Filter ('_row_number_dedup_ <= n OR isnull('a) OR isnull('b)) + * +- Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_dedup_], ['a, 'b], ['a ASC NULLS FIRST, 'b ASC NULLS FIRST] + * +- ... + */ + // Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, + // specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_dedup_], ['a, 'b], ['a + // ASC + // NULLS FIRST, 'b ASC NULLS FIRST] + RexNode rowNumber = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .partitionBy(dedupeFields) + .orderBy(dedupeFields) + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_DEDUP); + context.relBuilder.projectPlus(rowNumber); + RexNode _row_number_dedup_ = context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_DEDUP); + // Filter (isnull('a) OR isnull('b) OR '_row_number_dedup_ <= n) + context.relBuilder.filter( + context.relBuilder.or( + context.relBuilder.or(dedupeFields.stream().map(context.relBuilder::isNull).toList()), + context.relBuilder.lessThanOrEqual( + _row_number_dedup_, context.relBuilder.literal(allowedDuplication)))); + // DropColumns('_row_number_dedup_) + context.relBuilder.projectExcept(_row_number_dedup_); + } + + private static void buildDedupNotNull( + CalcitePlanContext context, List dedupeFields, Integer allowedDuplication) { + /* + * | dedup 2 a, b keepempty=false + * DropColumns('_row_number_dedup_) + * +- Filter ('_row_number_dedup_ <= n) + * +- Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_dedup_], ['a, 'b], ['a ASC NULLS FIRST, 'b ASC NULLS FIRST] + * +- Filter (isnotnull('a) AND isnotnull('b)) + * +- ... + */ + // Filter (isnotnull('a) AND isnotnull('b)) + context.relBuilder.filter( + context.relBuilder.and(dedupeFields.stream().map(context.relBuilder::isNotNull).toList())); + // Window [row_number() windowspecdefinition('a, 'b, 'a ASC NULLS FIRST, 'b ASC NULLS FIRST, + // specifiedwindowoundedpreceding$(), currentrow$())) AS _row_number_dedup_], ['a, 'b], ['a ASC + // NULLS FIRST, 'b ASC NULLS FIRST] + RexNode rowNumber = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .partitionBy(dedupeFields) + .orderBy(dedupeFields) + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_DEDUP); + context.relBuilder.projectPlus(rowNumber); + RexNode _row_number_dedup_ = context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_DEDUP); + // Filter ('_row_number_dedup_ <= n) + context.relBuilder.filter( + context.relBuilder.lessThanOrEqual( + _row_number_dedup_, context.relBuilder.literal(allowedDuplication))); + // DropColumns('_row_number_dedup_) + context.relBuilder.projectExcept(_row_number_dedup_); + } + @Override public RelNode visitWindow(Window node, CalcitePlanContext context) { visitChildren(node, context); List overExpressions = - node.getWindowFunctionList().stream().map(w -> rexVisitor.analyze(w, context)).collect(Collectors.toList()); + node.getWindowFunctionList().stream().map(w -> rexVisitor.analyze(w, context)).toList(); context.relBuilder.projectPlus(overExpressions); return context.relBuilder.peek(); } + /** + * Validates type compatibility between replacement value and field for fillnull operation. Throws + * SemanticCheckException if types are incompatible. + */ + private void validateFillNullTypeCompatibility( + RexNode replacement, RexNode fieldRef, String fieldName) { + RelDataTypeFamily replacementFamily = replacement.getType().getFamily(); + RelDataTypeFamily fieldFamily = fieldRef.getType().getFamily(); + + // Check if the replacement type is compatible with the field type + // Allow NULL type family as it's compatible with any type + if (fieldFamily != replacementFamily + && fieldFamily != SqlTypeFamily.NULL + && replacementFamily != SqlTypeFamily.NULL) { + throw new SemanticCheckException( + String.format( + "fillnull failed: replacement value type %s is not compatible with field '%s' " + + "(type: %s). The replacement value type must match the field type.", + replacement.getType().getSqlTypeName(), + fieldName, + fieldRef.getType().getSqlTypeName())); + } + } + @Override - public RelNode visitFillNull(FillNull node, CalcitePlanContext context) { + public RelNode visitStreamWindow(StreamWindow node, CalcitePlanContext context) { visitChildren(node, context); - if (node.getFields().size() - != new HashSet<>(node.getFields().stream().map(f -> f.getField().toString()).collect(Collectors.toList())) - .size()) { - throw new IllegalArgumentException("The field list cannot be duplicated in fillnull"); + + List groupList = node.getGroupList(); + boolean hasGroup = groupList != null && !groupList.isEmpty(); + boolean hasWindow = node.getWindow() > 0; + boolean hasReset = node.getResetBefore() != null || node.getResetAfter() != null; + + // Local helper column names + final String RESET_BEFORE_FLAG_COL = "__reset_before_flag__"; // flag for reset_before + final String RESET_AFTER_FLAG_COL = "__reset_after_flag__"; // flag for reset_after + final String SEGMENT_ID_COL = "__seg_id__"; // segment id + + // CASE: reset + if (hasReset) { + // 1. Build helper columns: seq, before/after flags, segment_id + RelNode leftWithSeg = buildResetHelperColumns(context, node); + + // 2. Run correlate + aggregate with reset-specific filter and cleanup + return buildStreamWindowJoinPlan( + context, + leftWithSeg, + node, + groupList, + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + SEGMENT_ID_COL, + new String[] { + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + RESET_BEFORE_FLAG_COL, + RESET_AFTER_FLAG_COL, + SEGMENT_ID_COL + }); } - List projects = new ArrayList<>(); - List fieldsList = context.relBuilder.peek().getRowType().getFieldList(); - for (RelDataTypeField field : fieldsList) { + + // CASE: global=true + window>0 + has group + if (node.isGlobal() && hasWindow && hasGroup) { + // 1. Add global sequence column for sliding window + RexNode streamSeq = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(streamSeq); + RelNode left = context.relBuilder.build(); + + // 2. Run correlate + aggregate + return buildStreamWindowJoinPlan( + context, + left, + node, + groupList, + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + null, + new String[] {ROW_NUMBER_COLUMN_FOR_STREAMSTATS}); + } + + // Default + if (hasGroup) { + // only build sequence when there is by condition + RexNode streamSeq = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(streamSeq); + } + + List overExpressions = + node.getWindowFunctionList().stream().map(w -> rexVisitor.analyze(w, context)).toList(); + context.relBuilder.projectPlus(overExpressions); + + // resort when there is by condition + if (hasGroup) { + context.relBuilder.sort(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_STREAMSTATS)); + context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_STREAMSTATS)); + } + + return context.relBuilder.peek(); + } + + private RelNode buildStreamWindowJoinPlan( + CalcitePlanContext context, + RelNode leftWithHelpers, + StreamWindow node, + List groupList, + String seqCol, + String segmentCol, + String[] helperColsToCleanup) { + + final Holder<@Nullable RexCorrelVariable> v = Holder.empty(); + context.relBuilder.push(leftWithHelpers); + context.relBuilder.variable(v::set); + + context.relBuilder.push(leftWithHelpers); + RexNode rightSeq = context.relBuilder.field(seqCol); + RexNode outerSeq = context.relBuilder.field(v.get(), seqCol); + + RexNode filter; + if (segmentCol != null) { // reset condition + RexNode segRight = context.relBuilder.field(segmentCol); + RexNode segOuter = context.relBuilder.field(v.get(), segmentCol); + RexNode frame = buildResetFrameFilter(context, node, outerSeq, rightSeq, segOuter, segRight); + RexNode group = buildGroupFilter(context, groupList, v.get()); + filter = (group == null) ? frame : context.relBuilder.and(frame, group); + } else { // global + window + by condition + RexNode frame = buildFrameFilter(context, node, outerSeq, rightSeq); + RexNode group = buildGroupFilter(context, groupList, v.get()); + filter = context.relBuilder.and(frame, group); + } + context.relBuilder.filter(filter); + + // aggregate all window functions on right side + List aggCalls = buildAggCallsForWindowFunctions(node.getWindowFunctionList(), context); + context.relBuilder.aggregate(context.relBuilder.groupKey(), aggCalls); + RelNode rightAgg = context.relBuilder.build(); + + // correlate LEFT with RIGHT using seq + group fields + context.relBuilder.push(leftWithHelpers); + context.relBuilder.push(rightAgg); + List requiredLeft = buildRequiredLeft(context, seqCol, groupList); + if (segmentCol != null) { // also require seg_id for reset segmentation equality + requiredLeft = new ArrayList<>(requiredLeft); + requiredLeft.add(context.relBuilder.field(2, 0, segmentCol)); + } + context.relBuilder.correlate(JoinRelType.LEFT, v.get().id, requiredLeft); + + // resort to original order + boolean hasGroup = !groupList.isEmpty(); + // resort when 1. global + window + by condition 2.reset + by condition + if (hasGroup) { + context.relBuilder.sort(context.relBuilder.field(seqCol)); + } + + // cleanup helper columns + List cleanup = new ArrayList<>(); + for (String c : helperColsToCleanup) { + cleanup.add(context.relBuilder.field(c)); + } + context.relBuilder.projectExcept(cleanup); + return context.relBuilder.peek(); + } + + private RelNode buildResetHelperColumns(CalcitePlanContext context, StreamWindow node) { + // 1. global sequence to define order + RexNode rowNum = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(rowNum); + + // 2. before/after flags + RexNode beforePred = + (node.getResetBefore() == null) + ? context.relBuilder.literal(false) + : rexVisitor.analyze(node.getResetBefore(), context); + RexNode afterPred = + (node.getResetAfter() == null) + ? context.relBuilder.literal(false) + : rexVisitor.analyze(node.getResetAfter(), context); + RexNode beforeFlag = + context.relBuilder.call( + SqlStdOperatorTable.CASE, + beforePred, + context.relBuilder.literal(1), + context.relBuilder.literal(0)); + RexNode afterFlag = + context.relBuilder.call( + SqlStdOperatorTable.CASE, + afterPred, + context.relBuilder.literal(1), + context.relBuilder.literal(0)); + context.relBuilder.projectPlus(context.relBuilder.alias(beforeFlag, "__reset_before_flag__")); + context.relBuilder.projectPlus(context.relBuilder.alias(afterFlag, "__reset_after_flag__")); + + // 3. session id = SUM(beforeFlag) over (to current) + SUM(afterFlag) over (to 1 preceding) + RexNode sumBefore = + context + .relBuilder + .aggregateCall( + SqlStdOperatorTable.SUM, context.relBuilder.field("__reset_before_flag__")) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .toRex(); + RexNode sumAfterPrev = + context + .relBuilder + .aggregateCall( + SqlStdOperatorTable.SUM, context.relBuilder.field("__reset_after_flag__")) + .over() + .rowsBetween( + RexWindowBounds.UNBOUNDED_PRECEDING, + RexWindowBounds.preceding(context.relBuilder.literal(1))) + .toRex(); + sumBefore = + context.relBuilder.call( + SqlStdOperatorTable.COALESCE, sumBefore, context.relBuilder.literal(0)); + sumAfterPrev = + context.relBuilder.call( + SqlStdOperatorTable.COALESCE, sumAfterPrev, context.relBuilder.literal(0)); + + RexNode segId = context.relBuilder.call(SqlStdOperatorTable.PLUS, sumBefore, sumAfterPrev); + context.relBuilder.projectPlus(context.relBuilder.alias(segId, "__seg_id__")); + return context.relBuilder.build(); + } + + private RexNode buildFrameFilter( + CalcitePlanContext context, StreamWindow node, RexNode outerSeq, RexNode rightSeq) { + // window always >0 + // frame: either [outer-(w-1), outer] or [outer-w, outer-1] + if (node.isCurrent()) { + RexNode lower = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, + outerSeq, + context.relBuilder.literal(node.getWindow() - 1)); + return context.relBuilder.between(rightSeq, lower, outerSeq); + } else { + RexNode lower = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, outerSeq, context.relBuilder.literal(node.getWindow())); + RexNode upper = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, outerSeq, context.relBuilder.literal(1)); + return context.relBuilder.between(rightSeq, lower, upper); + } + } + + private RexNode buildResetFrameFilter( + CalcitePlanContext context, + StreamWindow node, + RexNode outerSeq, + RexNode rightSeq, + RexNode segIdOuter, + RexNode segIdRight) { + // 1. Compute sequence range (handle running window semantics when window == 0) + RexNode seqFilter; + if (node.getWindow() == 0) { + // running: current => rightSeq <= outerSeq; excluding current => rightSeq < outerSeq + seqFilter = + node.isCurrent() + ? context.relBuilder.lessThanOrEqual(rightSeq, outerSeq) + : context.relBuilder.lessThan(rightSeq, outerSeq); + } else { + // Reuse normal frame filter logic when window > 0 + seqFilter = buildFrameFilter(context, node, outerSeq, rightSeq); + } + // 2. Ensure same segment (seg_id) for reset partitioning + RexNode segFilter = context.relBuilder.equals(segIdRight, segIdOuter); + // 3. Combine filters + return context.relBuilder.and(seqFilter, segFilter); + } + + private RexNode buildGroupFilter( + CalcitePlanContext context, List groupList, RexCorrelVariable correl) { + // build conjunctive equality filters: right.g_i = outer.g_i + if (groupList.isEmpty()) { + return null; + } + List equalsList = + groupList.stream() + .map( + expr -> { + String groupName = extractGroupFieldName(expr); + RexNode rightGroup = context.relBuilder.field(groupName); + RexNode outerGroup = context.relBuilder.field(correl, groupName); + return context.relBuilder.equals(rightGroup, outerGroup); + }) + .toList(); + return context.relBuilder.and(equalsList); + } + + private String extractGroupFieldName(UnresolvedExpression groupExpr) { + if (groupExpr instanceof Alias groupAlias + && groupAlias.getDelegated() instanceof Field groupField) { + return groupField.getField().toString(); + } else if (groupExpr instanceof Field groupField) { + return groupField.getField().toString(); + } else { + throw new IllegalArgumentException( + "Unsupported group expression: only field or alias(field) is supported"); + } + } + + private List buildAggCallsForWindowFunctions( + List windowExprs, CalcitePlanContext context) { + List aggCalls = new ArrayList<>(); + for (UnresolvedExpression expr : windowExprs) { + if (expr instanceof Alias a && a.getDelegated() instanceof WindowFunction wf) { + Function func = (Function) wf.getFunction(); + List args = func.getFuncArgs(); + // first argument is the input field, others are function params + UnresolvedExpression field = args.isEmpty() ? null : args.get(0); + List rest = + args.size() <= 1 ? List.of() : args.subList(1, args.size()); + AggregateFunction aggFunc = new AggregateFunction(func.getFuncName(), field, rest); + AggCall call = aggVisitor.analyze(new Alias(a.getName(), aggFunc), context); + aggCalls.add(call); + } else { + throw new IllegalArgumentException("Unsupported window function in streamstats"); + } + } + return aggCalls; + } + + private List buildRequiredLeft( + CalcitePlanContext context, String seqCol, List groupList) { + List requiredLeft = new ArrayList<>(); + // reference to left seq column + requiredLeft.add(context.relBuilder.field(2, 0, seqCol)); + for (UnresolvedExpression groupExpr : groupList) { + String groupName = extractGroupFieldName(groupExpr); + requiredLeft.add(context.relBuilder.field(2, 0, groupName)); + } + return requiredLeft; + } + + @Override + public RelNode visitFillNull(FillNull node, CalcitePlanContext context) { + visitChildren(node, context); + if (node.getFields().size() + != new HashSet<>(node.getFields().stream().map(f -> f.getField().toString()).toList()) + .size()) { + throw new IllegalArgumentException("The field list cannot be duplicated in fillnull"); + } + + // Validate type compatibility when replacementForAll is present + if (node.getReplacementForAll().isPresent()) { + List fieldsList = context.relBuilder.peek().getRowType().getFieldList(); + RexNode replacement = rexVisitor.analyze(node.getReplacementForAll().get(), context); + + // Validate all fields are compatible with the replacement value + for (RelDataTypeField field : fieldsList) { + RexNode fieldRef = context.rexBuilder.makeInputRef(field.getType(), field.getIndex()); + validateFillNullTypeCompatibility(replacement, fieldRef, field.getName()); + } + } + + List projects = new ArrayList<>(); + List fieldsList = context.relBuilder.peek().getRowType().getFieldList(); + for (RelDataTypeField field : fieldsList) { RexNode fieldRef = context.rexBuilder.makeInputRef(field.getType(), field.getIndex()); boolean toReplace = false; for (Pair pair : node.getReplacementPairs()) { if (field.getName().equalsIgnoreCase(pair.getLeft().getField().toString())) { RexNode replacement = rexVisitor.analyze(pair.getRight(), context); + // Validate type compatibility before COALESCE + validateFillNullTypeCompatibility(replacement, fieldRef, field.getName()); RexNode coalesce = context.rexBuilder.coalesce(fieldRef, replacement); RexNode coalesceWithAlias = context.relBuilder.alias(coalesce, field.getName()); projects.add(coalesceWithAlias); @@ -944,7 +1972,7 @@ public RelNode visitAppendCol(AppendCol node, CalcitePlanContext context) { List.of(), WindowFrame.toCurrentRow()); context.relBuilder.projectPlus( - context.relBuilder.alias(mainRowNumber, ROW_NUMBER_COLUMN_NAME_MAIN)); + context.relBuilder.alias(mainRowNumber, ROW_NUMBER_COLUMN_FOR_MAIN)); // 3. build subsearch tree (attach relation to subsearch) UnresolvedPlan relation = getRelation(node); @@ -962,22 +1990,22 @@ public RelNode visitAppendCol(AppendCol node, CalcitePlanContext context) { List.of(), WindowFrame.toCurrentRow()); context.relBuilder.projectPlus( - context.relBuilder.alias(subsearchRowNumber, ROW_NUMBER_COLUMN_NAME_SUBSEARCH)); + context.relBuilder.alias(subsearchRowNumber, ROW_NUMBER_COLUMN_FOR_SUBSEARCH)); List subsearchFields = context.relBuilder.peek().getRowType().getFieldNames(); List mainFields = context.relBuilder.peek(1).getRowType().getFieldNames(); if (!node.isOverride()) { // 6. if override = false, drop all the duplicated columns in subsearch before join List subsearchProjectList = - subsearchFields.stream().filter(r -> !mainFields.contains(r)).collect(Collectors.toList()); + subsearchFields.stream().filter(r -> !mainFields.contains(r)).toList(); context.relBuilder.project(context.relBuilder.fields(subsearchProjectList)); } // 7. join with condition `_row_number_main_ = _row_number_subsearch_` RexNode joinCondition = context.relBuilder.equals( - context.relBuilder.field(2, 0, ROW_NUMBER_COLUMN_NAME_MAIN), - context.relBuilder.field(2, 1, ROW_NUMBER_COLUMN_NAME_SUBSEARCH)); + context.relBuilder.field(2, 0, ROW_NUMBER_COLUMN_FOR_MAIN), + context.relBuilder.field(2, 1, ROW_NUMBER_COLUMN_FOR_SUBSEARCH)); context.relBuilder.join( JoinAndLookupUtils.translateJoinType(Join.JoinType.FULL), joinCondition); @@ -985,8 +2013,8 @@ public RelNode visitAppendCol(AppendCol node, CalcitePlanContext context) { // 8. if override = false, drop both _row_number_ columns context.relBuilder.projectExcept( List.of( - context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_MAIN), - context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_SUBSEARCH))); + context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_MAIN), + context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_SUBSEARCH))); return context.relBuilder.peek(); } else { // 9. if override = true, override the duplicated columns in main by subsearch values @@ -998,11 +2026,11 @@ public RelNode visitAppendCol(AppendCol node, CalcitePlanContext context) { mainFields.stream().filter(subsearchFields::contains).collect(Collectors.toSet()); RexNode caseCondition = context.relBuilder.equals( - context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_MAIN), - context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_SUBSEARCH)); + context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_MAIN), + context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_SUBSEARCH)); for (int mainFieldIndex = 0; mainFieldIndex < mainFields.size(); mainFieldIndex++) { String mainFieldName = mainFields.get(mainFieldIndex); - if (mainFieldName.equals(ROW_NUMBER_COLUMN_NAME_MAIN)) { + if (mainFieldName.equals(ROW_NUMBER_COLUMN_FOR_MAIN)) { continue; } finalFieldNames.add(mainFieldName); @@ -1027,7 +2055,7 @@ public RelNode visitAppendCol(AppendCol node, CalcitePlanContext context) { subsearchFieldIndex < subsearchFields.size(); subsearchFieldIndex++) { String subsearchFieldName = subsearchFields.get(subsearchFieldIndex); - if (subsearchFieldName.equals(ROW_NUMBER_COLUMN_NAME_SUBSEARCH)) { + if (subsearchFieldName.equals(ROW_NUMBER_COLUMN_FOR_SUBSEARCH)) { continue; } if (!duplicatedFields.contains(subsearchFieldName)) { @@ -1040,6 +2068,81 @@ public RelNode visitAppendCol(AppendCol node, CalcitePlanContext context) { } } + @Override + public RelNode visitAppend(Append node, CalcitePlanContext context) { + // 1. Resolve main plan + visitChildren(node, context); + + // 2. Resolve subsearch plan + UnresolvedPlan prunedSubSearch = + node.getSubSearch().accept(new EmptySourcePropagateVisitor(), null); + prunedSubSearch.accept(this, context); + + // 3. Merge two query schemas using shared logic + RelNode subsearchNode = context.relBuilder.build(); + RelNode mainNode = context.relBuilder.build(); + + // Use shared schema merging logic that handles type conflicts via field renaming + List nodesToMerge = Arrays.asList(mainNode, subsearchNode); + List projectedNodes = + SchemaUnifier.buildUnifiedSchemaWithConflictResolution(nodesToMerge, context); + + // 4. Union the projected plans + for (RelNode projectedNode : projectedNodes) { + context.relBuilder.push(projectedNode); + } + context.relBuilder.union(true); + return context.relBuilder.peek(); + } + + @Override + public RelNode visitMultisearch(Multisearch node, CalcitePlanContext context) { + List subsearchNodes = new ArrayList<>(); + for (UnresolvedPlan subsearch : node.getSubsearches()) { + UnresolvedPlan prunedSubSearch = subsearch.accept(new EmptySourcePropagateVisitor(), null); + prunedSubSearch.accept(this, context); + subsearchNodes.add(context.relBuilder.build()); + } + + // Use shared schema merging logic that handles type conflicts via field renaming + List alignedNodes = + SchemaUnifier.buildUnifiedSchemaWithConflictResolution(subsearchNodes, context); + + for (RelNode alignedNode : alignedNodes) { + context.relBuilder.push(alignedNode); + } + context.relBuilder.union(true, alignedNodes.size()); + + RelDataType rowType = context.relBuilder.peek().getRowType(); + String timestampField = findTimestampField(rowType); + if (timestampField != null) { + RelDataTypeField timestampFieldRef = rowType.getField(timestampField, false, false); + if (timestampFieldRef != null) { + RexNode timestampRef = + context.rexBuilder.makeInputRef( + context.relBuilder.peek(), timestampFieldRef.getIndex()); + context.relBuilder.sort(context.relBuilder.desc(timestampRef)); + } + } + + return context.relBuilder.peek(); + } + + /** + * Finds the @timestamp field for multisearch ordering. Only @timestamp field is used for + * timestamp interleaving. Other timestamp-like fields are ignored. + * + * @param rowType The row type to search for @timestamp field + * @return "@timestamp" if the field exists, or null if not found + */ + private String findTimestampField(RelDataType rowType) { + RelDataTypeField field = rowType.getField("@timestamp", false, false); + if (field != null) { + return "@timestamp"; + } + return null; + } + /* * Unsupported Commands of PPL with Calcite for OpenSearch 3.0.0-beta */ @@ -1076,9 +2179,8 @@ public RelNode visitKmeans(Kmeans node, CalcitePlanContext context) { @Override public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) { visitChildren(node, context); - - ArgumentMap arguments = ArgumentMap.of(node.getArguments()); - String countFieldName = (String) arguments.get("countField").getValue(); + ArgumentMap argumentMap = ArgumentMap.of(node.getArguments()); + String countFieldName = (String) argumentMap.get(RareTopN.Option.countField.name()).getValue(); if (context.relBuilder.peek().getRowType().getFieldNames().contains(countFieldName)) { throw new IllegalArgumentException( "Field `" @@ -1089,13 +2191,32 @@ public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) { // 1. group the group-by list + field list and add a count() aggregation List groupExprList = new ArrayList<>(node.getGroupExprList()); List fieldList = - node.getFields().stream().map(f -> (UnresolvedExpression) f).collect(Collectors.toList()); + node.getFields().stream().map(f -> (UnresolvedExpression) f).toList(); groupExprList.addAll(fieldList); List aggExprList = List.of(AstDSL.alias(countFieldName, AstDSL.aggregate("count", null))); + + // if usenull=false, add a isNotNull before Aggregate and the hint to this Aggregate + Boolean bucketNullable = (Boolean) argumentMap.get(RareTopN.Option.useNull.name()).getValue(); + boolean toAddHintsOnAggregate = false; + if (!bucketNullable && !groupExprList.isEmpty()) { + toAddHintsOnAggregate = true; + // add isNotNull filter before aggregation to filter out null bucket + List groupByList = + groupExprList.stream().map(expr -> rexVisitor.analyze(expr, context)).toList(); + context.relBuilder.filter( + PlanUtils.getSelectColumns(groupByList).stream() + .map(context.relBuilder::field) + .map(context.relBuilder::isNotNull) + .toList()); + } aggregateWithTrimming(groupExprList, aggExprList, context); - // 2. add a window column + if (toAddHintsOnAggregate) { + addIgnoreNullBucketHintToAggregate(context); + } + + // 2. add count() column with sort direction List partitionKeys = rexVisitor.analyze(node.getGroupExprList(), context); RexNode countField; if (node.getCommandType() == RareTopN.CommandType.TOP) { @@ -1103,6 +2224,7 @@ public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) { } else { countField = context.relBuilder.field(countFieldName); } + RexNode rowNumberWindowOver = PlanUtils.makeOver( context, @@ -1113,26 +2235,46 @@ public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) { List.of(countField), WindowFrame.toCurrentRow()); context.relBuilder.projectPlus( - context.relBuilder.alias(rowNumberWindowOver, ROW_NUMBER_COLUMN_NAME)); + context.relBuilder.alias(rowNumberWindowOver, ROW_NUMBER_COLUMN_FOR_RARE_TOP)); // 3. filter row_number() <= k in each partition - Integer N = (Integer) arguments.get("noOfResults").getValue(); + int k = node.getNoOfResults(); context.relBuilder.filter( context.relBuilder.lessThanOrEqual( - context.relBuilder.field(ROW_NUMBER_COLUMN_NAME), context.relBuilder.literal(N))); + context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_RARE_TOP), + context.relBuilder.literal(k))); // 4. project final output. the default output is group by list + field list - Boolean showCount = (Boolean) arguments.get("showCount").getValue(); + Boolean showCount = (Boolean) argumentMap.get(RareTopN.Option.showCount.name()).getValue(); if (showCount) { - context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_NAME)); + context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_RARE_TOP)); } else { context.relBuilder.projectExcept( - context.relBuilder.field(ROW_NUMBER_COLUMN_NAME), + context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_RARE_TOP), context.relBuilder.field(countFieldName)); } return context.relBuilder.peek(); } + private static void addIgnoreNullBucketHintToAggregate(CalcitePlanContext context) { + final RelHint statHits = + RelHint.builder("stats_args").hintOption(Argument.BUCKET_NULLABLE, "false").build(); + assert context.relBuilder.peek() instanceof LogicalAggregate + : "Stats hits should be added to LogicalAggregate"; + context.relBuilder.hints(statHits); + context + .relBuilder + .getCluster() + .setHintStrategies( + HintStrategyTable.builder() + .hintStrategy( + "stats_args", + (hint, rel) -> { + return rel instanceof LogicalAggregate; + }) + .build()); + } + @Override public RelNode visitTableFunction(TableFunction node, CalcitePlanContext context) { throw new CalciteUnsupportedException("Table function is unsupported in Calcite"); @@ -1159,7 +2301,7 @@ public RelNode visitFlatten(Flatten node, CalcitePlanContext context) { List fieldsToExpand = relBuilder.peek().getRowType().getFieldList().stream() .filter(f -> f.getName().startsWith(fieldName + ".")) - .collect(Collectors.toList()); + .toList(); List expandedFieldNames; if (node.getAliases() != null) { @@ -1195,6 +2337,334 @@ public RelNode visitFlatten(Flatten node, CalcitePlanContext context) { return relBuilder.peek(); } + /** Helper method to get the function name for proper column naming */ + private String getValueFunctionName(UnresolvedExpression aggregateFunction) { + if (aggregateFunction instanceof Alias) { + return ((Alias) aggregateFunction).getName(); + } + if (!(aggregateFunction instanceof AggregateFunction)) { + return "value"; + } + + AggregateFunction aggFunc = (AggregateFunction) aggregateFunction; + String funcName = aggFunc.getFuncName().toLowerCase(); + List args = new ArrayList<>(); + if (aggFunc.getField() != null) { + args.add(aggFunc.getField()); + } + if (aggFunc.getArgList() != null) { + args.addAll(aggFunc.getArgList()); + } + + if (args.isEmpty() || funcName.equals("count")) { + // Special case for count() to show as just "count" instead of "count(AllFields())" + return "count"; + } + + // Build the full function call string like "avg(cpu_usage)" + StringBuilder sb = new StringBuilder(funcName).append("("); + for (int i = 0; i < args.size(); i++) { + if (i > 0) sb.append(", "); + if (args.get(i) instanceof Field) { + sb.append(((Field) args.get(i)).getField().toString()); + } else { + sb.append(args.get(i).toString()); + } + } + sb.append(")"); + return sb.toString(); + } + + /** Transforms timechart command into SQL-based operations. */ + @Override + public RelNode visitTimechart( + org.opensearch.sql.ast.tree.Timechart node, CalcitePlanContext context) { + visitChildren(node, context); + + // Extract parameters + UnresolvedExpression spanExpr = node.getBinExpression(); + + List groupExprList = Arrays.asList(spanExpr); + + // Handle no by field case + if (node.getByField() == null) { + String valueFunctionName = getValueFunctionName(node.getAggregateFunction()); + + // Create group expression list with just the timestamp span but use a different alias + // to avoid @timestamp naming conflict + List simpleGroupExprList = new ArrayList<>(); + simpleGroupExprList.add(new Alias("timestamp", spanExpr)); + // Create agg expression list with the aggregate function + List simpleAggExprList = + List.of(new Alias(valueFunctionName, node.getAggregateFunction())); + // Create an Aggregation object + Aggregation aggregation = + new Aggregation( + simpleAggExprList, + Collections.emptyList(), + simpleGroupExprList, + null, + Collections.emptyList()); + // Use visitAggregation to handle the aggregation and column naming + RelNode result = visitAggregation(aggregation, context); + // Push the result and add explicit projection to get [@timestamp, count] order + context.relBuilder.push(result); + // Reorder fields: timestamp first, then count + context.relBuilder.project( + context.relBuilder.field("timestamp"), context.relBuilder.field(valueFunctionName)); + // Rename timestamp to @timestamp + context.relBuilder.rename(List.of("@timestamp", valueFunctionName)); + + context.relBuilder.sort(context.relBuilder.field(0)); + return context.relBuilder.peek(); + } + + // Extract parameters for byField case + UnresolvedExpression byField = node.getByField(); + String byFieldName = ((Field) byField).getField().toString(); + String valueFunctionName = getValueFunctionName(node.getAggregateFunction()); + + int limit = Optional.ofNullable(node.getLimit()).orElse(10); + boolean useOther = Optional.ofNullable(node.getUseOther()).orElse(true); + + try { + // Step 1: Initial aggregation - IMPORTANT: order is [spanExpr, byField] + groupExprList = Arrays.asList(spanExpr, byField); + aggregateWithTrimming(groupExprList, List.of(node.getAggregateFunction()), context); + + // First rename the timestamp field (2nd to last) to @timestamp + List fieldNames = context.relBuilder.peek().getRowType().getFieldNames(); + List renamedFields = new ArrayList<>(fieldNames); + // TODO: Fix aggregateWithTrimming reordering + renamedFields.set(fieldNames.size() - 2, "@timestamp"); + context.relBuilder.rename(renamedFields); + + // Then reorder: @timestamp first, then byField, then value function + List outputFields = context.relBuilder.fields(); + List reordered = new ArrayList<>(); + reordered.add(context.relBuilder.field("@timestamp")); // timestamp first + reordered.add(context.relBuilder.field(byFieldName)); // byField second + reordered.add(outputFields.get(outputFields.size() - 1)); // value function last + context.relBuilder.project(reordered); + + // Handle no limit case - just sort and return with proper field aliases + if (limit == 0) { + // Add final projection with proper aliases: [@timestamp, byField, valueFunctionName] + context.relBuilder.project( + context.relBuilder.alias(context.relBuilder.field(0), "@timestamp"), + context.relBuilder.alias(context.relBuilder.field(1), byFieldName), + context.relBuilder.alias(context.relBuilder.field(2), valueFunctionName)); + context.relBuilder.sort(context.relBuilder.field(0), context.relBuilder.field(1)); + return context.relBuilder.peek(); + } + + // Use known field positions after reordering: 0=@timestamp, 1=byField, 2=value + RelNode completeResults = context.relBuilder.build(); + + // Step 2: Find top N categories using window function approach (more efficient than separate + // aggregation) + RelNode topCategories = buildTopCategoriesQuery(completeResults, limit, context); + + // Step 3: Apply OTHER logic with single pass + return buildFinalResultWithOther( + completeResults, topCategories, byFieldName, valueFunctionName, useOther, limit, context); + + } catch (Exception e) { + throw new RuntimeException("Error in visitTimechart: " + e.getMessage(), e); + } + } + + /** Build top categories query - simpler approach that works better with OTHER handling */ + private RelNode buildTopCategoriesQuery( + RelNode completeResults, int limit, CalcitePlanContext context) { + context.relBuilder.push(completeResults); + + // Filter out null values when determining top categories - null should not count towards limit + context.relBuilder.filter(context.relBuilder.isNotNull(context.relBuilder.field(1))); + + // Get totals for non-null categories - field positions: 0=@timestamp, 1=byField, 2=value + context.relBuilder.aggregate( + context.relBuilder.groupKey(context.relBuilder.field(1)), + context.relBuilder.sum(context.relBuilder.field(2)).as("grand_total")); + + // Apply sorting and limit to non-null categories only + context.relBuilder.sort(context.relBuilder.desc(context.relBuilder.field("grand_total"))); + if (limit > 0) { + context.relBuilder.limit(0, limit); + } + + return context.relBuilder.build(); + } + + /** Build final result with OTHER category using efficient single-pass approach */ + private RelNode buildFinalResultWithOther( + RelNode completeResults, + RelNode topCategories, + String byFieldName, + String valueFunctionName, + boolean useOther, + int limit, + CalcitePlanContext context) { + + // Use zero-filling for count aggregations, standard result for others + if (valueFunctionName.equals("count")) { + return buildZeroFilledResult( + completeResults, topCategories, byFieldName, valueFunctionName, useOther, limit, context); + } else { + return buildStandardResult( + completeResults, topCategories, byFieldName, valueFunctionName, useOther, context); + } + } + + /** Build standard result without zero-filling */ + private RelNode buildStandardResult( + RelNode completeResults, + RelNode topCategories, + String byFieldName, + String valueFunctionName, + boolean useOther, + CalcitePlanContext context) { + + context.relBuilder.push(completeResults); + context.relBuilder.push(topCategories); + + // LEFT JOIN to identify top categories - field positions: 0=@timestamp, 1=byField, 2=value + context.relBuilder.join( + org.apache.calcite.rel.core.JoinRelType.LEFT, + context.relBuilder.equals( + context.relBuilder.field(2, 0, 1), context.relBuilder.field(2, 1, 0))); + + // Calculate field position after join + int topCategoryFieldIndex = completeResults.getRowType().getFieldCount(); + + // Create CASE expression for OTHER logic + RexNode categoryExpr = createOtherCaseExpression(topCategoryFieldIndex, 1, context); + + // Project and aggregate + context.relBuilder.project( + context.relBuilder.alias(context.relBuilder.field(0), "@timestamp"), + context.relBuilder.alias(categoryExpr, byFieldName), + context.relBuilder.alias(context.relBuilder.field(2), valueFunctionName)); + + context.relBuilder.aggregate( + context.relBuilder.groupKey(context.relBuilder.field(0), context.relBuilder.field(1)), + context.relBuilder.sum(context.relBuilder.field(2)).as(valueFunctionName)); + + applyFiltersAndSort(useOther, context); + return context.relBuilder.peek(); + } + + /** Helper to create OTHER case expression - preserves NULL as a category */ + private RexNode createOtherCaseExpression( + int topCategoryFieldIndex, int byIndex, CalcitePlanContext context) { + return context.relBuilder.call( + org.apache.calcite.sql.fun.SqlStdOperatorTable.CASE, + context.relBuilder.isNotNull(context.relBuilder.field(topCategoryFieldIndex)), + context.relBuilder.field(byIndex), // Keep original value (including NULL) + context.relBuilder.call( + org.apache.calcite.sql.fun.SqlStdOperatorTable.CASE, + context.relBuilder.isNull(context.relBuilder.field(byIndex)), + context.relBuilder.literal(null), // Preserve NULL as NULL + context.relBuilder.literal("OTHER"))); + } + + /** Helper to apply filters and sorting */ + private void applyFiltersAndSort(boolean useOther, CalcitePlanContext context) { + if (!useOther) { + context.relBuilder.filter( + context.relBuilder.notEquals( + context.relBuilder.field(1), context.relBuilder.literal("OTHER"))); + } + context.relBuilder.sort(context.relBuilder.field(0), context.relBuilder.field(1)); + } + + /** Build zero-filled result using fillnull pattern - treat NULL as just another category */ + private RelNode buildZeroFilledResult( + RelNode completeResults, + RelNode topCategories, + String byFieldName, + String valueFunctionName, + boolean useOther, + int limit, + CalcitePlanContext context) { + + // Get all unique timestamps - field positions: 0=@timestamp, 1=byField, 2=value + context.relBuilder.push(completeResults); + context.relBuilder.aggregate(context.relBuilder.groupKey(context.relBuilder.field(0))); + RelNode allTimestamps = context.relBuilder.build(); + + // Get all categories for zero-filling - apply OTHER logic here too + context.relBuilder.push(completeResults); + context.relBuilder.push(topCategories); + context.relBuilder.join( + org.apache.calcite.rel.core.JoinRelType.LEFT, + context.relBuilder.call( + org.apache.calcite.sql.fun.SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, + context.relBuilder.field(2, 0, 1), + context.relBuilder.field(2, 1, 0))); + + int topCategoryFieldIndex = completeResults.getRowType().getFieldCount(); + RexNode categoryExpr = createOtherCaseExpression(topCategoryFieldIndex, 1, context); + + context.relBuilder.project(categoryExpr); + context.relBuilder.aggregate(context.relBuilder.groupKey(context.relBuilder.field(0))); + RelNode allCategories = context.relBuilder.build(); + + // Cross join timestamps with ALL categories (including OTHER) for zero-filling + context.relBuilder.push(allTimestamps); + context.relBuilder.push(allCategories); + context.relBuilder.join( + org.apache.calcite.rel.core.JoinRelType.INNER, context.relBuilder.literal(true)); + + // Create zero-filled combinations with count=0 + context.relBuilder.project( + context.relBuilder.alias( + context.relBuilder.cast(context.relBuilder.field(0), SqlTypeName.TIMESTAMP), + "@timestamp"), + context.relBuilder.alias(context.relBuilder.field(1), byFieldName), + context.relBuilder.alias(context.relBuilder.literal(0), valueFunctionName)); + RelNode zeroFilledCombinations = context.relBuilder.build(); + + // Get actual results with OTHER logic applied + context.relBuilder.push(completeResults); + context.relBuilder.push(topCategories); + context.relBuilder.join( + org.apache.calcite.rel.core.JoinRelType.LEFT, + // Use IS NOT DISTINCT FROM for proper null handling in join + context.relBuilder.call( + org.apache.calcite.sql.fun.SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, + context.relBuilder.field(2, 0, 1), + context.relBuilder.field(2, 1, 0))); + + int actualTopCategoryFieldIndex = completeResults.getRowType().getFieldCount(); + RexNode actualCategoryExpr = createOtherCaseExpression(actualTopCategoryFieldIndex, 1, context); + + context.relBuilder.project( + context.relBuilder.alias( + context.relBuilder.cast(context.relBuilder.field(0), SqlTypeName.TIMESTAMP), + "@timestamp"), + context.relBuilder.alias(actualCategoryExpr, byFieldName), + context.relBuilder.alias(context.relBuilder.field(2), valueFunctionName)); + + context.relBuilder.aggregate( + context.relBuilder.groupKey(context.relBuilder.field(0), context.relBuilder.field(1)), + context.relBuilder.sum(context.relBuilder.field(2)).as("actual_count")); + RelNode actualResults = context.relBuilder.build(); + + // UNION zero-filled with actual results + context.relBuilder.push(actualResults); + context.relBuilder.push(zeroFilledCombinations); + context.relBuilder.union(false); + + // Aggregate to combine actual and zero-filled data + context.relBuilder.aggregate( + context.relBuilder.groupKey(context.relBuilder.field(0), context.relBuilder.field(1)), + context.relBuilder.sum(context.relBuilder.field(2)).as(valueFunctionName)); + + applyFiltersAndSort(useOther, context); + return context.relBuilder.peek(); + } + @Override public RelNode visitTrendline(Trendline node, CalcitePlanContext context) { visitChildren(node, context); @@ -1244,7 +2714,7 @@ public RelNode visitTrendline(Trendline node, CalcitePlanContext context) { RexNode thenExpr; switch (trendlineComputation.getComputationType()) { - case SMA: + case TrendlineType.SMA: // THEN avg(field) over (ROWS (windowSize-1) PRECEDING) thenExpr = PlanUtils.makeOver( @@ -1256,7 +2726,7 @@ public RelNode visitTrendline(Trendline node, CalcitePlanContext context) { List.of(), windowFrame); break; - case WMA: + case TrendlineType.WMA: // THEN wma expression thenExpr = buildWmaRexNode( @@ -1345,6 +2815,61 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { return context.relBuilder.peek(); } + @Override + public RelNode visitValues(Values values, CalcitePlanContext context) { + if (values.getValues() == null || values.getValues().isEmpty()) { + context.relBuilder.values(context.relBuilder.getTypeFactory().builder().build()); + return context.relBuilder.peek(); + } else { + throw new CalciteUnsupportedException("Explicit values node is unsupported in Calcite"); + } + } + + @Override + public RelNode visitReplace(Replace node, CalcitePlanContext context) { + visitChildren(node, context); + + List fieldNames = context.relBuilder.peek().getRowType().getFieldNames(); + + // Create a set of field names to replace for quick lookup + Set fieldsToReplace = + node.getFieldList().stream().map(f -> f.getField().toString()).collect(Collectors.toSet()); + + // Validate that all fields to replace exist by calling field() on each + // This leverages relBuilder.field()'s built-in validation which throws + // IllegalArgumentException if any field doesn't exist + for (String fieldToReplace : fieldsToReplace) { + context.relBuilder.field(fieldToReplace); + } + + List projectList = new ArrayList<>(); + + // Project all fields, replacing specified ones in-place + for (String fieldName : fieldNames) { + if (fieldsToReplace.contains(fieldName)) { + // Replace this field in-place with all pattern/replacement pairs applied sequentially + RexNode fieldRef = context.relBuilder.field(fieldName); + + // Apply all replacement pairs sequentially (nested REPLACE calls) + for (ReplacePair pair : node.getReplacePairs()) { + RexNode patternNode = rexVisitor.analyze(pair.getPattern(), context); + RexNode replacementNode = rexVisitor.analyze(pair.getReplacement(), context); + fieldRef = + context.relBuilder.call( + SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); + } + + projectList.add(fieldRef); + } else { + // Keep original field unchanged + projectList.add(context.relBuilder.field(fieldName)); + } + } + + context.relBuilder.project(projectList, fieldNames); + return context.relBuilder.peek(); + } + private void buildParseRelNode(Parse node, CalcitePlanContext context) { RexNode sourceField = rexVisitor.analyze(node.getSourceField(), context); ParseMethod parseMethod = node.getParseMethod(); @@ -1363,32 +2888,82 @@ private void buildParseRelNode(Parse node, CalcitePlanContext context) { pattern, context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), true) }; if (ParseMethod.PATTERNS.equals(parseMethod)) { - rexNodeList = ArrayUtils.add(rexNodeList, context.relBuilder.literal("<*>")); + rexNodeList = + ArrayUtils.add( + rexNodeList, + context.rexBuilder.makeLiteral( + "<*>", + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true)); + } else { + rexNodeList = + ArrayUtils.add( + rexNodeList, + context.rexBuilder.makeLiteral( + parseMethod.getName(), + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true)); } List newFields = new ArrayList<>(); for (String groupCandidate : groupCandidates) { RexNode innerRex = PPLFuncImpTable.INSTANCE.resolve( context.rexBuilder, ParseUtils.BUILTIN_FUNCTION_MAP.get(parseMethod), rexNodeList); - if (ParseMethod.GROK.equals(parseMethod)) { + if (!ParseMethod.PATTERNS.equals(parseMethod)) { newFields.add( PPLFuncImpTable.INSTANCE.resolve( context.rexBuilder, BuiltinFunctionName.INTERNAL_ITEM, innerRex, - context.relBuilder.literal(groupCandidate))); + context.rexBuilder.makeLiteral( + groupCandidate, + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true))); } else { - newFields.add(innerRex); + RexNode emptyString = + context.rexBuilder.makeLiteral( + "", context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), true); + RexNode isEmptyCondition = + context.rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, sourceField, emptyString); + RexNode isNullCondition = + context.rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, sourceField); + // Calcite regexp_replace(string, string, string) doesn't accept empty string. + // So use case when condition here to handle corner cases + newFields.add( + context.rexBuilder.makeCall( + SqlStdOperatorTable.CASE, // case + isNullCondition, + emptyString, // when field is NULL then '' + isEmptyCondition, + emptyString, // when field = '' then '' + innerRex // else regexp_replace(field, regex, replace_string) + )); } } projectPlusOverriding(newFields, groupCandidates, context); } + /** + * CALCITE-6981 introduced a stricter type checking for Array type in {@link RexToLixTranslator}. + * We defined a MAP(VARCHAR, ANY) in {@link UserDefinedFunctionUtils#nullablePatternAggList}, when + * we convert the value type to ArraySqlType, it will check the source data type by {@link + * RelDataType#getComponentType()} which will return null due to the source type is ANY. + */ + private RexNode explicitMapType( + CalcitePlanContext context, RexNode origin, SqlTypeName targetType) { + MapSqlType originalMapType = (MapSqlType) origin.getType(); + ArraySqlType newValueType = + new ArraySqlType(context.rexBuilder.getTypeFactory().createSqlType(targetType), true); + MapSqlType newMapType = new MapSqlType(originalMapType.getKeyType(), newValueType, true); + return new RexInputRef(((RexInputRef) origin).getIndex(), newMapType); + } + private void flattenParsedPattern( String originalPatternResultAlias, RexNode parsedNode, CalcitePlanContext context, - boolean flattenPatternCount) { + boolean flattenPatternAggResult, + Boolean showNumberedToken) { List fattenedNodes = new ArrayList<>(); List projectNames = new ArrayList<>(); // Flatten map struct fields @@ -1404,7 +2979,7 @@ private void flattenParsedPattern( true); fattenedNodes.add(context.relBuilder.alias(patternExpr, originalPatternResultAlias)); projectNames.add(originalPatternResultAlias); - if (flattenPatternCount) { + if (flattenPatternAggResult) { RexNode patternCountExpr = context.rexBuilder.makeCast( context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.BIGINT), @@ -1418,18 +2993,38 @@ private void flattenParsedPattern( fattenedNodes.add(context.relBuilder.alias(patternCountExpr, PatternUtils.PATTERN_COUNT)); projectNames.add(PatternUtils.PATTERN_COUNT); } - RexNode tokensExpr = - context.rexBuilder.makeCast( - UserDefinedFunctionUtils.tokensMap, - PPLFuncImpTable.INSTANCE.resolve( - context.rexBuilder, - BuiltinFunctionName.INTERNAL_ITEM, - parsedNode, - context.rexBuilder.makeLiteral(PatternUtils.TOKENS)), - true, - true); - fattenedNodes.add(context.relBuilder.alias(tokensExpr, PatternUtils.TOKENS)); - projectNames.add(PatternUtils.TOKENS); + if (showNumberedToken) { + RexNode tokensExpr = + context.rexBuilder.makeCast( + UserDefinedFunctionUtils.tokensMap, + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_ITEM, + parsedNode, + context.rexBuilder.makeLiteral(PatternUtils.TOKENS)), + true, + true); + fattenedNodes.add(context.relBuilder.alias(tokensExpr, PatternUtils.TOKENS)); + projectNames.add(PatternUtils.TOKENS); + } + if (flattenPatternAggResult) { + RexNode sampleLogsExpr = + context.rexBuilder.makeCast( + context + .rexBuilder + .getTypeFactory() + .createArrayType( + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), -1), + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_ITEM, + explicitMapType(context, parsedNode, SqlTypeName.VARCHAR), + context.rexBuilder.makeLiteral(PatternUtils.SAMPLE_LOGS)), + true, + true); + fattenedNodes.add(context.relBuilder.alias(sampleLogsExpr, PatternUtils.SAMPLE_LOGS)); + projectNames.add(PatternUtils.SAMPLE_LOGS); + } projectPlusOverriding(fattenedNodes, projectNames, context); } @@ -1480,4 +3075,115 @@ private void buildExpandRelNode( context.relBuilder.rename(names); } } + + /** Creates an optimized sed call using native Calcite functions */ + private RexNode createOptimizedSedCall( + RexNode fieldRex, String sedExpression, CalcitePlanContext context) { + if (sedExpression.startsWith("s/")) { + return createOptimizedSubstitution(fieldRex, sedExpression, context); + } else if (sedExpression.startsWith("y/")) { + return createOptimizedTransliteration(fieldRex, sedExpression, context); + } else { + throw new RuntimeException("Unsupported sed pattern: " + sedExpression); + } + } + + /** Creates optimized substitution calls for s/pattern/replacement/flags syntax. */ + private RexNode createOptimizedSubstitution( + RexNode fieldRex, String sedExpression, CalcitePlanContext context) { + try { + // Parse sed substitution: s/pattern/replacement/flags + if (!sedExpression.matches("s/.+/.*/.*")) { + throw new IllegalArgumentException("Invalid sed substitution format"); + } + + // Find the delimiters - sed format is s/pattern/replacement/flags + int firstDelimiter = sedExpression.indexOf('/', 2); // First '/' after 's/' + int secondDelimiter = sedExpression.indexOf('/', firstDelimiter + 1); // Second '/' + int thirdDelimiter = sedExpression.indexOf('/', secondDelimiter + 1); // Third '/' (optional) + + if (firstDelimiter == -1 || secondDelimiter == -1) { + throw new IllegalArgumentException("Invalid sed substitution format"); + } + + String pattern = sedExpression.substring(2, firstDelimiter); + String replacement = sedExpression.substring(firstDelimiter + 1, secondDelimiter); + String flags = + secondDelimiter + 1 < sedExpression.length() + ? sedExpression.substring(secondDelimiter + 1) + : ""; + + // Convert sed backreferences (\1, \2) to Java style ($1, $2) + String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1"); + + if (flags.isEmpty()) { + // 3-parameter REGEXP_REPLACE + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_3, + fieldRex, + context.rexBuilder.makeLiteral(pattern), + context.rexBuilder.makeLiteral(javaReplacement)); + } else if (flags.matches("[gi]+")) { + // 4-parameter REGEXP_REPLACE with flags + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4, + fieldRex, + context.rexBuilder.makeLiteral(pattern), + context.rexBuilder.makeLiteral(javaReplacement), + context.rexBuilder.makeLiteral(flags)); + } else if (flags.matches("\\d+")) { + // 5-parameter REGEXP_REPLACE with occurrence + int occurrence = Integer.parseInt(flags); + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5, + fieldRex, + context.rexBuilder.makeLiteral(pattern), + context.rexBuilder.makeLiteral(javaReplacement), + context.relBuilder.literal(1), // start position + context.relBuilder.literal(occurrence)); + } else { + throw new RuntimeException( + "Unsupported sed flags: " + flags + " in expression: " + sedExpression); + } + } catch (Exception e) { + throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e); + } + } + + /** Creates optimized transliteration calls for y/from/to/ syntax. */ + private RexNode createOptimizedTransliteration( + RexNode fieldRex, String sedExpression, CalcitePlanContext context) { + try { + // Parse sed transliteration: y/from/to/ + if (!sedExpression.matches("y/.+/.*/.*")) { + throw new IllegalArgumentException("Invalid sed transliteration format"); + } + + int firstSlash = sedExpression.indexOf('/', 1); + int secondSlash = sedExpression.indexOf('/', firstSlash + 1); + int thirdSlash = sedExpression.indexOf('/', secondSlash + 1); + + if (firstSlash == -1 || secondSlash == -1) { + throw new IllegalArgumentException("Invalid sed transliteration format"); + } + + String from = sedExpression.substring(firstSlash + 1, secondSlash); + String to = + sedExpression.substring( + secondSlash + 1, thirdSlash != -1 ? thirdSlash : sedExpression.length()); + + // Use Calcite's native TRANSLATE3 function + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_TRANSLATE3, + fieldRex, + context.rexBuilder.makeLiteral(from), + context.rexBuilder.makeLiteral(to)); + } catch (Exception e) { + throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e); + } + } } diff --git a/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java b/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java new file mode 100644 index 00000000000..f1671e0eb63 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java @@ -0,0 +1,156 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.plan; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelRule; +import org.apache.calcite.rel.logical.LogicalAggregate; +import org.apache.calcite.rel.logical.LogicalProject; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexUtil; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.tools.RelBuilder; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.calcite.util.mapping.Mapping; +import org.apache.calcite.util.mapping.Mappings; +import org.apache.commons.lang3.tuple.Pair; +import org.immutables.value.Value; +import org.opensearch.sql.calcite.utils.CalciteUtils; + +/** + * Planner rule that merge multiple agg group fields into a single one, on which all other group + * fields depend. e.g. + * + *

stats ... by a, f1(a), f2(a) -> stats ... by a | eval `f1(a)` = f1(a), `f2(a)` = f2(a) + * + *

TODO: this rule could be expanded further for more cases: 1. support multiple base group + * fields, e.g. stats ... by a, f1(a), b, f2(b), f3(a, b) -> stats ... by a, b | eval `f1(a)` = + * f1(a), `f2(b)` = f2(b), `f3(a, b)` = f3(a, b) 2. support no base fields, e.g. stats ... by f1(a), + * f2(a) -> stats ... by a | eval `f1(a)` = f1(a), `f2(a)` = f2(a) | fields - a Note that one of + * these UDFs' output must have equivalent cardinality as `a`. + */ +@Value.Enclosing +public class PPLAggGroupMergeRule extends RelRule { + + /** Creates a OpenSearchAggregateConvertRule. */ + protected PPLAggGroupMergeRule(Config config) { + super(config); + } + + @Override + public void onMatch(RelOptRuleCall call) { + if (call.rels.length == 2) { + final LogicalAggregate aggregate = call.rel(0); + final LogicalProject project = call.rel(1); + apply(call, aggregate, project); + } else { + throw new AssertionError( + String.format( + "The length of rels should be %s but got %s", + this.operands.size(), call.rels.length)); + } + } + + public void apply(RelOptRuleCall call, LogicalAggregate aggregate, LogicalProject project) { + List groupSet = aggregate.getGroupSet().asList(); + List groupNodes = + groupSet.stream().map(group -> project.getProjects().get(group)).toList(); + Pair, List> baseFieldsAndOthers = + CalciteUtils.partition( + groupSet, i -> project.getProjects().get(i).getKind() == SqlKind.INPUT_REF); + List baseGroupList = baseFieldsAndOthers.getLeft(); + // TODO: support more base fields in the future. + if (baseGroupList.size() != 1) return; + Integer baseGroupField = baseGroupList.get(0); + RexInputRef baseGroupRef = (RexInputRef) project.getProjects().get(baseGroupField); + List otherGroupList = baseFieldsAndOthers.getRight(); + boolean allDependOnBaseField = + otherGroupList.stream() + .map(i -> project.getProjects().get(i)) + .allMatch(node -> isDependentField(node, List.of(baseGroupRef))); + if (!allDependOnBaseField) return; + + final RelBuilder relBuilder = call.builder(); + relBuilder.push(project); + + relBuilder.aggregate( + relBuilder.groupKey(ImmutableBitSet.of(baseGroupField)), aggregate.getAggCallList()); + + /* Build the final project-aggregate-project */ + final Mapping mapping = + Mappings.target( + List.of(baseGroupRef.getIndex()), + baseGroupRef.getIndex() + 1); // set source count greater than the max ref index + List parentProjections = new ArrayList<>(RexUtil.apply(mapping, groupNodes)); + List aggCallRefs = + relBuilder.fields( + IntStream.range(baseGroupList.size(), relBuilder.peek().getRowType().getFieldCount()) + .boxed() + .toList()); + parentProjections.addAll(aggCallRefs); + relBuilder.project(parentProjections); + call.transformTo(relBuilder.build()); + } + + /** Rule configuration. */ + @Value.Immutable + public interface Config extends RelRule.Config { + Config GROUP_MERGE = + ImmutablePPLAggGroupMergeRule.Config.builder() + .build() + .withOperandSupplier( + b0 -> + b0.operand(LogicalAggregate.class) + .predicate(Config::containsMultipleGroupSets) + .oneInput( + b1 -> + b1.operand(LogicalProject.class) + .predicate(Config::containsDependentFields) + .anyInputs())); + + static boolean containsMultipleGroupSets(LogicalAggregate aggregate) { + return aggregate.getGroupSet().cardinality() > 1; + } + + // Only rough predication here since we don't know which fields are group fields currently. + static boolean containsDependentFields(LogicalProject project) { + Set baseFields = + project.getProjects().stream() + .filter(node -> node.getKind() == SqlKind.INPUT_REF) + .collect(Collectors.toUnmodifiableSet()); + return project.getProjects().stream() + .anyMatch(node -> PPLAggGroupMergeRule.isDependentField(node, baseFields)); + } + + @Override + default PPLAggGroupMergeRule toRule() { + return new PPLAggGroupMergeRule(this); + } + } + + public static boolean isDependentField(RexNode node, Collection baseFields) { + // Always view literal field as dependent field here since we can always implement a function + // to transform a field into such a literal + if (node.getKind() == SqlKind.LITERAL) return true; + if (node.getKind() == SqlKind.INPUT_REF && baseFields.contains(node)) return true; + // Use !isAggregator to rule out window functions like row_number() + if (node instanceof RexCall + && ((RexCall) node).getOperator().isDeterministic() + && !((RexCall) node).getOperator().isAggregator()) { + return ((RexCall) node) + .getOperands().stream().allMatch(op -> isDependentField(op, baseFields)); + } + return false; + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java index def0e28bcdc..c9f30d650ec 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java @@ -12,23 +12,38 @@ import static org.apache.calcite.rex.RexWindowBounds.preceding; import com.google.common.collect.ImmutableList; +import java.lang.reflect.Method; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; import java.util.stream.Collectors; import javax.annotation.Nullable; import org.apache.calcite.plan.RelOptTable; import org.apache.calcite.rel.RelHomogeneousShuttle; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.RelShuttle; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.core.Sort; import org.apache.calcite.rel.core.TableScan; +import org.apache.calcite.rel.logical.LogicalProject; +import org.apache.calcite.rel.logical.LogicalSort; +import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexCorrelVariable; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; import org.apache.calcite.rex.RexVisitorImpl; +import org.apache.calcite.rex.RexWindow; import org.apache.calcite.rex.RexWindowBound; +import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.RelBuilder; +import org.apache.calcite.util.Pair; import org.apache.calcite.util.Util; import org.opensearch.sql.ast.AbstractNodeVisitor; import org.opensearch.sql.ast.Node; @@ -44,9 +59,13 @@ public interface PlanUtils { - String ROW_NUMBER_COLUMN_NAME = "_row_number_"; - String ROW_NUMBER_COLUMN_NAME_MAIN = "_row_number_main_"; - String ROW_NUMBER_COLUMN_NAME_SUBSEARCH = "_row_number_subsearch_"; + /** this is only for dedup command, do not reuse it in other command */ + String ROW_NUMBER_COLUMN_FOR_DEDUP = "_row_number_dedup_"; + + String ROW_NUMBER_COLUMN_FOR_RARE_TOP = "_row_number_rare_top_"; + String ROW_NUMBER_COLUMN_FOR_MAIN = "_row_number_main_"; + String ROW_NUMBER_COLUMN_FOR_SUBSEARCH = "_row_number_subsearch_"; + String ROW_NUMBER_COLUMN_FOR_STREAMSTATS = "__stream_seq__"; static SpanUnit intervalUnitToSpanUnit(IntervalUnit unit) { SpanUnit result; @@ -273,6 +292,9 @@ static RelBuilder.AggCall makeAggCall( /** Get all uniq input references from a RexNode. */ static List getInputRefs(RexNode node) { + if (node == null) { + return List.of(); + } List inputRefs = new ArrayList<>(); node.accept( new RexVisitorImpl(true) { @@ -289,7 +311,27 @@ public Void visitInputRef(RexInputRef inputRef) { /** Get all uniq input references from a list of RexNodes. */ static List getInputRefs(List nodes) { - return nodes.stream().flatMap(node -> getInputRefs(node).stream()).collect(Collectors.toList()); + return nodes.stream().flatMap(node -> getInputRefs(node).stream()).toList(); + } + + /** Get all uniq RexCall from RexNode with a predicate */ + static List getRexCall(RexNode node, Predicate predicate) { + List list = new ArrayList<>(); + node.accept( + new RexVisitorImpl(true) { + @Override + public Void visitCall(RexCall inputCall) { + if (predicate.test(inputCall)) { + if (!list.contains(inputCall)) { + list.add(inputCall); + } + } else { + inputCall.getOperands().forEach(call -> call.accept(this)); + } + return null; + } + }); + return list; } /** Get all uniq input references from a list of agg calls. */ @@ -298,7 +340,7 @@ static List getInputRefsFromAggCall(List aggCal .map(RelBuilder.AggCall::over) .map(RelBuilder.OverCall::toRex) .flatMap(rex -> getInputRefs(rex).stream()) - .collect(Collectors.toList()); + .toList(); } /** @@ -375,4 +417,134 @@ static RexNode derefMapCall(RexNode rexNode) { } return rexNode; } + + /** Check if contains RexOver introduced by dedup */ + static boolean containsRowNumberDedup(LogicalProject project) { + return project.getProjects().stream() + .anyMatch(p -> p instanceof RexOver && p.getKind() == SqlKind.ROW_NUMBER) + && project.getRowType().getFieldNames().contains(ROW_NUMBER_COLUMN_FOR_DEDUP); + } + + /** Check if contains RexOver introduced by dedup top/rare */ + static boolean containsRowNumberRareTop(LogicalProject project) { + return project.getProjects().stream() + .anyMatch(p -> p instanceof RexOver && p.getKind() == SqlKind.ROW_NUMBER) + && project.getRowType().getFieldNames().contains(ROW_NUMBER_COLUMN_FOR_RARE_TOP); + } + + /** Get all RexWindow list from LogicalProject */ + static List getRexWindowFromProject(LogicalProject project) { + final List res = new ArrayList<>(); + final RexVisitorImpl visitor = + new RexVisitorImpl<>(true) { + @Override + public Void visitOver(RexOver over) { + res.add(over.getWindow()); + return null; + } + }; + visitor.visitEach(project.getProjects()); + return res; + } + + static List getSelectColumns(List rexNodes) { + final List selectedColumns = new ArrayList<>(); + final RexVisitorImpl visitor = + new RexVisitorImpl(true) { + @Override + public Void visitInputRef(RexInputRef inputRef) { + if (!selectedColumns.contains(inputRef.getIndex())) { + selectedColumns.add(inputRef.getIndex()); + } + return null; + } + }; + visitor.visitEach(rexNodes); + return selectedColumns; + } + + // `RelDecorrelator` may generate a Project with duplicated fields, e.g. Project($0,$0). + // There will be problem if pushing down the pattern like `Aggregate(AGG($0),{1})-Project($0,$0)`, + // as it will lead to field-name conflict. + // We should wait and rely on `AggregateProjectMergeRule` to mitigate it by having this constraint + // Nevertheless, that rule cannot handle all cases if there is RexCall in the Project, + // e.g. Project($0, $0, +($0,1)). We cannot push down the Aggregate for this corner case. + // TODO: Simplify the Project where there is RexCall by adding a new rule. + static boolean distinctProjectList(LogicalProject project) { + // Change to Set> to resolve + // https://github.com/opensearch-project/sql/issues/4347 + Set> rexSet = new HashSet<>(); + return project.getNamedProjects().stream().allMatch(rexSet::add); + } + + static boolean containsRexOver(LogicalProject project) { + return project.getProjects().stream().anyMatch(RexOver::containsOver); + } + + /** + * The LogicalSort is a LIMIT that should be pushed down when its fetch field is not null and its + * collation is empty. For example: sort name | head 5 should not be pushed down + * because it has a field collation. + * + * @param sort The LogicalSort to check. + * @return True if the LogicalSort is a LIMIT, false otherwise. + */ + static boolean isLogicalSortLimit(LogicalSort sort) { + return sort.fetch != null; + } + + static boolean projectContainsExpr(Project project) { + return project.getProjects().stream().anyMatch(p -> p instanceof RexCall); + } + + static boolean sortByFieldsOnly(Sort sort) { + return !sort.getCollation().getFieldCollations().isEmpty() && sort.fetch == null; + } + + /** + * Get a string representation of the argument types expressed in ExprType for error messages. + * + * @param argTypes the list of argument types as {@link RelDataType} + * @return a string in the format [type1,type2,...] representing the argument types + */ + static String getActualSignature(List argTypes) { + return "[" + + argTypes.stream() + .map(OpenSearchTypeFactory::convertRelDataTypeToExprType) + .map(Objects::toString) + .collect(Collectors.joining(",")) + + "]"; + } + + /** + * Check if the RexNode contains any CorrelVariable. + * + * @param node the RexNode to check + * @return true if the RexNode contains any CorrelVariable, false otherwise + */ + static boolean containsCorrelVariable(RexNode node) { + try { + node.accept( + new RexVisitorImpl(true) { + @Override + public Void visitCorrelVariable(RexCorrelVariable correlVar) { + throw new RuntimeException("Correl found"); + } + }); + return false; + } catch (Exception e) { + return true; + } + } + + /** Adds a rel node to the top of the stack while preserving the field names and aliases. */ + static void replaceTop(RelBuilder relBuilder, RelNode relNode) { + try { + Method method = RelBuilder.class.getDeclaredMethod("replaceTop", RelNode.class); + method.setAccessible(true); + method.invoke(relBuilder, relNode); + } catch (Exception e) { + throw new IllegalStateException("Unable to invoke RelBuilder.replaceTop", e); + } + } } diff --git a/docs/category.json b/docs/category.json index a33cac1d17c..7ebe643373b 100644 --- a/docs/category.json +++ b/docs/category.json @@ -1,38 +1,62 @@ { "bash": [ - "user/ppl/interfaces/endpoint.rst", - "user/ppl/interfaces/protocol.rst", - "user/ppl/admin/settings.rst", "user/optimization/optimization.rst", "user/admin/settings.rst" ], - "ppl_cli": [ + "bash_calcite": [ + "user/ppl/interfaces/endpoint.rst", + "user/ppl/interfaces/protocol.rst" + ], + "sql_cli": [ + "user/dql/expressions.rst", + "user/general/comments.rst", + "user/general/datatypes.rst", + "user/general/identifiers.rst", + "user/general/values.rst", + "user/dql/basics.rst", + "user/dql/functions.rst", + "user/dql/window.rst", + "user/beyond/partiql.rst", + "user/dql/aggregations.rst", + "user/dql/complex.rst", + "user/dql/metadata.rst" + ], + "ppl_cli_calcite": [ "user/ppl/cmd/ad.rst", + "user/ppl/cmd/append.rst", + "user/ppl/cmd/bin.rst", "user/ppl/cmd/dedup.rst", "user/ppl/cmd/describe.rst", - "user/ppl/cmd/showdatasources.rst", - "user/ppl/cmd/information_schema.rst", + "user/ppl/cmd/eventstats.rst", "user/ppl/cmd/eval.rst", "user/ppl/cmd/fields.rst", "user/ppl/cmd/fillnull.rst", "user/ppl/cmd/grok.rst", "user/ppl/cmd/head.rst", + "user/ppl/cmd/join.rst", + "user/ppl/cmd/lookup.rst", "user/ppl/cmd/parse.rst", "user/ppl/cmd/patterns.rst", "user/ppl/cmd/rare.rst", + "user/ppl/cmd/regex.rst", "user/ppl/cmd/rename.rst", + "user/ppl/cmd/multisearch.rst", + "user/ppl/cmd/replace.rst", + "user/ppl/cmd/rex.rst", "user/ppl/cmd/search.rst", + "user/ppl/cmd/showdatasources.rst", "user/ppl/cmd/sort.rst", "user/ppl/cmd/stats.rst", + "user/ppl/cmd/streamstats.rst", + "user/ppl/cmd/subquery.rst", "user/ppl/cmd/syntax.rst", - "user/ppl/cmd/trendline.rst", + "user/ppl/cmd/timechart.rst", + "user/ppl/cmd/search.rst", + "user/ppl/functions/statistical.rst", "user/ppl/cmd/top.rst", + "user/ppl/cmd/trendline.rst", "user/ppl/cmd/where.rst", - "user/ppl/cmd/join.rst", - "user/ppl/cmd/lookup.rst", - "user/ppl/cmd/subquery.rst", - "user/ppl/general/identifiers.rst", - "user/ppl/general/datatypes.rst", + "user/ppl/functions/collection.rst", "user/ppl/functions/condition.rst", "user/ppl/functions/datetime.rst", "user/ppl/functions/expressions.rst", @@ -40,19 +64,12 @@ "user/ppl/functions/json.rst", "user/ppl/functions/math.rst", "user/ppl/functions/relevance.rst", - "user/ppl/functions/string.rst" + "user/ppl/functions/string.rst", + "user/ppl/functions/conversion.rst", + "user/ppl/general/datatypes.rst", + "user/ppl/general/identifiers.rst" ], - "sql_cli": [ - "user/dql/expressions.rst", - "user/general/comments.rst", - "user/general/datatypes.rst", - "user/general/identifiers.rst", - "user/general/values.rst", - "user/dql/basics.rst", - "user/dql/functions.rst", - "user/dql/window.rst", - "user/beyond/partiql.rst", - "user/dql/aggregations.rst", - "user/dql/complex.rst" + "bash_settings": [ + "user/ppl/admin/settings.rst" ] } diff --git a/docs/user/ppl/cmd/streamstats.rst b/docs/user/ppl/cmd/streamstats.rst new file mode 100644 index 00000000000..0ac18637fec --- /dev/null +++ b/docs/user/ppl/cmd/streamstats.rst @@ -0,0 +1,229 @@ +=========== +streamstats +=========== + +.. rubric:: Table of contents + +.. contents:: + :local: + :depth: 2 + + +Description +=========== +The ``streamstats`` command is used to calculate cumulative or rolling statistics as events are processed in order. Unlike ``stats`` or ``eventstats`` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. + +Key aspects of `streamstats`: + +1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. +2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). +3. Retains all original events and appends new fields containing the calculated statistics. +4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. + +Difference between ``stats``, ``eventstats`` and ``streamstats`` + +All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: + +* Transformation Behavior: + * ``stats``: Transforms all events into an aggregated result table, losing original event structure. + * ``eventstats``: Adds aggregation results as new fields to the original events without removing the event structure. + * ``streamstats``: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. +* Output Format: + * ``stats``: Output contains only aggregated values. Original raw events are not preserved. + * ``eventstats``: Original events remain, with extra fields containing summary statistics. + * ``streamstats``: Original events remain, with extra fields containing running totals or cumulative statistics. +* Aggregation Scope: + * ``stats``: Based on all events in the search (or groups defined by BY clause). + * ``eventstats``: Based on all relevant events, then the result is added back to each event in the group. + * ``streamstats``: Calculations occur progressively as each event is processed; can be scoped by window. +* Use Cases: + * ``stats``: When only aggregated results are needed (e.g., counts, averages, sums). + * ``eventstats``: When aggregated statistics are needed alongside original event data. + * ``streamstats``: When a running total or cumulative statistic is needed across event streams. + +Syntax +====== +streamstats [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause] + +* function: mandatory. A aggregation function or window function. +* current: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=. **Default:** true. +* window: optional. Specifies the number of events to use when computing the statistics. Syntax: window=. **Default:** 0, which means that all previous and current events are used. +* global: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=. **Default:** true. +* reset_before: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("")". **Default:** false. +* reset_after: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("")". **Default:** false. +* by-clause: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no is specified, all events are processed as a single group and running statistics are computed across the entire event stream. +* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, ``span(age, 10)`` creates 10-year age buckets, ``span(timestamp, 1h)`` creates hourly buckets. + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +Aggregation Functions +===================== + +The streamstats command supports the following aggregation functions: + +* COUNT: Count of values +* SUM: Sum of numeric values +* AVG: Average of numeric values +* MAX: Maximum value +* MIN: Minimum value +* VAR_SAMP: Sample variance +* VAR_POP: Population variance +* STDDEV_SAMP: Sample standard deviation +* STDDEV_POP: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* EARLIEST: Earliest value by timestamp +* LATEST: Latest value by timestamp + +For detailed documentation of each function, see `Aggregation Functions <../functions/aggregation.rst>`_. + +Usage +===== + +Streamstats:: + + source = table | streamstats avg(a) + source = table | streamstats current = false avg(a) + source = table | streamstats window = 5 sum(b) + source = table | streamstats current = false window = 2 max(a) + source = table | where a < 50 | streamstats count(c) + source = table | streamstats min(c), max(c) by b + source = table | streamstats count(c) as count_by by b | where count_by > 1000 + source = table | streamstats dc(field) as distinct_count + source = table | streamstats distinct_count(category) by region + source = table | streamstats current=false window=2 global=false avg(a) by b + source = table | streamstats window=2 reset_before=a>31 avg(b) + source = table | streamstats current=false reset_after=a>31 avg(b) by c + + +Example 1: Calculate the running average, sum, and count of a field by group +============================================================================ + +This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. + +PPL query:: + + os> source=accounts | streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender; + fetched rows / total rows = 4/4 + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | + |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ + + +Example 2: Running maximum age over a 2-row window +================================================== + +This example calculates the running maximum age over a 2-row window, excluding the current event. + +PPL query:: + + os> source=state_country | streamstats current=false window=2 max(age) as prev_max_age + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+--------------+ + | name | country | state | month | year | age | prev_max_age | + |-------+---------+------------+-------+------+-----+--------------| + | Jake | USA | California | 4 | 2023 | 70 | null | + | Hello | USA | New York | 4 | 2023 | 30 | 70 | + | John | Canada | Ontario | 4 | 2023 | 25 | 70 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 25 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 27 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 57 | + | David | USA | Washington | 4 | 2023 | 40 | 70 | + +-------+---------+------------+-------+------+-----+--------------+ + + +Example 3: Use the global argument to calculate running statistics +================================================================== + +The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: + +* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. +* global=false: the window itself is created per group, meaning each group gets its own independent window. + +This example shows how to calculate the running average of age across accounts by country, using global argument. + +original data:: + + +-------+---------+------------+-------+------+-----+ + | name | country | state | month | year | age | + |-------+---------+------------+-------+------+-----+ + | Jake | USA | California | 4 | 2023 | 70 | + | Hello | USA | New York | 4 | 2023 | 30 | + | John | Canada | Ontario | 4 | 2023 | 25 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | + | Jim | Canada | B.C | 4 | 2023 | 27 | + | Peter | Canada | B.C | 4 | 2023 | 57 | + | Rick | Canada | B.C | 4 | 2023 | 70 | + | David | USA | Washington | 4 | 2023 | 40 | + +-------+---------+------------+-------+------+-----+ + +* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. +* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. + +PPL query:: + + os> source=state_country | streamstats window=2 global=true avg(age) as running_avg by country ; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+-------------+ + | name | country | state | month | year | age | running_avg | + |-------+---------+------------+-------+------+-----+-------------| + | Jake | USA | California | 4 | 2023 | 70 | 70.0 | + | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | + | David | USA | Washington | 4 | 2023 | 40 | 40.0 | + +-------+---------+------------+-------+------+-----+-------------+ + + os> source=state_country | streamstats window=2 global=false avg(age) as running_avg by country ; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+-------------+ + | name | country | state | month | year | age | running_avg | + |-------+---------+------------+-------+------+-----+-------------| + | Jake | USA | California | 4 | 2023 | 70 | 70.0 | + | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | + | David | USA | Washington | 4 | 2023 | 40 | 35.0 | + +-------+---------+------------+-------+------+-----+-------------+ + + +Example 4: Use the reset_before and reset_after arguments to reset statistics +============================================================================= + +This example calculates the running average of age across accounts by country, with resets applied. + +PPL query:: + + os> source=state_country | streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+---------+ + | name | country | state | month | year | age | avg_age | + |-------+---------+------------+-------+------+-----+---------| + | Jake | USA | California | 4 | 2023 | 70 | null | + | Hello | USA | New York | 4 | 2023 | 30 | 70.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | null | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | + | Jim | Canada | B.C | 4 | 2023 | 27 | null | + | Peter | Canada | B.C | 4 | 2023 | 57 | null | + | Rick | Canada | B.C | 4 | 2023 | 70 | null | + | David | USA | Washington | 4 | 2023 | 40 | null | + +-------+---------+------------+-------+------+-----+---------+ \ No newline at end of file diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst index bcdbdad3e94..697ec7e2c6e 100644 --- a/docs/user/ppl/index.rst +++ b/docs/user/ppl/index.rst @@ -40,59 +40,95 @@ The query start with search command and then flowing a set of command delimited - `Cross-Cluster Search `_ +* **Language Structure** + + - `Identifiers `_ + + - `Data Types `_ + * **Commands** - `Syntax `_ - `ad command `_ + - `append command `_ + + - `appendcol command `_ + + - `bin command `_ + - `dedup command `_ - `describe command `_ - - `show datasources command `_ - - `eval command `_ + - `eventstats command `_ + + - `expand command `_ + + - `explain command `_ + - `fields command `_ + - `fillnull command `_ + + - `flatten command `_ + - `grok command `_ + - `head command `_ + + - `join command `_ + - `kmeans command `_ + - `lookup command `_ + - `ml command `_ + - `multisearch command `_ + - `parse command `_ - `patterns command `_ + - `rare command `_ + - `rename command `_ + - `regex command `_ + + - `rex command `_ + - `search command `_ + - `show datasources command `_ + - `sort command `_ + - `spath command `_ + - `stats command `_ - - `trendline command `_ + - `streamstats command `_ - - `where command `_ + - `subquery (aka subsearch) command `_ - - `head command `_ + - `reverse command `_ + + - `table command `_ - - `rare command `_ + - `timechart command `_ - `top command `_ - - `metadata commands `_ - - - `(Experimental)(From 3.0.0) join command `_ - - - `(Experimental)(From 3.0.0) lookup command `_ + - `trendline command `_ - - `(Experimental)(From 3.0.0) subquery (aka subsearch) command `_ + - `replace command `_ - - `(Experimental)(From 3.1.0) eventstats command `_ + - `where command `_ * **Functions** @@ -114,15 +150,15 @@ The query start with search command and then flowing a set of command delimited - `IP Address Functions `_ -* **Optimization** + - `Collection Functions `_ - - `Optimization <../../user/optimization/optimization.rst>`_ + - `Cryptographic Functions `_ -* **Language Structure** + - `JSON Functions `_ - - `Identifiers `_ +* **Optimization** - - `Data Types `_ + - `Optimization <../../user/optimization/optimization.rst>`_ * **Limitations** diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index 0597f36b007..620e0e4a971 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -64,6 +64,7 @@ CalcitePPLCryptographicFunctionIT.class, CalcitePPLDedupIT.class, CalcitePPLEventstatsIT.class, + CalciteStreamstatsCommandIT.class, CalcitePPLExistsSubqueryIT.class, CalcitePPLExplainIT.class, CalcitePPLFillnullIT.class, diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index 5fab2fade3f..77f3a45cc07 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -5,11 +5,23 @@ package org.opensearch.sql.calcite.remote; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_ACCOUNT; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_BANK; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_LOGS; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_NESTED_SIMPLE; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_STRINGS; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_TIME_DATA; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WEBLOGS; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORKER; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORK_INFORMATION; import static org.opensearch.sql.util.MatcherUtils.assertJsonEqualsIgnoreId; +import static org.opensearch.sql.util.MatcherUtils.assertYamlEqualsIgnoreId; import java.io.IOException; +import java.util.Locale; import org.junit.Ignore; import org.junit.Test; +import org.opensearch.sql.common.setting.Settings; import org.opensearch.sql.ppl.ExplainIT; public class CalciteExplainIT extends ExplainIT { @@ -17,7 +29,16 @@ public class CalciteExplainIT extends ExplainIT { public void init() throws Exception { super.init(); enableCalcite(); - disallowCalciteFallback(); + setQueryBucketSize(1000); + loadIndex(Index.BANK_WITH_STRING_VALUES); + loadIndex(Index.NESTED_SIMPLE); + loadIndex(Index.TIME_TEST_DATA); + loadIndex(Index.TIME_TEST_DATA2); + loadIndex(Index.EVENTS); + loadIndex(Index.LOGS); + loadIndex(Index.WORKER); + loadIndex(Index.WORK_INFORMATION); + loadIndex(Index.WEBLOG); } @Override @@ -29,9 +50,9 @@ public void testExplainModeUnsupportedInV2() throws IOException {} public void supportSearchSargPushDown_singleRange() throws IOException { String query = "source=opensearch-sql_test_index_account | where age >= 1.0 and age < 10 | fields age"; - var result = explainQueryToString(query); - String expected = loadExpectedPlan("explain_sarg_filter_push_single_range.json"); - assertJsonEqualsIgnoreId(expected, result); + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_sarg_filter_push_single_range.yaml"); + assertYamlEqualsIgnoreId(expected, result); } // Only for Calcite @@ -48,12 +69,1495 @@ public void supportSearchSargPushDown_multiRange() throws IOException { // Only for Calcite @Test public void supportSearchSargPushDown_timeRange() throws IOException { - String expected = loadExpectedPlan("explain_sarg_filter_push_time_range.json"); + String query = + "source=opensearch-sql_test_index_bank" + + "| where birthdate >= '2016-12-08 00:00:00.000000000' " + + "and birthdate < '2018-11-09 00:00:00.000000000'"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_sarg_filter_push_time_range.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Ignore("https://github.com/opensearch-project/OpenSearch/issues/3725") + public void testJoinWithCriteriaAndMaxOption() throws IOException { + String query = + "source=opensearch-sql_test_index_bank | join max=1 left=l right=r on" + + " l.account_number=r.account_number opensearch-sql_test_index_bank"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_join_with_criteria_max_option.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Ignore("https://github.com/opensearch-project/OpenSearch/issues/3725") + public void testJoinWithFieldListAndMaxOption() throws IOException { + String query = + "source=opensearch-sql_test_index_bank | join type=inner max=1 account_number" + + " opensearch-sql_test_index_bank"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_join_with_fields_max_option.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Test + public void testJoinWithFieldList() throws IOException { + String query = + "source=opensearch-sql_test_index_bank | join type=outer account_number" + + " opensearch-sql_test_index_bank"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_join_with_fields.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainExistsUncorrelatedSubquery() throws IOException { + String expected = loadExpectedPlan("explain_exists_uncorrelated_subquery.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| where exists [" + + " source = %s | where name = 'Tom'" + + " ]" + + "| sort - salary" + + "| fields id, name, salary", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainExistsCorrelatedSubquery() throws IOException { + String expected = loadExpectedPlan("explain_exists_correlated_subquery.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| where exists [" + + " source = %s | where id = uid and name = 'Tom'" + + " ]" + + "| sort - salary" + + "| fields id, name, salary", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainInUncorrelatedSubquery() throws IOException { + String expected = loadExpectedPlan("explain_in_uncorrelated_subquery.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| where id in [" + + " source = %s | fields uid" + + " ]" + + "| sort - salary" + + "| fields id, name, salary", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainInCorrelatedSubquery() throws IOException { + String expected = loadExpectedPlan("explain_in_correlated_subquery.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| where name in [" + + " source = %s | where id = uid and name = 'Tom' | fields name" + + " ]" + + "| sort - salary | fields id, name, salary", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainScalarUncorrelatedSubqueryInSelect() throws IOException { + String expected = loadExpectedPlan("explain_scalar_uncorrelated_subquery_in_select.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| eval count_dept = [" + + " source = %s | stats count(name)" + + " ]" + + "| fields name, count_dept", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainScalarUncorrelatedSubqueryInWhere() throws IOException { + String expected = loadExpectedPlan("explain_scalar_uncorrelated_subquery_in_where.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| where id > [" + + " source = %s | stats count(name)" + + " ] + 999" + + "| fields name", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainScalarCorrelatedSubqueryInSelect() throws IOException { + String expected = loadExpectedPlan("explain_scalar_correlated_subquery_in_select.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| eval count_dept = [" + + " source = %s" + + " | where id = uid | stats count(name)" + + " ]" + + "| fields id, name, count_dept", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION))); + } + + @Test + public void testExplainScalarCorrelatedSubqueryInWhere() throws IOException { + String expected = loadExpectedPlan("explain_scalar_correlated_subquery_in_where.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source = %s" + + "| where id = [" + + " source = %s | where id = uid | stats max(uid)" + + " ]" + + "| fields id, name", + TEST_INDEX_WORKER, TEST_INDEX_WORK_INFORMATION))); + } + + // Only for Calcite + @Test + public void supportPushDownSortMergeJoin() throws IOException { + String query = + "source=opensearch-sql_test_index_bank| join left=l right=r on" + + " l.account_number=r.account_number opensearch-sql_test_index_bank"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_merge_join_sort_push.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Ignore("We've supported script push down on text field") + @Test + public void supportPartialPushDown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + // field `address` is text type without keyword subfield, so we cannot push it down. + String query = + "source=opensearch-sql_test_index_account | where (state = 'Seattle' or age < 10) and (age" + + " >= 1 and address = '880 Holmes Lane') | fields age, address"; + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_partial_filter_push.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Ignore("We've supported script push down on text field") + @Test + public void supportPartialPushDown_NoPushIfAllFailed() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + // field `address` is text type without keyword subfield, so we cannot push it down. + String query = + "source=opensearch-sql_test_index_account | where (address = '671 Bristol Street' or age <" + + " 10) and (age >= 10 or address = '880 Holmes Lane') | fields age, address"; + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_partial_filter_push2.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Test + public void testExplainIsEmpty() throws IOException { + // script pushdown + String expected = loadExpectedPlan("explain_isempty.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=opensearch-sql_test_index_account | where isempty(firstname)")); + } + + @Test + public void testExplainMultisearchBasic() throws IOException { + String query = + "| multisearch [search" + + " source=opensearch-sql_test_index_account | where age < 30 | eval age_group =" + + " 'young'] [search source=opensearch-sql_test_index_account | where age >= 30 | eval" + + " age_group = 'adult'] | stats count by age_group"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_multisearch_basic.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainMultisearchTimestampInterleaving() throws IOException { + String query = + "| multisearch " + + "[search source=opensearch-sql_test_index_time_data | where category IN ('A', 'B')] " + + "[search source=opensearch-sql_test_index_time_data2 | where category IN ('E', 'F')] " + + "| head 5"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_multisearch_timestamp.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Test + public void testExplainIsBlank() throws IOException { + // script pushdown + String expected = loadExpectedPlan("explain_isblank.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=opensearch-sql_test_index_account | where isblank(firstname)")); + } + + // Only for Calcite + @Test + public void testExplainIsEmptyOrOthers() throws IOException { + // script pushdown + String expected = loadExpectedPlan("explain_isempty_or_others.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | where gender = 'M' or isempty(firstname) or" + + " isnull(firstname)")); + } + + // Only for Calcite + @Test + public void testExplainIsNullOrOthers() throws IOException { + // pushdown should work + String expected = loadExpectedPlan("explain_isnull_or_others.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | where isnull(firstname) or gender = 'M'")); + } + + @Ignore("We've supported script push down on text field") + @Test + public void supportPartialPushDownScript() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + // field `address` is text type without keyword subfield, so we cannot push it down. + // But the second condition can be translated to script, so the second one is pushed down. + String query = + "source=opensearch-sql_test_index_account | where address = '671 Bristol Street' and age -" + + " 2 = 30 | fields firstname, age, address"; + var result = explainQueryToString(query); + String expected = + loadFromFile("expectedOutput/calcite/explain_partial_filter_script_push.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testPartialPushdownFilterWithIsNull() throws IOException { + // isnull(nested_field) should not be pushed down since DSL doesn't handle it correctly, but + // name='david' can be pushed down + String query = + String.format( + Locale.ROOT, + "source=%s | where isnull(address) and name='david'", + TEST_INDEX_NESTED_SIMPLE); + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_partial_filter_isnull.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testSkipScriptEncodingOnExtendedFormat() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + "source=opensearch-sql_test_index_account | where address = '671 Bristol Street' and age -" + + " 2 = 30 | fields firstname, age, address"; + var result = explainQueryToString(query, true); + String expected = loadFromFile("expectedOutput/calcite/explain_skip_script_encoding.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Only for Calcite, as v2 gets unstable serialized string for function + @Test + public void testFilterScriptPushDownExplain() throws Exception { + super.testFilterScriptPushDownExplain(); + } + + // Only for Calcite, as v2 gets unstable serialized string for function + @Test + public void testFilterFunctionScriptPushDownExplain() throws Exception { + super.testFilterFunctionScriptPushDownExplain(); + } + + @Test + public void testFilterWithSearchCall() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_filter_with_search.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | where birthdate >= '2023-01-01 00:00:00' and birthdate < '2023-01-03" + + " 00:00:00' | stats count() by span(birthdate, 1d)", + TEST_INDEX_BANK))); + } + + @Test + public void testExplainWithReverse() throws IOException { + String result = + executeWithReplace( + "explain source=opensearch-sql_test_index_account | sort age | reverse | head 5"); + + // Verify that the plan contains a LogicalSort with fetch (from head 5) + assertTrue(result.contains("LogicalSort") && result.contains("fetch=[5]")); + + // Verify that reverse added a ROW_NUMBER and another sort (descending) + assertTrue(result.contains("ROW_NUMBER()")); + assertTrue(result.contains("dir0=[DESC]")); + } + + @Test + public void testExplainWithTimechartAvg() throws IOException { + var result = explainQueryYaml("source=events | timechart span=1m avg(cpu_usage) by host"); + String expected = loadExpectedPlan("explain_timechart.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainWithTimechartCount() throws IOException { + var result = explainQueryYaml("source=events | timechart span=1m count() by host"); + String expected = loadExpectedPlan("explain_timechart_count.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainTimechartPerSecond() throws IOException { + var result = explainQueryToString("source=events | timechart span=2m per_second(cpu_usage)"); + assertTrue( + result.contains( + "per_second(cpu_usage)=[DIVIDE(*($1, 1000.0E0), TIMESTAMPDIFF('MILLISECOND':VARCHAR," + + " $0, TIMESTAMPADD('MINUTE':VARCHAR, 2, $0)))]")); + assertTrue(result.contains("per_second(cpu_usage)=[SUM($0)]")); + } + + @Test + public void testExplainTimechartPerMinute() throws IOException { + var result = explainQueryToString("source=events | timechart span=2m per_minute(cpu_usage)"); + assertTrue( + result.contains( + "per_minute(cpu_usage)=[DIVIDE(*($1, 60000.0E0), TIMESTAMPDIFF('MILLISECOND':VARCHAR," + + " $0, TIMESTAMPADD('MINUTE':VARCHAR, 2, $0)))]")); + assertTrue(result.contains("per_minute(cpu_usage)=[SUM($0)]")); + } + + @Test + public void testExplainTimechartPerHour() throws IOException { + var result = explainQueryToString("source=events | timechart span=2m per_hour(cpu_usage)"); + assertTrue( + result.contains( + "per_hour(cpu_usage)=[DIVIDE(*($1, 3600000.0E0), TIMESTAMPDIFF('MILLISECOND':VARCHAR," + + " $0, TIMESTAMPADD('MINUTE':VARCHAR, 2, $0)))]")); + assertTrue(result.contains("per_hour(cpu_usage)=[SUM($0)]")); + } + + @Test + public void testExplainTimechartPerDay() throws IOException { + var result = explainQueryToString("source=events | timechart span=2m per_day(cpu_usage)"); + assertTrue( + result.contains( + "per_day(cpu_usage)=[DIVIDE(*($1, 8.64E7), TIMESTAMPDIFF('MILLISECOND':VARCHAR, $0," + + " TIMESTAMPADD('MINUTE':VARCHAR, 2, $0)))]")); + assertTrue(result.contains("per_day(cpu_usage)=[SUM($0)]")); + } + + @Test + public void noPushDownForAggOnWindow() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + "source=opensearch-sql_test_index_account | patterns address method=BRAIN | stats count()" + + " by patterns_field"; + var result = explainQueryYaml(query); + String expected = loadFromFile("expectedOutput/calcite/explain_agg_on_window.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Test + public void supportPushDownScriptOnTextField() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String result = + explainQueryYaml( + "explain source=opensearch-sql_test_index_account | where length(address) > 0 | eval" + + " address_length = length(address) | stats count() by address_length"); + String expected = loadFromFile("expectedOutput/calcite/explain_script_push_on_text.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainBinWithBins() throws IOException { + String expected = loadExpectedPlan("explain_bin_bins.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString("source=opensearch-sql_test_index_account | bin age bins=3 | head 5")); + } + + @Test + public void testExplainStatsWithBinsOnTimeField() throws IOException { + // TODO: Remove this after addressing https://github.com/opensearch-project/sql/issues/4317 + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_stats_bins_on_time.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=events | bin @timestamp bins=3 | stats count() by @timestamp")); + + expected = loadExpectedPlan("explain_stats_bins_on_time2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=events | bin @timestamp bins=3 | stats avg(cpu_usage) by @timestamp")); + } + + @Test + public void testExplainStatsWithSubAggregation() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_stats_bins_on_time_and_term.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=events | bin @timestamp bins=3 | stats bucket_nullable=false count() by" + + " @timestamp, region")); + + expected = loadExpectedPlan("explain_stats_bins_on_time_and_term2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=events | bin @timestamp bins=3 | stats bucket_nullable=false avg(cpu_usage) by" + + " @timestamp, region")); + } + + @Test + public void testExplainBinWithSpan() throws IOException { + String expected = loadExpectedPlan("explain_bin_span.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=opensearch-sql_test_index_account | bin age span=10 | head 5")); + } + + @Test + public void testExplainBinWithMinspan() throws IOException { + String expected = loadExpectedPlan("explain_bin_minspan.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | bin age minspan=5 | head 5")); + } + + @Test + public void testExplainBinWithStartEnd() throws IOException { + String expected = loadExpectedPlan("explain_bin_start_end.json"); assertJsonEqualsIgnoreId( expected, explainQueryToString( - "source=opensearch-sql_test_index_bank" - + "| where birthdate >= '2016-12-08 00:00:00.000000000' " - + "and birthdate < '2018-11-09 00:00:00.000000000' ")); + "source=opensearch-sql_test_index_account | bin balance start=0 end=100001 | head 5")); + } + + @Test + public void testExplainBinWithAligntime() throws IOException { + String expected = loadExpectedPlan("explain_bin_aligntime.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_time_data | bin @timestamp span=2h aligntime=latest |" + + " head 5")); + } + + @Test + public void testExplainCountEval() throws IOException { + String query = + "source=opensearch-sql_test_index_bank | stats count(eval(age > 30)) as mature_count"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_count_eval_push.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainCountEvalComplex() throws IOException { + String query = + "source=opensearch-sql_test_index_bank | stats count(eval(age > 30 and age < 50)) as" + + " mature_count"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_count_eval_complex_push.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testEventstatsDistinctCountExplain() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + "source=opensearch-sql_test_index_account | eventstats dc(state) as distinct_states"; + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_eventstats_dc.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testEventstatsDistinctCountFunctionExplain() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + "source=opensearch-sql_test_index_account | eventstats distinct_count(state) as" + + " distinct_states by gender"; + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_eventstats_distinct_count.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsDistinctCountExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats dc(state) as distinct_states"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_dc.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsDistinctCountFunctionExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats distinct_count(state) as" + + " distinct_states by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_distinct_count.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsGlobalExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats window=2 global=true avg(age) as" + + " avg_age by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_global.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsResetExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats current=false reset_before=age>34" + + " reset_after=age<25 avg(age) as avg_age by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_reset.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + // Only for Calcite, as v2 gets unstable serialized string for function + @Test + public void testExplainOnAggregationWithSumEnhancement() throws IOException { + String expected = loadExpectedPlan("explain_agg_with_sum_enhancement.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats sum(balance), sum(balance + 100), sum(balance - 100)," + + " sum(balance * 100), sum(balance / 100) by gender", + TEST_INDEX_BANK))); + } + + @Test + public void testStatsDistinctCountApproxFunctionExplainWithPushDown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + "source=opensearch-sql_test_index_account | stats distinct_count_approx(state) as" + + " distinct_states by gender"; + var result = explainQueryToString(query); + String expected = + loadFromFile( + "expectedOutput/calcite/explain_agg_with_distinct_count_approx_enhancement.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainRegexMatchInWhereWithScriptPushdown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + String.format("source=%s | where regex_match(name, 'hello')", TEST_INDEX_STRINGS); + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_regex_match_in_where.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainRegexMatchInEvalWithOutScriptPushdown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String query = + String.format( + "source=%s |eval has_hello = regex_match(name, 'hello') | fields has_hello", + TEST_INDEX_STRINGS); + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_regex_match_in_eval.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Only for Calcite + @Test + public void testExplainOnEarliestLatest() throws IOException { + String expected = loadExpectedPlan("explain_earliest_latest.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats earliest(message) as earliest_message, latest(message) as" + + " latest_message by server", + TEST_INDEX_LOGS))); + } + + // Only for Calcite + @Test + public void testExplainOnEarliestLatestWithCustomTimeField() throws IOException { + String expected = loadExpectedPlan("explain_earliest_latest_custom_time.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats earliest(message, created_at) as earliest_message," + + " latest(message, created_at) as latest_message by level", + TEST_INDEX_LOGS))); + } + + // Only for Calcite + @Test + public void testExplainOnFirstLast() throws IOException { + String expected = loadExpectedPlan("explain_first_last.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats first(firstname) as first_name, last(firstname) as" + + " last_name by gender", + TEST_INDEX_BANK))); + } + + // Only for Calcite + public void testExplainOnEventstatsEarliestLatest() throws IOException { + String expected = loadExpectedPlan("explain_eventstats_earliest_latest.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + String.format( + "source=%s | eventstats earliest(message) as earliest_message, latest(message) as" + + " latest_message by server", + TEST_INDEX_LOGS))); + } + + // Only for Calcite + @Test + public void testExplainOnEventstatsEarliestLatestWithCustomTimeField() throws IOException { + String expected = loadExpectedPlan("explain_eventstats_earliest_latest_custom_time.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + String.format( + "source=%s | eventstats earliest(message, created_at) as earliest_message," + + " latest(message, created_at) as latest_message by level", + TEST_INDEX_LOGS))); + } + + // Only for Calcite + @Test + public void testExplainOnEventstatsEarliestLatestNoGroupBy() throws IOException { + String expected = loadExpectedPlan("explain_eventstats_earliest_latest_no_group.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + String.format( + "source=%s | eventstats earliest(message) as earliest_message, latest(message) as" + + " latest_message", + TEST_INDEX_LOGS))); + } + + public void testExplainOnStreamstatsEarliestLatest() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message) as earliest_message, latest(message) as" + + " latest_message by server", + TEST_INDEX_LOGS))); + } + + @Test + public void testExplainOnStreamstatsEarliestLatestWithCustomTimeField() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest_custom_time.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message, created_at) as earliest_message," + + " latest(message, created_at) as latest_message by level", + TEST_INDEX_LOGS))); + } + + @Test + public void testExplainOnStreamstatsEarliestLatestNoGroupBy() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest_no_group.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message) as earliest_message, latest(message) as" + + " latest_message", + TEST_INDEX_LOGS))); + } + + @Test + public void testListAggregationExplain() throws IOException { + String expected = loadExpectedPlan("explain_list_aggregation.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | stats list(age) as age_list")); + } + + @Test + public void testValuesAggregationExplain() throws IOException { + String expected = loadExpectedPlan("explain_values_aggregation.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | stats values(age) as age_values")); + } + + @Test + public void testRegexExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | regex lastname='^[A-Z][a-z]+$' | head 5"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_regex.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testRegexNegatedExplain() throws IOException { + String query = "source=opensearch-sql_test_index_account | regex lastname!='.*son$' | head 5"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_regex_negated.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testSimpleSortExpressionPushDownExplain() throws Exception { + String query = + "source=opensearch-sql_test_index_bank| eval age2 = age + 2 | sort age2 | fields age, age2"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_simple_sort_expr_push.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testSimpleSortExpressionPushDownWithOnlyExprProjected() throws Exception { + String query = + "source=opensearch-sql_test_index_bank| eval b = balance + 1 | sort b | fields b"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_simple_sort_expr_single_expr_output_push.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testRexExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | rex field=lastname \\\"(?^[A-Z])\\\" |" + + " head 5"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_rex.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testExplainAppendCommand() throws IOException { + String expected = loadExpectedPlan("explain_append_command.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + String.format( + Locale.ROOT, + "source=%s | stats count(balance) as cnt by gender | append [ source=%s | stats" + + " count() as cnt ]", + TEST_INDEX_BANK, + TEST_INDEX_BANK))); + } + + @Test + public void testMvjoinExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | eval result = mvjoin(array('a', 'b', 'c'), ',')" + + " | fields result | head 1"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_mvjoin.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testPreventLimitPushdown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + setMaxResultWindow("opensearch-sql_test_index_account", 1); + String query = "source=opensearch-sql_test_index_account | head 1 from 1"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_prevent_limit_push.yaml"); + assertYamlEqualsIgnoreId(expected, result); + resetMaxResultWindow("opensearch-sql_test_index_account"); + } + + @Test + public void testPushdownLimitIntoAggregation() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_limit_agg_pushdown.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString("source=opensearch-sql_test_index_account | stats count() by state")); + + expected = loadExpectedPlan("explain_limit_agg_pushdown2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count() by state | head 100")); + + expected = loadExpectedPlan("explain_limit_agg_pushdown3.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | stats count() by state | head 100 | head 10" + + " from 10 ")); + + expected = loadExpectedPlan("explain_limit_agg_pushdown4.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count() by state | sort state | head" + + " 100 | head 10 from 10 ")); + + expected = loadExpectedPlan("explain_limit_agg_pushdown_bucket_nullable1.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false count() by" + + " state | head 100 | head 10 from 10 ")); + + expected = loadExpectedPlan("explain_limit_agg_pushdown_bucket_nullable2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false count() by" + + " state | sort state | head 100 | head 10 from 10 ")); + + // Don't pushdown the combination of limit and sort + expected = loadExpectedPlan("explain_limit_agg_pushdown5.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | stats count() by state | sort `count()` |" + + " head 100 | head 10 from 10 ")); + } + + @Test + public void testExplainMaxOnStringField() throws IOException { + String expected = loadExpectedPlan("explain_max_string_field.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=opensearch-sql_test_index_account | stats max(firstname)")); + } + + @Test + public void testExplainMinOnStringField() throws IOException { + String expected = loadExpectedPlan("explain_min_string_field.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=opensearch-sql_test_index_account | stats min(firstname)")); + } + + @Test + @Override + public void testCountAggPushDownExplain() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + // should be optimized by hits.total.value + String expected = loadExpectedPlan("explain_count_agg_push1.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml("source=opensearch-sql_test_index_account | stats count() as cnt")); + + // should be optimized + expected = loadExpectedPlan("explain_count_agg_push2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count(lastname) as cnt")); + + // should be optimized + expected = loadExpectedPlan("explain_count_agg_push3.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | eval name = lastname | stats count(name) as" + + " cnt")); + + // should be optimized + expected = loadExpectedPlan("explain_count_agg_push4.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count() as c1, count() as c2")); + + // should be optimized + expected = loadExpectedPlan("explain_count_agg_push5.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count(lastname) as c1," + + " count(lastname) as c2")); + + // should be optimized + expected = loadExpectedPlan("explain_count_agg_push6.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | eval name = lastname | stats" + + " count(lastname), count(name)")); + + // should not be optimized + expected = loadExpectedPlan("explain_count_agg_push7.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count(balance + 1) as cnt")); + + // should not be optimized + expected = loadExpectedPlan("explain_count_agg_push8.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count() as c1, count(lastname) as" + + " c2")); + + // should not be optimized + expected = loadExpectedPlan("explain_count_agg_push9.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats count(firstname), count(lastname)")); + + // should not be optimized + expected = loadExpectedPlan("explain_count_agg_push10.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | eval name = lastname | stats" + + " count(firstname), count(name)")); + } + + @Test + public void testExplainCountsByAgg() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_agg_counts_by1.yaml"); + // case of only count(): doc_count works + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats count(), count() as c1 by gender", TEST_INDEX_ACCOUNT))); + + // count(FIELD) by: doc_count doesn't work + expected = loadExpectedPlan("explain_agg_counts_by2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats count(balance) as c1, count(balance) as c2 by gender", + TEST_INDEX_ACCOUNT))); + + // count(FIELD) by: doc_count doesn't work + expected = loadExpectedPlan("explain_agg_counts_by3.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | eval account_number_alias = account_number" + + " | stats count(account_number), count(account_number_alias) as c2 by gender", + TEST_INDEX_ACCOUNT))); + + // count() + count(FIELD)): doc_count doesn't work + expected = loadExpectedPlan("explain_agg_counts_by4.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats count(), count(account_number) by gender", TEST_INDEX_ACCOUNT))); + + // count(FIELD1) + count(FIELD2)) by: doc_count doesn't work + expected = loadExpectedPlan("explain_agg_counts_by5.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats count(balance), count(account_number) by gender", + TEST_INDEX_ACCOUNT))); + + // case of count(EXPRESSION) by: doc_count doesn't work + expected = loadExpectedPlan("explain_agg_counts_by6.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | eval b_1 = balance + 1" + + " | stats count(b_1), count(pow(balance, 2)) as c3 by gender", + TEST_INDEX_ACCOUNT))); + } + + @Test + public void testExplainSortOnMeasure() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_agg_sort_on_measure1.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false count() by" + + " state | sort `count()`")); + expected = loadExpectedPlan("explain_agg_sort_on_measure2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false sum(balance)" + + " as sum by state | sort - sum")); + // TODO limit should pushdown to non-composite agg + expected = loadExpectedPlan("explain_agg_sort_on_measure3.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats count() as cnt by span(birthdate, 1d) | sort - cnt", + TEST_INDEX_BANK))); + expected = loadExpectedPlan("explain_agg_sort_on_measure4.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats bucket_nullable=false sum(balance) by span(age, 5) | sort -" + + " `sum(balance)`", + TEST_INDEX_BANK))); + } + + @Test + public void testExplainSortOnMeasureMultiTerms() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_agg_sort_on_measure_multi_terms.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false count() by" + + " gender, state | sort `count()`")); + } + + @Test + public void testExplainCompositeMultiBucketsAutoDateThenSortOnMeasureNotPushdown() + throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_multi_terms_autodate_sort_agg_measure_not_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | bin timestamp bins=3 | stats bucket_nullable=false avg(value), count()" + + " as cnt by category, value, timestamp | sort cnt", + TEST_INDEX_TIME_DATA))); + } + + @Test + public void testExplainCompositeRangeThenSortOnMeasureNotPushdown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_range_sort_agg_measure_not_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval value_range = case(value < 7000, 'small'" + + " else 'great') | stats bucket_nullable=false avg(value), count() as cnt by" + + " value_range, category | sort cnt", + TEST_INDEX_TIME_DATA))); + } + + @Test + public void testExplainCompositeAutoDateThenSortOnMeasureNotPushdown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_autodate_sort_agg_measure_not_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | bin timestamp bins=3 | stats bucket_nullable=false avg(value), count()" + + " as cnt by timestamp, category | sort cnt", + TEST_INDEX_TIME_DATA))); + } + + @Test + public void testExplainCompositeRangeAutoDateThenSortOnMeasureNotPushdown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_autodate_range_metric_sort_agg_measure_not_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | bin timestamp bins=3 | eval value_range = case(value < 7000, 'small'" + + " else 'great') | stats bucket_nullable=false avg(value), count() as cnt by" + + " timestamp, value_range, category | sort cnt", + TEST_INDEX_TIME_DATA))); + } + + @Test + public void testExplainMultipleAggregatorsWithSortOnOneMeasureNotPushDown() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = + loadExpectedPlan("explain_multiple_agg_with_sort_on_one_measure_not_push1.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false count() as c," + + " sum(balance) as s by state | sort c")); + expected = loadExpectedPlan("explain_multiple_agg_with_sort_on_one_measure_not_push2.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account | stats bucket_nullable=false count() as c," + + " sum(balance) as s by state | sort c, s")); + } + + @Test + public void testExplainEvalMax() throws IOException { + String expected = loadExpectedPlan("explain_eval_max.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | eval new = max(1, 2, 3, age, 'banana')")); + } + + @Test + public void testExplainEvalMin() throws IOException { + String expected = loadExpectedPlan("explain_eval_min.json"); + assertJsonEqualsIgnoreId( + expected, + explainQueryToString( + "source=opensearch-sql_test_index_account | eval new = min(1, 2, 3, age, 'banana')")); + } + + /** + * Executes the PPL query and returns the result as a string with windows-style line breaks + * replaced with Unix-style ones. + * + * @param ppl the PPL query to execute + * @return the result of the query as a string with line breaks replaced + * @throws IOException if an error occurs during query execution + */ + private String executeWithReplace(String ppl) throws IOException { + var result = executeQueryToString(ppl); + return result.replace("\\r\\n", "\\n"); + } + + @Test + public void testStrftimeFunctionExplain() throws IOException { + // Test explain for strftime function + String query = + "source=opensearch-sql_test_index_account | eval formatted_date = strftime(1521467703," + + " '%Y-%m-%d') | fields formatted_date | head 1"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_strftime_function.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + // Script generation is not stable in v2 + @Test + public void testExplainPushDownScriptsContainingUDT() throws IOException { + assertJsonEqualsIgnoreId( + loadExpectedPlan("explain_filter_script_ip_push.json"), + explainQueryToString( + String.format( + "source=%s | where cidrmatch(host, '0.0.0.0/24') | fields host", + TEST_INDEX_WEBLOGS))); + + assertYamlEqualsIgnoreId( + loadExpectedPlan("explain_agg_script_timestamp_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval t = unix_timestamp(birthdate) | stats count() by t | sort t |" + + " head 3", + TEST_INDEX_BANK))); + + assertYamlEqualsIgnoreId( + loadExpectedPlan("explain_agg_script_udt_arg_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval t = date_add(birthdate, interval 1 day) | stats count() by" + + " span(t, 1d)", + TEST_INDEX_BANK))); + } + + @Test + public void testFillNullValueSyntaxExplain() throws IOException { + String expected = loadExpectedPlan("explain_fillnull_value_syntax.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | fields age, balance | fillnull value=0", TEST_INDEX_ACCOUNT))); + } + + @Test + public void testJoinWithPushdownSortIntoAgg() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + // PPL_JOIN_SUBSEARCH_MAXOUT!=0 will add limit before sort and then prevent sort push down. + setJoinSubsearchMaxOut(0); + String expected = loadExpectedPlan("explain_join_with_agg.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | stats COUNT() by age, gender | join left=L right=R ON L.gender =" + + " R.gender [source=%s | stats COUNT() as overall_cnt by gender]", + TEST_INDEX_ACCOUNT, TEST_INDEX_ACCOUNT))); + resetJoinSubsearchMaxOut(); + } + + @Test + public void testReplaceCommandExplain() throws IOException { + String expected = loadExpectedPlan("explain_replace_command.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | replace 'IL' WITH 'Illinois' IN state | fields state", + TEST_INDEX_ACCOUNT))); + } + + @Test + public void testExplainRareCommandUseNull() throws IOException { + String expected = loadExpectedPlan("explain_rare_usenull_false.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format("source=%s | rare 2 usenull=false state by gender", TEST_INDEX_ACCOUNT))); + expected = loadExpectedPlan("explain_rare_usenull_true.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format("source=%s | rare 2 usenull=true state by gender", TEST_INDEX_ACCOUNT))); + withSettings( + Settings.Key.PPL_SYNTAX_LEGACY_PREFERRED, + "false", + () -> { + try { + assertYamlEqualsIgnoreId( + loadExpectedPlan("explain_rare_usenull_false.yaml"), + explainQueryYaml( + String.format("source=%s | rare 2 state by gender", TEST_INDEX_ACCOUNT))); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + + @Test + public void testExplainTopCommandUseNull() throws IOException { + String expected = loadExpectedPlan("explain_top_usenull_false.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format("source=%s | top 2 usenull=false state by gender", TEST_INDEX_ACCOUNT))); + expected = loadExpectedPlan("explain_top_usenull_true.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format("source=%s | top 2 usenull=true state by gender", TEST_INDEX_ACCOUNT))); + withSettings( + Settings.Key.PPL_SYNTAX_LEGACY_PREFERRED, + "false", + () -> { + try { + assertYamlEqualsIgnoreId( + loadExpectedPlan("explain_top_usenull_false.yaml"), + explainQueryYaml( + String.format("source=%s | top 2 state by gender", TEST_INDEX_ACCOUNT))); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + + // Test cases for verifying the fix of https://github.com/opensearch-project/sql/issues/4571 + @Test + public void testPushDownMinOrMaxAggOnDerivedField() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_min_max_agg_on_derived_field.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | eval balance2 = CEIL(balance/10000.0) " + + "| stats MIN(balance2), MAX(balance2)", + TEST_INDEX_ACCOUNT))); + } + + @Test + public void testCasePushdownAsRangeQueryExplain() throws IOException { + // CASE 1: Range - Metric + // 1.1 Range - Metric + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_range_metric_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30', age < 40, 'u40' else 'u100') |" + + " stats avg(age) as avg_age by age_range", + TEST_INDEX_BANK))); + + // 1.2 Range - Metric (COUNT) + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_range_count_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30', age >= 30 and age < 40, 'u40'" + + " else 'u100') | stats avg(age) by age_range", + TEST_INDEX_BANK))); + + // 1.3 Range - Range - Metric + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_range_range_metric_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30', age < 40, 'u40' else 'u100')," + + " balance_range = case(balance < 20000, 'medium' else 'high') | stats" + + " avg(balance) as avg_balance by age_range, balance_range", + TEST_INDEX_BANK))); + + // 1.4 Range - Metric (With null & discontinuous ranges) + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_range_metric_complex_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30', (age >= 35 and age < 40) or age" + + " >= 80, '30-40 or >=80') | stats avg(balance) by age_range", + TEST_INDEX_BANK))); + + // 1.5 Should not be pushed because the range is not closed-open + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_case_cannot_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30', age >= 30 and age <= 40, 'u40'" + + " else 'u100') | stats avg(age) as avg_age by age_range", + TEST_INDEX_BANK))); + + // 1.6 Should not be pushed as range query because the result expression is not a string + // literal. + // Range aggregation keys must be strings + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_case_num_res_cannot_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 30 else 100) | stats count() by" + + " age_range", + TEST_INDEX_BANK))); + + // CASE 2: Composite - Range - Metric + // 2.1 Composite (term) - Range - Metric + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_range_metric_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30' else 'a30') | stats avg(balance)" + + " by state, age_range", + TEST_INDEX_BANK))); + + // 2.2 Composite (date histogram) - Range - Metric + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_date_range_push.yaml"), + explainQueryYaml( + "source=opensearch-sql_test_index_time_data | eval value_range = case(value < 7000," + + " 'small' else 'large') | stats avg(value) by value_range, span(@timestamp," + + " 1h)")); + + // 2.3 Composite(2 fields) - Range - Metric (with count) + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite2_range_count_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 30, 'u30' else 'a30') | stats" + + " avg(balance), count() by age_range, state, gender", + TEST_INDEX_BANK))); + + // 2.4 Composite (2 fields) - Range - Range - Metric (with count) + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite2_range_range_count_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 35, 'u35' else 'a35'), balance_range =" + + " case(balance < 20000, 'medium' else 'high') | stats avg(balance) as" + + " avg_balance by age_range, balance_range, state", + TEST_INDEX_BANK))); + + // 2.5 Should not be pushed down as range query because case result expression is not constant + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_case_composite_cannot_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval age_range = case(age < 35, 'u35' else email) | stats avg(balance)" + + " as avg_balance by age_range, state", + TEST_INDEX_BANK))); + } + + @Test + public void testNestedAggregationsExplain() throws IOException { + // TODO: Remove after resolving: https://github.com/opensearch-project/sql/issues/4578 + enabledOnlyWhenPushdownIsEnabled(); + assertYamlEqualsIgnoreId( + loadExpectedPlan("agg_composite_autodate_range_metric_push.yaml"), + explainQueryYaml( + String.format( + "source=%s | bin timestamp bins=3 | eval value_range = case(value < 7000, 'small'" + + " else 'great') | stats bucket_nullable=false avg(value), count() by" + + " timestamp, value_range, category", + TEST_INDEX_TIME_DATA))); + } + + @Test + public void testTopKThenSortExplain() throws IOException { + enabledOnlyWhenPushdownIsEnabled(); + String expected = loadExpectedPlan("explain_top_k_then_sort_push.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + "source=opensearch-sql_test_index_account" + + "| sort balance" + + "| head 5 " + + "| sort age " + + "| fields age")); + } + + @Test + public void testGeoIpPushedInAgg() throws IOException { + // This explain IT verifies that externally registered UDF can be properly pushed down + assertYamlEqualsIgnoreId( + loadExpectedPlan("udf_geoip_in_agg_pushed.yaml"), + explainQueryYaml( + String.format( + "source=%s | eval info = geoip('my-datasource', host) | stats count() by info.city", + TEST_INDEX_WEBLOGS))); + } + + @Test + public void testInternalItemAccessOnStructs() throws IOException { + String expected = loadExpectedPlan("access_struct_subfield_with_item.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | eval info = geoip('dummy-datasource', host) | fields host, info," + + " info.dummy_sub_field", + TEST_INDEX_WEBLOGS))); } } diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java index 9ea53f0ef5c..a50b83cb07d 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java @@ -11,10 +11,7 @@ import java.io.IOException; import java.util.List; import org.json.JSONObject; -import org.junit.Ignore; import org.junit.jupiter.api.Test; -import org.opensearch.client.Request; -import org.opensearch.sql.legacy.TestsConstants; import org.opensearch.sql.ppl.PPLIntegTestCase; public class CalcitePPLEventstatsIT extends PPLIntegTestCase { @@ -22,15 +19,15 @@ public class CalcitePPLEventstatsIT extends PPLIntegTestCase { public void init() throws Exception { super.init(); enableCalcite(); - disallowCalciteFallback(); + loadIndex(Index.BANK); loadIndex(Index.STATE_COUNTRY); loadIndex(Index.STATE_COUNTRY_WITH_NULL); - loadIndex(Index.BANK_TWO); + loadIndex(Index.LOGS); } @Test - public void testEventstat() throws IOException { + public void testEventstats() throws IOException { JSONObject actual = executeQuery( String.format( @@ -60,7 +57,7 @@ public void testEventstat() throws IOException { } @Test - public void testEventstatWithNull() throws IOException { + public void testEventstatsWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -92,13 +89,13 @@ public void testEventstatWithNull() throws IOException { } @Test - public void testEventstatBy() throws IOException { + public void testEventstatsBy() throws IOException { JSONObject actual = executeQuery( String.format( "source=%s | eventstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by country | fields name, country, state, month, year, age, cnt," - + " avg, min, max", + + " as max by country | fields name, country, state, month, year, age, cnt," + + " avg, min, max", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( @@ -123,7 +120,7 @@ public void testEventstatBy() throws IOException { } @Test - public void testEventstatByWithNull() throws IOException { + public void testEventstatsByWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -172,7 +169,7 @@ public void testEventstatByWithNull() throws IOException { } @Test - public void testEventstatBySpan() throws IOException { + public void testEventstatsBySpan() throws IOException { JSONObject actual = executeQuery( String.format( @@ -190,7 +187,7 @@ public void testEventstatBySpan() throws IOException { } @Test - public void testEventstatBySpanWithNull() throws IOException { + public void testEventstatsBySpanWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -210,7 +207,7 @@ public void testEventstatBySpanWithNull() throws IOException { } @Test - public void testEventstatByMultiplePartitions1() throws IOException { + public void testEventstatsByMultiplePartitions1() throws IOException { JSONObject actual = executeQuery( String.format( @@ -228,7 +225,7 @@ public void testEventstatByMultiplePartitions1() throws IOException { } @Test - public void testEventstatByMultiplePartitions2() throws IOException { + public void testEventstatsByMultiplePartitions2() throws IOException { JSONObject actual = executeQuery( String.format( @@ -246,7 +243,7 @@ public void testEventstatByMultiplePartitions2() throws IOException { } @Test - public void testEventstatByMultiplePartitionsWithNull1() throws IOException { + public void testEventstatsByMultiplePartitionsWithNull1() throws IOException { JSONObject actual = executeQuery( String.format( @@ -266,7 +263,7 @@ public void testEventstatByMultiplePartitionsWithNull1() throws IOException { } @Test - public void testEventstatByMultiplePartitionsWithNull2() throws IOException { + public void testEventstatsByMultiplePartitionsWithNull2() throws IOException { JSONObject actual = executeQuery( String.format( @@ -300,30 +297,8 @@ public void testUnsupportedWindowFunctions() { } } - @Ignore("DC should fail in window function") - public void testDistinctCountShouldFail() throws IOException { - Request request1 = - new Request("PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/5?refresh=true"); - request1.setJsonEntity( - "{\"name\":\"Jim\",\"age\":27,\"state\":\"Ontario\",\"country\":\"Canada\",\"year\":2023,\"month\":4}"); - client().performRequest(request1); - JSONObject actual = - executeQuery( - String.format( - "source=%s | eventstats distinct_count(state) by country", - TEST_INDEX_STATE_COUNTRY)); - - verifyDataRows( - actual, - rows("John", "Canada", "Ontario", 4, 2023, 25, 3), - rows("Jane", "Canada", "Quebec", 4, 2023, 20, 3), - rows("Jim", "Canada", "Ontario", 4, 2023, 27, 3), - rows("Jake", "USA", "California", 4, 2023, 70, 2), - rows("Hello", "USA", "New York", 4, 2023, 30, 2)); - } - @Test - public void testMultipleEventstat() throws IOException { + public void testMultipleEventstats() throws IOException { JSONObject actual = executeQuery( String.format( @@ -341,7 +316,7 @@ public void testMultipleEventstat() throws IOException { } @Test - public void testMultipleEventstatWithNull() throws IOException { + public void testMultipleEventstatsWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -361,7 +336,7 @@ public void testMultipleEventstatWithNull() throws IOException { } @Test - public void testMultipleEventstatWithEval() throws IOException { + public void testMultipleEventstatsWithEval() throws IOException { JSONObject actual = executeQuery( String.format( @@ -381,7 +356,7 @@ public void testMultipleEventstatWithEval() throws IOException { } @Test - public void testEventstatEmptyRows() throws IOException { + public void testEventstatsEmptyRows() throws IOException { JSONObject actual = executeQuery( String.format( @@ -401,7 +376,7 @@ public void testEventstatEmptyRows() throws IOException { } @Test - public void testEventstatVariance() throws IOException { + public void testEventstatsVariance() throws IOException { JSONObject actual = executeQuery( String.format( @@ -472,7 +447,7 @@ public void testEventstatVariance() throws IOException { } @Test - public void testEventstatVarianceWithNull() throws IOException { + public void testEventstatsVarianceWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -536,7 +511,7 @@ public void testEventstatVarianceWithNull() throws IOException { } @Test - public void testEventstatVarianceBy() throws IOException { + public void testEventstatsVarianceBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -554,7 +529,7 @@ public void testEventstatVarianceBy() throws IOException { } @Test - public void testEventstatVarianceBySpan() throws IOException { + public void testEventstatsVarianceBySpan() throws IOException { JSONObject actual = executeQuery( String.format( @@ -569,7 +544,7 @@ public void testEventstatVarianceBySpan() throws IOException { } @Test - public void testEventstatVarianceWithNullBy() throws IOException { + public void testEventstatsVarianceWithNullBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -618,48 +593,168 @@ public void testEventstatVarianceWithNullBy() throws IOException { rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800)); } - @Ignore @Test - public void testEventstatEarliestAndLatest() throws IOException { + public void testEventstatsDistinctCount() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | eventstats earliest(birthdate), latest(birthdate) | head 1", - TEST_INDEX_BANK_TWO)); - verifySchema( + "source=%s | eventstats dc(state) as dc_state", TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, 4), + rows("Jake", "USA", "California", 4, 2023, 70, 4), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows("Hello", "USA", "New York", 4, 2023, 30, 4)); + } + + @Test + public void testEventstatsDistinctCountByCountry() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats dc(state) as dc_state by country | fields name, country, state, month, year, age, dc_state", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jake", "USA", "California", 4, 2023, 70, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2), + rows("Hello", "USA", "New York", 4, 2023, 30, 2)); + } + + @Test + public void testEventstatsDistinctCountFunction() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats distinct_count(country) as dc_country | fields name, country, state, month, year, age, dc_country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_country", "bigint")); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jake", "USA", "California", 4, 2023, 70, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2), + rows("Hello", "USA", "New York", 4, 2023, 30, 2)); + } + + @Test + public void testEventstatsDistinctCountWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats dc(state) as dc_state | fields name, country, state, month, year, age, dc_state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( actual, - schema("account_number", "bigint"), - schema("firstname", "string"), - schema("address", "string"), - schema("birthdate", "timestamp"), - schema("gender", "string"), - schema("city", "string"), - schema("lastname", "string"), - schema("balance", "bigint"), - schema("employer", "string"), + schema("name", "string"), + schema("country", "string"), schema("state", "string"), + schema("month", "int"), + schema("year", "int"), schema("age", "int"), - schema("email", "string"), - schema("male", "boolean"), - schema("earliest(birthdate)", "timestamp"), - schema("latest(birthdate)", "timestamp")); + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows(null, "Canada", null, 4, 2023, 10, 4), + rows("Kevin", null, null, 4, 2023, null, 4), + rows("John", "Canada", "Ontario", 4, 2023, 25, 4), + rows("Jake", "USA", "California", 4, 2023, 70, 4), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows("Hello", "USA", "New York", 4, 2023, 30, 4)); + } + + @Test + public void testEventstatsEarliestAndLatest() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats earliest(message), latest(message) by server", + TEST_INDEX_LOGS)); + verifySchema( + actual, + schema("created_at", "timestamp"), + schema("server", "string"), + schema("@timestamp", "timestamp"), + schema("message", "string"), + schema("level", "string"), + schema("earliest(message)", "string"), + schema("latest(message)", "string")); verifyDataRows( actual, rows( - 1, - "Amber JOHnny", - "880 Holmes Lane", - "2017-10-23 00:00:00", - "M", - "Brogan", - "Duke Willmington", - 39225, - "Pyrami", - "IL", - 32, - "amberduke@pyrami.com", - true, - "1970-01-18 20:22:32", - "2018-08-19 00:00:00")); + "2023-01-05 00:00:00", + "server1", + "2023-01-01 00:00:00", + "Database connection failed", + "ERROR", + "Database connection failed", + "High memory usage"), + rows( + "2023-01-04 00:00:00", + "server2", + "2023-01-02 00:00:00", + "Service started", + "INFO", + "Service started", + "Backup completed"), + rows( + "2023-01-03 00:00:00", + "server1", + "2023-01-03 00:00:00", + "High memory usage", + "WARN", + "Database connection failed", + "High memory usage"), + rows( + "2023-01-02 00:00:00", + "server3", + "2023-01-04 00:00:00", + "Disk space low", + "ERROR", + "Disk space low", + "Disk space low"), + rows( + "2023-01-01 00:00:00", + "server2", + "2023-01-05 00:00:00", + "Backup completed", + "INFO", + "Service started", + "Backup completed")); } } diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java new file mode 100644 index 00000000000..ee94c218dbb --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java @@ -0,0 +1,1095 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.legacy.TestsConstants.*; +import static org.opensearch.sql.util.MatcherUtils.*; + +import java.io.IOException; +import java.util.List; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.client.Request; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +public class CalciteStreamstatsCommandIT extends PPLIntegTestCase { + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.STATE_COUNTRY); + loadIndex(Index.STATE_COUNTRY_WITH_NULL); + loadIndex(Index.BANK_TWO); + loadIndex(Index.LOGS); + } + + @Test + public void testStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3, 41.666666666666664, 25, 70), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4, 36.25, 20, 70)); + } + + @Test + public void testStreamstatsWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3, 41.666666666666664, 25, 70), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4, 36.25, 20, 70), + rows(null, "Canada", null, 4, 2023, 10, 5, 31, 10, 70), + rows("Kevin", null, null, 4, 2023, null, 6, 31, 10, 70)); + } + + @Test + public void testStreamstatsBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsByWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 3, 18.333333333333332, 10, 25), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + + actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 2, 10, 10, 10)); + } + + @Test + public void testStreamstatsBySpan() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsBySpanWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsByMultiplePartitions1() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsByMultiplePartitions2() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, state", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20)); + } + + @Test + public void testStreamstatsByMultiplePartitionsWithNull1() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsByMultiplePartitionsWithNull2() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsCurrent() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current=false avg(age) as prev_avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 41.666666666666664)); + } + + @Test + public void testStreamstatsCurrentWithNUll() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current=false avg(age) as prev_avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 41.666666666666664), + rows(null, "Canada", null, 4, 2023, 10, 36.25), + rows("Kevin", null, null, 4, 2023, null, 31)); + } + + @Test + public void testStreamstatsWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 3 avg(age) as avg", TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 25)); + } + + @Test + public void testStreamstatsWindowWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 3 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 18.333333333333332), + rows("Kevin", null, null, 4, 2023, null, 15)); + } + + public void testStreamstatsBigWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 10 avg(age) as avg", TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 36.25)); + } + + @Test + public void testStreamstatsWindowError() { + Throwable e = + assertThrowsWithReplace( + IllegalArgumentException.class, + () -> + executeQuery( + String.format( + "source=%s | streamstats window=-1 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY))); + verifyErrorMessageContains(e, "Window size must be >= 0, but got: -1"); + } + + @Test + public void testStreamstatsCurrentAndWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current = false window = 2 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 27.5)); + } + + @Test + public void testStreamstatsCurrentAndWindowWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current = false window = 2 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 27.5), + rows(null, "Canada", null, 4, 2023, 10, 22.5), + rows("Kevin", null, null, 4, 2023, null, 15)); + } + + @Test + public void testStreamstatsGlobal() throws IOException { + final int docId = 5; + Request insertRequest = + new Request( + "PUT", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 40,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=false avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 35)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=true avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 40)); + + Request deleteRequest = + new Request( + "DELETE", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsGlobalWithNull() throws IOException { + final int docId = 7; + Request insertRequest = + new Request( + "PUT", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 40,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=false avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 35)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=true avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 40)); + + Request deleteRequest = + new Request( + "DELETE", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsReset() throws IOException { + final int docId = 5; + Request insertRequest = + new Request( + "PUT", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 28,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + Request deleteRequest = + new Request( + "DELETE", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsResetWithNull() throws IOException { + final int docId = 7; + Request insertRequest = + new Request( + "PUT", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 28,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + Request deleteRequest = + new Request( + "DELETE", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testUnsupportedWindowFunctions() { + List unsupported = List.of("PERCENTILE_APPROX", "PERCENTILE"); + for (String u : unsupported) { + Throwable e = + assertThrowsWithReplace( + UnsupportedOperationException.class, + () -> + executeQuery( + String.format( + "source=%s | streamstats %s(age)", TEST_INDEX_STATE_COUNTRY, u))); + verifyErrorMessageContains(e, "Unexpected window function: " + u); + } + } + + @Test + public void testMultipleStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" + + " avg(avg_age) as avg_state_age by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20, 22.5)); + } + + @Test + public void testMultipleStreamstatsWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" + + " avg(avg_age) as avg_state_age by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 10, 18.333333333333332), + rows("Kevin", null, null, 4, 2023, null, null, null)); + } + + @Test + public void testStreamstatsAndEventstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats avg(age) as avg_age| streamstats" + + " avg(age) as avg_age_stream", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 36.25, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 36.25, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 36.25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 36.25, 36.25)); + } + + @Test + public void testStreamstatsAndSort() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | sort age | streamstats window = 2 avg(age) as avg_age ", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows("John", "Canada", "Ontario", 4, 2023, 25, 22.5), + rows("Hello", "USA", "New York", 4, 2023, 30, 27.5), + rows("Jake", "USA", "California", 4, 2023, 70, 50)); + } + + @Test + public void testLeftJoinWithStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s as l | left join left=l right=r on l.country = r.country [ source=%s |" + + " streamstats window=2 avg(age) as avg_age]", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows( + "John", "Canada", "Ontario", 4, 2023, 25, "John", "Canada", "Ontario", 4, 2023, 25, + 27.5), + rows( + "John", "Canada", "Ontario", 4, 2023, 25, "Jane", "Canada", "Quebec", 4, 2023, 20, + 22.5), + rows("John", "Canada", "Ontario", 4, 2023, 25, null, "Canada", null, 4, 2023, 10, 15), + rows( + "Jane", "Canada", "Quebec", 4, 2023, 20, "John", "Canada", "Ontario", 4, 2023, 25, + 27.5), + rows( + "Jane", "Canada", "Quebec", 4, 2023, 20, "Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, null, "Canada", null, 4, 2023, 10, 15), + rows( + "Jake", "USA", "California", 4, 2023, 70, "Jake", "USA", "California", 4, 2023, 70, 70), + rows("Jake", "USA", "California", 4, 2023, 70, "Hello", "USA", "New York", 4, 2023, 30, 50), + rows("Hello", "USA", "New York", 4, 2023, 30, "Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, "Hello", "USA", "New York", 4, 2023, 30, 50)); + } + + @Test + public void testWhereInWithStreamstatsSubquery() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where country in [ source=%s | streamstats window=2 avg(age) as" + + " avg_age | where avg_age > 40 | fields country ]", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70), + rows("Hello", "USA", "New York", 4, 2023, 30)); + } + + @Test + public void testMultipleStreamstatsWithEval() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by country, state, name | eval" + + " avg_age_divide_20 = avg_age - 20 | streamstats avg(avg_age_divide_20) as" + + " avg_state_age by country, state | where avg_state_age > 0 | streamstats" + + " count(avg_state_age) as count_country_age_greater_20 by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 50, 50, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 10, 10, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 5, 5, 1)); + } + + @Test + public void testStreamstatsEmptyRows() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where name = 'non-existed' | streamstats count(), avg(age), min(age)," + + " max(age), stddev_pop(age), stddev_samp(age), var_pop(age), var_samp(age)", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyNumOfRows(actual, 0); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | where name = 'non-existed' | streamstats count(), avg(age), min(age)," + + " max(age), stddev_pop(age), stddev_samp(age), var_pop(age), var_samp(age) by" + + " country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyNumOfRows(actual2, 0); + } + + @Test + public void testStreamstatsVariance() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age)", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("stddev_pop(age)", "double"), + schema("stddev_samp(age)", "double"), + schema("var_pop(age)", "double"), + schema("var_samp(age)", "double")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows( + "John", + "Canada", + "Ontario", + 4, + 2023, + 25, + 20.138409955990955, + 24.66441431158124, + 405.55555555555566, + 608.3333333333335), + rows( + "Jane", + "Canada", + "Quebec", + 4, + 2023, + 20, + 19.803724397193573, + 22.86737122335374, + 392.1875, + 522.9166666666666)); + } + + @Test + public void testStreamstatsVarianceWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age)", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("stddev_pop(age)", "double"), + schema("stddev_samp(age)", "double"), + schema("var_pop(age)", "double"), + schema("var_samp(age)", "double")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows( + "John", + "Canada", + "Ontario", + 4, + 2023, + 25, + 20.138409955990955, + 24.66441431158124, + 405.55555555555566, + 608.3333333333335), + rows( + "Jane", + "Canada", + "Quebec", + 4, + 2023, + 20, + 19.803724397193573, + 22.86737122335374, + 392.1875, + 522.9166666666666), + rows(null, "Canada", null, 4, 2023, 10, 20.591260281974, 23.021728866442675, 424, 530), + rows("Kevin", null, null, 4, 2023, null, 20.591260281974, 23.021728866442675, 424, 530)); + } + + @Test + public void testStreamstatsVarianceBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows("John", "Canada", "Ontario", 4, 2023, 25, 0, null, 0, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2.5, 3.5355339059327378, 6.25, 12.5)); + } + + @Test + public void testStreamstatsVarianceBySpan() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where country != 'USA' | streamstats stddev_samp(age) by span(age," + + " 10)", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 3.5355339059327378)); + } + + @Test + public void testStreamstatsVarianceWithNullBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows("John", "Canada", "Ontario", 4, 2023, 25, 0, null, 0, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2.5, 3.5355339059327378, 6.25, 12.5), + rows( + null, + "Canada", + null, + 4, + 2023, + 10, + 6.2360956446232345, + 7.6376261582597325, + 38.88888888888888, + 58.333333333333314), + rows("Kevin", null, null, 4, 2023, null, null, null, null, null)); + } + + @Test + public void testStreamstatsDistinctCount() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state", TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4)); + } + + @Test + public void testStreamstatsDistinctCountByCountry() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state by country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2)); + } + + @Test + public void testStreamstatsDistinctCountFunction() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats distinct_count(country) as dc_country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_country", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 1), + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2)); + } + + @Test + public void testStreamstatsDistinctCountWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows(null, "Canada", null, 4, 2023, 10, 4), + rows("Kevin", null, null, 4, 2023, null, 4)); + } + + @Test + public void testStreamstatsEarliestAndLatest() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats earliest(message), latest(message) by server", + TEST_INDEX_LOGS)); + verifySchema( + actual, + schema("created_at", "timestamp"), + schema("server", "string"), + schema("@timestamp", "timestamp"), + schema("message", "string"), + schema("level", "string"), + schema("earliest(message)", "string"), + schema("latest(message)", "string")); + verifyDataRows( + actual, + rows( + "2023-01-05 00:00:00", + "server1", + "2023-01-01 00:00:00", + "Database connection failed", + "ERROR", + "Database connection failed", + "Database connection failed"), + rows( + "2023-01-04 00:00:00", + "server2", + "2023-01-02 00:00:00", + "Service started", + "INFO", + "Service started", + "Service started"), + rows( + "2023-01-03 00:00:00", + "server1", + "2023-01-03 00:00:00", + "High memory usage", + "WARN", + "Database connection failed", + "High memory usage"), + rows( + "2023-01-02 00:00:00", + "server3", + "2023-01-04 00:00:00", + "Disk space low", + "ERROR", + "Disk space low", + "Disk space low"), + rows( + "2023-01-01 00:00:00", + "server2", + "2023-01-05 00:00:00", + "Backup completed", + "INFO", + "Service started", + "Backup completed")); + } +} diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml new file mode 100644 index 00000000000..9dd91501bf8 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml new file mode 100644 index 00000000000..32538ab17df --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..12=[{inputs}], proj#0..10=[{exprs}], distinct_states=[$t12]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml new file mode 100644 index 00000000000..cac21b929ee --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..7=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t6], latest_message=[$t7]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$5], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {1} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml new file mode 100644 index 00000000000..f19625d85e5 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..7=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t6], latest_message=[$t7]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$5], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $0), ARG_MAX($3, $0)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml new file mode 100644 index 00000000000..f17643ab804 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[ARG_MIN($3, $2) OVER (ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml new file mode 100644 index 00000000000..293dd785f96 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml @@ -0,0 +1,29 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(>=($17, -($cor0.__stream_seq__, 1)), <=($17, $cor0.__stream_seq__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..11=[{inputs}], expr#12=[1], expr#13=[-($t11, $t12)], proj#0..11=[{exprs}], $f12=[$t13]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($0, $3), >=($5, $2), <=($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..1=[{inputs}], expr#2=[1], expr#3=[-($t1, $t2)], proj#0..1=[{exprs}], $f2=[$t3]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml new file mode 100644 index 00000000000..0e8ed3a3dde --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml @@ -0,0 +1,38 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$21]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17, 20}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(<($17, $cor0.__stream_seq__), =($20, $cor0.__seg_id__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..15=[{inputs}], expr#16=[0], expr#17=[COALESCE($t15, $t16)], expr#18=[+($t14, $t17)], proj#0..11=[{exprs}], __seg_id__=[$t18]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($12)])], window#1=[window(rows between UNBOUNDED PRECEDING and $14 PRECEDING aggs [$SUM0($13)])], constants=[[1]]) + EnumerableCalc(expr#0..11=[{inputs}], expr#12=[34], expr#13=[>($t8, $t12)], expr#14=[1], expr#15=[0], expr#16=[CASE($t13, $t14, $t15)], expr#17=[25], expr#18=[<($t8, $t17)], expr#19=[CASE($t18, $t14, $t15)], proj#0..11=[{exprs}], __reset_before_flag__=[$t16], __reset_after_flag__=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($2, $6), =($0, $3), <($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[COALESCE($t5, $t6)], expr#8=[+($t4, $t7)], proj#0..1=[{exprs}], __seg_id__=[$t8]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($2)])], window#1=[window(rows between UNBOUNDED PRECEDING and $4 PRECEDING aggs [$SUM0($3)])], constants=[[1]]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[34], expr#4=[>($t1, $t3)], expr#5=[1], expr#6=[0], expr#7=[CASE($t4, $t5, $t6)], expr#8=[25], expr#9=[<($t1, $t8)], expr#10=[CASE($t9, $t5, $t6)], gender=[$t0], __stream_seq__=[$t2], __reset_before_flag__=[$t7], __reset_after_flag__=[$t10]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..6=[{inputs}], expr#7=[0], expr#8=[COALESCE($t6, $t7)], expr#9=[+($t5, $t8)], proj#0..2=[{exprs}], __seg_id__=[$t9]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($3)])], window#1=[window(rows between UNBOUNDED PRECEDING and $5 PRECEDING aggs [$SUM0($4)])], constants=[[1]]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[34], expr#4=[>($t1, $t3)], expr#5=[1], expr#6=[0], expr#7=[CASE($t4, $t5, $t6)], expr#8=[25], expr#9=[<($t1, $t8)], expr#10=[CASE($t9, $t5, $t6)], proj#0..2=[{exprs}], __reset_before_flag__=[$t7], __reset_after_flag__=[$t10]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml new file mode 100644 index 00000000000..6ffa5ad304c --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml @@ -0,0 +1,10 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..17=[{inputs}], proj#0..10=[{exprs}], $11=[$t17]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml new file mode 100644 index 00000000000..550cf0ea9cb --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..18=[{inputs}], proj#0..10=[{exprs}], distinct_states=[$t18]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$17], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml new file mode 100644 index 00000000000..c37fae48771 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..13=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t12], latest_message=[$t13]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {1} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml new file mode 100644 index 00000000000..b85e4b6b7bb --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..13=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t12], latest_message=[$t13]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $0), ARG_MAX($3, $0)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml new file mode 100644 index 00000000000..79dcbca7555 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml @@ -0,0 +1,10 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[ARG_MIN($3, $2) OVER (ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..12=[{inputs}], proj#0..4=[{exprs}], $5=[$t11], $6=[$t12]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml new file mode 100644 index 00000000000..3ac52e02f55 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml @@ -0,0 +1,30 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(>=($17, -($cor0.__stream_seq__, 1)), <=($17, $cor0.__stream_seq__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[1], expr#19=[-($t17, $t18)], proj#0..10=[{exprs}], __stream_seq__=[$t17], $f12=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($0, $3), >=($5, $2), <=($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[1], expr#19=[-($t17, $t18)], gender=[$t4], __stream_seq__=[$t17], $f12=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..17=[{inputs}], gender=[$t4], age=[$t8], $2=[$t17]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml new file mode 100644 index 00000000000..be28e9b1d8c --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml @@ -0,0 +1,38 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$21]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17, 20}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(<($17, $cor0.__stream_seq__), =($20, $cor0.__seg_id__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..15=[{inputs}], expr#16=[0], expr#17=[COALESCE($t15, $t16)], expr#18=[+($t14, $t17)], proj#0..11=[{exprs}], __seg_id__=[$t18]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($12)])], window#1=[window(rows between UNBOUNDED PRECEDING and $14 PRECEDING aggs [$SUM0($13)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], proj#0..10=[{exprs}], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($2, $6), =($0, $3), <($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[COALESCE($t5, $t6)], expr#8=[+($t4, $t7)], proj#0..1=[{exprs}], __seg_id__=[$t8]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($2)])], window#1=[window(rows between UNBOUNDED PRECEDING and $4 PRECEDING aggs [$SUM0($3)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], gender=[$t4], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..6=[{inputs}], expr#7=[0], expr#8=[COALESCE($t6, $t7)], expr#9=[+($t5, $t8)], proj#0..2=[{exprs}], __seg_id__=[$t9]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($3)])], window#1=[window(rows between UNBOUNDED PRECEDING and $5 PRECEDING aggs [$SUM0($4)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], gender=[$t4], age=[$t8], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 78f6535e543..511122fa28c 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -17,17 +17,24 @@ EXPLAIN: 'EXPLAIN'; FROM: 'FROM'; WHERE: 'WHERE'; FIELDS: 'FIELDS'; +FIELD: 'FIELD'; +TABLE: 'TABLE'; // Alias for FIELDS command RENAME: 'RENAME'; STATS: 'STATS'; EVENTSTATS: 'EVENTSTATS'; +STREAMSTATS: 'STREAMSTATS'; DEDUP: 'DEDUP'; SORT: 'SORT'; EVAL: 'EVAL'; HEAD: 'HEAD'; +BIN: 'BIN'; TOP: 'TOP'; RARE: 'RARE'; PARSE: 'PARSE'; +SPATH: 'SPATH'; REGEX: 'REGEX'; +REX: 'REX'; +SED: 'SED'; PUNCT: 'PUNCT'; GROK: 'GROK'; PATTERN: 'PATTERN'; @@ -39,6 +46,7 @@ ML: 'ML'; FILLNULL: 'FILLNULL'; FLATTEN: 'FLATTEN'; TRENDLINE: 'TRENDLINE'; +TIMECHART: 'TIMECHART'; APPENDCOL: 'APPENDCOL'; EXPAND: 'EXPAND'; SIMPLE_PATTERN: 'SIMPLE_PATTERN'; @@ -47,8 +55,11 @@ VARIABLE_COUNT_THRESHOLD: 'VARIABLE_COUNT_THRESHOLD'; FREQUENCY_THRESHOLD_PERCENTAGE: 'FREQUENCY_THRESHOLD_PERCENTAGE'; METHOD: 'METHOD'; MAX_SAMPLE_COUNT: 'MAX_SAMPLE_COUNT'; +MAX_MATCH: 'MAX_MATCH'; +OFFSET_FIELD: 'OFFSET_FIELD'; BUFFER_LIMIT: 'BUFFER_LIMIT'; LABEL: 'LABEL'; +SHOW_NUMBERED_TOKEN: 'SHOW_NUMBERED_TOKEN'; AGGREGATION: 'AGGREGATION'; //Native JOIN KEYWORDS @@ -62,23 +73,26 @@ ANTI: 'ANTI'; CROSS: 'CROSS'; LEFT_HINT: 'HINT.LEFT'; RIGHT_HINT: 'HINT.RIGHT'; -PATTERN_METHOD: 'PATTERN_METHOD'; // COMMAND ASSIST KEYWORDS AS: 'AS'; BY: 'BY'; SOURCE: 'SOURCE'; INDEX: 'INDEX'; +A: 'A'; +ASC: 'ASC'; D: 'D'; DESC: 'DESC'; DATASOURCES: 'DATASOURCES'; USING: 'USING'; WITH: 'WITH'; +VALUE: 'VALUE'; SIMPLE: 'SIMPLE'; STANDARD: 'STANDARD'; COST: 'COST'; EXTENDED: 'EXTENDED'; OVERRIDE: 'OVERRIDE'; +OVERWRITE: 'OVERWRITE'; // SORT FIELD KEYWORDS // TODO #3180: Fix broken sort functionality @@ -97,6 +111,13 @@ DEDUP_SPLITVALUES: 'DEDUP_SPLITVALUES'; PARTITIONS: 'PARTITIONS'; ALLNUM: 'ALLNUM'; DELIM: 'DELIM'; +CURRENT: 'CURRENT'; +WINDOW: 'WINDOW'; +GLOBAL: 'GLOBAL'; +RESET_BEFORE: 'RESET_BEFORE'; +RESET_AFTER: 'RESET_AFTER'; +BUCKET_NULLABLE: 'BUCKET_NULLABLE'; +USENULL: 'USENULL'; CENTROIDS: 'CENTROIDS'; ITERATIONS: 'ITERATIONS'; DISTANCE_TYPE: 'DISTANCE_TYPE'; @@ -112,8 +133,14 @@ TIME_ZONE: 'TIME_ZONE'; TRAINING_DATA_SIZE: 'TRAINING_DATA_SIZE'; ANOMALY_SCORE_THRESHOLD: 'ANOMALY_SCORE_THRESHOLD'; APPEND: 'APPEND'; +MULTISEARCH: 'MULTISEARCH'; COUNTFIELD: 'COUNTFIELD'; SHOWCOUNT: 'SHOWCOUNT'; +LIMIT: 'LIMIT'; +USEOTHER: 'USEOTHER'; +INPUT: 'INPUT'; +OUTPUT: 'OUTPUT'; +PATH: 'PATH'; // COMPARISON FUNCTION KEYWORDS CASE: 'CASE'; @@ -132,6 +159,7 @@ XOR: 'XOR'; TRUE: 'TRUE'; FALSE: 'FALSE'; REGEXP: 'REGEXP'; +REGEX_MATCH: 'REGEX_MATCH'; // DATETIME, INTERVAL AND UNIT KEYWORDS CONVERT_TZ: 'CONVERT_TZ'; @@ -148,8 +176,8 @@ HOUR_MINUTE: 'HOUR_MINUTE'; HOUR_OF_DAY: 'HOUR_OF_DAY'; HOUR_SECOND: 'HOUR_SECOND'; INTERVAL: 'INTERVAL'; -MICROSECOND: 'MICROSECOND'; MILLISECOND: 'MILLISECOND'; +MICROSECOND: 'MICROSECOND'; MINUTE: 'MINUTE'; MINUTE_MICROSECOND: 'MINUTE_MICROSECOND'; MINUTE_OF_DAY: 'MINUTE_OF_DAY'; @@ -167,9 +195,7 @@ YEAR: 'YEAR'; YEAR_MONTH: 'YEAR_MONTH'; // DATASET TYPES -DATAMODEL: 'DATAMODEL'; LOOKUP: 'LOOKUP'; -SAVEDSEARCH: 'SAVEDSEARCH'; // CONVERTED DATA TYPES INT: 'INT'; @@ -186,6 +212,7 @@ PIPE: '|'; COMMA: ','; DOT: '.'; EQUAL: '='; +DOUBLE_EQUAL: '=='; GREATER: '>'; LESS: '<'; NOT_GREATER: '<' '='; @@ -202,10 +229,13 @@ LT_PRTHS: '('; RT_PRTHS: ')'; LT_SQR_PRTHS: '['; RT_SQR_PRTHS: ']'; +LT_CURLY: '{'; +RT_CURLY: '}'; SINGLE_QUOTE: '\''; DOUBLE_QUOTE: '"'; BACKTICK: '`'; ARROW: '->'; +fragment AT: '@'; // Operators. Bit @@ -234,6 +264,7 @@ VAR_SAMP: 'VAR_SAMP'; VAR_POP: 'VAR_POP'; STDDEV_SAMP: 'STDDEV_SAMP'; STDDEV_POP: 'STDDEV_POP'; +PERC: 'PERC'; PERCENTILE: 'PERCENTILE'; PERCENTILE_APPROX: 'PERCENTILE_APPROX'; EARLIEST: 'EARLIEST'; @@ -241,8 +272,6 @@ LATEST: 'LATEST'; TAKE: 'TAKE'; LIST: 'LIST'; VALUES: 'VALUES'; -EARLIEST_TIME: 'EARLIEST_TIME'; -LATEST_TIME: 'LATEST_TIME'; PER_DAY: 'PER_DAY'; PER_HOUR: 'PER_HOUR'; PER_MINUTE: 'PER_MINUTE'; @@ -264,6 +293,10 @@ NTH: 'NTH'; NTILE: 'NTILE'; // BASIC FUNCTIONS +PLUS_FUCTION: 'ADD'; +MINUS_FUCTION: 'SUBTRACT'; +STAR_FUNCTION: 'MULTIPLY'; +DIVIDE_FUNCTION: 'DIVIDE'; ABS: 'ABS'; CBRT: 'CBRT'; CEIL: 'CEIL'; @@ -272,12 +305,13 @@ CONV: 'CONV'; CRC32: 'CRC32'; E: 'E'; EXP: 'EXP'; +EXPM1: 'EXPM1'; FLOOR: 'FLOOR'; LN: 'LN'; LOG: 'LOG'; -LOG10: 'LOG10'; -LOG2: 'LOG2'; +LOG_WITH_BASE: ([0-9]+ ('.' [0-9]+)?)? ('LOG' | 'log') [0-9]+ ('.' [0-9]+)?; MOD: 'MOD'; +MODULUS: 'MODULUS'; PI: 'PI'; POSITION: 'POSITION'; POW: 'POW'; @@ -287,6 +321,8 @@ ROUND: 'ROUND'; SIGN: 'SIGN'; SQRT: 'SQRT'; TRUNCATE: 'TRUNCATE'; +RINT: 'RINT'; +SIGNUM: 'SIGNUM'; // TRIGONOMETRIC FUNCTIONS ACOS: 'ACOS'; @@ -294,10 +330,12 @@ ASIN: 'ASIN'; ATAN: 'ATAN'; ATAN2: 'ATAN2'; COS: 'COS'; +COSH: 'COSH'; COT: 'COT'; DEGREES: 'DEGREES'; RADIANS: 'RADIANS'; SIN: 'SIN'; +SINH: 'SINH'; TAN: 'TAN'; // CRYPTOGRAPHIC FUNCTIONS @@ -357,6 +395,7 @@ UTC_TIME: 'UTC_TIME'; UTC_TIMESTAMP: 'UTC_TIMESTAMP'; WEEKDAY: 'WEEKDAY'; YEARWEEK: 'YEARWEEK'; +STRFTIME: 'STRFTIME'; // TEXT FUNCTIONS SUBSTR: 'SUBSTR'; @@ -364,7 +403,6 @@ SUBSTRING: 'SUBSTRING'; LTRIM: 'LTRIM'; RTRIM: 'RTRIM'; TRIM: 'TRIM'; -TO: 'TO'; LOWER: 'LOWER'; UPPER: 'UPPER'; CONCAT: 'CONCAT'; @@ -392,6 +430,8 @@ ISBLANK: 'ISBLANK'; // COLLECTION FUNCTIONS ARRAY: 'ARRAY'; ARRAY_LENGTH: 'ARRAY_LENGTH'; +MVAPPEND: 'MVAPPEND'; +MVJOIN: 'MVJOIN'; FORALL: 'FORALL'; FILTER: 'FILTER'; TRANSFORM: 'TRANSFORM'; @@ -460,17 +500,43 @@ ZERO_TERMS_QUERY: 'ZERO_TERMS_QUERY'; // SPAN KEYWORDS SPAN: 'SPAN'; -MS: 'MS'; -S: 'S'; -M: 'M'; -H: 'H'; -W: 'W'; -Q: 'Q'; -Y: 'Y'; - +BINS: 'BINS'; +MINSPAN: 'MINSPAN'; +START: 'START'; +END: 'END'; +ALIGNTIME: 'ALIGNTIME'; +// PERCENTILE SHORTCUT FUNCTIONS +// Must precede ID to avoid conflicts with identifier matching +PERCENTILE_SHORTCUT: PERC(INTEGER_LITERAL | DECIMAL_LITERAL) | 'P'(INTEGER_LITERAL | DECIMAL_LITERAL); + +SPANLENGTH: [0-9]+ ( + 'US' |'CS'|'DS' + |'MS'|'MILLISECOND'|'MILLISECONDS' + |'S'|'SEC'|'SECS'|'SECOND'|'SECONDS' + |'MIN'|'MINS'|'MINUTE'|'MINUTES' + |'H'|'HR'|'HRS'|'HOUR'|'HOURS' + |'H'|'HR'|'HRS'|'HOUR'|'HOURS' + |'D'|'DAY'|'DAYS' + |'W'|'WEEK'|'WEEKS' + |'M'|'MON'|'MONTH'|'MONTHS' + |'Q'|'QTR'|'QTRS'|'QUARTER'|'QUARTERS' + |'Y'|'YR'|'YRS'|'YEAR'|'YEARS' +); + +NUMERIC_ID : DEC_DIGIT+ ID_LITERAL; // LITERALS AND VALUES //STRING_LITERAL: DQUOTA_STRING | SQUOTA_STRING | BQUOTA_STRING; +fragment WEEK_SNAP_UNIT: 'W' [0-7]; +fragment TIME_SNAP_UNIT: 'S' | 'SEC' | 'SECOND' + | 'M' | 'MIN' | 'MINUTE' + | 'H' | 'HR' | 'HOUR' | 'HOURS' + | 'D' | 'DAY' + | 'W' | 'WEEK' | WEEK_SNAP_UNIT + | 'MON' | 'MONTH' + | 'Q' | 'QTR' | 'QUARTER' + | 'Y' | 'YR' | 'YEAR'; +TIME_SNAP: AT TIME_SNAP_UNIT; ID: ID_LITERAL; CLUSTER: CLUSTER_PREFIX_LITERAL; INTEGER_LITERAL: DEC_DIGIT+; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 3b2728c677b..6b98fac02d6 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -8,22 +8,19 @@ parser grammar OpenSearchPPLParser; options { tokenVocab = OpenSearchPPLLexer; } + root : pplStatement? EOF ; // statement pplStatement - : dmlStatement - ; - -dmlStatement - : queryStatement - | explainStatement + : explainStatement + | queryStatement ; queryStatement - : pplCommands (PIPE commands)* + : (PIPE)? pplCommands (PIPE commands)* ; explainStatement @@ -43,26 +40,30 @@ subSearch // commands pplCommands - : searchCommand - | describeCommand + : describeCommand | showDataSourcesCommand + | searchCommand + | multisearchCommand ; commands : whereCommand | fieldsCommand + | tableCommand | joinCommand | renameCommand | statsCommand | eventstatsCommand + | streamstatsCommand | dedupCommand | sortCommand | evalCommand | headCommand - | topCommand - | rareCommand + | binCommand + | rareTopCommand | grokCommand | parseCommand + | spathCommand | patternsCommand | lookupCommand | kmeansCommand @@ -71,8 +72,14 @@ commands | fillnullCommand | trendlineCommand | appendcolCommand + | appendCommand | expandCommand | flattenCommand + | reverseCommand + | regexCommand + | timechartCommand + | rexCommand + | replaceCommand ; commandName @@ -81,14 +88,17 @@ commandName | SHOW | WHERE | FIELDS + | TABLE | JOIN | RENAME | STATS | EVENTSTATS + | STREAMSTATS | DEDUP | SORT | EVAL | HEAD + | BIN | TOP | RARE | GROK @@ -102,19 +112,67 @@ commandName | EXPAND | FLATTEN | TRENDLINE - | kmeansCommand - | adCommand - | mlCommand - | patternsCommand + | TIMECHART | EXPLAIN + | REVERSE + | REGEX + | APPEND + | MULTISEARCH + | REX + | REPLACE ; searchCommand - : (SEARCH)? fromClause # searchFrom - | (SEARCH)? fromClause logicalExpression # searchFromFilter - | (SEARCH)? logicalExpression fromClause # searchFilterFrom + : (SEARCH)? (searchExpression)* fromClause (searchExpression)* # searchFrom + ; + +searchExpression + : timeModifier # timeModifierExpression + | LT_PRTHS searchExpression RT_PRTHS # groupedExpression + | NOT searchExpression # notExpression + | searchExpression OR searchExpression # orExpression + | searchExpression AND searchExpression # andExpression + | searchTerm # termExpression + ; + +searchTerm + : searchFieldComparison # searchComparisonTerm + | searchFieldInList # searchInListTerm + | searchLiteral # searchLiteralTerm + ; + +// Unified search literal for both free text and field comparisons +searchLiteral + : numericLiteral + | booleanLiteral + | ID + | NUMERIC_ID + | stringLiteral + | searchableKeyWord ; +searchFieldComparison + : fieldExpression searchComparisonOperator searchLiteral # searchFieldCompare + ; + +searchFieldInList + : fieldExpression IN LT_PRTHS searchLiteralList RT_PRTHS # searchFieldInValues + ; + +searchLiteralList + : searchLiteral (COMMA searchLiteral)* # searchLiterals + ; + +searchComparisonOperator + : EQUAL # equals + | NOT_EQUAL # notEquals + | LESS # lessThan + | NOT_GREATER # lessOrEqual + | GREATER # greaterThan + | NOT_LESS # greaterOrEqual + ; + + describeCommand : DESCRIBE tableSourceClause ; @@ -128,27 +186,120 @@ whereCommand ; fieldsCommand - : FIELDS (PLUS | MINUS)? fieldList + : FIELDS fieldsCommandBody + ; + +// Table command - alias for fields command +tableCommand + : TABLE fieldsCommandBody + ; + +fieldsCommandBody + : (PLUS | MINUS)? wcFieldList + ; + +// Wildcard field list supporting both comma-separated and space-separated fields +wcFieldList + : selectFieldExpression (COMMA? selectFieldExpression)* ; renameCommand - : RENAME renameClasue (COMMA renameClasue)* + : RENAME renameClasue (COMMA? renameClasue)* + ; + +replaceCommand + : REPLACE replacePair (COMMA replacePair)* IN fieldList + ; + +replacePair + : pattern=stringLiteral WITH replacement=stringLiteral ; statsCommand - : STATS (PARTITIONS EQUAL partitions = integerLiteral)? (ALLNUM EQUAL allnum = booleanLiteral)? (DELIM EQUAL delim = stringLiteral)? statsAggTerm (COMMA statsAggTerm)* (statsByClause)? (DEDUP_SPLITVALUES EQUAL dedupsplit = booleanLiteral)? + : STATS statsArgs statsAggTerm (COMMA statsAggTerm)* (statsByClause)? (dedupSplitArg)? + ; + +statsArgs + : (partitionsArg | allnumArg | delimArg | bucketNullableArg)* + ; + +partitionsArg + : PARTITIONS EQUAL partitions = integerLiteral + ; + +allnumArg + : ALLNUM EQUAL allnum = booleanLiteral + ; + +delimArg + : DELIM EQUAL delim = stringLiteral + ; + +bucketNullableArg + : BUCKET_NULLABLE EQUAL bucket_nullable = booleanLiteral + ; + +dedupSplitArg + : DEDUP_SPLITVALUES EQUAL dedupsplit = booleanLiteral ; eventstatsCommand : EVENTSTATS eventstatsAggTerm (COMMA eventstatsAggTerm)* (statsByClause)? ; +streamstatsCommand + : STREAMSTATS streamstatsArgs streamstatsAggTerm (COMMA streamstatsAggTerm)* (statsByClause)? + ; + +streamstatsArgs + : (currentArg | windowArg | globalArg | resetBeforeArg | resetAfterArg)* + ; + +currentArg + : CURRENT EQUAL current = booleanLiteral + ; + +windowArg + : WINDOW EQUAL window = integerLiteral + ; + +globalArg + : GLOBAL EQUAL global = booleanLiteral + ; + +resetBeforeArg + : RESET_BEFORE EQUAL logicalExpression + ; + +resetAfterArg + : RESET_AFTER EQUAL logicalExpression + ; + dedupCommand : DEDUP (number = integerLiteral)? fieldList (KEEPEMPTY EQUAL keepempty = booleanLiteral)? (CONSECUTIVE EQUAL consecutive = booleanLiteral)? ; sortCommand - : SORT sortbyClause + : SORT (count = integerLiteral)? sortbyClause + ; + +reverseCommand + : REVERSE + ; + +timechartCommand + : TIMECHART timechartParameter* statsFunction (BY fieldExpression)? + ; + +timechartParameter + : LIMIT EQUAL integerLiteral + | SPAN EQUAL spanLiteral + | USEOTHER EQUAL (booleanLiteral | ident) + ; + +spanLiteral + : SPANLENGTH + | INTEGER_LITERAL ; evalCommand @@ -159,12 +310,42 @@ headCommand : HEAD (number = integerLiteral)? (FROM from = integerLiteral)? ; -topCommand - : TOP (number = integerLiteral)? (COUNTFIELD EQUAL countfield = stringLiteral)? (SHOWCOUNT EQUAL showcount = booleanLiteral)? fieldList (byClause)? +binCommand + : BIN fieldExpression binOption* (AS alias = qualifiedName)? + ; + +binOption + : SPAN EQUAL span = binSpanValue + | BINS EQUAL bins = integerLiteral + | MINSPAN EQUAL minspan = spanLiteral + | ALIGNTIME EQUAL aligntime = aligntimeValue + | START EQUAL start = numericLiteral + | END EQUAL end = numericLiteral + ; + +aligntimeValue + : EARLIEST + | LATEST + | literalValue + ; + +binSpanValue + : spanLiteral # numericSpanValue + | logSpanValue # logBasedSpanValue + ; + +logSpanValue + : LOG_WITH_BASE # logWithBaseSpan ; -rareCommand - : RARE (number = integerLiteral)? (COUNTFIELD EQUAL countfield = stringLiteral)? (SHOWCOUNT EQUAL showcount = booleanLiteral)? fieldList (byClause)? +rareTopCommand + : (TOP | RARE) (number = integerLiteral)? rareTopOption* fieldList (byClause)? + ; + +rareTopOption + : COUNTFIELD EQUAL countField = stringLiteral + | SHOWCOUNT EQUAL showCount = booleanLiteral + | USENULL EQUAL useNull = booleanLiteral ; grokCommand @@ -175,17 +356,68 @@ parseCommand : PARSE (source_field = expression) (pattern = stringLiteral) ; +spathCommand + : SPATH spathParameter* + ; + +spathParameter + : (INPUT EQUAL input = expression) + | (OUTPUT EQUAL output = expression) + | ((PATH EQUAL)? path = indexablePath) + ; + +indexablePath + : pathElement (DOT pathElement)* + ; + +pathElement + : ident pathArrayAccess? + ; + +pathArrayAccess + : LT_CURLY (INTEGER_LITERAL)? RT_CURLY + ; +regexCommand + : REGEX regexExpr + ; + +regexExpr + : field=qualifiedName operator=(EQUAL | NOT_EQUAL) pattern=stringLiteral + ; + +rexCommand + : REX rexExpr + ; + +rexExpr + : FIELD EQUAL field=qualifiedName (rexOption)* pattern=stringLiteral (rexOption)* + ; + +rexOption + : MAX_MATCH EQUAL maxMatch=integerLiteral + | MODE EQUAL (EXTRACT | SED) + | OFFSET_FIELD EQUAL offsetField=qualifiedName + ; patternsMethod : PUNCT | REGEX ; patternsCommand - : PATTERNS (source_field = expression) (statsByClause)? (METHOD EQUAL method = patternMethod)? (MODE EQUAL pattern_mode = patternMode)? (MAX_SAMPLE_COUNT EQUAL max_sample_count = integerLiteral)? (BUFFER_LIMIT EQUAL buffer_limit = integerLiteral)? (NEW_FIELD EQUAL new_field = stringLiteral)? (patternsParameter)* + : PATTERNS (source_field = expression) (statsByClause)? (patternsCommandOption)* (patternsParameter)* + ; + +patternsCommandOption + : (METHOD EQUAL method = patternMethod) + | (MODE EQUAL pattern_mode = patternMode) + | (MAX_SAMPLE_COUNT EQUAL max_sample_count = integerLiteral) + | (BUFFER_LIMIT EQUAL buffer_limit = integerLiteral) + | (SHOW_NUMBERED_TOKEN EQUAL show_numbered_token = booleanLiteral) ; patternsParameter : (PATTERN EQUAL pattern = stringLiteral) + | (NEW_FIELD EQUAL new_field = stringLiteral) | (VARIABLE_COUNT_THRESHOLD EQUAL variable_count_threshold = integerLiteral) | (FREQUENCY_THRESHOLD_PERCENTAGE EQUAL frequency_threshold_percentage = decimalLiteral) ; @@ -222,8 +454,10 @@ lookupPair ; fillnullCommand - : FILLNULL fillNullWith - | FILLNULL fillNullUsing + : FILLNULL fillNullWith # fillNullWithClause + | FILLNULL fillNullUsing # fillNullUsingClause + | FILLNULL VALUE EQUAL replacement = valueExpression fieldList # fillNullValueWithFields + | FILLNULL VALUE EQUAL replacement = valueExpression # fillNullValueAllFields ; fillNullWith @@ -263,6 +497,14 @@ appendcolCommand : APPENDCOL (OVERRIDE EQUAL override = booleanLiteral)? LT_SQR_PRTHS commands (PIPE commands)* RT_SQR_PRTHS ; +appendCommand + : APPEND LT_SQR_PRTHS searchCommand? (PIPE commands)* RT_SQR_PRTHS + ; + +multisearchCommand + : MULTISEARCH (LT_SQR_PRTHS subSearch RT_SQR_PRTHS)+ + ; + kmeansCommand : KMEANS (kmeansParameter)* ; @@ -306,6 +548,8 @@ fromClause | INDEX EQUAL tableOrSubqueryClause | SOURCE EQUAL tableFunction | INDEX EQUAL tableFunction + | SOURCE EQUAL dynamicSourceClause + | INDEX EQUAL dynamicSourceClause ; tableOrSubqueryClause @@ -317,27 +561,52 @@ tableSourceClause : tableSource (COMMA tableSource)* (AS alias = qualifiedName)? ; +dynamicSourceClause + : LT_SQR_PRTHS (sourceReference | sourceFilterArg) (COMMA (sourceReference | sourceFilterArg))* RT_SQR_PRTHS + ; + +sourceReference + : (CLUSTER)? wcQualifiedName + ; + +sourceFilterArg + : ident EQUAL literalValue + | ident IN valueList + ; + // join joinCommand - : (joinType) JOIN sideAlias joinHintList? joinCriteria? right = tableOrSubqueryClause + : JOIN (joinOption)* (fieldList)? right = tableOrSubqueryClause + | sqlLikeJoinType? JOIN (joinOption)* sideAlias joinHintList? joinCriteria right = tableOrSubqueryClause ; -joinType - : INNER? +sqlLikeJoinType + : INNER | CROSS - | LEFT OUTER? + | (LEFT OUTER? | OUTER) | RIGHT OUTER? | FULL OUTER? | LEFT? SEMI | LEFT? ANTI ; +joinType + : INNER + | CROSS + | OUTER + | LEFT + | RIGHT + | FULL + | SEMI + | ANTI + ; + sideAlias : (LEFT EQUAL leftAlias = qualifiedName)? COMMA? (RIGHT EQUAL rightAlias = qualifiedName)? ; joinCriteria - : ON logicalExpression + : (ON | WHERE) logicalExpression ; joinHintList @@ -349,8 +618,14 @@ hintPair | rightHintKey = RIGHT_HINT DOT ID EQUAL rightHintValue = ident #rightHint ; +joinOption + : OVERWRITE EQUAL booleanLiteral # overwriteOption + | TYPE EQUAL joinType # typeOption + | MAX EQUAL integerLiteral # maxOption + ; + renameClasue - : orignalField = wcFieldExpression AS renamedField = wcFieldExpression + : orignalField = renameFieldExpression AS renamedField = renameFieldExpression ; byClause @@ -369,7 +644,7 @@ bySpanClause ; spanClause - : SPAN LT_PRTHS fieldExpression COMMA value = literalValue (unit = timespanUnit)? RT_PRTHS + : SPAN LT_PRTHS (fieldExpression COMMA)? value = spanLiteral RT_PRTHS ; sortbyClause @@ -377,13 +652,17 @@ sortbyClause ; evalClause - : fieldExpression EQUAL expression + : fieldExpression EQUAL logicalExpression ; eventstatsAggTerm : windowFunction (AS alias = wcFieldExpression)? ; +streamstatsAggTerm + : windowFunction (AS alias = wcFieldExpression)? + ; + windowFunction : windowFunctionName LT_PRTHS functionArgs RT_PRTHS ; @@ -403,6 +682,8 @@ scalarWindowFunctionName | LAST | NTH | NTILE + | DISTINCT_COUNT + | DC ; // aggregation terms @@ -412,11 +693,15 @@ statsAggTerm // aggregation functions statsFunction - : statsFunctionName LT_PRTHS valueExpression RT_PRTHS # statsFunctionCall - | COUNT LT_PRTHS RT_PRTHS # countAllFunctionCall + : (COUNT | C) LT_PRTHS evalExpression RT_PRTHS # countEvalFunctionCall + | (COUNT | C) (LT_PRTHS RT_PRTHS)? # countAllFunctionCall + | PERCENTILE_SHORTCUT LT_PRTHS valueExpression RT_PRTHS # percentileShortcutFunctionCall | (DISTINCT_COUNT | DC | DISTINCT_COUNT_APPROX) LT_PRTHS valueExpression RT_PRTHS # distinctCountFunctionCall | takeAggFunction # takeAggFunctionCall + | valuesAggFunction # valuesAggFunctionCall | percentileApproxFunction # percentileApproxFunctionCall + | perFunction # perFunctionCall + | statsFunctionName LT_PRTHS functionArgs RT_PRTHS # statsFunctionCall ; statsFunctionName @@ -431,19 +716,31 @@ statsFunctionName | STDDEV_POP | PERCENTILE | PERCENTILE_APPROX + | MEDIAN + | LIST + | FIRST | EARLIEST | LATEST + | LAST ; takeAggFunction : TAKE LT_PRTHS fieldExpression (COMMA size = integerLiteral)? RT_PRTHS ; +valuesAggFunction + : VALUES LT_PRTHS valueExpression RT_PRTHS + ; + percentileApproxFunction : (PERCENTILE | PERCENTILE_APPROX) LT_PRTHS aggField = valueExpression COMMA percent = numericLiteral (COMMA compression = numericLiteral)? RT_PRTHS ; +perFunction + : funcName=(PER_SECOND | PER_MINUTE | PER_HOUR | PER_DAY) LT_PRTHS functionArg RT_PRTHS + ; + numericLiteral : integerLiteral | decimalLiteral @@ -451,68 +748,56 @@ numericLiteral | floatLiteral ; -// expressions -expression - : logicalExpression - | comparisonExpression - | valueExpression - ; - // predicates logicalExpression - : LT_PRTHS logicalExpression RT_PRTHS # parentheticLogicalExpr - | NOT logicalExpression # logicalNot - | left = logicalExpression (AND)? right = logicalExpression # logicalAnd + : NOT logicalExpression # logicalNot + | left = logicalExpression AND right = logicalExpression # logicalAnd | left = logicalExpression XOR right = logicalExpression # logicalXor | left = logicalExpression OR right = logicalExpression # logicalOr - | comparisonExpression # comparsion - | booleanExpression # booleanExpr - | relevanceExpression # relevanceExpr - ; - -comparisonExpression - : left = valueExpression comparisonOperator right = valueExpression # compareExpr - | valueExpression NOT? IN valueList # inExpr - | valueExpression NOT? BETWEEN valueExpression AND valueExpression # between + | expression # logicalExpr ; -valueExpressionList - : valueExpression - | LT_PRTHS valueExpression (COMMA valueExpression)* RT_PRTHS +expression + : valueExpression # valueExpr + | relevanceExpression # relevanceExpr + | left = expression comparisonOperator right = expression # compareExpr + | expression NOT? IN valueList # inExpr + | expression NOT? BETWEEN expression AND expression # between ; valueExpression - : left = valueExpression binaryOperator = (STAR | DIVIDE | MODULE) right = valueExpression # binaryArithmetic - | left = valueExpression binaryOperator = (PLUS | MINUS) right = valueExpression # binaryArithmetic - | primaryExpression # valueExpressionDefault - | positionFunction # positionFunctionCall - | caseFunction # caseExpr - | extractFunction # extractFunctionCall - | getFormatFunction # getFormatFunctionCall - | timestampFunction # timestampFunctionCall - | LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr - | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr - | lambda # lambdaExpr - ; - -primaryExpression + : left = valueExpression binaryOperator = (STAR | DIVIDE | MODULE) right = valueExpression # binaryArithmetic + | left = valueExpression binaryOperator = (PLUS | MINUS) right = valueExpression # binaryArithmetic + | literalValue # literalValueExpr + | functionCall # functionCallExpr + | lambda # lambdaExpr + | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr + | valueExpression NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr + | LT_PRTHS valueExpression (COMMA valueExpression)* RT_PRTHS NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr + | EXISTS LT_SQR_PRTHS subSearch RT_SQR_PRTHS # existsSubqueryExpr + | fieldExpression # fieldExpr + | LT_PRTHS logicalExpression RT_PRTHS # nestedValueExpr + ; + +evalExpression + : EVAL LT_PRTHS logicalExpression RT_PRTHS + ; + +functionCall : evalFunctionCall | dataTypeFunctionCall - | fieldExpression - | literalValue + | positionFunctionCall + | caseFunctionCall + | timestampFunctionCall + | extractFunctionCall + | getFormatFunctionCall ; -positionFunction +positionFunctionCall : positionFunctionName LT_PRTHS functionArg IN functionArg RT_PRTHS ; -booleanExpression - : booleanFunctionCall # booleanFunctionCallExpr - | valueExpressionList NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr - | EXISTS LT_SQR_PRTHS subSearch RT_SQR_PRTHS # existsSubqueryExpr - ; - -caseFunction +caseFunctionCall : CASE LT_PRTHS logicalExpression COMMA valueExpression (COMMA logicalExpression COMMA valueExpression)* (ELSE valueExpression)? RT_PRTHS ; @@ -528,7 +813,21 @@ singleFieldRelevanceFunction // Field is a list of columns multiFieldRelevanceFunction - : multiFieldRelevanceFunctionName LT_PRTHS LT_SQR_PRTHS field = relevanceFieldAndWeight (COMMA field = relevanceFieldAndWeight)* RT_SQR_PRTHS COMMA query = relevanceQuery (COMMA relevanceArg)* RT_PRTHS + : multiFieldRelevanceFunctionName LT_PRTHS (LT_SQR_PRTHS field = relevanceFieldAndWeight (COMMA field = relevanceFieldAndWeight)* RT_SQR_PRTHS COMMA)? query = relevanceQuery (COMMA relevanceArg)* RT_PRTHS + ; + +timeModifier + : (EARLIEST | LATEST) EQUAL timeModifierValue + ; + +timeModifierValue + : NOW + | NOW LT_PRTHS RT_PRTHS + | DECIMAL_LITERAL + | INTEGER_LITERAL + | stringLiteral + | TIME_SNAP + | (PLUS | MINUS) SPANLENGTH (TIME_SNAP)? ; // tables @@ -538,20 +837,19 @@ tableSource ; tableFunction - : qualifiedName LT_PRTHS functionArgs RT_PRTHS + : qualifiedName LT_PRTHS namedFunctionArgs RT_PRTHS ; // fields fieldList - : fieldExpression (COMMA fieldExpression)* - ; - -wcFieldList - : wcFieldExpression (COMMA wcFieldExpression)* + : fieldExpression ((COMMA)? fieldExpression)* ; sortField - : (PLUS | MINUS)? sortFieldExpression + : (PLUS | MINUS) sortFieldExpression (ASC | A | DESC | D) # invalidMixedSortField + | (PLUS | MINUS) sortFieldExpression # prefixSortField + | sortFieldExpression (ASC | A | DESC | D) # suffixSortField + | sortFieldExpression # defaultSortField ; sortFieldExpression @@ -570,6 +868,16 @@ wcFieldExpression : wcQualifiedName ; +selectFieldExpression + : wcQualifiedName + | STAR + ; + +renameFieldExpression + : wcQualifiedName + | STAR + ; + // functions evalFunctionCall : evalFunctionName LT_PRTHS functionArgs RT_PRTHS @@ -577,12 +885,7 @@ evalFunctionCall // cast function dataTypeFunctionCall - : CAST LT_PRTHS expression AS convertedDataType RT_PRTHS - ; - -// boolean functions -booleanFunctionCall - : conditionFunctionName LT_PRTHS functionArgs RT_PRTHS + : CAST LT_PRTHS logicalExpression AS convertedDataType RT_PRTHS ; convertedDataType @@ -610,27 +913,34 @@ evalFunctionName | positionFunctionName | cryptographicFunctionName | jsonFunctionName - | collectionFunctionName | geoipFunctionName + | collectionFunctionName ; functionArgs : (functionArg (COMMA functionArg)*)? ; +namedFunctionArgs + : (namedFunctionArg (COMMA namedFunctionArg)*)? + ; + functionArg - : (ident EQUAL)? functionArgExpression + : functionArgExpression ; +namedFunctionArg + : (ident EQUAL)? functionArgExpression + ; functionArgExpression : lambda - | expression + | logicalExpression ; lambda - : ident ARROW expression - | LT_PRTHS ident (COMMA ident)+ RT_PRTHS ARROW expression + : ident ARROW logicalExpression + | LT_PRTHS ident (COMMA ident)+ RT_PRTHS ARROW logicalExpression ; relevanceArg @@ -702,6 +1012,10 @@ relevanceArgValue mathematicalFunctionName : ABS + | PLUS_FUCTION + | MINUS_FUCTION + | STAR_FUNCTION + | DIVIDE_FUNCTION | CBRT | CEIL | CEILING @@ -709,12 +1023,13 @@ mathematicalFunctionName | CRC32 | E | EXP + | EXPM1 | FLOOR | LN | LOG - | LOG10 - | LOG2 + | LOG_WITH_BASE | MOD + | MODULUS | PI | POW | POWER @@ -723,23 +1038,31 @@ mathematicalFunctionName | SIGN | SQRT | TRUNCATE + | RINT + | SIGNUM + | SUM + | AVG + | MAX + | MIN | trigonometricFunctionName ; - +geoipFunctionName + : GEOIP + ; collectionFunctionName : ARRAY | ARRAY_LENGTH + | MVAPPEND + | MVJOIN | FORALL | EXISTS | FILTER | TRANSFORM | REDUCE ; -geoipFunctionName - : GEOIP - ; + trigonometricFunctionName : ACOS @@ -747,10 +1070,12 @@ trigonometricFunctionName | ATAN | ATAN2 | COS + | COSH | COT | DEGREES | RADIANS | SIN + | SINH | TAN ; @@ -839,9 +1164,10 @@ dateTimeFunctionName | WEEK_OF_YEAR | YEAR | YEARWEEK + | STRFTIME ; -getFormatFunction +getFormatFunctionCall : GET_FORMAT LT_PRTHS getFormatType COMMA functionArg RT_PRTHS ; @@ -852,12 +1178,13 @@ getFormatType | TIMESTAMP ; -extractFunction +extractFunctionCall : EXTRACT LT_PRTHS datetimePart FROM functionArg RT_PRTHS ; simpleDateTimePart : MICROSECOND + | MILLISECOND | SECOND | MINUTE | HOUR @@ -887,7 +1214,7 @@ datetimePart | complexDateTimePart ; -timestampFunction +timestampFunctionCall : timestampFunctionName LT_PRTHS simpleDateTimePart COMMA firstArg = functionArg COMMA secondArg = functionArg RT_PRTHS ; @@ -902,6 +1229,7 @@ conditionFunctionName | ISNULL | ISNOTNULL | CIDRMATCH + | REGEX_MATCH | JSON_VALID | ISPRESENT | ISEMPTY @@ -949,12 +1277,14 @@ positionFunctionName // operators comparisonOperator : EQUAL + | DOUBLE_EQUAL | NOT_EQUAL | LESS | NOT_LESS | GREATER | NOT_GREATER | REGEXP + | LIKE ; singleFieldRelevanceFunctionName @@ -1033,6 +1363,7 @@ timestampLiteral intervalUnit : MICROSECOND + | MILLISECOND | SECOND | MINUTE | HOUR @@ -1054,26 +1385,6 @@ intervalUnit | YEAR_MONTH ; -timespanUnit - : MS - | S - | M - | H - | D - | W - | Q - | Y - | MILLISECOND - | SECOND - | MINUTE - | HOUR - | DAY - | WEEK - | MONTH - | QUARTER - | YEAR - ; - valueList : LT_PRTHS literalValue (COMMA literalValue)* RT_PRTHS ; @@ -1114,9 +1425,14 @@ wildcard ; keywordsCanBeId + : searchableKeyWord + | IN + ; + +searchableKeyWord : D // OD SQL and ODBC special - | timespanUnit | SPAN + | SPANLENGTH | evalFunctionName | jsonFunctionName | relevanceArgName @@ -1126,54 +1442,19 @@ keywordsCanBeId | multiFieldRelevanceFunctionName | commandName | collectionFunctionName - | comparisonOperator - | dateTimeFunctionName - | textFunctionName - | mathematicalFunctionName - | positionFunctionName - | conditionFunctionName - | jsonFunctionName - | patternMethod - | patternsMethod - | geoipFunctionName - // commands - | SEARCH - | DESCRIBE - | SHOW - | FROM - | WHERE - | FIELDS - | RENAME - | STATS - | DEDUP - | SORT - | EVAL - | FILLNULL - | HEAD - | TOP - | RARE - | PARSE - | METHOD | REGEX - | PUNCT - | GROK - | PATTERN - | PATTERNS - | NEW_FIELD - | KMEANS - | AD - | ML - | TRENDLINE | explainMode + | REGEXP // commands assist keywords | CASE | ELSE - | IN | ARROW | BETWEEN | EXISTS | SOURCE | INDEX + | A + | ASC | DESC | DATASOURCES | FROM @@ -1184,10 +1465,12 @@ keywordsCanBeId | FREQUENCY_THRESHOLD_PERCENTAGE | MAX_SAMPLE_COUNT | BUFFER_LIMIT + | SHOW_NUMBERED_TOKEN | WITH | REGEX | PUNCT | USING + | VALUE | CAST | GET_FORMAT | EXTRACT @@ -1207,6 +1490,13 @@ keywordsCanBeId | PARTITIONS | ALLNUM | DELIM + | CURRENT + | WINDOW + | GLOBAL + | RESET_BEFORE + | RESET_AFTER + | BUCKET_NULLABLE + | USENULL | CENTROIDS | ITERATIONS | DISTANCE_TYPE @@ -1223,6 +1513,19 @@ keywordsCanBeId | ANOMALY_SCORE_THRESHOLD | COUNTFIELD | SHOWCOUNT + | PATH + | INPUT + | OUTPUT + | AS + | ON + | LIMIT + | OVERWRITE + | FIELD + | SED + | MAX_MATCH + | OFFSET_FIELD + | patternMethod + | patternMode // AGGREGATIONS AND WINDOW | statsFunctionName | windowFunctionName @@ -1261,4 +1564,5 @@ keywordsCanBeId | ANTI | LEFT_HINT | RIGHT_HINT + | PERCENTILE_SHORTCUT ; diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index cd102ad13a6..65323229162 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -8,18 +8,20 @@ import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; import static org.opensearch.sql.ast.dsl.AstDSL.qualifiedName; +import static org.opensearch.sql.calcite.utils.CalciteUtils.getOnlyForCalciteException; import static org.opensearch.sql.lang.PPLLangSpec.PPL_SPEC; +import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.BinCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DedupCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DescribeCommandContext; +import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DynamicSourceClauseContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.EvalCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.FieldsCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.HeadCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.RenameCommandContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SearchFilterFromContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SearchFromContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SearchFromFilterContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SortCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.StatsCommandContext; +import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.TableCommandContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.TableFunctionContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.TableSourceClauseContext; import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.WhereCommandContext; @@ -30,18 +32,24 @@ import com.google.common.collect.ImmutableMap; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Optional; +import java.util.Set; import java.util.stream.Collectors; import org.antlr.v4.runtime.ParserRuleContext; import org.antlr.v4.runtime.Token; import org.antlr.v4.runtime.tree.ParseTree; import org.apache.commons.lang3.tuple.Pair; +import org.opensearch.sql.ast.EmptySourcePropagateVisitor; import org.opensearch.sql.ast.dsl.AstDSL; import org.opensearch.sql.ast.expression.Alias; import org.opensearch.sql.ast.expression.AllFieldsExcludeMeta; +import org.opensearch.sql.ast.expression.Argument; +import org.opensearch.sql.ast.expression.Argument.ArgumentMap; +import org.opensearch.sql.ast.expression.DataType; import org.opensearch.sql.ast.expression.EqualTo; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.Let; @@ -51,13 +59,22 @@ import org.opensearch.sql.ast.expression.PatternMethod; import org.opensearch.sql.ast.expression.PatternMode; import org.opensearch.sql.ast.expression.QualifiedName; +import org.opensearch.sql.ast.expression.SearchAnd; +import org.opensearch.sql.ast.expression.SearchExpression; +import org.opensearch.sql.ast.expression.SearchGroup; +import org.opensearch.sql.ast.expression.Span; +import org.opensearch.sql.ast.expression.SpanUnit; import org.opensearch.sql.ast.expression.UnresolvedArgument; import org.opensearch.sql.ast.expression.UnresolvedExpression; +import org.opensearch.sql.ast.expression.WindowFrame; import org.opensearch.sql.ast.expression.WindowFunction; import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.Aggregation; +import org.opensearch.sql.ast.tree.Append; import org.opensearch.sql.ast.tree.AppendCol; +import org.opensearch.sql.ast.tree.CountBin; import org.opensearch.sql.ast.tree.Dedupe; +import org.opensearch.sql.ast.tree.DefaultBin; import org.opensearch.sql.ast.tree.DescribeRelation; import org.opensearch.sql.ast.tree.Eval; import org.opensearch.sql.ast.tree.Expand; @@ -69,19 +86,33 @@ import org.opensearch.sql.ast.tree.Kmeans; import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.ast.tree.ML; +import org.opensearch.sql.ast.tree.MinSpanBin; +import org.opensearch.sql.ast.tree.Multisearch; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; +import org.opensearch.sql.ast.tree.RangeBin; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.RareTopN.CommandType; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; +import org.opensearch.sql.ast.tree.Replace; +import org.opensearch.sql.ast.tree.ReplacePair; +import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.Rex; +import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; +import org.opensearch.sql.ast.tree.SpanBin; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; +import org.opensearch.sql.ast.tree.Timechart; import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.ast.tree.Window; +import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.common.setting.Settings; import org.opensearch.sql.common.setting.Settings.Key; import org.opensearch.sql.common.utils.StringUtils; @@ -120,6 +151,10 @@ public AstBuilder(String query, Settings settings) { this.settings = settings; } + public Settings getSettings() { + return settings; + } + @Override public UnresolvedPlan visitQueryStatement(OpenSearchPPLParser.QueryStatementContext ctx) { UnresolvedPlan pplCommand = visit(ctx.pplCommands()); @@ -139,26 +174,43 @@ public UnresolvedPlan visitSubSearch(OpenSearchPPLParser.SubSearchContext ctx) { /** Search command. */ @Override public UnresolvedPlan visitSearchFrom(SearchFromContext ctx) { - return visitFromClause(ctx.fromClause()); - } + if (ctx.searchExpression().isEmpty()) { + return visitFromClause(ctx.fromClause()); + } else { + // Build search expressions using visitor pattern + List searchExprs = + ctx.searchExpression().stream() + .map(expr -> (SearchExpression) expressionBuilder.visit(expr)) + .toList(); + // Combine multiple expressions with AND + SearchExpression combined; + if (searchExprs.size() == 1) { + combined = searchExprs.getFirst(); + } else { + // before being combined with AND (e.g., "a=1 b=-1" becomes "(a:1) AND (b:-1)") + combined = + searchExprs.stream() + .map(SearchGroup::new) + .map(SearchExpression.class::cast) + .reduce(SearchAnd::new) + .get(); // Safe because we know size > 1 from the if condition + } - @Override - public UnresolvedPlan visitSearchFromFilter(SearchFromFilterContext ctx) { - return new Filter(internalVisitExpression(ctx.logicalExpression())) - .attach(visit(ctx.fromClause())); - } + // Convert to query string + String queryString = combined.toQueryString(); - @Override - public UnresolvedPlan visitSearchFilterFrom(SearchFilterFromContext ctx) { - return new Filter(internalVisitExpression(ctx.logicalExpression())) - .attach(visit(ctx.fromClause())); + // Create Search node with relation and query string + Relation relation = (Relation) visitFromClause(ctx.fromClause()); + return new Search(relation, queryString); + } } /** - * Describe command. Current logic separates table and metadata info about table by adding - * MAPPING_ODFE_SYS_TABLE as suffix. Even with the introduction of datasource and schema name in - * fully qualified table name, we do the same thing by appending MAPPING_ODFE_SYS_TABLE as syffix - * to the last part of qualified name. + * Describe command.
+ * Current logic separates table and metadata info about table by adding MAPPING_ODFE_SYS_TABLE as + * suffix. Even with the introduction of datasource and schema name in fully qualified table name, + * we do the same thing by appending MAPPING_ODFE_SYS_TABLE as syffix to the last part of + * qualified name. */ @Override public UnresolvedPlan visitDescribeCommand(DescribeCommandContext ctx) { @@ -184,21 +236,40 @@ public UnresolvedPlan visitWhereCommand(WhereCommandContext ctx) { @Override public UnresolvedPlan visitJoinCommand(OpenSearchPPLParser.JoinCommandContext ctx) { - Join.JoinType joinType = getJoinType(ctx.joinType()); - if (ctx.joinCriteria() == null) { - joinType = Join.JoinType.CROSS; + // a sql-like syntax if join criteria existed + boolean sqlLike = ctx.joinCriteria() != null; + Join.JoinType joinType = null; + if (sqlLike) { + joinType = ArgumentFactory.getJoinType(ctx.sqlLikeJoinType()); + } + List arguments = + ctx.joinOption().stream().map(o -> (Argument) expressionBuilder.visit(o)).toList(); + Argument.ArgumentMap argumentMap = Argument.ArgumentMap.of(arguments); + if (argumentMap.get("type") != null) { + Join.JoinType joinTypeFromArgument = ArgumentFactory.getJoinType(argumentMap); + if (sqlLike && joinType != joinTypeFromArgument && ctx.sqlLikeJoinType() != null) { + throw new SemanticCheckException( + "Join type is ambiguous, remove either the join type before JOIN keyword or 'type='" + + " option."); + } + joinType = joinTypeFromArgument; + } + if (!sqlLike && argumentMap.get("type") == null) { + joinType = Join.JoinType.INNER; } + validateJoinType(joinType); + Join.JoinHint joinHint = getJoinHint(ctx.joinHintList()); - Optional leftAlias = - ctx.sideAlias().leftAlias != null - ? Optional.of(internalVisitExpression(ctx.sideAlias().leftAlias).toString()) - : Optional.empty(); + Optional leftAlias = Optional.empty(); Optional rightAlias = Optional.empty(); + if (ctx.sideAlias() != null && ctx.sideAlias().leftAlias != null) { + leftAlias = Optional.of(internalVisitExpression(ctx.sideAlias().leftAlias).toString()); + } if (ctx.tableOrSubqueryClause().alias != null) { rightAlias = Optional.of(internalVisitExpression(ctx.tableOrSubqueryClause().alias).toString()); } - if (ctx.sideAlias().rightAlias != null) { + if (ctx.sideAlias() != null && ctx.sideAlias().rightAlias != null) { rightAlias = Optional.of(internalVisitExpression(ctx.sideAlias().rightAlias).toString()); } @@ -217,8 +288,19 @@ public UnresolvedPlan visitJoinCommand(OpenSearchPPLParser.JoinCommandContext ct ctx.joinCriteria() == null ? Optional.empty() : Optional.of(expressionBuilder.visitJoinCriteria(ctx.joinCriteria())); + Optional> joinFields = Optional.empty(); + if (ctx.fieldList() != null) { + joinFields = Optional.of(getFieldList(ctx.fieldList())); + } return new Join( - projectExceptMeta(right), leftAlias, rightAlias, joinType, joinCondition, joinHint); + projectExceptMeta(right), + leftAlias, + rightAlias, + joinType, + joinCondition, + joinHint, + joinFields, + argumentMap); } private Join.JoinHint getJoinHint(OpenSearchPPLParser.JoinHintListContext ctx) { @@ -242,38 +324,72 @@ private Join.JoinHint getJoinHint(OpenSearchPPLParser.JoinHintListContext ctx) { return joinHint; } - private Join.JoinType getJoinType(OpenSearchPPLParser.JoinTypeContext ctx) { - Join.JoinType joinType; - if (ctx == null) { - joinType = Join.JoinType.INNER; - } else if (ctx.INNER() != null) { - joinType = Join.JoinType.INNER; - } else if (ctx.SEMI() != null) { - joinType = Join.JoinType.SEMI; - } else if (ctx.ANTI() != null) { - joinType = Join.JoinType.ANTI; - } else if (ctx.LEFT() != null) { - joinType = Join.JoinType.LEFT; - } else if (ctx.RIGHT() != null) { - joinType = Join.JoinType.RIGHT; - } else if (ctx.CROSS() != null) { - joinType = Join.JoinType.CROSS; - } else if (ctx.FULL() != null) { - joinType = Join.JoinType.FULL; - } else { - joinType = Join.JoinType.INNER; + private void validateJoinType(Join.JoinType joinType) { + Object config = settings.getSettingValue(Key.CALCITE_SUPPORT_ALL_JOIN_TYPES); + if (config != null && !((Boolean) config)) { + if (Join.highCostJoinTypes().contains(joinType)) { + throw new SemanticCheckException( + String.format( + "Join type %s is performance sensitive. Set %s to true to enable it.", + joinType.name(), Key.CALCITE_SUPPORT_ALL_JOIN_TYPES.getKeyValue())); + } } - return joinType; } - /** Fields command. */ @Override public UnresolvedPlan visitFieldsCommand(FieldsCommandContext ctx) { - return new Project( - ctx.fieldList().fieldExpression().stream() - .map(this::internalVisitExpression) - .collect(Collectors.toList()), - ArgumentFactory.getArgumentList(ctx)); + return buildProjectCommand(ctx.fieldsCommandBody(), ArgumentFactory.getArgumentList(ctx)); + } + + /** Table command as an alias for fields command. */ + @Override + public UnresolvedPlan visitTableCommand(TableCommandContext ctx) { + if (settings != null + && Boolean.TRUE.equals(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED))) { + // Table command uses the same structure as fields command + List arguments = + Collections.singletonList( + ctx.fieldsCommandBody().MINUS() != null + ? new Argument("exclude", new Literal(true, DataType.BOOLEAN)) + : new Argument("exclude", new Literal(false, DataType.BOOLEAN))); + return buildProjectCommand(ctx.fieldsCommandBody(), arguments); + } + throw getOnlyForCalciteException("Table command"); + } + + private UnresolvedPlan buildProjectCommand( + OpenSearchPPLParser.FieldsCommandBodyContext bodyCtx, List arguments) { + List fields = extractFieldExpressions(bodyCtx); + + // Check for enhanced field features when Calcite is explicitly disabled + if (settings != null + && Boolean.FALSE.equals(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED))) { + if (hasEnhancedFieldFeatures(bodyCtx, fields)) { + throw getOnlyForCalciteException("Enhanced fields feature"); + } + } + + return new Project(fields, arguments); + } + + private List extractFieldExpressions( + OpenSearchPPLParser.FieldsCommandBodyContext bodyCtx) { + if (bodyCtx.wcFieldList() != null) { + return processFieldExpressions(bodyCtx.wcFieldList().selectFieldExpression()); + } + return Collections.emptyList(); + } + + private List processFieldExpressions( + List fieldExpressions) { + var stream = fieldExpressions.stream().map(this::internalVisitExpression); + + if (settings != null + && Boolean.TRUE.equals(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED))) { + stream = stream.distinct(); + } + + return stream.collect(Collectors.toList()); } /** Rename command. */ @@ -289,6 +405,25 @@ public UnresolvedPlan visitRenameCommand(RenameCommandContext ctx) { .collect(Collectors.toList())); } + /** Replace command. */ + @Override + public UnresolvedPlan visitReplaceCommand(OpenSearchPPLParser.ReplaceCommandContext ctx) { + // Parse all replacement pairs + List replacePairs = + ctx.replacePair().stream().map(this::buildReplacePair).collect(Collectors.toList()); + + Set fieldList = getUniqueFieldSet(ctx.fieldList()); + + return new Replace(replacePairs, fieldList); + } + + /** Build a ReplacePair from parse context. */ + private ReplacePair buildReplacePair(OpenSearchPPLParser.ReplacePairContext ctx) { + Literal pattern = (Literal) internalVisitExpression(ctx.pattern); + Literal replacement = (Literal) internalVisitExpression(ctx.replacement); + return new ReplacePair(pattern, replacement); + } + /** Stats command. */ @Override public UnresolvedPlan visitStatsCommand(StatsCommandContext ctx) { @@ -330,10 +465,11 @@ public UnresolvedPlan visitStatsCommand(StatsCommandContext ctx) { Collections.emptyList(), groupList, span, - ArgumentFactory.getArgumentList(ctx)); + ArgumentFactory.getArgumentList(ctx, settings)); return aggregation; } + /** Eventstats command. */ public UnresolvedPlan visitEventstatsCommand(OpenSearchPPLParser.EventstatsCommandContext ctx) { ImmutableList.Builder windownFunctionListBuilder = new ImmutableList.Builder<>(); @@ -355,6 +491,92 @@ public UnresolvedPlan visitEventstatsCommand(OpenSearchPPLParser.EventstatsComma return new Window(windownFunctionListBuilder.build()); } + /** Streamstats command. */ + public UnresolvedPlan visitStreamstatsCommand(OpenSearchPPLParser.StreamstatsCommandContext ctx) { + // 1. Parse arguments from the streamstats command + List argExprList = ArgumentFactory.getArgumentList(ctx); + ArgumentMap arguments = ArgumentMap.of(argExprList); + + // current, window and global from ArgumentFactory + boolean current = (Boolean) arguments.get("current").getValue(); + int window = (Integer) arguments.get("window").getValue(); + boolean global = (Boolean) arguments.get("global").getValue(); + + if (window < 0) { + throw new IllegalArgumentException("Window size must be >= 0, but got: " + window); + } + + // reset_before, reset_after + UnresolvedExpression resetBeforeExpr = + Optional.ofNullable(ctx.streamstatsArgs()) + .filter(args -> args.resetBeforeArg() != null && !args.resetBeforeArg().isEmpty()) + .map(args -> expressionBuilder.visit(args.resetBeforeArg(0).logicalExpression())) + .orElse(null); + + UnresolvedExpression resetAfterExpr = + Optional.ofNullable(ctx.streamstatsArgs()) + .filter(args -> args.resetAfterArg() != null && !args.resetAfterArg().isEmpty()) + .map(args -> expressionBuilder.visit(args.resetAfterArg(0).logicalExpression())) + .orElse(null); + + // 2.1 Build a WindowFrame from the provided arguments + WindowFrame frame = buildFrameFromArgs(current, window); + // 2.2 Build groupList + List groupList = getPartitionExprList(ctx.statsByClause()); + + // 3. Build each window function in the command + ImmutableList.Builder windowFunctionListBuilder = + new ImmutableList.Builder<>(); + + for (OpenSearchPPLParser.StreamstatsAggTermContext aggCtx : ctx.streamstatsAggTerm()) { + UnresolvedExpression windowFunction = internalVisitExpression(aggCtx.windowFunction()); + if (windowFunction instanceof WindowFunction wf) { + // Attach PARTITION BY clause expressions + wf.setPartitionByList(groupList); + // Inject the frame + wf.setWindowFrame(frame); + } + String name = + aggCtx.alias == null + ? getTextInQuery(aggCtx) + : StringUtils.unquoteIdentifier(aggCtx.alias.getText()); + Alias alias = new Alias(name, windowFunction); + windowFunctionListBuilder.add(alias); + } + + // 4. Build StreamWindow AST node + return new StreamWindow( + windowFunctionListBuilder.build(), + groupList, + current, + window, + global, + resetBeforeExpr, + resetAfterExpr); + } + + private WindowFrame buildFrameFromArgs(boolean current, int window) { + // Build the frame + if (window > 0) { + if (current) { + // N-1 PRECEDING to CURRENT ROW + return WindowFrame.of( + WindowFrame.FrameType.ROWS, (window - 1) + " PRECEDING", "CURRENT ROW"); + } else { + // N PRECEDING to 1 PRECEDING + return WindowFrame.of(WindowFrame.FrameType.ROWS, window + " PRECEDING", "1 PRECEDING"); + } + } else { + // Default: running total + if (current) { + return WindowFrame.toCurrentRow(); + } else { + // Default: running total excluding current row + return WindowFrame.of(WindowFrame.FrameType.ROWS, "UNBOUNDED PRECEDING", "1 PRECEDING"); + } + } + } + /** Dedup command. */ @Override public UnresolvedPlan visitDedupCommand(DedupCommandContext ctx) { @@ -369,13 +591,174 @@ public UnresolvedPlan visitHeadCommand(HeadCommandContext ctx) { return new Head(size, from); } + /** Bin command visitor. */ + @Override + public UnresolvedPlan visitBinCommand(BinCommandContext ctx) { + UnresolvedExpression field = internalVisitExpression(ctx.fieldExpression()); + + // Handle alias from binCommand context + String alias = ctx.alias != null ? StringUtils.unquoteIdentifier(ctx.alias.getText()) : null; + + // Track seen parameters for duplicate detection + Set seenParams = new HashSet<>(); + + // Initialize all optional parameters + UnresolvedExpression span = null; + Integer bins = null; + UnresolvedExpression minspan = null; + UnresolvedExpression aligntime = null; + UnresolvedExpression start = null; + UnresolvedExpression end = null; + + // Process each bin option: detect duplicates and assign values in one shot + for (OpenSearchPPLParser.BinOptionContext option : ctx.binOption()) { + // SPAN parameter + if (option.span != null) { + if (!seenParams.add("SPAN")) { + throw new IllegalArgumentException("Duplicate SPAN parameter in bin command"); + } + span = internalVisitExpression(option.span); + } + + // BINS parameter + if (option.bins != null) { + if (!seenParams.add("BINS")) { + throw new IllegalArgumentException("Duplicate BINS parameter in bin command"); + } + bins = Integer.parseInt(option.bins.getText()); + } + + // MINSPAN parameter + if (option.minspan != null) { + if (!seenParams.add("MINSPAN")) { + throw new IllegalArgumentException("Duplicate MINSPAN parameter in bin command"); + } + minspan = internalVisitExpression(option.minspan); + } + + // ALIGNTIME parameter + if (option.aligntime != null) { + if (!seenParams.add("ALIGNTIME")) { + throw new IllegalArgumentException("Duplicate ALIGNTIME parameter in bin command"); + } + aligntime = + option.aligntime.EARLIEST() != null + ? org.opensearch.sql.ast.dsl.AstDSL.stringLiteral("earliest") + : option.aligntime.LATEST() != null + ? org.opensearch.sql.ast.dsl.AstDSL.stringLiteral("latest") + : internalVisitExpression(option.aligntime.literalValue()); + } + + // START parameter + if (option.start != null) { + if (!seenParams.add("START")) { + throw new IllegalArgumentException("Duplicate START parameter in bin command"); + } + start = internalVisitExpression(option.start); + } + + // END parameter + if (option.end != null) { + if (!seenParams.add("END")) { + throw new IllegalArgumentException("Duplicate END parameter in bin command"); + } + end = internalVisitExpression(option.end); + } + } + + // Create appropriate Bin subclass based on priority order (matches AstDSL.bin() logic) + if (span != null) { + // 1. SPAN (highest priority) -> SpanBin + return SpanBin.builder().field(field).span(span).aligntime(aligntime).alias(alias).build(); + } else if (minspan != null) { + // 2. MINSPAN (second priority) -> MinSpanBin + return MinSpanBin.builder() + .field(field) + .minspan(minspan) + .start(start) + .end(end) + .alias(alias) + .build(); + } else if (bins != null) { + // 3. BINS (third priority) -> CountBin + return CountBin.builder().field(field).bins(bins).start(start).end(end).alias(alias).build(); + } else if (start != null || end != null) { + // 4. START/END only (fourth priority) -> RangeBin + return RangeBin.builder().field(field).start(start).end(end).alias(alias).build(); + } else { + // 5. No parameters (default) -> DefaultBin + return DefaultBin.builder().field(field).alias(alias).build(); + } + } + /** Sort command. */ @Override public UnresolvedPlan visitSortCommand(SortCommandContext ctx) { - return new Sort( - ctx.sortbyClause().sortField().stream() + Integer count = ctx.count != null ? Math.max(0, Integer.parseInt(ctx.count.getText())) : 0; + + List sortFieldContexts = ctx.sortbyClause().sortField(); + validateSortDirectionSyntax(sortFieldContexts); + + List sortFields = + sortFieldContexts.stream() .map(sort -> (Field) internalVisitExpression(sort)) - .collect(Collectors.toList())); + .collect(Collectors.toList()); + + return new Sort(count, sortFields); + } + + private void validateSortDirectionSyntax(List sortFields) { + boolean hasPrefix = + sortFields.stream() + .anyMatch(sortField -> sortField instanceof OpenSearchPPLParser.PrefixSortFieldContext); + boolean hasSuffix = + sortFields.stream() + .anyMatch(sortField -> sortField instanceof OpenSearchPPLParser.SuffixSortFieldContext); + + if (hasPrefix && hasSuffix) { + throw new SemanticCheckException( + "Cannot mix prefix (+/-) and suffix (asc/desc) sort direction syntax in the same" + + " command."); + } + } + + /** Reverse command. */ + @Override + public UnresolvedPlan visitReverseCommand(OpenSearchPPLParser.ReverseCommandContext ctx) { + return new Reverse(); + } + + /** Timechart command. */ + @Override + public UnresolvedPlan visitTimechartCommand(OpenSearchPPLParser.TimechartCommandContext ctx) { + UnresolvedExpression binExpression = + AstDSL.span(AstDSL.referImplicitTimestampField(), AstDSL.intLiteral(1), SpanUnit.m); + Integer limit = 10; + Boolean useOther = true; + // Process timechart parameters + for (OpenSearchPPLParser.TimechartParameterContext paramCtx : ctx.timechartParameter()) { + UnresolvedExpression param = internalVisitExpression(paramCtx); + if (param instanceof Span) { + binExpression = param; + } else if (param instanceof Literal literal) { + if (DataType.BOOLEAN.equals(literal.getType())) { + useOther = (Boolean) literal.getValue(); + } else if (DataType.INTEGER.equals(literal.getType()) + || DataType.LONG.equals(literal.getType())) { + limit = (Integer) literal.getValue(); + } + } + } + + UnresolvedExpression aggregateFunction = internalVisitExpression(ctx.statsFunction()); + UnresolvedExpression byField = + ctx.fieldExpression() != null ? internalVisitExpression(ctx.fieldExpression()) : null; + + return new Timechart(null, aggregateFunction) + .span(binExpression) + .by(byField) + .limit(limit) + .useOther(useOther); } /** Eval command. */ @@ -399,26 +782,43 @@ private List getFieldList(FieldListContext ctx) { .collect(Collectors.toList()); } - /** Rare command. */ - @Override - public UnresolvedPlan visitRareCommand(OpenSearchPPLParser.RareCommandContext ctx) { - List groupList = - ctx.byClause() == null ? emptyList() : getGroupByList(ctx.byClause()); - return new RareTopN( - CommandType.RARE, - ArgumentFactory.getArgumentList(ctx), - getFieldList(ctx.fieldList()), - groupList); + private Set getUniqueFieldSet(FieldListContext ctx) { + List fields = + ctx.fieldExpression().stream() + .map(field -> (Field) internalVisitExpression(field)) + .toList(); + + Set uniqueFields = new java.util.LinkedHashSet<>(fields); + + if (uniqueFields.size() < fields.size()) { + // Find duplicates for error message + Set seen = new HashSet<>(); + Set duplicates = + fields.stream() + .map(f -> f.getField().toString()) + .filter(name -> !seen.add(name)) + .collect(Collectors.toSet()); + + throw new IllegalArgumentException( + String.format("Duplicate fields [%s] in Replace command", String.join(", ", duplicates))); + } + + return uniqueFields; } - /** Top command. */ + /** Rare and Top commands. */ @Override - public UnresolvedPlan visitTopCommand(OpenSearchPPLParser.TopCommandContext ctx) { + public UnresolvedPlan visitRareTopCommand(OpenSearchPPLParser.RareTopCommandContext ctx) { List groupList = ctx.byClause() == null ? emptyList() : getGroupByList(ctx.byClause()); + Integer noOfResults = + ctx.number != null + ? (Integer) ((Literal) expressionBuilder.visitIntegerLiteral(ctx.number)).getValue() + : 10; return new RareTopN( - CommandType.TOP, - ArgumentFactory.getArgumentList(ctx), + ctx.TOP() != null ? CommandType.TOP : CommandType.RARE, + noOfResults, + ArgumentFactory.getArgumentList(ctx, settings), getFieldList(ctx.fieldList()), groupList); } @@ -447,15 +847,38 @@ public UnresolvedPlan visitParseCommand(OpenSearchPPLParser.ParseCommandContext return new Parse(ParseMethod.REGEX, sourceField, pattern, ImmutableMap.of()); } + @Override + public UnresolvedPlan visitSpathCommand(OpenSearchPPLParser.SpathCommandContext ctx) { + String inField = null; + String outField = null; + String path = null; + + for (OpenSearchPPLParser.SpathParameterContext param : ctx.spathParameter()) { + if (param.input != null) { + inField = param.input.getText(); + } + if (param.output != null) { + outField = param.output.getText(); + } + if (param.path != null) { + path = param.path.getText(); + } + } + + if (inField == null) { + throw new IllegalArgumentException("`input` parameter is required for `spath`"); + } + if (path == null) { + throw new IllegalArgumentException("`path` parameter is required for `spath`"); + } + + return new SPath(inField, outField, path); + } + @Override public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandContext ctx) { UnresolvedExpression sourceField = internalVisitExpression(ctx.source_field); ImmutableMap.Builder builder = ImmutableMap.builder(); - Literal newField = null; - if (ctx.new_field != null) { - newField = (Literal) internalVisitExpression(ctx.new_field); - builder.put("new_field", newField); - } ctx.patternsParameter() .forEach( x -> { @@ -464,32 +887,48 @@ public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandCo builder.put(argName, value); }); java.util.Map arguments = builder.build(); + + ImmutableMap.Builder cmdOptionsBuilder = ImmutableMap.builder(); + ctx.patternsCommandOption() + .forEach( + option -> { + String argName = option.children.get(0).toString(); + Literal value = (Literal) internalVisitExpression(option.children.get(2)); + cmdOptionsBuilder.put(argName, value); + }); + java.util.Map cmdOptions = cmdOptionsBuilder.build(); String patternMethod = - ctx.method != null - ? StringUtils.unquoteIdentifier(ctx.method.getText()).toLowerCase(Locale.ROOT) - : settings.getSettingValue(Key.PATTERN_METHOD).toString().toLowerCase(Locale.ROOT); + cmdOptions + .getOrDefault( + "method", AstDSL.stringLiteral(settings.getSettingValue(Key.PATTERN_METHOD))) + .toString(); String patternMode = - ctx.pattern_mode != null - ? StringUtils.unquoteIdentifier(ctx.pattern_mode.getText()).toLowerCase(Locale.ROOT) - : settings.getSettingValue(Key.PATTERN_MODE).toString().toLowerCase(Locale.ROOT); + cmdOptions + .getOrDefault("mode", AstDSL.stringLiteral(settings.getSettingValue(Key.PATTERN_MODE))) + .toString(); Literal patternMaxSampleCount = - ctx.max_sample_count != null - ? (Literal) internalVisitExpression(ctx.max_sample_count) - : AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT)); + cmdOptions.getOrDefault( + "max_sample_count", + AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT))); Literal patternBufferLimit = - ctx.buffer_limit != null - ? (Literal) internalVisitExpression(ctx.buffer_limit) - : AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT)); + cmdOptions.getOrDefault( + "max_sample_count", + AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT))); + Literal showNumberedToken = + cmdOptions.getOrDefault( + "show_numbered_token", + AstDSL.booleanLiteral(settings.getSettingValue(Key.PATTERN_SHOW_NUMBERED_TOKEN))); List partitionByList = getPartitionExprList(ctx.statsByClause()); return new Patterns( sourceField, partitionByList, - newField != null ? newField.getValue().toString() : "patterns_field", + arguments.getOrDefault("new_field", AstDSL.stringLiteral("patterns_field")).toString(), PatternMethod.valueOf(patternMethod.toUpperCase(Locale.ROOT)), PatternMode.valueOf(patternMode.toUpperCase(Locale.ROOT)), patternMaxSampleCount, patternBufferLimit, + showNumberedToken, arguments); } @@ -544,11 +983,17 @@ public UnresolvedPlan visitTableSourceClause(TableSourceClauseContext ctx) { : relation; } + @Override + public UnresolvedPlan visitDynamicSourceClause(DynamicSourceClauseContext ctx) { + throw new UnsupportedOperationException( + "Dynamic source clause with metadata filters is not supported."); + } + @Override public UnresolvedPlan visitTableFunction(TableFunctionContext ctx) { ImmutableList.Builder builder = ImmutableList.builder(); - ctx.functionArgs() - .functionArg() + ctx.namedFunctionArgs() + .namedFunctionArg() .forEach( arg -> { String argName = (arg.ident() != null) ? arg.ident().getText() : null; @@ -622,7 +1067,7 @@ public UnresolvedPlan visitFillNullWith(OpenSearchPPLParser.FillNullWithContext internalVisitExpression(ctx.replacement), ctx.fieldList().fieldExpression().stream() .map(f -> (Field) internalVisitExpression(f)) - .collect(Collectors.toList())); + .toList()); } else { return FillNull.ofSameValue(internalVisitExpression(ctx.replacement), List.of()); } @@ -643,6 +1088,25 @@ public UnresolvedPlan visitFillNullUsing(OpenSearchPPLParser.FillNullUsingContex return FillNull.ofVariousValue(replacementsBuilder.build()); } + /** fillnull command - value= syntax: fillnull value= field1 field2 ... */ + @Override + public UnresolvedPlan visitFillNullValueWithFields( + OpenSearchPPLParser.FillNullValueWithFieldsContext ctx) { + return FillNull.ofSameValue( + internalVisitExpression(ctx.replacement), + ctx.fieldList().fieldExpression().stream() + .map(f -> (Field) internalVisitExpression(f)) + .toList(), + true); + } + + /** fillnull command - value= syntax: fillnull value= */ + @Override + public UnresolvedPlan visitFillNullValueAllFields( + OpenSearchPPLParser.FillNullValueAllFieldsContext ctx) { + return FillNull.ofSameValue(internalVisitExpression(ctx.replacement), List.of(), true); + } + @Override public UnresolvedPlan visitFlattenCommand(OpenSearchPPLParser.FlattenCommandContext ctx) { Field field = (Field) internalVisitExpression(ctx.fieldExpression()); @@ -684,6 +1148,104 @@ public UnresolvedPlan visitAppendcolCommand(OpenSearchPPLParser.AppendcolCommand return new AppendCol(override, subsearch.get()); } + @Override + public UnresolvedPlan visitRegexCommand(OpenSearchPPLParser.RegexCommandContext ctx) { + UnresolvedExpression field = internalVisitExpression(ctx.regexExpr().field); + boolean negated = ctx.regexExpr().operator.getType() == OpenSearchPPLParser.NOT_EQUAL; + Literal pattern = (Literal) internalVisitExpression(ctx.regexExpr().pattern); + + return new Regex(field, negated, pattern); + } + + @Override + public UnresolvedPlan visitAppendCommand(OpenSearchPPLParser.AppendCommandContext ctx) { + UnresolvedPlan searchCommandInSubSearch = + ctx.searchCommand() != null + ? visit(ctx.searchCommand()) + : EmptySourcePropagateVisitor + .EMPTY_SOURCE; // Represents 0 row * 0 col empty input syntax + UnresolvedPlan subsearch = + ctx.commands().stream() + .map(this::visit) + .reduce(searchCommandInSubSearch, (r, e) -> e.attach(r)); + + return new Append(subsearch); + } + + @Override + public UnresolvedPlan visitMultisearchCommand(OpenSearchPPLParser.MultisearchCommandContext ctx) { + List subsearches = new ArrayList<>(); + + // Process each subsearch + for (OpenSearchPPLParser.SubSearchContext subsearchCtx : ctx.subSearch()) { + // Use the existing visitSubSearch logic + UnresolvedPlan fullSubsearch = visitSubSearch(subsearchCtx); + subsearches.add(fullSubsearch); + } + + // Validate minimum number of subsearches + if (subsearches.size() < 2) { + throw new SyntaxCheckException( + "Multisearch command requires at least two subsearches. Provided: " + subsearches.size()); + } + + return new Multisearch(subsearches); + } + + @Override + public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx) { + UnresolvedExpression field = internalVisitExpression(ctx.rexExpr().field); + Literal pattern = (Literal) internalVisitExpression(ctx.rexExpr().pattern); + Rex.RexMode mode = Rex.RexMode.EXTRACT; + Optional maxMatch = Optional.empty(); + Optional offsetField = Optional.empty(); + + for (OpenSearchPPLParser.RexOptionContext optionCtx : ctx.rexExpr().rexOption()) { + if (optionCtx.maxMatch != null) { + maxMatch = Optional.of(Integer.parseInt(optionCtx.maxMatch.getText())); + } + if (optionCtx.EXTRACT() != null) { + mode = Rex.RexMode.EXTRACT; + } + if (optionCtx.SED() != null) { + mode = Rex.RexMode.SED; + } + if (optionCtx.offsetField != null) { + offsetField = Optional.of(optionCtx.offsetField.getText()); + } + } + + if (mode == Rex.RexMode.SED && offsetField.isPresent()) { + throw new IllegalArgumentException( + "Rex command: offset_field cannot be used with mode=sed. " + + "The offset_field option is only supported in extract mode."); + } + + int maxMatchLimit = + (settings != null) ? settings.getSettingValue(Settings.Key.PPL_REX_MAX_MATCH_LIMIT) : 10; + + int userMaxMatch = maxMatch.orElse(1); + int effectiveMaxMatch; + + if (userMaxMatch == 0) { + effectiveMaxMatch = maxMatchLimit; + } else if (userMaxMatch > maxMatchLimit) { + throw new IllegalArgumentException( + String.format( + "Rex command max_match value (%d) exceeds the configured limit (%d). " + + "Consider using a smaller max_match value" + + (settings != null + ? " or adjust the plugins.ppl.rex.max_match.limit setting." + : "."), + userMaxMatch, + maxMatchLimit)); + } else { + effectiveMaxMatch = userMaxMatch; + } + + return new Rex(field, pattern, mode, Optional.of(effectiveMaxMatch), offsetField); + } + /** Get original text in query. */ private String getTextInQuery(ParserRuleContext ctx) { Token start = ctx.getStart(); @@ -701,13 +1263,12 @@ private String getTextInQuery(ParserRuleContext ctx) { private UnresolvedPlan projectExceptMeta(UnresolvedPlan plan) { if ((plan instanceof Project) && !((Project) plan).isExcluded()) { return plan; - } else if (plan instanceof SubqueryAlias) { - SubqueryAlias subqueryAlias = (SubqueryAlias) plan; + } else if (plan instanceof SubqueryAlias subqueryAlias) { // don't wrap subquery alias with project, wrap its child return new SubqueryAlias( subqueryAlias.getAlias(), new Project(ImmutableList.of(AllFieldsExcludeMeta.of())) - .attach(subqueryAlias.getChild().get(0))); + .attach(subqueryAlias.getChild().getFirst())); } else { return new Project(ImmutableList.of(AllFieldsExcludeMeta.of())).attach(plan); } @@ -736,4 +1297,83 @@ private List getPartitionExprList(StatsByClauseContext ctx .ifPresent(partExprListBuilder::addAll); return partExprListBuilder.build(); } + + private boolean hasEnhancedFieldFeatures( + OpenSearchPPLParser.FieldsCommandBodyContext bodyCtx, List fields) { + if (hasActualWildcards(bodyCtx)) { + return true; + } + + return hasSpaceDelimitedFields(bodyCtx); + } + + private boolean hasSpaceDelimitedFields(OpenSearchPPLParser.FieldsCommandBodyContext bodyCtx) { + if (bodyCtx.wcFieldList() == null) { + return false; + } + + String fieldsText = getTextInQuery(bodyCtx.wcFieldList()); + + // If all fields are backtick-enclosed (like eval expressions), don't treat as enhanced + if (isAllFieldsBacktickEnclosed(bodyCtx)) { + return false; + } + + if (bodyCtx.wcFieldList().selectFieldExpression().size() > 1 && !fieldsText.contains(",")) { + return true; + } + + if (fieldsText.contains(",") && hasSpacesBetweenFields(fieldsText)) { + return true; + } + + return false; + } + + private boolean hasSpacesBetweenFields(String fieldsText) { + String[] parts = fieldsText.split(","); + for (String part : parts) { + String trimmed = part.trim(); + if (trimmed.contains(" ") && trimmed.split("\\s+").length > 1) { + // If the field is backtick-enclosed, it's likely an eval expression, not space-delimited + if (!trimmed.startsWith("`") || !trimmed.endsWith("`")) { + return true; + } + } + } + return false; + } + + private boolean isAllFieldsBacktickEnclosed( + OpenSearchPPLParser.FieldsCommandBodyContext bodyCtx) { + for (var fieldExpr : bodyCtx.wcFieldList().selectFieldExpression()) { + if (fieldExpr.wcQualifiedName() != null) { + String originalText = getTextInQuery(fieldExpr.wcQualifiedName()); + if (!originalText.startsWith("`") || !originalText.endsWith("`")) { + return false; + } + } + } + return true; + } + + private boolean hasActualWildcards(OpenSearchPPLParser.FieldsCommandBodyContext bodyCtx) { + if (bodyCtx.wcFieldList() == null) { + return false; + } + + for (var fieldExpr : bodyCtx.wcFieldList().selectFieldExpression()) { + if (fieldExpr.STAR() != null) { + return true; + } + + if (fieldExpr.wcQualifiedName() != null) { + String originalText = getTextInQuery(fieldExpr.wcQualifiedName()); + if (originalText.contains("*") && !originalText.contains("`")) { + return true; + } + } + } + return false; + } } diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java index 40c220fece8..acf204e8030 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java @@ -5,23 +5,31 @@ package org.opensearch.sql.ppl.utils; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.BooleanLiteralContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DedupCommandContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.FieldsCommandContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.IntegerLiteralContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.RareCommandContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SortFieldContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.StatsCommandContext; -import static org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.TopCommandContext; - +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Optional; import org.antlr.v4.runtime.ParserRuleContext; import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.DataType; import org.opensearch.sql.ast.expression.Literal; +import org.opensearch.sql.ast.tree.Join; +import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.common.setting.Settings; import org.opensearch.sql.common.utils.StringUtils; +import org.opensearch.sql.exception.SemanticCheckException; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.BooleanLiteralContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DecimalLiteralContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DedupCommandContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.DefaultSortFieldContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.FieldsCommandContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.IntegerLiteralContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.PrefixSortFieldContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SortFieldContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.StreamstatsCommandContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SuffixSortFieldContext; /** Util class to get all arguments as a list from the PPL command. */ public class ArgumentFactory { @@ -34,7 +42,7 @@ public class ArgumentFactory { */ public static List getArgumentList(FieldsCommandContext ctx) { return Collections.singletonList( - ctx.MINUS() != null + ctx.fieldsCommandBody().MINUS() != null ? new Argument("exclude", new Literal(true, DataType.BOOLEAN)) : new Argument("exclude", new Literal(false, DataType.BOOLEAN))); } @@ -45,20 +53,60 @@ public static List getArgumentList(FieldsCommandContext ctx) { * @param ctx StatsCommandContext instance * @return the list of arguments fetched from the stats command */ - public static List getArgumentList(StatsCommandContext ctx) { + public static List getArgumentList( + OpenSearchPPLParser.StatsCommandContext ctx, Settings settings) { + OpenSearchPPLParser.StatsArgsContext ctx1 = ctx.statsArgs(); + OpenSearchPPLParser.DedupSplitArgContext ctx2 = ctx.dedupSplitArg(); + List list = + new ArrayList<>( + Arrays.asList( + ctx1.partitionsArg() != null && !ctx1.partitionsArg().isEmpty() + ? new Argument("partitions", getArgumentValue(ctx1.partitionsArg(0).partitions)) + : new Argument("partitions", Literal.ONE), + ctx1.allnumArg() != null && !ctx1.allnumArg().isEmpty() + ? new Argument("allnum", getArgumentValue(ctx1.allnumArg(0).allnum)) + : new Argument("allnum", Literal.FALSE), + ctx1.delimArg() != null && !ctx1.delimArg().isEmpty() + ? new Argument("delim", getArgumentValue(ctx1.delimArg(0).delim)) + : new Argument("delim", new Literal(" ", DataType.STRING)), + ctx1.bucketNullableArg() != null && !ctx1.bucketNullableArg().isEmpty() + ? new Argument( + Argument.BUCKET_NULLABLE, + getArgumentValue(ctx1.bucketNullableArg(0).bucket_nullable)) + : new Argument( + Argument.BUCKET_NULLABLE, + legacyPreferred(settings) ? Literal.TRUE : Literal.FALSE))); + if (ctx2 != null) { + list.add(new Argument("dedupsplit", getArgumentValue(ctx2.dedupsplit))); + } else { + list.add(new Argument("dedupsplit", Literal.FALSE)); + } + return list; + } + + private static boolean legacyPreferred(Settings settings) { + return settings == null + || settings.getSettingValue(Settings.Key.PPL_SYNTAX_LEGACY_PREFERRED) == null + || Boolean.TRUE.equals(settings.getSettingValue(Settings.Key.PPL_SYNTAX_LEGACY_PREFERRED)); + } + + /** + * Get list of {@link Argument}. + * + * @param ctx StreamstatsCommandContext instance + * @return the list of arguments fetched from the streamstats command + */ + public static List getArgumentList(StreamstatsCommandContext ctx) { return Arrays.asList( - ctx.partitions != null - ? new Argument("partitions", getArgumentValue(ctx.partitions)) - : new Argument("partitions", new Literal(1, DataType.INTEGER)), - ctx.allnum != null - ? new Argument("allnum", getArgumentValue(ctx.allnum)) - : new Argument("allnum", new Literal(false, DataType.BOOLEAN)), - ctx.delim != null - ? new Argument("delim", getArgumentValue(ctx.delim)) - : new Argument("delim", new Literal(" ", DataType.STRING)), - ctx.dedupsplit != null - ? new Argument("dedupsplit", getArgumentValue(ctx.dedupsplit)) - : new Argument("dedupsplit", new Literal(false, DataType.BOOLEAN))); + ctx.streamstatsArgs().currentArg() != null && !ctx.streamstatsArgs().currentArg().isEmpty() + ? new Argument("current", getArgumentValue(ctx.streamstatsArgs().currentArg(0).current)) + : new Argument("current", new Literal(true, DataType.BOOLEAN)), + ctx.streamstatsArgs().windowArg() != null && !ctx.streamstatsArgs().windowArg().isEmpty() + ? new Argument("window", getArgumentValue(ctx.streamstatsArgs().windowArg(0).window)) + : new Argument("window", new Literal(0, DataType.INTEGER)), + ctx.streamstatsArgs().globalArg() != null && !ctx.streamstatsArgs().globalArg().isEmpty() + ? new Argument("global", getArgumentValue(ctx.streamstatsArgs().globalArg(0).global)) + : new Argument("global", new Literal(true, DataType.BOOLEAN))); } /** @@ -87,57 +135,101 @@ public static List getArgumentList(DedupCommandContext ctx) { * @return the list of arguments fetched from the sort field in sort command */ public static List getArgumentList(SortFieldContext ctx) { + if (ctx instanceof PrefixSortFieldContext) { + return getArgumentList((PrefixSortFieldContext) ctx); + } else if (ctx instanceof SuffixSortFieldContext) { + return getArgumentList((SuffixSortFieldContext) ctx); + } else { + return getArgumentList((DefaultSortFieldContext) ctx); + } + } + + /** + * Get list of {@link Argument} for prefix sort field (+/- syntax). + * + * @param ctx PrefixSortFieldContext instance + * @return the list of arguments fetched from the prefix sort field + */ + public static List getArgumentList(PrefixSortFieldContext ctx) { return Arrays.asList( ctx.MINUS() != null ? new Argument("asc", new Literal(false, DataType.BOOLEAN)) : new Argument("asc", new Literal(true, DataType.BOOLEAN)), - ctx.sortFieldExpression().AUTO() != null - ? new Argument("type", new Literal("auto", DataType.STRING)) - : ctx.sortFieldExpression().IP() != null - ? new Argument("type", new Literal("ip", DataType.STRING)) - : ctx.sortFieldExpression().NUM() != null - ? new Argument("type", new Literal("num", DataType.STRING)) - : ctx.sortFieldExpression().STR() != null - ? new Argument("type", new Literal("str", DataType.STRING)) - : new Argument("type", new Literal(null, DataType.NULL))); + getTypeArgument(ctx.sortFieldExpression())); } /** - * Get list of {@link Argument}. + * Get list of {@link Argument} for suffix sort field (asc/desc syntax). * - * @param ctx TopCommandContext instance - * @return the list of arguments fetched from the top command + * @param ctx SuffixSortFieldContext instance + * @return the list of arguments fetched from the suffix sort field */ - public static List getArgumentList(TopCommandContext ctx) { + public static List getArgumentList(SuffixSortFieldContext ctx) { return Arrays.asList( - ctx.number != null - ? new Argument("noOfResults", getArgumentValue(ctx.number)) - : new Argument("noOfResults", new Literal(10, DataType.INTEGER)), - ctx.countfield != null - ? new Argument("countField", getArgumentValue(ctx.countfield)) - : new Argument("countField", new Literal("count", DataType.STRING)), - ctx.showcount != null - ? new Argument("showCount", getArgumentValue(ctx.showcount)) - : new Argument("showCount", new Literal(true, DataType.BOOLEAN))); + (ctx.DESC() != null || ctx.D() != null) + ? new Argument("asc", new Literal(false, DataType.BOOLEAN)) + : new Argument("asc", new Literal(true, DataType.BOOLEAN)), + getTypeArgument(ctx.sortFieldExpression())); + } + + /** + * Get list of {@link Argument} for default sort field (no direction specified). + * + * @param ctx DefaultSortFieldContext instance + * @return the list of arguments fetched from the default sort field + */ + public static List getArgumentList(DefaultSortFieldContext ctx) { + return Arrays.asList( + new Argument("asc", new Literal(true, DataType.BOOLEAN)), + getTypeArgument(ctx.sortFieldExpression())); + } + + /** Helper method to get type argument from sortFieldExpression. */ + private static Argument getTypeArgument(OpenSearchPPLParser.SortFieldExpressionContext ctx) { + if (ctx.AUTO() != null) { + return new Argument("type", new Literal("auto", DataType.STRING)); + } else if (ctx.IP() != null) { + return new Argument("type", new Literal("ip", DataType.STRING)); + } else if (ctx.NUM() != null) { + return new Argument("type", new Literal("num", DataType.STRING)); + } else if (ctx.STR() != null) { + return new Argument("type", new Literal("str", DataType.STRING)); + } else { + return new Argument("type", new Literal(null, DataType.NULL)); + } } /** * Get list of {@link Argument}. * * @param ctx RareCommandContext instance + * @param settings Settings instance * @return the list of argument with default number of results for the rare command */ - public static List getArgumentList(RareCommandContext ctx) { - return Arrays.asList( - ctx.number != null - ? new Argument("noOfResults", getArgumentValue(ctx.number)) - : new Argument("noOfResults", new Literal(10, DataType.INTEGER)), - ctx.countfield != null - ? new Argument("countField", getArgumentValue(ctx.countfield)) - : new Argument("countField", new Literal("count", DataType.STRING)), - ctx.showcount != null - ? new Argument("showCount", getArgumentValue(ctx.showcount)) - : new Argument("showCount", new Literal(true, DataType.BOOLEAN))); + public static List getArgumentList( + OpenSearchPPLParser.RareTopCommandContext ctx, Settings settings) { + List list = new ArrayList<>(); + Optional opt = + ctx.rareTopOption().stream().filter(op -> op.countField != null).findFirst(); + list.add( + new Argument( + RareTopN.Option.countField.name(), + opt.isPresent() + ? getArgumentValue(opt.get().countField) + : new Literal("count", DataType.STRING))); + opt = ctx.rareTopOption().stream().filter(op -> op.showCount != null).findFirst(); + list.add( + new Argument( + RareTopN.Option.showCount.name(), + opt.isPresent() ? getArgumentValue(opt.get().showCount) : Literal.TRUE)); + opt = ctx.rareTopOption().stream().filter(op -> op.useNull != null).findFirst(); + list.add( + new Argument( + RareTopN.Option.useNull.name(), + opt.isPresent() + ? getArgumentValue(opt.get().useNull) + : legacyPreferred(settings) ? Literal.TRUE : Literal.FALSE)); + return list; } /** @@ -147,10 +239,76 @@ public static List getArgumentList(RareCommandContext ctx) { * @return Literal */ private static Literal getArgumentValue(ParserRuleContext ctx) { - return ctx instanceof IntegerLiteralContext - ? new Literal(Integer.parseInt(ctx.getText()), DataType.INTEGER) - : ctx instanceof BooleanLiteralContext - ? new Literal(Boolean.valueOf(ctx.getText()), DataType.BOOLEAN) - : new Literal(StringUtils.unquoteText(ctx.getText()), DataType.STRING); + if (ctx instanceof IntegerLiteralContext) { + return new Literal(Integer.parseInt(ctx.getText()), DataType.INTEGER); + } else if (ctx instanceof DecimalLiteralContext) { + return new Literal(Double.parseDouble(ctx.getText()), DataType.DOUBLE); + } else if (ctx instanceof BooleanLiteralContext) { + return new Literal(Boolean.valueOf(ctx.getText()), DataType.BOOLEAN); + } else { + return new Literal(StringUtils.unquoteText(ctx.getText()), DataType.STRING); + } + } + + /** + * parse argument value into Literal. + * + * @param ctx ParserRuleContext instance + * @return Literal + */ + public static Argument getArgumentValue(OpenSearchPPLParser.JoinTypeContext ctx) { + Join.JoinType type = getJoinType(ctx); + return new Argument("type", new Literal(type.name(), DataType.STRING)); + } + + public static Join.JoinType getJoinType(OpenSearchPPLParser.SqlLikeJoinTypeContext ctx) { + if (ctx == null) return Join.JoinType.INNER; + if (ctx.INNER() != null) return Join.JoinType.INNER; + if (ctx.SEMI() != null) return Join.JoinType.SEMI; + if (ctx.ANTI() != null) return Join.JoinType.ANTI; + if (ctx.LEFT() != null) return Join.JoinType.LEFT; + if (ctx.RIGHT() != null) return Join.JoinType.RIGHT; + if (ctx.CROSS() != null) return Join.JoinType.CROSS; + if (ctx.FULL() != null) return Join.JoinType.FULL; + if (ctx.OUTER() != null) return Join.JoinType.LEFT; + throw new SemanticCheckException(String.format("Unsupported join type %s", ctx.getText())); + } + + public static Join.JoinType getJoinType(OpenSearchPPLParser.JoinTypeContext ctx) { + if (ctx == null) return Join.JoinType.INNER; + if (ctx.INNER() != null) return Join.JoinType.INNER; + if (ctx.SEMI() != null) return Join.JoinType.SEMI; + if (ctx.ANTI() != null) return Join.JoinType.ANTI; + if (ctx.LEFT() != null) return Join.JoinType.LEFT; + if (ctx.RIGHT() != null) return Join.JoinType.RIGHT; + if (ctx.CROSS() != null) return Join.JoinType.CROSS; + if (ctx.FULL() != null) return Join.JoinType.FULL; + if (ctx.OUTER() != null) return Join.JoinType.LEFT; + throw new SemanticCheckException(String.format("Unsupported join type %s", ctx.getText())); + } + + public static Join.JoinType getJoinType(Argument.ArgumentMap argumentMap) { + Join.JoinType joinType; + String type = argumentMap.get("type").toString(); + if (type.equalsIgnoreCase(Join.JoinType.INNER.name())) { + joinType = Join.JoinType.INNER; + } else if (type.equalsIgnoreCase(Join.JoinType.SEMI.name())) { + joinType = Join.JoinType.SEMI; + } else if (type.equalsIgnoreCase(Join.JoinType.ANTI.name())) { + joinType = Join.JoinType.ANTI; + } else if (type.equalsIgnoreCase(Join.JoinType.LEFT.name())) { + joinType = Join.JoinType.LEFT; + } else if (type.equalsIgnoreCase(Join.JoinType.RIGHT.name())) { + joinType = Join.JoinType.RIGHT; + } else if (type.equalsIgnoreCase(Join.JoinType.CROSS.name())) { + joinType = Join.JoinType.CROSS; + } else if (type.equalsIgnoreCase(Join.JoinType.FULL.name())) { + joinType = Join.JoinType.FULL; + } else if (type.equalsIgnoreCase("OUTER")) { + joinType = Join.JoinType.LEFT; + } else { + throw new SemanticCheckException(String.format("Supported join type %s", type)); + } + return joinType; } } diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index 2d7143fcb36..5b599ae162c 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -53,8 +53,12 @@ import org.opensearch.sql.ast.statement.Query; import org.opensearch.sql.ast.statement.Statement; import org.opensearch.sql.ast.tree.Aggregation; +import org.opensearch.sql.ast.tree.Append; import org.opensearch.sql.ast.tree.AppendCol; +import org.opensearch.sql.ast.tree.Bin; +import org.opensearch.sql.ast.tree.CountBin; import org.opensearch.sql.ast.tree.Dedupe; +import org.opensearch.sql.ast.tree.DefaultBin; import org.opensearch.sql.ast.tree.DescribeRelation; import org.opensearch.sql.ast.tree.Eval; import org.opensearch.sql.ast.tree.Expand; @@ -64,17 +68,30 @@ import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Lookup; +import org.opensearch.sql.ast.tree.MinSpanBin; +import org.opensearch.sql.ast.tree.Multisearch; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; +import org.opensearch.sql.ast.tree.RangeBin; import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; +import org.opensearch.sql.ast.tree.Replace; +import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.Rex; +import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; +import org.opensearch.sql.ast.tree.SpanBin; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; +import org.opensearch.sql.ast.tree.Timechart; import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; +import org.opensearch.sql.ast.tree.Values; import org.opensearch.sql.ast.tree.Window; import org.opensearch.sql.common.setting.Settings; import org.opensearch.sql.common.utils.StringUtils; @@ -92,6 +109,10 @@ public class PPLQueryDataAnonymizer extends AbstractNodeVisitor private static final String MASK_LITERAL = "***"; + private static final String MASK_COLUMN = "identifier"; + + private static final String MASK_TABLE = "table"; + private final AnonymizerExpressionAnalyzer expressionAnalyzer; private final Settings settings; @@ -102,7 +123,7 @@ public PPLQueryDataAnonymizer(Settings settings) { /** * This method is used to anonymize sensitive data in PPL query. Sensitive data includes user - * data., + * data. * * @return ppl query string with all user data replace with "***" */ @@ -130,12 +151,9 @@ public String visitExplain(Explain node, String context) { @Override public String visitRelation(Relation node, String context) { if (node instanceof DescribeRelation) { - // remove the system table suffix - String systemTable = node.getTableQualifiedName().toString(); - return StringUtils.format( - "describe %s", systemTable.substring(0, systemTable.lastIndexOf('.'))); + return StringUtils.format("describe %s", MASK_TABLE); } - return StringUtils.format("source=%s", node.getTableQualifiedName().toString()); + return StringUtils.format("source=%s", MASK_TABLE); } @Override @@ -146,19 +164,46 @@ public String visitJoin(Join node, String context) { rightTableOrSubquery.startsWith("source=") ? rightTableOrSubquery.substring("source=".length()) : rightTableOrSubquery; - String joinType = node.getJoinType().name().toLowerCase(Locale.ROOT); - String leftAlias = node.getLeftAlias().map(l -> " left = " + l).orElse(""); - String rightAlias = node.getRightAlias().map(r -> " right = " + r).orElse(""); - String condition = - node.getJoinCondition().map(c -> expressionAnalyzer.analyze(c, context)).orElse("true"); - return StringUtils.format( - "%s | %s join%s%s on %s %s", left, joinType, leftAlias, rightAlias, condition, right); + Argument.ArgumentMap argumentMap = node.getArgumentMap(); + String max = + argumentMap.get("max") == null + ? "0" + : argumentMap.get("max").toString().toLowerCase(Locale.ROOT); + if (node.getJoinCondition().isEmpty()) { + String joinType = + argumentMap.get("type") == null + ? "inner" + : argumentMap.get("type").toString().toLowerCase(Locale.ROOT); + String overwrite = + argumentMap.get("overwrite") == null + ? "true" + : argumentMap.get("overwrite").toString().toLowerCase(Locale.ROOT); + String fieldList = + node.getJoinFields().isEmpty() + ? "" + : String.join( + ",", + node.getJoinFields().get().stream() + .map(c -> expressionAnalyzer.analyze(c, context)) + .toList()); + return StringUtils.format( + "%s | join type=%s overwrite=%s max=%s %s %s", + left, joinType, MASK_LITERAL, MASK_LITERAL, fieldList, right); + } else { + String joinType = node.getJoinType().name().toLowerCase(Locale.ROOT); + String leftAlias = node.getLeftAlias().map(l -> " left = " + MASK_COLUMN).orElse(""); + String rightAlias = node.getRightAlias().map(r -> " right = " + MASK_COLUMN).orElse(""); + String condition = + node.getJoinCondition().map(c -> expressionAnalyzer.analyze(c, context)).orElse("true"); + return StringUtils.format( + "%s | %s join max=%s%s%s on %s %s", + left, joinType, MASK_LITERAL, leftAlias, rightAlias, condition, right); + } } @Override public String visitLookup(Lookup node, String context) { String child = node.getChild().get(0).accept(this, context); - String lookupTable = ((Relation) node.getLookupRelation()).getTableQualifiedName().toString(); String mappingFields = formatFieldAlias(node.getMappingAliasMap()); String strategy = node.getOutputAliasMap().isEmpty() @@ -166,7 +211,7 @@ public String visitLookup(Lookup node, String context) { : String.format(" %s ", node.getOutputStrategy().toString().toLowerCase()); String outputFields = formatFieldAlias(node.getOutputAliasMap()); return StringUtils.format( - "%s | lookup %s %s%s%s", child, lookupTable, mappingFields, strategy, outputFields); + "%s | lookup %s %s%s%s", child, MASK_TABLE, mappingFields, strategy, outputFields); } private String formatFieldAlias(java.util.Map fieldMap) { @@ -183,15 +228,13 @@ private String formatFieldAlias(java.util.Map fieldMap) { public String visitSubqueryAlias(SubqueryAlias node, String context) { Node childNode = node.getChild().get(0); String child = childNode.accept(this, context); - if (childNode instanceof Project) { - Project project = (Project) childNode; - if (project.getProjectList().get(0) instanceof AllFields) { - childNode = childNode.getChild().get(0); - } + if (childNode instanceof Project project + && project.getProjectList().get(0) instanceof AllFields) { + childNode = childNode.getChild().get(0); } // add "[]" only if its child is not a root String format = childNode.getChild().isEmpty() ? "%s as %s" : "[ %s ] as %s"; - return StringUtils.format(format, child, node.getAlias()); + return StringUtils.format(format, child, MASK_COLUMN); } @Override @@ -205,6 +248,14 @@ public String visitTableFunction(TableFunction node, String context) { return StringUtils.format("source=%s(%s)", node.getFunctionName().toString(), arguments); } + @Override + public String visitSearch(Search node, String context) { + String source = node.getChild().get(0).accept(this, context); + String queryString = node.getQueryString(); + String anonymized = queryString.replaceAll(":\\S+", ":" + MASK_LITERAL); + return StringUtils.format("%s %s", source, anonymized); + } + @Override public String visitFilter(Filter node, String context) { String child = node.getChild().get(0).accept(this, context); @@ -223,12 +274,36 @@ public String visitRename(Rename node, String context) { ((Field) renameMap.getTarget()).getField().toString()); } String renames = - renameMapBuilder.build().entrySet().stream() - .map(entry -> StringUtils.format("%s as %s", entry.getKey(), entry.getValue())) + node.getRenameList().stream() + .map(entry -> StringUtils.format("%s as %s", MASK_COLUMN, MASK_COLUMN)) .collect(Collectors.joining(",")); return StringUtils.format("%s | rename %s", child, renames); } + @Override + public String visitReplace(Replace node, String context) { + // Get the child query string + String child = node.getChild().get(0).accept(this, context); + + // Build pattern/replacement pairs string + String pairs = + node.getReplacePairs().stream() + .map( + pair -> + StringUtils.format( + "%s WITH %s", + visitExpression(pair.getPattern()), visitExpression(pair.getReplacement()))) + .collect(Collectors.joining(", ")); + + // Get field list + String fieldListStr = + " IN " + + node.getFieldList().stream().map(Field::toString).collect(Collectors.joining(", ")); + + // Build the replace command string + return StringUtils.format("%s | replace %s%s", child, pairs, fieldListStr); + } + /** Build {@link LogicalAggregation}. */ @Override public String visitAggregation(Aggregation node, String context) { @@ -245,6 +320,56 @@ public String visitAggregation(Aggregation node, String context) { child, String.join(" ", visitExpressionList(node.getAggExprList()), groupBy(group)).trim()); } + @Override + public String visitBin(Bin node, String context) { + String child = node.getChild().get(0).accept(this, context); + StringBuilder binCommand = new StringBuilder(); + binCommand.append(" | bin ").append(visitExpression(node.getField())); + + // Use instanceof for type-safe dispatch to access subclass-specific properties + if (node instanceof SpanBin) { + SpanBin spanBin = (SpanBin) node; + binCommand.append(" span=").append(visitExpression(spanBin.getSpan())); + if (spanBin.getAligntime() != null) { + binCommand.append(" aligntime=").append(visitExpression(spanBin.getAligntime())); + } + } else if (node instanceof MinSpanBin) { + MinSpanBin minSpanBin = (MinSpanBin) node; + binCommand.append(" minspan=").append(visitExpression(minSpanBin.getMinspan())); + if (minSpanBin.getStart() != null) { + binCommand.append(" start=").append(visitExpression(minSpanBin.getStart())); + } + if (minSpanBin.getEnd() != null) { + binCommand.append(" end=").append(visitExpression(minSpanBin.getEnd())); + } + } else if (node instanceof CountBin) { + CountBin countBin = (CountBin) node; + binCommand.append(" bins=").append(MASK_LITERAL); + if (countBin.getStart() != null) { + binCommand.append(" start=").append(visitExpression(countBin.getStart())); + } + if (countBin.getEnd() != null) { + binCommand.append(" end=").append(visitExpression(countBin.getEnd())); + } + } else if (node instanceof RangeBin) { + RangeBin rangeBin = (RangeBin) node; + if (rangeBin.getStart() != null) { + binCommand.append(" start=").append(visitExpression(rangeBin.getStart())); + } + if (rangeBin.getEnd() != null) { + binCommand.append(" end=").append(visitExpression(rangeBin.getEnd())); + } + } else if (node instanceof DefaultBin) { + // DefaultBin has no additional parameters + } + + if (node.getAlias() != null) { + binCommand.append(" as ").append(MASK_COLUMN); + } + + return StringUtils.format("%s%s", child, binCommand.toString()); + } + @Override public String visitWindow(Window node, String context) { String child = node.getChild().get(0).accept(this, context); @@ -253,19 +378,29 @@ public String visitWindow(Window node, String context) { child, String.join(" ", visitExpressionList(node.getWindowFunctionList())).trim()); } + @Override + public String visitStreamWindow(StreamWindow node, String context) { + String child = node.getChild().get(0).accept(this, context); + return StringUtils.format( + "%s | streamstats %s", + child, String.join(" ", visitExpressionList(node.getWindowFunctionList())).trim()); + } + /** Build {@link LogicalRareTopN}. */ @Override public String visitRareTopN(RareTopN node, String context) { final String child = node.getChild().get(0).accept(this, context); ArgumentMap arguments = ArgumentMap.of(node.getArguments()); - Integer noOfResults = (Integer) arguments.get("noOfResults").getValue(); - String countField = (String) arguments.get("countField").getValue(); - Boolean showCount = (Boolean) arguments.get("showCount").getValue(); + Integer noOfResults = node.getNoOfResults(); + String countField = (String) arguments.get(RareTopN.Option.countField.name()).getValue(); + Boolean showCount = (Boolean) arguments.get(RareTopN.Option.showCount.name()).getValue(); + Boolean useNull = (Boolean) arguments.get(RareTopN.Option.useNull.name()).getValue(); String fields = visitFieldList(node.getFields()); String group = visitExpressionList(node.getGroupExprList()); String options = isCalciteEnabled(settings) - ? StringUtils.format("countield='%s' showcount=%s ", countField, showCount) + ? StringUtils.format( + "countield='%s' showcount=%s usenull=%s ", countField, showCount, useNull) : ""; return StringUtils.format( "%s | %s %d %s%s", @@ -309,14 +444,14 @@ public String visitEval(Eval node, String context) { } String expressions = expressionsBuilder.build().stream() - .map(pair -> StringUtils.format("%s" + "=%s", pair.getLeft(), pair.getRight())) + .map(pair -> StringUtils.format("%s" + "=%s", MASK_COLUMN, pair.getRight())) .collect(Collectors.joining(" ")); return StringUtils.format("%s | eval %s", child, expressions); } @Override public String visitExpand(Expand node, String context) { - String child = node.getChild().get(0).accept(this, context); + String child = node.getChild().getFirst().accept(this, context); String field = visitExpression(node.getField()); return StringUtils.format("%s | expand %s", child, field); @@ -327,8 +462,13 @@ public String visitExpand(Expand node, String context) { public String visitSort(Sort node, String context) { String child = node.getChild().get(0).accept(this, context); // the first options is {"count": "integer"} + Integer count = node.getCount(); String sortList = visitFieldList(node.getSortList()); - return StringUtils.format("%s | sort %s", child, sortList); + if (count != 0) { + return StringUtils.format("%s | sort %d %s", child, count, sortList); + } else { + return StringUtils.format("%s | sort %s", child, sortList); + } } /** Build {@link LogicalDedupe}. */ @@ -353,6 +493,66 @@ public String visitHead(Head node, String context) { return StringUtils.format("%s | head %d", child, size); } + @Override + public String visitReverse(Reverse node, String context) { + String child = node.getChild().get(0).accept(this, context); + return StringUtils.format("%s | reverse", child); + } + + @Override + public String visitTimechart(Timechart node, String context) { + String child = node.getChild().get(0).accept(this, context); + StringBuilder timechartCommand = new StringBuilder(); + timechartCommand.append(" | timechart"); + + // Add span if present + if (node.getBinExpression() != null) { + timechartCommand.append(" span=").append(visitExpression(node.getBinExpression())); + } + + // Add limit if present + if (node.getLimit() != null) { + timechartCommand.append(" limit=").append(node.getLimit()); + } + + // Add useother if present + if (node.getUseOther() != null) { + timechartCommand.append(" useother=").append(node.getUseOther()); + } + + // Add aggregation function + timechartCommand.append(" ").append(visitExpression(node.getAggregateFunction())); + + // Add by clause if present + if (node.getByField() != null) { + timechartCommand.append(" by ").append(visitExpression(node.getByField())); + } + + return StringUtils.format("%s%s", child, timechartCommand.toString()); + } + + public String visitRex(Rex node, String context) { + String child = node.getChild().get(0).accept(this, context); + String field = visitExpression(node.getField()); + String pattern = "\"" + MASK_LITERAL + "\""; + StringBuilder command = new StringBuilder(); + + command.append( + String.format( + "%s | rex field=%s mode=%s %s", + child, field, node.getMode().toString().toLowerCase(), pattern)); + + if (node.getMaxMatch().isPresent()) { + command.append(" max_match=").append(MASK_LITERAL); + } + + if (node.getOffsetField().isPresent()) { + command.append(" offset_field=").append(MASK_COLUMN); + } + + return command.toString(); + } + @Override public String visitParse(Parse node, String context) { String child = node.getChild().get(0).accept(this, context); @@ -361,10 +561,10 @@ public String visitParse(Parse node, String context) { String commandName; switch (node.getParseMethod()) { - case PATTERNS: + case ParseMethod.PATTERNS: commandName = "patterns"; break; - case GROK: + case ParseMethod.GROK: commandName = "grok"; break; default: @@ -373,12 +573,22 @@ public String visitParse(Parse node, String context) { } return ParseMethod.PATTERNS.equals(node.getParseMethod()) && regex.isEmpty() ? StringUtils.format("%s | %s %s", child, commandName, source) - : StringUtils.format("%s | %s %s '%s'", child, commandName, source, regex); + : StringUtils.format("%s | %s %s '%s'", child, commandName, source, MASK_LITERAL); } @Override - public String visitFlatten(Flatten node, String context) { + public String visitRegex(Regex node, String context) { String child = node.getChild().get(0).accept(this, context); + String operator = node.isNegated() ? Regex.NOT_EQUALS_OPERATOR : Regex.EQUALS_OPERATOR; + String pattern = MASK_LITERAL; + + String field = visitExpression(node.getField()); + return StringUtils.format("%s | regex %s%s%s", child, field, operator, pattern); + } + + @Override + public String visitFlatten(Flatten node, String context) { + String child = node.getChild().getFirst().accept(this, context); String field = visitExpression(node.getField()); return StringUtils.format("%s | flatten %s", child, field); } @@ -401,6 +611,50 @@ public String visitAppendCol(AppendCol node, String context) { "%s | appendcol override=%s [%s ]", child, node.isOverride(), subsearchWithoutRelation); } + @Override + public String visitAppend(Append node, String context) { + String child = node.getChild().get(0).accept(this, context); + String subsearch = anonymizeData(node.getSubSearch()); + return StringUtils.format("%s | append [%s ]", child, subsearch); + } + + @Override + public String visitMultisearch(Multisearch node, String context) { + List anonymizedSubsearches = new ArrayList<>(); + + for (UnresolvedPlan subsearch : node.getSubsearches()) { + String anonymizedSubsearch = anonymizeData(subsearch); + anonymizedSubsearch = "search " + anonymizedSubsearch; + anonymizedSubsearch = + anonymizedSubsearch + .replaceAll("\\bsource=\\w+", "source=table") // Replace table names after source= + .replaceAll( + "\\b(?!source|fields|where|stats|head|tail|sort|eval|rename|multisearch|search|table|identifier|\\*\\*\\*)\\w+(?=\\s*[<>=!])", + "identifier") // Replace field names before operators + .replaceAll( + "\\b(?!source|fields|where|stats|head|tail|sort|eval|rename|multisearch|search|table|identifier|\\*\\*\\*)\\w+(?=\\s*,)", + "identifier") // Replace field names before commas + .replaceAll( + "fields" + + " \\+\\s*\\b(?!source|fields|where|stats|head|tail|sort|eval|rename|multisearch|search|table|identifier|\\*\\*\\*)\\w+", + "fields + identifier") // Replace field names after 'fields +' + .replaceAll( + "fields" + + " \\+\\s*identifier,\\s*\\b(?!source|fields|where|stats|head|tail|sort|eval|rename|multisearch|search|table|identifier|\\*\\*\\*)\\w+", + "fields + identifier,identifier"); // Handle multiple fields + anonymizedSubsearches.add(StringUtils.format("[%s]", anonymizedSubsearch)); + } + + return StringUtils.format("| multisearch %s", String.join(" ", anonymizedSubsearches)); + } + + @Override + public String visitValues(Values node, String context) { + // In case legacy SQL relies on it, return empty to fail open anyway. + // Don't expect it to fail the query execution. + return ""; + } + private String visitFieldList(List fieldList) { return fieldList.stream().map(this::visitExpression).collect(Collectors.joining(",")); } @@ -424,28 +678,63 @@ private String visitExpression(UnresolvedExpression expression) { public String visitFillNull(FillNull node, String context) { String child = node.getChild().get(0).accept(this, context); List> fieldFills = node.getReplacementPairs(); + + // Check if using value= syntax (added in 3.4) + if (node.isUseValueSyntax()) { + if (fieldFills.isEmpty()) { + return StringUtils.format("%s | fillnull value=%s", child, MASK_LITERAL); + } + return StringUtils.format( + "%s | fillnull value=%s %s", + child, + MASK_LITERAL, + fieldFills.stream() + .map(n -> visitExpression(n.getLeft())) + .collect(Collectors.joining(" "))); + } + + // Distinguish between with...in and using based on whether all values are the same if (fieldFills.isEmpty()) { return StringUtils.format("%s | fillnull with %s", child, MASK_LITERAL); } - final UnresolvedExpression firstReplacement = fieldFills.get(0).getRight(); + final UnresolvedExpression firstReplacement = fieldFills.getFirst().getRight(); if (fieldFills.stream().allMatch(n -> firstReplacement == n.getRight())) { + // All fields use same replacement value -> with...in syntax return StringUtils.format( "%s | fillnull with %s in %s", child, MASK_LITERAL, - node.getReplacementPairs().stream() + fieldFills.stream() .map(n -> visitExpression(n.getLeft())) .collect(Collectors.joining(", "))); } else { + // Different replacement values per field -> using syntax return StringUtils.format( "%s | fillnull using %s", child, - node.getReplacementPairs().stream() + fieldFills.stream() .map(n -> StringUtils.format("%s = %s", visitExpression(n.getLeft()), MASK_LITERAL)) .collect(Collectors.joining(", "))); } } + @Override + public String visitSpath(SPath node, String context) { + String child = node.getChild().get(0).accept(this, context); + StringBuilder builder = new StringBuilder(); + builder.append(child).append(" | spath"); + if (node.getInField() != null) { + builder.append(" input=").append(MASK_COLUMN); + } + if (node.getOutField() != null) { + builder.append(" output=").append(MASK_COLUMN); + } + if (node.getPath() != null) { + builder.append(" path=").append(MASK_COLUMN); + } + return builder.toString(); + } + @Override public String visitPatterns(Patterns node, String context) { String child = node.getChild().get(0).accept(this, context); @@ -460,7 +749,7 @@ public String visitPatterns(Patterns node, String context) { builder.append(" mode=").append(node.getPatternMode().toString()); builder.append(" max_sample_count=").append(visitExpression(node.getPatternMaxSampleCount())); builder.append(" buffer_limit=").append(visitExpression(node.getPatternBufferLimit())); - builder.append(" new_field=").append(node.getAlias()); + builder.append(" new_field=").append(MASK_COLUMN); if (!node.getArguments().isEmpty()) { for (java.util.Map.Entry entry : node.getArguments().entrySet()) { builder.append( @@ -594,7 +883,7 @@ public String visitIn(In node, String context) { @Override public String visitField(Field node, String context) { - return node.getField().toString(); + return MASK_COLUMN; } @Override @@ -616,7 +905,7 @@ public String visitAlias(Alias node, String context) { @Override public String visitTrendlineComputation(Trendline.TrendlineComputation node, String context) { final String dataField = node.getDataField().accept(this, context); - final String aliasClause = " as " + node.getAlias(); + final String aliasClause = " as " + MASK_COLUMN; final String computationType = node.getComputationType().name().toLowerCase(Locale.ROOT); return StringUtils.format( "%s(%d, %s)%s", computationType, node.getNumberOfDataPoints(), dataField, aliasClause); @@ -645,7 +934,7 @@ public String visitExistsSubquery(ExistsSubquery node, String context) { @Override public String visitCase(Case node, String context) { StringBuilder builder = new StringBuilder(); - builder.append("cast("); + builder.append("case("); for (When when : node.getWhenClauses()) { builder.append(analyze(when.getCondition(), context)); builder.append(","); @@ -668,5 +957,11 @@ public String visitCast(Cast node, String context) { String expr = analyze(node.getExpression(), context); return StringUtils.format("cast(%s as %s)", expr, node.getConvertedType().toString()); } + + @Override + public String visitQualifiedName( + org.opensearch.sql.ast.expression.QualifiedName node, String context) { + return MASK_COLUMN; + } } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java new file mode 100644 index 00000000000..04f4c7610d9 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java @@ -0,0 +1,189 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.test.CalciteAssert; +import org.junit.Test; + +public class CalcitePPLStreamstatsTest extends CalcitePPLAbstractTest { + + public CalcitePPLStreamstatsTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Test + public void testStreamstatsBy() { + String ppl = "source=EMP | streamstats max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], max(SAL)=[MAX($5) OVER" + + " (PARTITION BY $7 ROWS UNBOUNDED PRECEDING)])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (PARTITION BY `DEPTNO` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)" + + " `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t`\n" + + "ORDER BY `__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsCurrent() { + String ppl = "source=EMP | streamstats current = false max(SAL)"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[MAX($5) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) `max(SAL)`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsWindow() { + String ppl = "source=EMP | streamstats window = 5 max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{7," + + " 8}])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalAggregate(group=[{}], max(SAL)=[MAX($5)])\n" + + " LogicalFilter(condition=[AND(>=($8, -($cor0.__stream_seq__, 4)), <=($8," + + " $cor0.__stream_seq__), =($7, $cor0.DEPTNO))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `$cor0`.`EMPNO`, `$cor0`.`ENAME`, `$cor0`.`JOB`, `$cor0`.`MGR`, `$cor0`.`HIREDATE`," + + " `$cor0`.`SAL`, `$cor0`.`COMM`, `$cor0`.`DEPTNO`, `t2`.`max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `$cor0`,\n" + + "LATERAL (SELECT MAX(`SAL`) `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t0`\n" + + "WHERE `__stream_seq__` >= `$cor0`.`__stream_seq__` - 4 AND `__stream_seq__` <=" + + " `$cor0`.`__stream_seq__` AND `DEPTNO` = `$cor0`.`DEPTNO`) `t2`\n" + + "ORDER BY `$cor0`.`__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsGlobal() { + String ppl = "source=EMP | streamstats window = 5 global= false max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], max(SAL)=[MAX($5) OVER" + + " (PARTITION BY $7 ROWS 4 PRECEDING)])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (PARTITION BY `DEPTNO` ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t`\n" + + "ORDER BY `__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsReset() { + String ppl = + "source=EMP | streamstats reset_before=SAL>100 reset_after=SAL<50 avg(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], avg(SAL)=[$12])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{7, 8," + + " 11}])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], __reset_before_flag__=[$9]," + + " __reset_after_flag__=[$10], __seg_id__=[+(SUM($9) OVER (ROWS UNBOUNDED PRECEDING)," + + " COALESCE(SUM($10) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()]," + + " __reset_before_flag__=[CASE(>($5, 100), 1, 0)], __reset_after_flag__=[CASE(<($5," + + " 50), 1, 0)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalAggregate(group=[{}], avg(SAL)=[AVG($5)])\n" + + " LogicalFilter(condition=[AND(<=($8, $cor0.__stream_seq__), =($11," + + " $cor0.__seg_id__), =($7, $cor0.DEPTNO))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], __reset_before_flag__=[$9]," + + " __reset_after_flag__=[$10], __seg_id__=[+(SUM($9) OVER (ROWS UNBOUNDED PRECEDING)," + + " COALESCE(SUM($10) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3]," + + " HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER" + + " ()], __reset_before_flag__=[CASE(>($5, 100), 1, 0)]," + + " __reset_after_flag__=[CASE(<($5, 50), 1, 0)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `$cor0`.`EMPNO`, `$cor0`.`ENAME`, `$cor0`.`JOB`, `$cor0`.`MGR`, `$cor0`.`HIREDATE`," + + " `$cor0`.`SAL`, `$cor0`.`COMM`, `$cor0`.`DEPTNO`, `t4`.`avg(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `__stream_seq__`, `__reset_before_flag__`, `__reset_after_flag__`," + + " (SUM(`__reset_before_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT" + + " ROW)) + COALESCE(SUM(`__reset_after_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING), 0) `__seg_id__`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`, CASE WHEN `SAL` > 100 THEN 1 ELSE 0 END" + + " `__reset_before_flag__`, CASE WHEN `SAL` < 50 THEN 1 ELSE 0 END" + + " `__reset_after_flag__`\n" + + "FROM `scott`.`EMP`) `t`) `$cor0`,\n" + + "LATERAL (SELECT AVG(`SAL`) `avg(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `__stream_seq__`, `__reset_before_flag__`, `__reset_after_flag__`," + + " (SUM(`__reset_before_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT" + + " ROW)) + COALESCE(SUM(`__reset_after_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING), 0) `__seg_id__`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`, CASE WHEN `SAL` > 100 THEN 1 ELSE 0 END" + + " `__reset_before_flag__`, CASE WHEN `SAL` < 50 THEN 1 ELSE 0 END" + + " `__reset_after_flag__`\n" + + "FROM `scott`.`EMP`) `t1`) `t2`\n" + + "WHERE `__stream_seq__` <= `$cor0`.`__stream_seq__` AND `__seg_id__` =" + + " `$cor0`.`__seg_id__` AND `DEPTNO` = `$cor0`.`DEPTNO`) `t4`\n" + + "ORDER BY `$cor0`.`__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } +} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index b0c2eff0255..48f6c45b4c6 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -33,7 +33,7 @@ public class PPLQueryDataAnonymizerTest { @Test public void testSearchCommand() { - assertEquals("source=t | where a = ***", anonymize("search source=t a=1")); + assertEquals("source=table a:***", anonymize("search source=t a=1")); } @Test @@ -45,139 +45,302 @@ public void testTableFunctionCommand() { @Test public void testPrometheusPPLCommand() { - assertEquals( - "source=prometheus.http_requests_process", - anonymize("source=prometheus.http_requests_process")); + assertEquals("source=table", anonymize("source=prometheus.http_requests_process")); } @Test public void testWhereCommand() { - assertEquals("source=t | where a = ***", anonymize("search source=t | where a=1")); + assertEquals("source=table | where identifier = ***", anonymize("search source=t | where a=1")); } + // Fields and Table Command Tests @Test public void testFieldsCommandWithoutArguments() { - assertEquals("source=t | fields + f,g", anonymize("source=t | fields f,g")); + assertEquals( + "source=table | fields + identifier,identifier", anonymize("source=t | fields f,g")); } @Test public void testFieldsCommandWithIncludeArguments() { - assertEquals("source=t | fields + f,g", anonymize("source=t | fields + f,g")); + assertEquals( + "source=table | fields + identifier,identifier", anonymize("source=t | fields + f,g")); } @Test public void testFieldsCommandWithExcludeArguments() { - assertEquals("source=t | fields - f,g", anonymize("source=t | fields - f,g")); + assertEquals( + "source=table | fields - identifier,identifier", anonymize("source=t | fields - f,g")); + } + + @Test + public void testFieldsCommandWithWildcards() { + when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(true); + assertEquals("source=table | fields + identifier", anonymize("source=t | fields account*")); + assertEquals("source=table | fields + identifier", anonymize("source=t | fields *name")); + assertEquals("source=table | fields + identifier", anonymize("source=t | fields *a*")); + assertEquals("source=table | fields - identifier", anonymize("source=t | fields - account*")); + } + + @Test + public void testFieldsCommandWithDelimiters() { + when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(true); + assertEquals( + "source=table | fields + identifier,identifier,identifier", + anonymize("source=t | fields firstname lastname age")); + assertEquals( + "source=table | fields + identifier,identifier,identifier", + anonymize("source=t | fields firstname lastname, balance")); + assertEquals( + "source=table | fields + identifier,identifier", + anonymize("source=t | fields account*, *name")); + } + + @Test + public void testTableCommand() { + when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(true); + assertEquals( + "source=table | fields + identifier,identifier", anonymize("source=t | table f,g")); + assertEquals( + "source=table | fields + identifier,identifier", anonymize("source=t | table + f,g")); + assertEquals( + "source=table | fields - identifier,identifier", anonymize("source=t | table - f,g")); + assertEquals("source=table | fields + identifier", anonymize("source=t | table account*")); + assertEquals( + "source=table | fields + identifier,identifier,identifier", + anonymize("source=t | table firstname lastname age")); + } + + @Test + public void anonymizeFieldsNoArg() { + assertEquals( + "source=table | fields + identifier", + anonymize(projectWithArg(relation("t"), Collections.emptyList(), field("f")))); } @Test public void testRenameCommandWithMultiFields() { assertEquals( - "source=t | rename f as g,h as i,j as k", + "source=table | rename identifier as identifier,identifier as identifier,identifier as" + + " identifier", anonymize("source=t | rename f as g,h as i,j as k")); } + @Test + public void testRenameCommandWithWildcards() { + assertEquals( + "source=table | rename identifier as identifier", anonymize("source=t | rename f* as g*")); + } + @Test public void testStatsCommandWithByClause() { - assertEquals("source=t | stats count(a) by b", anonymize("source=t | stats count(a) by b")); + assertEquals( + "source=table | stats count(identifier) by identifier", + anonymize("source=t | stats count(a) by b")); } @Test public void testStatsCommandWithNestedFunctions() { - assertEquals("source=t | stats sum(+(a,b))", anonymize("source=t | stats sum(a+b)")); + assertEquals( + "source=table | stats sum(+(identifier,identifier))", + anonymize("source=t | stats sum(a+b)")); } @Test public void testStatsCommandWithSpanFunction() { assertEquals( - "source=t | stats count(a) by span(b, *** d),c", + "source=table | stats count(identifier) by span(identifier, *** d),identifier", anonymize("source=t | stats count(a) by span(b, 1d), c")); } @Test public void testEventstatsCommandWithByClause() { assertEquals( - "source=t | eventstats count(a) by b", anonymize("source=t | eventstats count(a) by b")); + "source=table | eventstats count(identifier) by identifier", + anonymize("source=t | eventstats count(a) by b")); } @Test public void testEventstatsCommandWithNestedFunctions() { - assertEquals("source=t | eventstats sum(+(a,b))", anonymize("source=t | eventstats sum(a+b)")); + assertEquals( + "source=table | eventstats sum(+(identifier,identifier))", + anonymize("source=t | eventstats sum(a+b)")); } @Test public void testEventstatsCommandWithSpanFunction() { assertEquals( - "source=t | eventstats count(a) by span(b, *** d),c", + "source=table | eventstats count(identifier) by span(identifier, *** d),identifier", anonymize("source=t | eventstats count(a) by span(b, 1d), c")); } + @Test + public void testStreamstatsCommandWithByClause() { + assertEquals( + "source=table | streamstats count(identifier) by identifier", + anonymize("source=t | streamstats count(a) by b")); + } + + @Test + public void testStreamstatsCommandWithWindowAndCurrent() { + assertEquals( + "source=table | streamstats max(identifier)", + anonymize("source=t | streamstats current=false window=2 max(a)")); + } + + @Test + public void testStreamstatsCommandWithNestedFunctions() { + assertEquals( + "source=table | streamstats sum(+(identifier,identifier))", + anonymize("source=t | streamstats sum(a+b)")); + } + + @Test + public void testStreamstatsCommandWithSpanFunction() { + assertEquals( + "source=table | streamstats count(identifier) by span(identifier, *** d),identifier", + anonymize("source=t | streamstats count(a) by span(b, 1d), c")); + } + + @Test + public void testBinCommandBasic() { + assertEquals("source=table | bin identifier span=***", anonymize("source=t | bin f span=10")); + } + + @Test + public void testBinCommandWithAllParameters() { + assertEquals( + "source=table | bin identifier span=*** aligntime=*** as identifier", + anonymize("source=t | bin f span=10 aligntime=earliest as alias")); + } + + @Test + public void testBinCommandWithCountParameters() { + assertEquals( + "source=table | bin identifier bins=*** start=*** end=*** as identifier", + anonymize("source=t | bin f bins=10 start=0 end=100 as alias")); + } + + @Test + public void testBinCommandWithMinspanParameters() { + assertEquals( + "source=table | bin identifier minspan=*** start=*** end=*** as identifier", + anonymize("source=t | bin f minspan=5 start=0 end=100 as alias")); + } + @Test public void testDedupCommand() { assertEquals( - "source=t | dedup f1,f2 1 keepempty=false consecutive=false", + "source=table | dedup identifier,identifier 1 keepempty=false consecutive=false", anonymize("source=t | dedup f1, f2")); } @Test public void testTrendlineCommand() { assertEquals( - "source=t | trendline sma(2, date) as date_alias sma(3, time) as time_alias", + "source=table | trendline sma(2, identifier) as identifier sma(3, identifier) as" + + " identifier", anonymize("source=t | trendline sma(2, date) as date_alias sma(3, time) as time_alias")); } @Test public void testHeadCommandWithNumber() { - assertEquals("source=t | head 3", anonymize("source=t | head 3")); + assertEquals("source=table | head 3", anonymize("source=t | head 3")); + } + + @Test + public void testReverseCommand() { + assertEquals("source=table | reverse", anonymize("source=t | reverse")); + } + + @Test + public void testTimechartCommand() { + assertEquals( + "source=table | timechart span=span(identifier, *** m) limit=10 useother=true count() by" + + " identifier", + anonymize("source=t | timechart count() by host")); } // todo, sort order is ignored, it doesn't impact the log analysis. @Test public void testSortCommandWithOptions() { - assertEquals("source=t | sort f1,f2", anonymize("source=t | sort - f1, + f2")); + assertEquals( + "source=table | sort identifier,identifier", anonymize("source=t | sort - f1, + f2")); + } + + @Test + public void testSortCommandWithCount() { + assertEquals("source=table | sort 5 identifier", anonymize("source=t | sort 5 f1")); + } + + @Test + public void testSortCommandWithDesc() { + assertEquals("source=table | sort identifier", anonymize("source=t | sort f1 desc")); } @Test public void testEvalCommand() { - assertEquals("source=t | eval r=abs(f)", anonymize("source=t | eval r=abs(f)")); + assertEquals( + "source=table | eval identifier=abs(identifier)", anonymize("source=t | eval r=abs(f)")); + } + + @Test + public void testEvalCommandWithStrftime() { + assertEquals( + "source=table | eval identifier=strftime(identifier,***)", + anonymize("source=t | eval formatted=strftime(timestamp, '%Y-%m-%d %H:%M:%S')")); } @Test public void testFillNullSameValue() { assertEquals( - "source=t | fillnull with *** in f1, f2", + "source=table | fillnull with *** in identifier, identifier", anonymize("source=t | fillnull with 0 in f1, f2")); } @Test public void testFillNullVariousValues() { assertEquals( - "source=t | fillnull using f1 = ***, f2 = ***", + "source=table | fillnull using identifier = ***, identifier = ***", anonymize("source=t | fillnull using f1 = 0, f2 = -1")); } @Test public void testFillNullWithoutFields() { - assertEquals("source=t | fillnull with ***", anonymize("source=t | fillnull with 0")); + assertEquals("source=table | fillnull with ***", anonymize("source=t | fillnull with 0")); + } + + @Test + public void testFillNullValueSyntaxWithFields() { + assertEquals( + "source=table | fillnull value=*** identifier identifier", + anonymize("source=t | fillnull value=0 f1 f2")); + } + + @Test + public void testFillNullValueSyntaxAllFields() { + assertEquals("source=table | fillnull value=***", anonymize("source=t | fillnull value=0")); } @Test public void testRareCommandWithGroupBy() { when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(false); - assertEquals("source=t | rare 10 a by b", anonymize("source=t | rare a by b")); + assertEquals( + "source=table | rare 10 identifier by identifier", anonymize("source=t | rare a by b")); } @Test public void testTopCommandWithNAndGroupBy() { when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(false); - assertEquals("source=t | top 1 a by b", anonymize("source=t | top 1 a by b")); + assertEquals( + "source=table | top 1 identifier by identifier", anonymize("source=t | top 1 a by b")); } @Test public void testRareCommandWithGroupByWithCalcite() { when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(true); assertEquals( - "source=t | rare 10 countield='count' showcount=true a by b", + "source=table | rare 10 countield='count' showcount=true usenull=true identifier by" + + " identifier", anonymize("source=t | rare a by b")); } @@ -185,154 +348,221 @@ public void testRareCommandWithGroupByWithCalcite() { public void testTopCommandWithNAndGroupByWithCalcite() { when(settings.getSettingValue(Key.CALCITE_ENGINE_ENABLED)).thenReturn(true); assertEquals( - "source=t | top 1 countield='count' showcount=true a by b", + "source=table | top 1 countield='count' showcount=true usenull=true identifier by" + + " identifier", anonymize("source=t | top 1 a by b")); } @Test public void testAndExpression() { - assertEquals("source=t | where a = *** and b = ***", anonymize("source=t | where a=1 and b=2")); + assertEquals( + "source=table | where identifier = *** and identifier = ***", + anonymize("source=t | where a=1 and b=2")); } @Test public void testOrExpression() { - assertEquals("source=t | where a = *** or b = ***", anonymize("source=t | where a=1 or b=2")); + assertEquals( + "source=table | where identifier = *** or identifier = ***", + anonymize("source=t | where a=1 or b=2")); } @Test public void testXorExpression() { - assertEquals("source=t | where a = *** xor b = ***", anonymize("source=t | where a=1 xor b=2")); + assertEquals( + "source=table | where identifier = *** xor identifier = ***", + anonymize("source=t | where a=1 xor b=2")); } @Test public void testNotExpression() { - assertEquals("source=t | where not a = ***", anonymize("source=t | where not a=1 ")); + assertEquals( + "source=table | where not identifier = ***", anonymize("source=t | where not a=1 ")); } @Test public void testInExpression() { - assertEquals("source=t | where a in (***)", anonymize("source=t | where a in (1, 2, 3) ")); + assertEquals( + "source=table | where identifier in (***)", anonymize("source=t | where a in (1, 2, 3) ")); } @Test public void testQualifiedName() { - assertEquals("source=t | fields + field0", anonymize("source=t | fields field0")); + assertEquals("source=table | fields + identifier", anonymize("source=t | fields field0")); } @Test public void testDateFunction() { assertEquals( - "source=t | eval date=DATE_ADD(DATE(***),INTERVAL *** HOUR)", + "source=table | eval identifier=DATE_ADD(DATE(***),INTERVAL *** HOUR)", anonymize("source=t | eval date=DATE_ADD(DATE('2020-08-26'),INTERVAL 1 HOUR)")); } @Test public void testDescribe() { - assertEquals("describe t", anonymize("describe t")); + assertEquals("describe table", anonymize("describe t")); } @Test public void testExplain() { assertEquals( - "explain standard source=t | fields + a", anonymizeStatement("source=t | fields a", true)); + "explain standard source=table | fields + identifier", + anonymizeStatement("source=t | fields a", true)); } @Test public void testExplainCommand() { assertEquals( - "explain standard source=t | fields + a", + "explain standard source=table | fields + identifier", anonymizeStatement("explain source=t | fields a", false)); assertEquals( - "explain extended source=t | fields + a", + "explain extended source=table | fields + identifier", anonymizeStatement("explain extended source=t | fields a", false)); } @Test public void testQuery() { - assertEquals("source=t | fields + a", anonymizeStatement("source=t | fields a", false)); - } - - @Test - public void anonymizeFieldsNoArg() { assertEquals( - "source=t | fields + f", - anonymize(projectWithArg(relation("t"), Collections.emptyList(), field("f")))); + "source=table | fields + identifier", anonymizeStatement("source=t | fields a", false)); } @Test public void testBetween() { assertEquals( - "source=t | where id between *** and *** | fields + id", + "source=table | where identifier between *** and *** | fields + identifier", anonymize("source=t | where id between 1 and 2 | fields id")); assertEquals( - "source=t | where not id between *** and *** | fields + id", + "source=table | where not identifier between *** and *** | fields + identifier", anonymize("source=t | where id not between 1 and 2 | fields id")); } @Test public void testAppendcol() { assertEquals( - "source=t | stats count() by b | appendcol override=false [ stats sum(c) by b ]", + "source=table | stats count() by identifier | appendcol override=false [ stats" + + " sum(identifier) by identifier ]", anonymize("source=t | stats count() by b | appendcol [ stats sum(c) by b ]")); assertEquals( - "source=t | stats count() by b | appendcol override=true [ stats sum(c) by b ]", + "source=table | stats count() by identifier | appendcol override=true [ stats" + + " sum(identifier) by identifier ]", anonymize("source=t | stats count() by b | appendcol override=true [ stats sum(c) by b ]")); assertEquals( - "source=t | appendcol override=false [ where a = *** ]", + "source=table | appendcol override=false [ where identifier = *** ]", anonymize("source=t | appendcol override=false [ where a = 1 ]")); } @Test + public void testAppend() { + assertEquals( + "source=table | stats count() by identifier | append [ | stats sum(identifier) by" + + " identifier ]", + anonymize("source=t | stats count() by b | append [ | stats sum(c) by b ]")); + assertEquals( + "source=table | stats count() by identifier | append [ | stats sum(identifier) by" + + " identifier ]", + anonymize("source=t | stats count() by b | append [ | stats sum(c) by b ]")); + assertEquals( + "source=table | append [ | where identifier = *** ]", + anonymize("source=t | append [ | where a = 1 ]")); + assertEquals( + "source=table | stats count() by identifier | append [source=table | stats sum(identifier)" + + " by identifier ]", + anonymize("source=t | stats count() by b | append [source=a | stats sum(c) by b ]")); + assertEquals( + "source=table | append [source=table | where identifier = *** ]", + anonymize("source=t | append [source=b | where a = 1 ]")); + assertEquals( + "source=table | stats count() by identifier | append [source=table ]", + anonymize("source=t | stats count() by b | append [ source=a ]")); + assertEquals( + "source=table | stats count() by identifier | append [ ]", + anonymize("source=t | stats count() by b | append [ ]")); + } + + @Test + // Same as SQL, select * from a as b -> SELECT * FROM table AS identifier public void testSubqueryAlias() { - assertEquals("source=t as t1", anonymize("source=t as t1")); + assertEquals("source=table as identifier", anonymize("source=t as t1")); } @Test public void testJoin() { assertEquals( - "source=t | cross join on true s | fields + id", - anonymize("source=t | cross join s | fields id")); + "source=table | cross join max=*** on *** = *** table | fields + identifier", + anonymize("source=t | cross join on 1=1 s | fields id")); assertEquals( - "source=t | inner join on id = uid s | fields + id", + "source=table | inner join max=*** on identifier = identifier table | fields + identifier", anonymize("source=t | inner join on id = uid s | fields id")); assertEquals( - "source=t as l | inner join left = l right = r on id = uid s as r | fields + id", + "source=table as identifier | inner join max=*** left = identifier right = identifier on" + + " identifier = identifier table as identifier | fields + identifier", anonymize("source=t | join left = l right = r on id = uid s | fields id")); assertEquals( - "source=t | left join right = r on id = uid s as r | fields + id", + "source=table | left join max=*** right = identifier on identifier = identifier table as" + + " identifier | fields + identifier", anonymize("source=t | left join right = r on id = uid s | fields id")); assertEquals( - "source=t as t1 | inner join left = t1 right = t2 on id = uid s as t2 | fields + t1.id", + "source=table as identifier | inner join max=*** left = identifier right = identifier on" + + " identifier = identifier table as identifier | fields + identifier", anonymize("source=t as t1 | inner join on id = uid s as t2 | fields t1.id")); assertEquals( - "source=t as t1 | right join left = t1 right = t2 on t1.id = t2.id s as t2 | fields +" - + " t1.id", - anonymize("source=t as t1 | right join on t1.id = t2.id s as t2 | fields t1.id")); + "source=table as identifier | right join max=*** left = identifier right = identifier on" + + " identifier = identifier table as identifier | fields + identifier", + anonymize("source=t as t1 | right join max=0 on t1.id = t2.id s as t2 | fields t1.id")); assertEquals( - "source=t as t1 | right join left = t1 right = t2 on t1.id = t2.id [ source=s | fields + id" - + " ] as t2 | fields + t1.id", + "source=table as identifier | right join max=*** left = identifier right = identifier on" + + " identifier = identifier [ source=table | fields + identifier ] as identifier |" + + " fields + identifier", anonymize( - "source=t as t1 | right join on t1.id = t2.id [ source=s | fields id] as t2 | fields" - + " t1.id")); + "source=t as t1 | right join max=0 on t1.id = t2.id [ source=s | fields id] as t2 |" + + " fields t1.id")); + assertEquals( + "source=table | inner join max=*** on identifier = identifier table | fields + identifier", + anonymize("source=t | inner join max=2 on id = uid s | fields id")); + } + + @Test + public void testJoinWithFieldList() { + assertEquals( + "source=table | join type=inner overwrite=*** max=*** table | fields + identifier", + anonymize("source=t | join s | fields id")); + assertEquals( + "source=table | join type=inner overwrite=*** max=*** identifier table | fields +" + + " identifier", + anonymize("source=t | join id s | fields id")); + assertEquals( + "source=table | join type=left overwrite=*** max=*** identifier,identifier table | fields +" + + " identifier", + anonymize("source=t | join type=left overwrite=false id1,id2 s | fields id1")); + assertEquals( + "source=table | join type=left overwrite=*** max=*** identifier,identifier table | fields +" + + " identifier", + anonymize("source=t | join type=outer overwrite=false id1 id2 s | fields id1")); + assertEquals( + "source=table | join type=left overwrite=*** max=*** identifier,identifier table | fields +" + + " identifier", + anonymize("source=t | join type=outer max=2 id1 id2 s | fields id1")); } @Test public void testLookup() { assertEquals( - "source=EMP | lookup DEPT DEPTNO replace LOC", + "source=table | lookup table DEPTNO replace LOC", anonymize("source=EMP | lookup DEPT DEPTNO replace LOC")); assertEquals( - "source=EMP | lookup DEPT DEPTNO replace LOC as JOB", + "source=table | lookup table DEPTNO replace LOC as JOB", anonymize("source=EMP | lookup DEPT DEPTNO replace LOC as JOB")); assertEquals( - "source=EMP | lookup DEPT DEPTNO append LOC", + "source=table | lookup table DEPTNO append LOC", anonymize("source=EMP | lookup DEPT DEPTNO append LOC")); assertEquals( - "source=EMP | lookup DEPT DEPTNO append LOC as JOB", + "source=table | lookup table DEPTNO append LOC as JOB", anonymize("source=EMP | lookup DEPT DEPTNO append LOC as JOB")); - assertEquals("source=EMP | lookup DEPT DEPTNO", anonymize("source=EMP | lookup DEPT DEPTNO")); assertEquals( - "source=EMP | lookup DEPT DEPTNO as EMPNO, ID append ID, LOC as JOB, COUNTRY as COUNTRY2", + "source=table | lookup table DEPTNO", anonymize("source=EMP | lookup DEPT DEPTNO")); + assertEquals( + "source=table | lookup table DEPTNO as EMPNO, ID append ID, LOC as JOB, COUNTRY as" + + " COUNTRY2", anonymize( "source=EMP | lookup DEPT DEPTNO as EMPNO, ID append ID, LOC as JOB, COUNTRY as" + " COUNTRY2")); @@ -341,41 +571,47 @@ public void testLookup() { @Test public void testInSubquery() { assertEquals( - "source=t | where (id) in [ source=s | fields + uid ] | fields + id", + "source=table | where (identifier) in [ source=table | fields + identifier ] | fields +" + + " identifier", anonymize("source=t | where id in [source=s | fields uid] | fields id")); } @Test public void testExistsSubquery() { assertEquals( - "source=t | where exists [ source=s | where id = uid ] | fields + id", + "source=table | where exists [ source=table | where identifier = identifier ] | fields +" + + " identifier", anonymize("source=t | where exists [source=s | where id = uid ] | fields id")); } @Test public void testScalarSubquery() { assertEquals( - "source=t | where id = [ source=s | stats max(b) ] | fields + id", + "source=table | where identifier = [ source=table | stats max(identifier) ] | fields +" + + " identifier", anonymize("source=t | where id = [ source=s | stats max(b) ] | fields id")); assertEquals( - "source=t | eval id=[ source=s | stats max(b) ] | fields + id", + "source=table | eval identifier=[ source=table | stats max(identifier) ] | fields +" + + " identifier", anonymize("source=t | eval id = [ source=s | stats max(b) ] | fields id")); assertEquals( - "source=t | where id > [ source=s | where id = uid | stats max(b) ] | fields + id", - anonymize("source=t id > [ source=s | where id = uid | stats max(b) ] | fields id")); + "source=table | where identifier > [ source=table | where identifier = identifier | stats" + + " max(identifier) ] | fields + identifier", + anonymize( + "source=t | where id > [ source=s | where id = uid | stats max(b) ] | fields id")); } @Test public void testCaseWhen() { assertEquals( - "source=t | eval level=cast(score >= ***,***,score >= *** and score < ***,*** else ***) |" - + " fields + level", + "source=table | eval identifier=case(identifier >= ***,***,identifier >= *** and identifier" + + " < ***,*** else ***) | fields + identifier", anonymize( "source=t | eval level=CASE(score >= 90, 'A', score >= 80 AND score < 90, 'B' else 'C')" + " | fields level")); assertEquals( - "source=t | eval level=cast(score >= ***,***,score >= *** and score < ***,***) | fields +" - + " level", + "source=table | eval identifier=case(identifier >= ***,***,identifier >= *** and identifier" + + " < ***,***) | fields + identifier", anonymize( "source=t | eval level=CASE(score >= 90, 'A', score >= 80 AND score < 90, 'B')" + " | fields level")); @@ -384,33 +620,76 @@ public void testCaseWhen() { @Test public void testCast() { assertEquals( - "source=t | eval id=cast(a as INTEGER) | fields + id", + "source=table | eval identifier=cast(identifier as INTEGER) | fields + identifier", anonymize("source=t | eval id=CAST(a AS INTEGER) | fields id")); assertEquals( - "source=t | eval id=cast(*** as DOUBLE) | fields + id", + "source=table | eval identifier=cast(*** as DOUBLE) | fields + identifier", anonymize("source=t | eval id=CAST('1' AS DOUBLE) | fields id")); } @Test public void testParse() { assertEquals( - "source=t | parse email '.+@(?.+)'", + "source=table | parse identifier '***'", anonymize("source=t | parse email '.+@(?.+)'")); assertEquals( - "source=t | parse email '.+@(?.+)' | fields + email,host", + "source=table | parse identifier '***' | fields + identifier,identifier", anonymize("source=t | parse email '.+@(?.+)' | fields email, host")); } @Test public void testGrok() { assertEquals( - "source=t | grok email '.+@%{HOSTNAME:host}'", + "source=table | grok identifier '***'", anonymize("source=t | grok email '.+@%{HOSTNAME:host}'")); assertEquals( - "source=t | grok email '.+@%{HOSTNAME:host}' | fields + email,host", + "source=table | grok identifier '***' | fields + identifier,identifier", anonymize("source=t | grok email '.+@%{HOSTNAME:host}' | fields email, host")); } + @Test + public void testReplaceCommandSingleField() { + assertEquals( + "source=table | replace *** WITH *** IN Field(field=fieldname, fieldArgs=[])", + anonymize("source=EMP | replace \"value\" WITH \"newvalue\" IN fieldname")); + } + + @Test + public void testReplaceCommandMultipleFields() { + assertEquals( + "source=table | replace *** WITH *** IN Field(field=fieldname1, fieldArgs=[])," + + " Field(field=fieldname2, fieldArgs=[])", + anonymize("source=EMP | replace \"value\" WITH \"newvalue\" IN fieldname1, fieldname2")); + } + + @Test(expected = Exception.class) + public void testReplaceCommandWithoutInShouldFail() { + anonymize("source=EMP | replace \"value\" WITH \"newvalue\""); + } + + @Test + public void testReplaceCommandSpecialCharactersInFields() { + assertEquals( + "source=table | replace *** WITH *** IN Field(field=user.name, fieldArgs=[])," + + " Field(field=user.email, fieldArgs=[])", + anonymize("source=EMP | replace \"value\" WITH \"newvalue\" IN user.name, user.email")); + } + + @Test + public void testReplaceCommandWithWildcards() { + assertEquals( + "source=table | replace *** WITH *** IN Field(field=fieldname, fieldArgs=[])", + anonymize("source=EMP | replace \"CLERK*\" WITH \"EMPLOYEE*\" IN fieldname")); + } + + @Test + public void testReplaceCommandWithMultipleWildcards() { + assertEquals( + "source=table | replace *** WITH *** IN Field(field=fieldname1, fieldArgs=[])," + + " Field(field=fieldname2, fieldArgs=[])", + anonymize("source=EMP | replace \"*TEST*\" WITH \"*NEW*\" IN fieldname1, fieldname2")); + } + @Test public void testPatterns() { when(settings.getSettingValue(Key.PATTERN_METHOD)).thenReturn("SIMPLE_PATTERN"); @@ -418,23 +697,111 @@ public void testPatterns() { when(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT)).thenReturn(10); when(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT)).thenReturn(100000); assertEquals( - "source=t | patterns email method=SIMPLE_PATTERN mode=LABEL" - + " max_sample_count=*** buffer_limit=*** new_field=patterns_field", + "source=table | patterns identifier method=SIMPLE_PATTERN mode=LABEL" + + " max_sample_count=*** buffer_limit=*** new_field=identifier", anonymize("source=t | patterns email")); assertEquals( - "source=t | patterns email method=SIMPLE_PATTERN mode=LABEL" - + " max_sample_count=*** buffer_limit=*** new_field=patterns_field |" - + " fields + email,patterns_field", - anonymize("source=t | patterns email | fields email, patterns_field")); + "source=table | patterns identifier method=SIMPLE_PATTERN mode=LABEL" + + " max_sample_count=*** buffer_limit=*** new_field=identifier |" + + " fields + identifier,identifier", + anonymize("source=t | patterns email | fields email, identifier")); assertEquals( - "source=t | patterns email method=BRAIN mode=AGGREGATION" - + " max_sample_count=*** buffer_limit=*** new_field=patterns_field" + "source=table | patterns identifier method=BRAIN mode=AGGREGATION" + + " max_sample_count=*** buffer_limit=*** new_field=identifier" + " variable_count_threshold=***", anonymize( "source=t | patterns email method=BRAIN mode=AGGREGATION" + " variable_count_threshold=5")); } + @Test + public void testRegex() { + assertEquals( + "source=table | regex identifier=***", anonymize("source=t | regex fieldname='pattern'")); + assertEquals( + "source=table | regex identifier!=***", anonymize("source=t | regex fieldname!='pattern'")); + assertEquals( + "source=table | regex identifier=*** | fields + identifier", + anonymize("source=t | regex email='.*@domain.com' | fields email")); + } + + @Test + public void testRexCommand() { + when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10); + + assertEquals( + "source=table | rex field=identifier mode=extract \"***\" max_match=***", + anonymize("source=t | rex field=message \"(?[A-Z]+)\"")); + assertEquals( + "source=table | rex field=identifier mode=extract \"***\" max_match=*** | fields +" + + " identifier,identifier", + anonymize("source=table | rex field=identifier \"***\" | fields identifier, identifier")); + assertEquals( + "source=table | rex field=identifier mode=extract \"***\" max_match=***", + anonymize("source=t | rex field=name \"(?[A-Z])\" max_match=3")); + } + + @Test + public void testRexSedMode() { + when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10); + + assertEquals( + "source=table | rex field=identifier mode=sed \"***\" max_match=***", + anonymize("source=t | rex field=lastname mode=sed \"s/^[A-Z]/X/\"")); + assertEquals( + "source=table | rex field=identifier mode=sed \"***\" max_match=*** | fields + identifier", + anonymize("source=t | rex field=data mode=sed \"s/sensitive/clean/g\" | fields data")); + } + + @Test + public void testMvjoin() { + // Test mvjoin with array of strings + assertEquals( + "source=table | eval identifier=mvjoin(array(***,***,***),***) | fields + identifier", + anonymize("source=t | eval result=mvjoin(array('a', 'b', 'c'), ',') | fields result")); + } + + @Test + public void testMvappend() { + assertEquals( + "source=table | eval identifier=mvappend(identifier,***,***) | fields + identifier", + anonymize("source=t | eval result=mvappend(a, 'b', 'c') | fields result")); + } + + @Test + public void testRexWithOffsetField() { + when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10); + + assertEquals( + "source=table | rex field=identifier mode=extract \"***\" max_match=***" + + " offset_field=identifier", + anonymize("source=t | rex field=message \"(?[a-z]+)\" offset_field=pos")); + } + + @Test + public void testMultisearch() { + assertEquals( + "| multisearch [search source=table | where identifier < ***] [search" + + " source=table | where identifier >= ***]", + anonymize( + "| multisearch [search source=accounts | where age < 30] [search" + + " source=accounts | where age >= 30]")); + + assertEquals( + "| multisearch [search source=table | where identifier > ***] [search" + + " source=table | where identifier = ***]", + anonymize( + "| multisearch [search source=accounts | where balance > 20000]" + + " [search source=accounts | where state = 'CA']")); + + assertEquals( + "| multisearch [search source=table | fields + identifier,identifier] [search" + + " source=table | where identifier = ***]", + anonymize( + "| multisearch [search source=accounts | fields firstname, lastname]" + + " [search source=accounts | where age = 25]")); + } + private String anonymize(String query) { AstBuilder astBuilder = new AstBuilder(query, settings); return anonymize(astBuilder.visit(parser.parse(query))); @@ -448,10 +815,26 @@ private String anonymize(UnresolvedPlan plan) { private String anonymizeStatement(String query, boolean isExplain) { AstStatementBuilder builder = new AstStatementBuilder( - new AstBuilder(query), + new AstBuilder(query, settings), AstStatementBuilder.StatementBuilderContext.builder().isExplain(isExplain).build()); Statement statement = builder.visit(parser.parse(query)); PPLQueryDataAnonymizer anonymize = new PPLQueryDataAnonymizer(settings); return anonymize.anonymizeStatement(statement); } + + @Test + public void testSearchWithAbsoluteTimeRange() { + assertEquals( + "source=table (@timestamp:*** AND (@timestamp:***", + anonymize("search source=t earliest='2012-12-10 15:00:00' latest=now")); + } + + @Test + public void testSpath() { + assertEquals( + "source=table | spath input=identifier output=identifier path=identifier | fields +" + + " identifier,identifier", + anonymize( + "search source=t | spath input=json_attr output=out path=foo.bar | fields id, out")); + } } From a3a6a0b635d04ebf2a86000577bb6ce885726ad9 Mon Sep 17 00:00:00 2001 From: Xinyu Hao Date: Tue, 4 Nov 2025 16:39:04 +0800 Subject: [PATCH 2/3] backport 4297 Signed-off-by: Xinyu Hao --- .../sql/calcite/CalciteRelNodeVisitor.java | 18 ++++++++++-------- ppl/src/main/antlr/OpenSearchPPLLexer.g4 | 1 - ppl/src/main/antlr/OpenSearchPPLParser.g4 | 2 ++ .../opensearch/sql/ppl/parser/AstBuilder.java | 3 ++- .../sql/ppl/utils/PPLQueryDataAnonymizer.java | 2 +- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 73578f9a470..28b92c2d179 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -813,15 +813,15 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { context.relBuilder.aggregate(context.relBuilder.groupKey(groupByList), aggCall); buildExpandRelNode( context.relBuilder.field(node.getAlias()), node.getAlias(), node.getAlias(), context); - flattenParsedPattern( - node.getAlias(), - context.relBuilder.field(node.getAlias()), - context, - true, - showNumberedToken); + flattenParsedPattern( + node.getAlias(), + context.relBuilder.field(node.getAlias()), + context, + true, + showNumberedToken); } } - return context.relBuilder.peek(); + return context.relBuilder.peek(); } @Override @@ -1474,7 +1474,7 @@ public RelNode visitDedupe(Dedupe node, CalcitePlanContext context) { throw new IllegalArgumentException("Number of duplicate events must be greater than 0"); } if (consecutive) { - throw new UnsupportedOperationException("Consecutive deduplication is not supported"); + throw new CalciteUnsupportedException("Consecutive deduplication is unsupported in Calcite"); } // Columns to deduplicate List dedupeFields = @@ -1900,6 +1900,8 @@ private List buildAggCallsForWindowFunctions( AggregateFunction aggFunc = new AggregateFunction(func.getFuncName(), field, rest); AggCall call = aggVisitor.analyze(new Alias(a.getName(), aggFunc), context); aggCalls.add(call); + } else { + throw new IllegalArgumentException("Unsupported window function in streamstats"); } } else { throw new IllegalArgumentException("Unsupported window function in streamstats"); diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 0342924383b..8d09afa7f46 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -406,7 +406,6 @@ SUBSTRING: 'SUBSTRING'; LTRIM: 'LTRIM'; RTRIM: 'RTRIM'; TRIM: 'TRIM'; -TO: 'TO'; LOWER: 'LOWER'; UPPER: 'UPPER'; CONCAT: 'CONCAT'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 4a83741ac2c..fc393788808 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -1498,6 +1498,8 @@ searchableKeyWord | EXISTS | SOURCE | INDEX + | A + | ASC | DESC | DATASOURCES | FROM diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 96d21e7733d..8fde9aa4ba6 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -531,7 +531,8 @@ public UnresolvedPlan visitStreamstatsCommand(OpenSearchPPLParser.StreamstatsCom for (OpenSearchPPLParser.StreamstatsAggTermContext aggCtx : ctx.streamstatsAggTerm()) { UnresolvedExpression windowFunction = internalVisitExpression(aggCtx.windowFunction()); - if (windowFunction instanceof WindowFunction wf) { + if (windowFunction instanceof WindowFunction) { + WindowFunction wf = (WindowFunction) windowFunction; // Attach PARTITION BY clause expressions wf.setPartitionByList(groupList); // Inject the frame diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index c6a4f7c1615..9db3f851962 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -237,7 +237,7 @@ public String visitSubqueryAlias(SubqueryAlias node, String context) { } // add "[]" only if its child is not a root String format = childNode.getChild().isEmpty() ? "%s as %s" : "[ %s ] as %s"; - return StringUtils.format(format, child, node.getAlias()); + return StringUtils.format(format, child, MASK_COLUMN); } @Override From 09cd9a0e0e5ec76964ff5868f32af74cc4f0d826 Mon Sep 17 00:00:00 2001 From: Xinyu Hao Date: Tue, 4 Nov 2025 17:48:19 +0800 Subject: [PATCH 3/3] fic IT Signed-off-by: Xinyu Hao --- docs/user/ppl/cmd/streamstats.rst | 8 +- .../remote/CalciteStreamstatsCommandIT.java | 90 ++++++++++--------- 2 files changed, 52 insertions(+), 46 deletions(-) diff --git a/docs/user/ppl/cmd/streamstats.rst b/docs/user/ppl/cmd/streamstats.rst index 0ac18637fec..e82053f748f 100644 --- a/docs/user/ppl/cmd/streamstats.rst +++ b/docs/user/ppl/cmd/streamstats.rst @@ -129,7 +129,7 @@ This example calculates the running maximum age over a 2-row window, excluding t PPL query:: - os> source=state_country | streamstats current=false window=2 max(age) as prev_max_age + os> source=state_country | fields name, country, state, month, year, age | streamstats current=false window=2 max(age) as prev_max_age fetched rows / total rows = 8/8 +-------+---------+------------+-------+------+-----+--------------+ | name | country | state | month | year | age | prev_max_age | @@ -175,7 +175,7 @@ original data:: PPL query:: - os> source=state_country | streamstats window=2 global=true avg(age) as running_avg by country ; + os> source=state_country | fields name, country, state, month, year, age | streamstats window=2 global=true avg(age) as running_avg by country ; fetched rows / total rows = 8/8 +-------+---------+------------+-------+------+-----+-------------+ | name | country | state | month | year | age | running_avg | @@ -190,7 +190,7 @@ PPL query:: | David | USA | Washington | 4 | 2023 | 40 | 40.0 | +-------+---------+------------+-------+------+-----+-------------+ - os> source=state_country | streamstats window=2 global=false avg(age) as running_avg by country ; + os> source=state_country | fields name, country, state, month, year, age | streamstats window=2 global=false avg(age) as running_avg by country ; fetched rows / total rows = 8/8 +-------+---------+------------+-------+------+-----+-------------+ | name | country | state | month | year | age | running_avg | @@ -213,7 +213,7 @@ This example calculates the running average of age across accounts by country, w PPL query:: - os> source=state_country | streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country; + os> source=state_country | fields name, country, state, month, year, age | streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country; fetched rows / total rows = 8/8 +-------+---------+------------+-------+------+-----+---------+ | name | country | state | month | year | age | avg_age | diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java index ee94c218dbb..0c899b501bc 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java @@ -32,7 +32,7 @@ public void testStreamstats() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max", + + " as max | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( @@ -62,7 +62,7 @@ public void testStreamstatsWithNull() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max", + + " as max | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifySchemaInOrder( @@ -94,7 +94,7 @@ public void testStreamstatsBy() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by country", + + " as max by country | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( @@ -124,7 +124,7 @@ public void testStreamstatsByWithNull() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by country", + + " as max by country | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifySchemaInOrder( @@ -153,7 +153,7 @@ public void testStreamstatsByWithNull() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by state", + + " as max by state | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( actual, @@ -171,7 +171,7 @@ public void testStreamstatsBySpan() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by span(age, 10) as age_span", + + " as max by span(age, 10) as age_span | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -188,7 +188,7 @@ public void testStreamstatsBySpanWithNull() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by span(age, 10) as age_span", + + " as max by span(age, 10) as age_span | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -207,7 +207,7 @@ public void testStreamstatsByMultiplePartitions1() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by span(age, 10) as age_span, country", + + " as max by span(age, 10) as age_span, country | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -224,7 +224,7 @@ public void testStreamstatsByMultiplePartitions2() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by span(age, 10) as age_span, state", + + " as max by span(age, 10) as age_span, state | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -241,7 +241,7 @@ public void testStreamstatsByMultiplePartitionsWithNull1() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by span(age, 10) as age_span, country", + + " as max by span(age, 10) as age_span, country | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -260,7 +260,7 @@ public void testStreamstatsByMultiplePartitionsWithNull2() throws IOException { executeQuery( String.format( "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" - + " as max by span(age, 10) as age_span, state", + + " as max by span(age, 10) as age_span, state | fields name, country, state, month, year, age, cnt, avg, min, max", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -278,7 +278,7 @@ public void testStreamstatsCurrent() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats current=false avg(age) as prev_avg", + "source=%s | streamstats current=false avg(age) as prev_avg | fields name, country, state, month, year, age, prev_avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -294,7 +294,7 @@ public void testStreamstatsCurrentWithNUll() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats current=false avg(age) as prev_avg", + "source=%s | streamstats current=false avg(age) as prev_avg | fields name, country, state, month, year, age, prev_avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -312,7 +312,7 @@ public void testStreamstatsWindow() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window = 3 avg(age) as avg", TEST_INDEX_STATE_COUNTRY)); + "source=%s | streamstats window = 3 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( actual, @@ -327,7 +327,7 @@ public void testStreamstatsWindowWithNull() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window = 3 avg(age) as avg", + "source=%s | streamstats window = 3 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -344,7 +344,7 @@ public void testStreamstatsBigWindow() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window = 10 avg(age) as avg", TEST_INDEX_STATE_COUNTRY)); + "source=%s | streamstats window = 10 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( actual, @@ -372,7 +372,7 @@ public void testStreamstatsCurrentAndWindow() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats current = false window = 2 avg(age) as avg", + "source=%s | streamstats current = false window = 2 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -388,7 +388,7 @@ public void testStreamstatsCurrentAndWindowWithNull() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats current = false window = 2 avg(age) as avg", + "source=%s | streamstats current = false window = 2 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -416,7 +416,7 @@ public void testStreamstatsGlobal() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window=2 global=false avg(age) as avg by country", + "source=%s | streamstats window=2 global=false avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -430,7 +430,7 @@ public void testStreamstatsGlobal() throws IOException { JSONObject actual2 = executeQuery( String.format( - "source=%s | streamstats window=2 global=true avg(age) as avg by country", + "source=%s | streamstats window=2 global=true avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -463,7 +463,7 @@ public void testStreamstatsGlobalWithNull() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window=2 global=false avg(age) as avg by country", + "source=%s | streamstats window=2 global=false avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -479,7 +479,7 @@ public void testStreamstatsGlobalWithNull() throws IOException { JSONObject actual2 = executeQuery( String.format( - "source=%s | streamstats window=2 global=true avg(age) as avg by country", + "source=%s | streamstats window=2 global=true avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -514,7 +514,7 @@ public void testStreamstatsReset() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country", + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -528,7 +528,7 @@ public void testStreamstatsReset() throws IOException { JSONObject actual2 = executeQuery( String.format( - "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country", + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -561,7 +561,7 @@ public void testStreamstatsResetWithNull() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country", + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -577,7 +577,7 @@ public void testStreamstatsResetWithNull() throws IOException { JSONObject actual2 = executeQuery( String.format( - "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country", + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -618,7 +618,7 @@ public void testMultipleStreamstats() throws IOException { executeQuery( String.format( "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" - + " avg(avg_age) as avg_state_age by country", + + " avg(avg_age) as avg_state_age by country | fields name, country, state, month, year, age, avg_age, avg_state_age", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -635,7 +635,7 @@ public void testMultipleStreamstatsWithNull() throws IOException { executeQuery( String.format( "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" - + " avg(avg_age) as avg_state_age by country", + + " avg(avg_age) as avg_state_age by country | fields name, country, state, month, year, age, avg_age, avg_state_age", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -654,7 +654,7 @@ public void testStreamstatsAndEventstats() throws IOException { executeQuery( String.format( "source=%s | eventstats avg(age) as avg_age| streamstats" - + " avg(age) as avg_age_stream", + + " avg(age) as avg_age_stream | fields name, country, state, month, year, age, avg_age, avg_age_stream", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -670,7 +670,7 @@ public void testStreamstatsAndSort() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | sort age | streamstats window = 2 avg(age) as avg_age ", + "source=%s | sort age | streamstats window = 2 avg(age) as avg_age | fields name, country, state, month, year, age, avg_age", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -687,7 +687,7 @@ public void testLeftJoinWithStreamstats() throws IOException { executeQuery( String.format( "source=%s as l | left join left=l right=r on l.country = r.country [ source=%s |" - + " streamstats window=2 avg(age) as avg_age]", + + " streamstats window=2 avg(age) as avg_age] | fields l.name, l.country, l.state, l.month, l.year, l.age, r.name, r.country, r.state, r.month, r.year, r.age, avg_age", TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -718,7 +718,7 @@ public void testWhereInWithStreamstatsSubquery() throws IOException { executeQuery( String.format( "source=%s | where country in [ source=%s | streamstats window=2 avg(age) as" - + " avg_age | where avg_age > 40 | fields country ]", + + " avg_age | where avg_age > 40 | fields country ] | fields name, country, state, month, year, age", TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -735,7 +735,9 @@ public void testMultipleStreamstatsWithEval() throws IOException { "source=%s | streamstats avg(age) as avg_age by country, state, name | eval" + " avg_age_divide_20 = avg_age - 20 | streamstats avg(avg_age_divide_20) as" + " avg_state_age by country, state | where avg_state_age > 0 | streamstats" - + " count(avg_state_age) as count_country_age_greater_20 by country", + + " count(avg_state_age) as count_country_age_greater_20 by country | fields" + + " name, country, state, month, year, age, avg_age, avg_age_divide_20," + + " avg_state_age, count_country_age_greater_20", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -771,7 +773,8 @@ public void testStreamstatsVariance() throws IOException { executeQuery( String.format( "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," - + " var_samp(age)", + + " var_samp(age) | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( @@ -821,7 +824,8 @@ public void testStreamstatsVarianceWithNull() throws IOException { executeQuery( String.format( "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," - + " var_samp(age)", + + " var_samp(age) | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifySchemaInOrder( @@ -873,7 +877,8 @@ public void testStreamstatsVarianceBy() throws IOException { executeQuery( String.format( "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," - + " var_samp(age) by country", + + " var_samp(age) by country | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -890,7 +895,7 @@ public void testStreamstatsVarianceBySpan() throws IOException { executeQuery( String.format( "source=%s | where country != 'USA' | streamstats stddev_samp(age) by span(age," - + " 10)", + + " 10) | fields name, country, state, month, year, age, `stddev_samp(age)`", TEST_INDEX_STATE_COUNTRY)); verifyDataRows( @@ -905,7 +910,8 @@ public void testStreamstatsVarianceWithNullBy() throws IOException { executeQuery( String.format( "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," - + " var_samp(age) by country", + + " var_samp(age) by country | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifyDataRows( @@ -933,7 +939,7 @@ public void testStreamstatsDistinctCount() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats dc(state) as dc_state", TEST_INDEX_STATE_COUNTRY)); + "source=%s | streamstats dc(state) as dc_state | fields name, country, state, month, year, age, dc_state", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( actual, @@ -958,7 +964,7 @@ public void testStreamstatsDistinctCountByCountry() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats dc(state) as dc_state by country", + "source=%s | streamstats dc(state) as dc_state by country | fields name, country, state, month, year, age, dc_state", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( @@ -984,7 +990,7 @@ public void testStreamstatsDistinctCountFunction() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats distinct_count(country) as dc_country", + "source=%s | streamstats distinct_count(country) as dc_country | fields name, country, state, month, year, age, dc_country", TEST_INDEX_STATE_COUNTRY)); verifySchemaInOrder( @@ -1010,7 +1016,7 @@ public void testStreamstatsDistinctCountWithNull() throws IOException { JSONObject actual = executeQuery( String.format( - "source=%s | streamstats dc(state) as dc_state", + "source=%s | streamstats dc(state) as dc_state | fields name, country, state, month, year, age, dc_state", TEST_INDEX_STATE_COUNTRY_WITH_NULL)); verifySchemaInOrder(