diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index d1dad6b61a0..2f1c0e83fa2 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -92,6 +92,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -747,6 +748,11 @@ public LogicalPlan visitTrendline(Trendline node, AnalysisContext context) { computationsAndTypes.build()); } + @Override + public LogicalPlan visitStreamWindow(StreamWindow node, AnalysisContext context) { + throw getOnlyForCalciteException("Streamstats"); + } + @Override public LogicalPlan visitFlatten(Flatten node, AnalysisContext context) { throw getOnlyForCalciteException("Flatten"); diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index f478d89eff7..f85f3ba9deb 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -80,6 +80,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -411,6 +412,10 @@ public T visitWindow(Window window, C context) { return visitChildren(window, context); } + public T visitStreamWindow(StreamWindow node, C context) { + return visitChildren(node, context); + } + public T visitJoin(Join node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java b/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java new file mode 100644 index 00000000000..ed7bcf10289 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java @@ -0,0 +1,71 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +@Getter +@ToString +@EqualsAndHashCode(callSuper = false) +public class StreamWindow extends UnresolvedPlan { + + private final List windowFunctionList; + private final List groupList; + private final boolean current; + private final int window; + private final boolean global; + private final UnresolvedExpression resetBefore; + private final UnresolvedExpression resetAfter; + @ToString.Exclude private UnresolvedPlan child; + + /** StreamWindow Constructor. */ + public StreamWindow( + List windowFunctionList, + List groupList, + boolean current, + int window, + boolean global, + UnresolvedExpression resetBefore, + UnresolvedExpression resetAfter) { + this.windowFunctionList = windowFunctionList; + this.groupList = groupList; + this.current = current; + this.window = window; + this.global = global; + this.resetBefore = resetBefore; + this.resetAfter = resetAfter; + } + + public boolean isCurrent() { + return current; + } + + public boolean isGlobal() { + return global; + } + + @Override + public StreamWindow attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitStreamWindow(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index f45764df1c4..28b92c2d179 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -17,6 +17,7 @@ import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_DEDUP; import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_MAIN; import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_RARE_TOP; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_STREAMSTATS; import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_SUBSEARCH; import static org.opensearch.sql.calcite.utils.PlanUtils.getRelation; import static org.opensearch.sql.calcite.utils.PlanUtils.getRexCall; @@ -132,6 +133,7 @@ import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; @@ -1587,6 +1589,339 @@ private void validateFillNullTypeCompatibility( } } + @Override + public RelNode visitStreamWindow(StreamWindow node, CalcitePlanContext context) { + visitChildren(node, context); + + List groupList = node.getGroupList(); + boolean hasGroup = groupList != null && !groupList.isEmpty(); + boolean hasWindow = node.getWindow() > 0; + boolean hasReset = node.getResetBefore() != null || node.getResetAfter() != null; + + // Local helper column names + final String RESET_BEFORE_FLAG_COL = "__reset_before_flag__"; // flag for reset_before + final String RESET_AFTER_FLAG_COL = "__reset_after_flag__"; // flag for reset_after + final String SEGMENT_ID_COL = "__seg_id__"; // segment id + + // CASE: reset + if (hasReset) { + // 1. Build helper columns: seq, before/after flags, segment_id + RelNode leftWithSeg = buildResetHelperColumns(context, node); + + // 2. Run correlate + aggregate with reset-specific filter and cleanup + return buildStreamWindowJoinPlan( + context, + leftWithSeg, + node, + groupList, + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + SEGMENT_ID_COL, + new String[] { + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + RESET_BEFORE_FLAG_COL, + RESET_AFTER_FLAG_COL, + SEGMENT_ID_COL + }); + } + + // CASE: global=true + window>0 + has group + if (node.isGlobal() && hasWindow && hasGroup) { + // 1. Add global sequence column for sliding window + RexNode streamSeq = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(streamSeq); + RelNode left = context.relBuilder.build(); + + // 2. Run correlate + aggregate + return buildStreamWindowJoinPlan( + context, + left, + node, + groupList, + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + null, + new String[] {ROW_NUMBER_COLUMN_FOR_STREAMSTATS}); + } + + // Default + if (hasGroup) { + // only build sequence when there is by condition + RexNode streamSeq = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(streamSeq); + } + + List overExpressions = + node.getWindowFunctionList().stream().map(w -> rexVisitor.analyze(w, context)).collect(Collectors.toList()); + context.relBuilder.projectPlus(overExpressions); + + // resort when there is by condition + if (hasGroup) { + context.relBuilder.sort(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_STREAMSTATS)); + context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_STREAMSTATS)); + } + + return context.relBuilder.peek(); + } + + private RelNode buildStreamWindowJoinPlan( + CalcitePlanContext context, + RelNode leftWithHelpers, + StreamWindow node, + List groupList, + String seqCol, + String segmentCol, + String[] helperColsToCleanup) { + + final Holder<@Nullable RexCorrelVariable> v = Holder.empty(); + context.relBuilder.push(leftWithHelpers); + context.relBuilder.variable(v::set); + + context.relBuilder.push(leftWithHelpers); + RexNode rightSeq = context.relBuilder.field(seqCol); + RexNode outerSeq = context.relBuilder.field(v.get(), seqCol); + + RexNode filter; + if (segmentCol != null) { // reset condition + RexNode segRight = context.relBuilder.field(segmentCol); + RexNode segOuter = context.relBuilder.field(v.get(), segmentCol); + RexNode frame = buildResetFrameFilter(context, node, outerSeq, rightSeq, segOuter, segRight); + RexNode group = buildGroupFilter(context, groupList, v.get()); + filter = (group == null) ? frame : context.relBuilder.and(frame, group); + } else { // global + window + by condition + RexNode frame = buildFrameFilter(context, node, outerSeq, rightSeq); + RexNode group = buildGroupFilter(context, groupList, v.get()); + filter = context.relBuilder.and(frame, group); + } + context.relBuilder.filter(filter); + + // aggregate all window functions on right side + List aggCalls = buildAggCallsForWindowFunctions(node.getWindowFunctionList(), context); + context.relBuilder.aggregate(context.relBuilder.groupKey(), aggCalls); + RelNode rightAgg = context.relBuilder.build(); + + // correlate LEFT with RIGHT using seq + group fields + context.relBuilder.push(leftWithHelpers); + context.relBuilder.push(rightAgg); + List requiredLeft = buildRequiredLeft(context, seqCol, groupList); + if (segmentCol != null) { // also require seg_id for reset segmentation equality + requiredLeft = new ArrayList<>(requiredLeft); + requiredLeft.add(context.relBuilder.field(2, 0, segmentCol)); + } + context.relBuilder.correlate(JoinRelType.LEFT, v.get().id, requiredLeft); + + // resort to original order + boolean hasGroup = !groupList.isEmpty(); + // resort when 1. global + window + by condition 2.reset + by condition + if (hasGroup) { + context.relBuilder.sort(context.relBuilder.field(seqCol)); + } + + // cleanup helper columns + List cleanup = new ArrayList<>(); + for (String c : helperColsToCleanup) { + cleanup.add(context.relBuilder.field(c)); + } + context.relBuilder.projectExcept(cleanup); + return context.relBuilder.peek(); + } + + private RelNode buildResetHelperColumns(CalcitePlanContext context, StreamWindow node) { + // 1. global sequence to define order + RexNode rowNum = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(rowNum); + + // 2. before/after flags + RexNode beforePred = + (node.getResetBefore() == null) + ? context.relBuilder.literal(false) + : rexVisitor.analyze(node.getResetBefore(), context); + RexNode afterPred = + (node.getResetAfter() == null) + ? context.relBuilder.literal(false) + : rexVisitor.analyze(node.getResetAfter(), context); + RexNode beforeFlag = + context.relBuilder.call( + SqlStdOperatorTable.CASE, + beforePred, + context.relBuilder.literal(1), + context.relBuilder.literal(0)); + RexNode afterFlag = + context.relBuilder.call( + SqlStdOperatorTable.CASE, + afterPred, + context.relBuilder.literal(1), + context.relBuilder.literal(0)); + context.relBuilder.projectPlus(context.relBuilder.alias(beforeFlag, "__reset_before_flag__")); + context.relBuilder.projectPlus(context.relBuilder.alias(afterFlag, "__reset_after_flag__")); + + // 3. session id = SUM(beforeFlag) over (to current) + SUM(afterFlag) over (to 1 preceding) + RexNode sumBefore = + context + .relBuilder + .aggregateCall( + SqlStdOperatorTable.SUM, context.relBuilder.field("__reset_before_flag__")) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .toRex(); + RexNode sumAfterPrev = + context + .relBuilder + .aggregateCall( + SqlStdOperatorTable.SUM, context.relBuilder.field("__reset_after_flag__")) + .over() + .rowsBetween( + RexWindowBounds.UNBOUNDED_PRECEDING, + RexWindowBounds.preceding(context.relBuilder.literal(1))) + .toRex(); + sumBefore = + context.relBuilder.call( + SqlStdOperatorTable.COALESCE, sumBefore, context.relBuilder.literal(0)); + sumAfterPrev = + context.relBuilder.call( + SqlStdOperatorTable.COALESCE, sumAfterPrev, context.relBuilder.literal(0)); + + RexNode segId = context.relBuilder.call(SqlStdOperatorTable.PLUS, sumBefore, sumAfterPrev); + context.relBuilder.projectPlus(context.relBuilder.alias(segId, "__seg_id__")); + return context.relBuilder.build(); + } + + private RexNode buildFrameFilter( + CalcitePlanContext context, StreamWindow node, RexNode outerSeq, RexNode rightSeq) { + // window always >0 + // frame: either [outer-(w-1), outer] or [outer-w, outer-1] + if (node.isCurrent()) { + RexNode lower = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, + outerSeq, + context.relBuilder.literal(node.getWindow() - 1)); + return context.relBuilder.between(rightSeq, lower, outerSeq); + } else { + RexNode lower = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, outerSeq, context.relBuilder.literal(node.getWindow())); + RexNode upper = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, outerSeq, context.relBuilder.literal(1)); + return context.relBuilder.between(rightSeq, lower, upper); + } + } + + private RexNode buildResetFrameFilter( + CalcitePlanContext context, + StreamWindow node, + RexNode outerSeq, + RexNode rightSeq, + RexNode segIdOuter, + RexNode segIdRight) { + // 1. Compute sequence range (handle running window semantics when window == 0) + RexNode seqFilter; + if (node.getWindow() == 0) { + // running: current => rightSeq <= outerSeq; excluding current => rightSeq < outerSeq + seqFilter = + node.isCurrent() + ? context.relBuilder.lessThanOrEqual(rightSeq, outerSeq) + : context.relBuilder.lessThan(rightSeq, outerSeq); + } else { + // Reuse normal frame filter logic when window > 0 + seqFilter = buildFrameFilter(context, node, outerSeq, rightSeq); + } + // 2. Ensure same segment (seg_id) for reset partitioning + RexNode segFilter = context.relBuilder.equals(segIdRight, segIdOuter); + // 3. Combine filters + return context.relBuilder.and(seqFilter, segFilter); + } + + private RexNode buildGroupFilter( + CalcitePlanContext context, List groupList, RexCorrelVariable correl) { + // build conjunctive equality filters: right.g_i = outer.g_i + if (groupList.isEmpty()) { + return null; + } + List equalsList = + groupList.stream() + .map( + expr -> { + String groupName = extractGroupFieldName(expr); + RexNode rightGroup = context.relBuilder.field(groupName); + RexNode outerGroup = context.relBuilder.field(correl, groupName); + return context.relBuilder.equals(rightGroup, outerGroup); + }) + .collect(Collectors.toList()); + return context.relBuilder.and(equalsList); + } + + private String extractGroupFieldName(UnresolvedExpression groupExpr) { + if (groupExpr instanceof Alias) { + Alias groupAlias = (Alias) groupExpr; + if (groupAlias.getDelegated() instanceof Field) { + Field groupField = (Field) groupAlias.getDelegated(); + return groupField.getField().toString(); + } + } else if (groupExpr instanceof Field) { + Field groupField = (Field) groupExpr; + return groupField.getField().toString(); + } + throw new IllegalArgumentException( + "Unsupported group expression: only field or alias(field) is supported"); + } + + private List buildAggCallsForWindowFunctions( + List windowExprs, CalcitePlanContext context) { + List aggCalls = new ArrayList<>(); + for (UnresolvedExpression expr : windowExprs) { + if (expr instanceof Alias) { + Alias a = (Alias) expr; + if (a.getDelegated() instanceof WindowFunction) { + WindowFunction wf = (WindowFunction) a.getDelegated(); + Function func = (Function) wf.getFunction(); + List args = func.getFuncArgs(); + // first argument is the input field, others are function params + UnresolvedExpression field = args.isEmpty() ? null : args.get(0); + List rest = + args.size() <= 1 ? List.of() : args.subList(1, args.size()); + AggregateFunction aggFunc = new AggregateFunction(func.getFuncName(), field, rest); + AggCall call = aggVisitor.analyze(new Alias(a.getName(), aggFunc), context); + aggCalls.add(call); + } else { + throw new IllegalArgumentException("Unsupported window function in streamstats"); + } + } else { + throw new IllegalArgumentException("Unsupported window function in streamstats"); + } + } + return aggCalls; + } + + private List buildRequiredLeft( + CalcitePlanContext context, String seqCol, List groupList) { + List requiredLeft = new ArrayList<>(); + // reference to left seq column + requiredLeft.add(context.relBuilder.field(2, 0, seqCol)); + for (UnresolvedExpression groupExpr : groupList) { + String groupName = extractGroupFieldName(groupExpr); + requiredLeft.add(context.relBuilder.field(2, 0, groupName)); + } + return requiredLeft; + } + @Override public RelNode visitFillNull(FillNull node, CalcitePlanContext context) { visitChildren(node, context); diff --git a/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java b/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java index 65e45a3c25e..5a6ac7d5370 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java +++ b/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java @@ -144,7 +144,10 @@ public static boolean isDependentField(RexNode node, Collection baseFie // to transform a field into such a literal if (node.getKind() == SqlKind.LITERAL) return true; if (node.getKind() == SqlKind.INPUT_REF && baseFields.contains(node)) return true; - if (node instanceof RexCall && ((RexCall) node).getOperator().isDeterministic()) { + // Use !isAggregator to rule out window functions like row_number() + if (node instanceof RexCall + && ((RexCall) node).getOperator().isDeterministic() + && !((RexCall) node).getOperator().isAggregator()) { return ((RexCall) node) .getOperands().stream().allMatch(op -> isDependentField(op, baseFields)); } diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java index dddd62f9955..c3cb610db83 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java @@ -65,6 +65,7 @@ public interface PlanUtils { String ROW_NUMBER_COLUMN_FOR_RARE_TOP = "_row_number_rare_top_"; String ROW_NUMBER_COLUMN_FOR_MAIN = "_row_number_main_"; String ROW_NUMBER_COLUMN_FOR_SUBSEARCH = "_row_number_subsearch_"; + String ROW_NUMBER_COLUMN_FOR_STREAMSTATS = "__stream_seq__"; static SpanUnit intervalUnitToSpanUnit(IntervalUnit unit) { SpanUnit result; diff --git a/docs/category.json b/docs/category.json index d9605598800..7ebe643373b 100644 --- a/docs/category.json +++ b/docs/category.json @@ -47,6 +47,7 @@ "user/ppl/cmd/showdatasources.rst", "user/ppl/cmd/sort.rst", "user/ppl/cmd/stats.rst", + "user/ppl/cmd/streamstats.rst", "user/ppl/cmd/subquery.rst", "user/ppl/cmd/syntax.rst", "user/ppl/cmd/timechart.rst", diff --git a/docs/user/ppl/cmd/streamstats.rst b/docs/user/ppl/cmd/streamstats.rst new file mode 100644 index 00000000000..e82053f748f --- /dev/null +++ b/docs/user/ppl/cmd/streamstats.rst @@ -0,0 +1,229 @@ +=========== +streamstats +=========== + +.. rubric:: Table of contents + +.. contents:: + :local: + :depth: 2 + + +Description +=========== +The ``streamstats`` command is used to calculate cumulative or rolling statistics as events are processed in order. Unlike ``stats`` or ``eventstats`` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. + +Key aspects of `streamstats`: + +1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. +2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). +3. Retains all original events and appends new fields containing the calculated statistics. +4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. + +Difference between ``stats``, ``eventstats`` and ``streamstats`` + +All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: + +* Transformation Behavior: + * ``stats``: Transforms all events into an aggregated result table, losing original event structure. + * ``eventstats``: Adds aggregation results as new fields to the original events without removing the event structure. + * ``streamstats``: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. +* Output Format: + * ``stats``: Output contains only aggregated values. Original raw events are not preserved. + * ``eventstats``: Original events remain, with extra fields containing summary statistics. + * ``streamstats``: Original events remain, with extra fields containing running totals or cumulative statistics. +* Aggregation Scope: + * ``stats``: Based on all events in the search (or groups defined by BY clause). + * ``eventstats``: Based on all relevant events, then the result is added back to each event in the group. + * ``streamstats``: Calculations occur progressively as each event is processed; can be scoped by window. +* Use Cases: + * ``stats``: When only aggregated results are needed (e.g., counts, averages, sums). + * ``eventstats``: When aggregated statistics are needed alongside original event data. + * ``streamstats``: When a running total or cumulative statistic is needed across event streams. + +Syntax +====== +streamstats [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause] + +* function: mandatory. A aggregation function or window function. +* current: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=. **Default:** true. +* window: optional. Specifies the number of events to use when computing the statistics. Syntax: window=. **Default:** 0, which means that all previous and current events are used. +* global: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=. **Default:** true. +* reset_before: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("")". **Default:** false. +* reset_after: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("")". **Default:** false. +* by-clause: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no is specified, all events are processed as a single group and running statistics are computed across the entire event stream. +* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, ``span(age, 10)`` creates 10-year age buckets, ``span(timestamp, 1h)`` creates hourly buckets. + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +Aggregation Functions +===================== + +The streamstats command supports the following aggregation functions: + +* COUNT: Count of values +* SUM: Sum of numeric values +* AVG: Average of numeric values +* MAX: Maximum value +* MIN: Minimum value +* VAR_SAMP: Sample variance +* VAR_POP: Population variance +* STDDEV_SAMP: Sample standard deviation +* STDDEV_POP: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* EARLIEST: Earliest value by timestamp +* LATEST: Latest value by timestamp + +For detailed documentation of each function, see `Aggregation Functions <../functions/aggregation.rst>`_. + +Usage +===== + +Streamstats:: + + source = table | streamstats avg(a) + source = table | streamstats current = false avg(a) + source = table | streamstats window = 5 sum(b) + source = table | streamstats current = false window = 2 max(a) + source = table | where a < 50 | streamstats count(c) + source = table | streamstats min(c), max(c) by b + source = table | streamstats count(c) as count_by by b | where count_by > 1000 + source = table | streamstats dc(field) as distinct_count + source = table | streamstats distinct_count(category) by region + source = table | streamstats current=false window=2 global=false avg(a) by b + source = table | streamstats window=2 reset_before=a>31 avg(b) + source = table | streamstats current=false reset_after=a>31 avg(b) by c + + +Example 1: Calculate the running average, sum, and count of a field by group +============================================================================ + +This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. + +PPL query:: + + os> source=accounts | streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender; + fetched rows / total rows = 4/4 + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | + |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ + + +Example 2: Running maximum age over a 2-row window +================================================== + +This example calculates the running maximum age over a 2-row window, excluding the current event. + +PPL query:: + + os> source=state_country | fields name, country, state, month, year, age | streamstats current=false window=2 max(age) as prev_max_age + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+--------------+ + | name | country | state | month | year | age | prev_max_age | + |-------+---------+------------+-------+------+-----+--------------| + | Jake | USA | California | 4 | 2023 | 70 | null | + | Hello | USA | New York | 4 | 2023 | 30 | 70 | + | John | Canada | Ontario | 4 | 2023 | 25 | 70 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 25 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 27 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 57 | + | David | USA | Washington | 4 | 2023 | 40 | 70 | + +-------+---------+------------+-------+------+-----+--------------+ + + +Example 3: Use the global argument to calculate running statistics +================================================================== + +The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: + +* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. +* global=false: the window itself is created per group, meaning each group gets its own independent window. + +This example shows how to calculate the running average of age across accounts by country, using global argument. + +original data:: + + +-------+---------+------------+-------+------+-----+ + | name | country | state | month | year | age | + |-------+---------+------------+-------+------+-----+ + | Jake | USA | California | 4 | 2023 | 70 | + | Hello | USA | New York | 4 | 2023 | 30 | + | John | Canada | Ontario | 4 | 2023 | 25 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | + | Jim | Canada | B.C | 4 | 2023 | 27 | + | Peter | Canada | B.C | 4 | 2023 | 57 | + | Rick | Canada | B.C | 4 | 2023 | 70 | + | David | USA | Washington | 4 | 2023 | 40 | + +-------+---------+------------+-------+------+-----+ + +* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. +* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. + +PPL query:: + + os> source=state_country | fields name, country, state, month, year, age | streamstats window=2 global=true avg(age) as running_avg by country ; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+-------------+ + | name | country | state | month | year | age | running_avg | + |-------+---------+------------+-------+------+-----+-------------| + | Jake | USA | California | 4 | 2023 | 70 | 70.0 | + | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | + | David | USA | Washington | 4 | 2023 | 40 | 40.0 | + +-------+---------+------------+-------+------+-----+-------------+ + + os> source=state_country | fields name, country, state, month, year, age | streamstats window=2 global=false avg(age) as running_avg by country ; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+-------------+ + | name | country | state | month | year | age | running_avg | + |-------+---------+------------+-------+------+-----+-------------| + | Jake | USA | California | 4 | 2023 | 70 | 70.0 | + | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | + | David | USA | Washington | 4 | 2023 | 40 | 35.0 | + +-------+---------+------------+-------+------+-----+-------------+ + + +Example 4: Use the reset_before and reset_after arguments to reset statistics +============================================================================= + +This example calculates the running average of age across accounts by country, with resets applied. + +PPL query:: + + os> source=state_country | fields name, country, state, month, year, age | streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+---------+ + | name | country | state | month | year | age | avg_age | + |-------+---------+------------+-------+------+-----+---------| + | Jake | USA | California | 4 | 2023 | 70 | null | + | Hello | USA | New York | 4 | 2023 | 30 | 70.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | null | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | + | Jim | Canada | B.C | 4 | 2023 | 27 | null | + | Peter | Canada | B.C | 4 | 2023 | 57 | null | + | Rick | Canada | B.C | 4 | 2023 | 70 | null | + | David | USA | Washington | 4 | 2023 | 40 | null | + +-------+---------+------------+-------+------+-----+---------+ \ No newline at end of file diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst index 98b6c00b8f9..ebfb20fdbd9 100644 --- a/docs/user/ppl/index.rst +++ b/docs/user/ppl/index.rst @@ -77,7 +77,7 @@ The query start with search command and then flowing a set of command delimited - `grok command `_ - `head command `_ - + - `join command `_ - `kmeans command `_ @@ -110,12 +110,14 @@ The query start with search command and then flowing a set of command delimited - `stats command `_ + - `streamstats command `_ + - `subquery (aka subsearch) command `_ - `reverse command `_ - `table command `_ - + - `timechart command `_ - `top command `_ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index 69507c71aa5..15051417db1 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -66,6 +66,7 @@ CalcitePPLCryptographicFunctionIT.class, CalcitePPLDedupIT.class, CalcitePPLEventstatsIT.class, + CalciteStreamstatsCommandIT.class, CalcitePPLExistsSubqueryIT.class, CalcitePPLExplainIT.class, CalcitePPLFillnullIT.class, diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index a5dee8aec58..6ba87f45c05 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -615,6 +615,45 @@ public void testEventstatsDistinctCountFunctionExplain() throws IOException { assertJsonEqualsIgnoreId(expected, result); } + @Test + public void testStreamstatsDistinctCountExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats dc(state) as distinct_states"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_dc.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsDistinctCountFunctionExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats distinct_count(state) as" + + " distinct_states by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_distinct_count.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsGlobalExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats window=2 global=true avg(age) as" + + " avg_age by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_global.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsResetExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats current=false reset_before=age>34" + + " reset_after=age<25 avg(age) as avg_age by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_reset.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + // Only for Calcite, as v2 gets unstable serialized string for function @Test public void testExplainOnAggregationWithSumEnhancement() throws IOException { @@ -740,6 +779,42 @@ public void testExplainOnEventstatsEarliestLatestNoGroupBy() throws IOException TEST_INDEX_LOGS))); } + @Test + public void testExplainOnStreamstatsEarliestLatest() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message) as earliest_message, latest(message) as" + + " latest_message by server", + TEST_INDEX_LOGS))); + } + + @Test + public void testExplainOnStreamstatsEarliestLatestWithCustomTimeField() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest_custom_time.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message, created_at) as earliest_message," + + " latest(message, created_at) as latest_message by level", + TEST_INDEX_LOGS))); + } + + @Test + public void testExplainOnStreamstatsEarliestLatestNoGroupBy() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest_no_group.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message) as earliest_message, latest(message) as" + + " latest_message", + TEST_INDEX_LOGS))); + } + @Test public void testListAggregationExplain() throws IOException { String expected = loadExpectedPlan("explain_list_aggregation.json"); diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java index 0b66cf5a225..84f81b4bc99 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java @@ -27,7 +27,7 @@ public void init() throws Exception { } @Test - public void testEventstat() throws IOException { + public void testEventstats() throws IOException { JSONObject actual = executeQuery( String.format( @@ -57,7 +57,7 @@ public void testEventstat() throws IOException { } @Test - public void testEventstatWithNull() throws IOException { + public void testEventstatsWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -89,7 +89,7 @@ public void testEventstatWithNull() throws IOException { } @Test - public void testEventstatBy() throws IOException { + public void testEventstatsBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -120,7 +120,7 @@ public void testEventstatBy() throws IOException { } @Test - public void testEventstatByWithNull() throws IOException { + public void testEventstatsByWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -169,7 +169,7 @@ public void testEventstatByWithNull() throws IOException { } @Test - public void testEventstatBySpan() throws IOException { + public void testEventstatsBySpan() throws IOException { JSONObject actual = executeQuery( String.format( @@ -187,7 +187,7 @@ public void testEventstatBySpan() throws IOException { } @Test - public void testEventstatBySpanWithNull() throws IOException { + public void testEventstatsBySpanWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -207,7 +207,7 @@ public void testEventstatBySpanWithNull() throws IOException { } @Test - public void testEventstatByMultiplePartitions1() throws IOException { + public void testEventstatsByMultiplePartitions1() throws IOException { JSONObject actual = executeQuery( String.format( @@ -225,7 +225,7 @@ public void testEventstatByMultiplePartitions1() throws IOException { } @Test - public void testEventstatByMultiplePartitions2() throws IOException { + public void testEventstatsByMultiplePartitions2() throws IOException { JSONObject actual = executeQuery( String.format( @@ -243,7 +243,7 @@ public void testEventstatByMultiplePartitions2() throws IOException { } @Test - public void testEventstatByMultiplePartitionsWithNull1() throws IOException { + public void testEventstatsByMultiplePartitionsWithNull1() throws IOException { JSONObject actual = executeQuery( String.format( @@ -263,7 +263,7 @@ public void testEventstatByMultiplePartitionsWithNull1() throws IOException { } @Test - public void testEventstatByMultiplePartitionsWithNull2() throws IOException { + public void testEventstatsByMultiplePartitionsWithNull2() throws IOException { JSONObject actual = executeQuery( String.format( @@ -298,7 +298,7 @@ public void testUnsupportedWindowFunctions() { } @Test - public void testMultipleEventstat() throws IOException { + public void testMultipleEventstats() throws IOException { JSONObject actual = executeQuery( String.format( @@ -316,7 +316,7 @@ public void testMultipleEventstat() throws IOException { } @Test - public void testMultipleEventstatWithNull() throws IOException { + public void testMultipleEventstatsWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -336,7 +336,7 @@ public void testMultipleEventstatWithNull() throws IOException { } @Test - public void testMultipleEventstatWithEval() throws IOException { + public void testMultipleEventstatsWithEval() throws IOException { JSONObject actual = executeQuery( String.format( @@ -356,7 +356,7 @@ public void testMultipleEventstatWithEval() throws IOException { } @Test - public void testEventstatEmptyRows() throws IOException { + public void testEventstatsEmptyRows() throws IOException { JSONObject actual = executeQuery( String.format( @@ -376,7 +376,7 @@ public void testEventstatEmptyRows() throws IOException { } @Test - public void testEventstatVariance() throws IOException { + public void testEventstatsVariance() throws IOException { JSONObject actual = executeQuery( String.format( @@ -447,7 +447,7 @@ public void testEventstatVariance() throws IOException { } @Test - public void testEventstatVarianceWithNull() throws IOException { + public void testEventstatsVarianceWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -511,7 +511,7 @@ public void testEventstatVarianceWithNull() throws IOException { } @Test - public void testEventstatVarianceBy() throws IOException { + public void testEventstatsVarianceBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -529,7 +529,7 @@ public void testEventstatVarianceBy() throws IOException { } @Test - public void testEventstatVarianceBySpan() throws IOException { + public void testEventstatsVarianceBySpan() throws IOException { JSONObject actual = executeQuery( String.format( @@ -544,7 +544,7 @@ public void testEventstatVarianceBySpan() throws IOException { } @Test - public void testEventstatVarianceWithNullBy() throws IOException { + public void testEventstatsVarianceWithNullBy() throws IOException { JSONObject actual = executeQuery( String.format( diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java new file mode 100644 index 00000000000..0c899b501bc --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java @@ -0,0 +1,1101 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.legacy.TestsConstants.*; +import static org.opensearch.sql.util.MatcherUtils.*; + +import java.io.IOException; +import java.util.List; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.client.Request; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +public class CalciteStreamstatsCommandIT extends PPLIntegTestCase { + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.STATE_COUNTRY); + loadIndex(Index.STATE_COUNTRY_WITH_NULL); + loadIndex(Index.BANK_TWO); + loadIndex(Index.LOGS); + } + + @Test + public void testStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3, 41.666666666666664, 25, 70), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4, 36.25, 20, 70)); + } + + @Test + public void testStreamstatsWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3, 41.666666666666664, 25, 70), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4, 36.25, 20, 70), + rows(null, "Canada", null, 4, 2023, 10, 5, 31, 10, 70), + rows("Kevin", null, null, 4, 2023, null, 6, 31, 10, 70)); + } + + @Test + public void testStreamstatsBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by country | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsByWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by country | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 3, 18.333333333333332, 10, 25), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + + actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by state | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 2, 10, 10, 10)); + } + + @Test + public void testStreamstatsBySpan() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsBySpanWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsByMultiplePartitions1() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, country | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsByMultiplePartitions2() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, state | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20)); + } + + @Test + public void testStreamstatsByMultiplePartitionsWithNull1() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, country | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsByMultiplePartitionsWithNull2() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, state | fields name, country, state, month, year, age, cnt, avg, min, max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsCurrent() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current=false avg(age) as prev_avg | fields name, country, state, month, year, age, prev_avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 41.666666666666664)); + } + + @Test + public void testStreamstatsCurrentWithNUll() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current=false avg(age) as prev_avg | fields name, country, state, month, year, age, prev_avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 41.666666666666664), + rows(null, "Canada", null, 4, 2023, 10, 36.25), + rows("Kevin", null, null, 4, 2023, null, 31)); + } + + @Test + public void testStreamstatsWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 3 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 25)); + } + + @Test + public void testStreamstatsWindowWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 3 avg(age) as avg | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 18.333333333333332), + rows("Kevin", null, null, 4, 2023, null, 15)); + } + + public void testStreamstatsBigWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 10 avg(age) as avg | fields name, country, state, month, year, age, avg", TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 36.25)); + } + + @Test + public void testStreamstatsWindowError() { + Throwable e = + assertThrowsWithReplace( + IllegalArgumentException.class, + () -> + executeQuery( + String.format( + "source=%s | streamstats window=-1 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY))); + verifyErrorMessageContains(e, "Window size must be >= 0, but got: -1"); + } + + @Test + public void testStreamstatsCurrentAndWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current = false window = 2 avg(age) as avg | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 27.5)); + } + + @Test + public void testStreamstatsCurrentAndWindowWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current = false window = 2 avg(age) as avg | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 27.5), + rows(null, "Canada", null, 4, 2023, 10, 22.5), + rows("Kevin", null, null, 4, 2023, null, 15)); + } + + @Test + public void testStreamstatsGlobal() throws IOException { + final int docId = 5; + Request insertRequest = + new Request( + "PUT", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 40,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=false avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 35)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=true avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 40)); + + Request deleteRequest = + new Request( + "DELETE", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsGlobalWithNull() throws IOException { + final int docId = 7; + Request insertRequest = + new Request( + "PUT", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 40,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=false avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 35)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=true avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 40)); + + Request deleteRequest = + new Request( + "DELETE", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsReset() throws IOException { + final int docId = 5; + Request insertRequest = + new Request( + "PUT", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 28,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + Request deleteRequest = + new Request( + "DELETE", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsResetWithNull() throws IOException { + final int docId = 7; + Request insertRequest = + new Request( + "PUT", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 28,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country | fields name, country, state, month, year, age, avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + Request deleteRequest = + new Request( + "DELETE", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testUnsupportedWindowFunctions() { + List unsupported = List.of("PERCENTILE_APPROX", "PERCENTILE"); + for (String u : unsupported) { + Throwable e = + assertThrowsWithReplace( + UnsupportedOperationException.class, + () -> + executeQuery( + String.format( + "source=%s | streamstats %s(age)", TEST_INDEX_STATE_COUNTRY, u))); + verifyErrorMessageContains(e, "Unexpected window function: " + u); + } + } + + @Test + public void testMultipleStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" + + " avg(avg_age) as avg_state_age by country | fields name, country, state, month, year, age, avg_age, avg_state_age", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20, 22.5)); + } + + @Test + public void testMultipleStreamstatsWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" + + " avg(avg_age) as avg_state_age by country | fields name, country, state, month, year, age, avg_age, avg_state_age", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 10, 18.333333333333332), + rows("Kevin", null, null, 4, 2023, null, null, null)); + } + + @Test + public void testStreamstatsAndEventstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats avg(age) as avg_age| streamstats" + + " avg(age) as avg_age_stream | fields name, country, state, month, year, age, avg_age, avg_age_stream", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 36.25, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 36.25, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 36.25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 36.25, 36.25)); + } + + @Test + public void testStreamstatsAndSort() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | sort age | streamstats window = 2 avg(age) as avg_age | fields name, country, state, month, year, age, avg_age", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows("John", "Canada", "Ontario", 4, 2023, 25, 22.5), + rows("Hello", "USA", "New York", 4, 2023, 30, 27.5), + rows("Jake", "USA", "California", 4, 2023, 70, 50)); + } + + @Test + public void testLeftJoinWithStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s as l | left join left=l right=r on l.country = r.country [ source=%s |" + + " streamstats window=2 avg(age) as avg_age] | fields l.name, l.country, l.state, l.month, l.year, l.age, r.name, r.country, r.state, r.month, r.year, r.age, avg_age", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows( + "John", "Canada", "Ontario", 4, 2023, 25, "John", "Canada", "Ontario", 4, 2023, 25, + 27.5), + rows( + "John", "Canada", "Ontario", 4, 2023, 25, "Jane", "Canada", "Quebec", 4, 2023, 20, + 22.5), + rows("John", "Canada", "Ontario", 4, 2023, 25, null, "Canada", null, 4, 2023, 10, 15), + rows( + "Jane", "Canada", "Quebec", 4, 2023, 20, "John", "Canada", "Ontario", 4, 2023, 25, + 27.5), + rows( + "Jane", "Canada", "Quebec", 4, 2023, 20, "Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, null, "Canada", null, 4, 2023, 10, 15), + rows( + "Jake", "USA", "California", 4, 2023, 70, "Jake", "USA", "California", 4, 2023, 70, 70), + rows("Jake", "USA", "California", 4, 2023, 70, "Hello", "USA", "New York", 4, 2023, 30, 50), + rows("Hello", "USA", "New York", 4, 2023, 30, "Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, "Hello", "USA", "New York", 4, 2023, 30, 50)); + } + + @Test + public void testWhereInWithStreamstatsSubquery() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where country in [ source=%s | streamstats window=2 avg(age) as" + + " avg_age | where avg_age > 40 | fields country ] | fields name, country, state, month, year, age", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70), + rows("Hello", "USA", "New York", 4, 2023, 30)); + } + + @Test + public void testMultipleStreamstatsWithEval() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by country, state, name | eval" + + " avg_age_divide_20 = avg_age - 20 | streamstats avg(avg_age_divide_20) as" + + " avg_state_age by country, state | where avg_state_age > 0 | streamstats" + + " count(avg_state_age) as count_country_age_greater_20 by country | fields" + + " name, country, state, month, year, age, avg_age, avg_age_divide_20," + + " avg_state_age, count_country_age_greater_20", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 50, 50, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 10, 10, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 5, 5, 1)); + } + + @Test + public void testStreamstatsEmptyRows() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where name = 'non-existed' | streamstats count(), avg(age), min(age)," + + " max(age), stddev_pop(age), stddev_samp(age), var_pop(age), var_samp(age)", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyNumOfRows(actual, 0); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | where name = 'non-existed' | streamstats count(), avg(age), min(age)," + + " max(age), stddev_pop(age), stddev_samp(age), var_pop(age), var_samp(age) by" + + " country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyNumOfRows(actual2, 0); + } + + @Test + public void testStreamstatsVariance() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("stddev_pop(age)", "double"), + schema("stddev_samp(age)", "double"), + schema("var_pop(age)", "double"), + schema("var_samp(age)", "double")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows( + "John", + "Canada", + "Ontario", + 4, + 2023, + 25, + 20.138409955990955, + 24.66441431158124, + 405.55555555555566, + 608.3333333333335), + rows( + "Jane", + "Canada", + "Quebec", + 4, + 2023, + 20, + 19.803724397193573, + 22.86737122335374, + 392.1875, + 522.9166666666666)); + } + + @Test + public void testStreamstatsVarianceWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("stddev_pop(age)", "double"), + schema("stddev_samp(age)", "double"), + schema("var_pop(age)", "double"), + schema("var_samp(age)", "double")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows( + "John", + "Canada", + "Ontario", + 4, + 2023, + 25, + 20.138409955990955, + 24.66441431158124, + 405.55555555555566, + 608.3333333333335), + rows( + "Jane", + "Canada", + "Quebec", + 4, + 2023, + 20, + 19.803724397193573, + 22.86737122335374, + 392.1875, + 522.9166666666666), + rows(null, "Canada", null, 4, 2023, 10, 20.591260281974, 23.021728866442675, 424, 530), + rows("Kevin", null, null, 4, 2023, null, 20.591260281974, 23.021728866442675, 424, 530)); + } + + @Test + public void testStreamstatsVarianceBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) by country | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows("John", "Canada", "Ontario", 4, 2023, 25, 0, null, 0, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2.5, 3.5355339059327378, 6.25, 12.5)); + } + + @Test + public void testStreamstatsVarianceBySpan() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where country != 'USA' | streamstats stddev_samp(age) by span(age," + + " 10) | fields name, country, state, month, year, age, `stddev_samp(age)`", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 3.5355339059327378)); + } + + @Test + public void testStreamstatsVarianceWithNullBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) by country | fields name, country, state, month, year, age," + + " `stddev_pop(age)`, `stddev_samp(age)`, `var_pop(age)`, `var_samp(age)`", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows("John", "Canada", "Ontario", 4, 2023, 25, 0, null, 0, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2.5, 3.5355339059327378, 6.25, 12.5), + rows( + null, + "Canada", + null, + 4, + 2023, + 10, + 6.2360956446232345, + 7.6376261582597325, + 38.88888888888888, + 58.333333333333314), + rows("Kevin", null, null, 4, 2023, null, null, null, null, null)); + } + + @Test + public void testStreamstatsDistinctCount() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state | fields name, country, state, month, year, age, dc_state", TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4)); + } + + @Test + public void testStreamstatsDistinctCountByCountry() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state by country | fields name, country, state, month, year, age, dc_state", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2)); + } + + @Test + public void testStreamstatsDistinctCountFunction() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats distinct_count(country) as dc_country | fields name, country, state, month, year, age, dc_country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_country", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 1), + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2)); + } + + @Test + public void testStreamstatsDistinctCountWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state | fields name, country, state, month, year, age, dc_state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows(null, "Canada", null, 4, 2023, 10, 4), + rows("Kevin", null, null, 4, 2023, null, 4)); + } + + @Test + public void testStreamstatsEarliestAndLatest() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats earliest(message), latest(message) by server", + TEST_INDEX_LOGS)); + verifySchema( + actual, + schema("created_at", "timestamp"), + schema("server", "string"), + schema("@timestamp", "timestamp"), + schema("message", "string"), + schema("level", "string"), + schema("earliest(message)", "string"), + schema("latest(message)", "string")); + verifyDataRows( + actual, + rows( + "2023-01-05 00:00:00", + "server1", + "2023-01-01 00:00:00", + "Database connection failed", + "ERROR", + "Database connection failed", + "Database connection failed"), + rows( + "2023-01-04 00:00:00", + "server2", + "2023-01-02 00:00:00", + "Service started", + "INFO", + "Service started", + "Service started"), + rows( + "2023-01-03 00:00:00", + "server1", + "2023-01-03 00:00:00", + "High memory usage", + "WARN", + "Database connection failed", + "High memory usage"), + rows( + "2023-01-02 00:00:00", + "server3", + "2023-01-04 00:00:00", + "Disk space low", + "ERROR", + "Disk space low", + "Disk space low"), + rows( + "2023-01-01 00:00:00", + "server2", + "2023-01-05 00:00:00", + "Backup completed", + "INFO", + "Service started", + "Backup completed")); + } +} diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml new file mode 100644 index 00000000000..9dd91501bf8 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml new file mode 100644 index 00000000000..32538ab17df --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..12=[{inputs}], proj#0..10=[{exprs}], distinct_states=[$t12]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml new file mode 100644 index 00000000000..cac21b929ee --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..7=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t6], latest_message=[$t7]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$5], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {1} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml new file mode 100644 index 00000000000..f19625d85e5 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..7=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t6], latest_message=[$t7]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$5], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $0), ARG_MAX($3, $0)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml new file mode 100644 index 00000000000..f17643ab804 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[ARG_MIN($3, $2) OVER (ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml new file mode 100644 index 00000000000..293dd785f96 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml @@ -0,0 +1,29 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(>=($17, -($cor0.__stream_seq__, 1)), <=($17, $cor0.__stream_seq__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..11=[{inputs}], expr#12=[1], expr#13=[-($t11, $t12)], proj#0..11=[{exprs}], $f12=[$t13]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($0, $3), >=($5, $2), <=($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..1=[{inputs}], expr#2=[1], expr#3=[-($t1, $t2)], proj#0..1=[{exprs}], $f2=[$t3]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml new file mode 100644 index 00000000000..0e8ed3a3dde --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml @@ -0,0 +1,38 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$21]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17, 20}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(<($17, $cor0.__stream_seq__), =($20, $cor0.__seg_id__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..15=[{inputs}], expr#16=[0], expr#17=[COALESCE($t15, $t16)], expr#18=[+($t14, $t17)], proj#0..11=[{exprs}], __seg_id__=[$t18]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($12)])], window#1=[window(rows between UNBOUNDED PRECEDING and $14 PRECEDING aggs [$SUM0($13)])], constants=[[1]]) + EnumerableCalc(expr#0..11=[{inputs}], expr#12=[34], expr#13=[>($t8, $t12)], expr#14=[1], expr#15=[0], expr#16=[CASE($t13, $t14, $t15)], expr#17=[25], expr#18=[<($t8, $t17)], expr#19=[CASE($t18, $t14, $t15)], proj#0..11=[{exprs}], __reset_before_flag__=[$t16], __reset_after_flag__=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($2, $6), =($0, $3), <($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[COALESCE($t5, $t6)], expr#8=[+($t4, $t7)], proj#0..1=[{exprs}], __seg_id__=[$t8]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($2)])], window#1=[window(rows between UNBOUNDED PRECEDING and $4 PRECEDING aggs [$SUM0($3)])], constants=[[1]]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[34], expr#4=[>($t1, $t3)], expr#5=[1], expr#6=[0], expr#7=[CASE($t4, $t5, $t6)], expr#8=[25], expr#9=[<($t1, $t8)], expr#10=[CASE($t9, $t5, $t6)], gender=[$t0], __stream_seq__=[$t2], __reset_before_flag__=[$t7], __reset_after_flag__=[$t10]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..6=[{inputs}], expr#7=[0], expr#8=[COALESCE($t6, $t7)], expr#9=[+($t5, $t8)], proj#0..2=[{exprs}], __seg_id__=[$t9]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($3)])], window#1=[window(rows between UNBOUNDED PRECEDING and $5 PRECEDING aggs [$SUM0($4)])], constants=[[1]]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[34], expr#4=[>($t1, $t3)], expr#5=[1], expr#6=[0], expr#7=[CASE($t4, $t5, $t6)], expr#8=[25], expr#9=[<($t1, $t8)], expr#10=[CASE($t9, $t5, $t6)], proj#0..2=[{exprs}], __reset_before_flag__=[$t7], __reset_after_flag__=[$t10]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml new file mode 100644 index 00000000000..6ffa5ad304c --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml @@ -0,0 +1,10 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..17=[{inputs}], proj#0..10=[{exprs}], $11=[$t17]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml new file mode 100644 index 00000000000..550cf0ea9cb --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..18=[{inputs}], proj#0..10=[{exprs}], distinct_states=[$t18]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$17], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml new file mode 100644 index 00000000000..c37fae48771 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..13=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t12], latest_message=[$t13]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {1} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml new file mode 100644 index 00000000000..b85e4b6b7bb --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..13=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t12], latest_message=[$t13]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $0), ARG_MAX($3, $0)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml new file mode 100644 index 00000000000..79dcbca7555 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml @@ -0,0 +1,10 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[ARG_MIN($3, $2) OVER (ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..12=[{inputs}], proj#0..4=[{exprs}], $5=[$t11], $6=[$t12]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml new file mode 100644 index 00000000000..3ac52e02f55 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml @@ -0,0 +1,30 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(>=($17, -($cor0.__stream_seq__, 1)), <=($17, $cor0.__stream_seq__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[1], expr#19=[-($t17, $t18)], proj#0..10=[{exprs}], __stream_seq__=[$t17], $f12=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($0, $3), >=($5, $2), <=($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[1], expr#19=[-($t17, $t18)], gender=[$t4], __stream_seq__=[$t17], $f12=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..17=[{inputs}], gender=[$t4], age=[$t8], $2=[$t17]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml new file mode 100644 index 00000000000..be28e9b1d8c --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml @@ -0,0 +1,38 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$21]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17, 20}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(<($17, $cor0.__stream_seq__), =($20, $cor0.__seg_id__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..15=[{inputs}], expr#16=[0], expr#17=[COALESCE($t15, $t16)], expr#18=[+($t14, $t17)], proj#0..11=[{exprs}], __seg_id__=[$t18]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($12)])], window#1=[window(rows between UNBOUNDED PRECEDING and $14 PRECEDING aggs [$SUM0($13)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], proj#0..10=[{exprs}], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($2, $6), =($0, $3), <($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[COALESCE($t5, $t6)], expr#8=[+($t4, $t7)], proj#0..1=[{exprs}], __seg_id__=[$t8]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($2)])], window#1=[window(rows between UNBOUNDED PRECEDING and $4 PRECEDING aggs [$SUM0($3)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], gender=[$t4], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..6=[{inputs}], expr#7=[0], expr#8=[COALESCE($t6, $t7)], expr#9=[+($t5, $t8)], proj#0..2=[{exprs}], __seg_id__=[$t9]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($3)])], window#1=[window(rows between UNBOUNDED PRECEDING and $5 PRECEDING aggs [$SUM0($4)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], gender=[$t4], age=[$t8], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index b3028e06b5e..8d09afa7f46 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -22,6 +22,7 @@ TABLE: 'TABLE'; // Alias for FIELDS command RENAME: 'RENAME'; STATS: 'STATS'; EVENTSTATS: 'EVENTSTATS'; +STREAMSTATS: 'STREAMSTATS'; DEDUP: 'DEDUP'; SORT: 'SORT'; EVAL: 'EVAL'; @@ -111,6 +112,11 @@ DEDUP_SPLITVALUES: 'DEDUP_SPLITVALUES'; PARTITIONS: 'PARTITIONS'; ALLNUM: 'ALLNUM'; DELIM: 'DELIM'; +CURRENT: 'CURRENT'; +WINDOW: 'WINDOW'; +GLOBAL: 'GLOBAL'; +RESET_BEFORE: 'RESET_BEFORE'; +RESET_AFTER: 'RESET_AFTER'; BUCKET_NULLABLE: 'BUCKET_NULLABLE'; USENULL: 'USENULL'; CENTROIDS: 'CENTROIDS'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index df6fd4384ff..fc393788808 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -54,6 +54,7 @@ commands | renameCommand | statsCommand | eventstatsCommand + | streamstatsCommand | dedupCommand | sortCommand | evalCommand @@ -92,6 +93,7 @@ commandName | RENAME | STATS | EVENTSTATS + | STREAMSTATS | DEDUP | SORT | EVAL @@ -249,6 +251,34 @@ eventstatsCommand : EVENTSTATS eventstatsAggTerm (COMMA eventstatsAggTerm)* (statsByClause)? ; +streamstatsCommand + : STREAMSTATS streamstatsArgs streamstatsAggTerm (COMMA streamstatsAggTerm)* (statsByClause)? + ; + +streamstatsArgs + : (currentArg | windowArg | globalArg | resetBeforeArg | resetAfterArg)* + ; + +currentArg + : CURRENT EQUAL current = booleanLiteral + ; + +windowArg + : WINDOW EQUAL window = integerLiteral + ; + +globalArg + : GLOBAL EQUAL global = booleanLiteral + ; + +resetBeforeArg + : RESET_BEFORE EQUAL logicalExpression + ; + +resetAfterArg + : RESET_AFTER EQUAL logicalExpression + ; + dedupCommand : DEDUP (number = integerLiteral)? fieldList (KEEPEMPTY EQUAL keepempty = booleanLiteral)? (CONSECUTIVE EQUAL consecutive = booleanLiteral)? ; @@ -635,6 +665,10 @@ eventstatsAggTerm : windowFunction (AS alias = wcFieldExpression)? ; +streamstatsAggTerm + : windowFunction (AS alias = wcFieldExpression)? + ; + windowFunction : windowFunctionName LT_PRTHS functionArgs RT_PRTHS ; @@ -1501,6 +1535,11 @@ searchableKeyWord | PARTITIONS | ALLNUM | DELIM + | CURRENT + | WINDOW + | GLOBAL + | RESET_BEFORE + | RESET_AFTER | BUCKET_NULLABLE | USENULL | CENTROIDS diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 67cbdebde23..8fde9aa4ba6 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -45,7 +45,29 @@ import org.apache.commons.lang3.tuple.Pair; import org.opensearch.sql.ast.EmptySourcePropagateVisitor; import org.opensearch.sql.ast.dsl.AstDSL; -import org.opensearch.sql.ast.expression.*; +import org.opensearch.sql.ast.expression.Alias; +import org.opensearch.sql.ast.expression.AllFieldsExcludeMeta; +import org.opensearch.sql.ast.expression.Argument; +import org.opensearch.sql.ast.expression.Argument.ArgumentMap; +import org.opensearch.sql.ast.expression.DataType; +import org.opensearch.sql.ast.expression.EqualTo; +import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.Let; +import org.opensearch.sql.ast.expression.Literal; +import org.opensearch.sql.ast.expression.Map; +import org.opensearch.sql.ast.expression.ParseMethod; +import org.opensearch.sql.ast.expression.PatternMethod; +import org.opensearch.sql.ast.expression.PatternMode; +import org.opensearch.sql.ast.expression.QualifiedName; +import org.opensearch.sql.ast.expression.SearchAnd; +import org.opensearch.sql.ast.expression.SearchExpression; +import org.opensearch.sql.ast.expression.SearchGroup; +import org.opensearch.sql.ast.expression.Span; +import org.opensearch.sql.ast.expression.SpanUnit; +import org.opensearch.sql.ast.expression.UnresolvedArgument; +import org.opensearch.sql.ast.expression.UnresolvedExpression; +import org.opensearch.sql.ast.expression.WindowFrame; +import org.opensearch.sql.ast.expression.WindowFunction; import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.Aggregation; import org.opensearch.sql.ast.tree.Append; @@ -83,6 +105,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SpanBin; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -447,6 +470,7 @@ public UnresolvedPlan visitStatsCommand(StatsCommandContext ctx) { return aggregation; } + /** Eventstats command. */ public UnresolvedPlan visitEventstatsCommand(OpenSearchPPLParser.EventstatsCommandContext ctx) { ImmutableList.Builder windownFunctionListBuilder = new ImmutableList.Builder<>(); @@ -468,6 +492,93 @@ public UnresolvedPlan visitEventstatsCommand(OpenSearchPPLParser.EventstatsComma return new Window(windownFunctionListBuilder.build()); } + /** Streamstats command. */ + public UnresolvedPlan visitStreamstatsCommand(OpenSearchPPLParser.StreamstatsCommandContext ctx) { + // 1. Parse arguments from the streamstats command + List argExprList = ArgumentFactory.getArgumentList(ctx); + ArgumentMap arguments = ArgumentMap.of(argExprList); + + // current, window and global from ArgumentFactory + boolean current = (Boolean) arguments.get("current").getValue(); + int window = (Integer) arguments.get("window").getValue(); + boolean global = (Boolean) arguments.get("global").getValue(); + + if (window < 0) { + throw new IllegalArgumentException("Window size must be >= 0, but got: " + window); + } + + // reset_before, reset_after + UnresolvedExpression resetBeforeExpr = + Optional.ofNullable(ctx.streamstatsArgs()) + .filter(args -> args.resetBeforeArg() != null && !args.resetBeforeArg().isEmpty()) + .map(args -> expressionBuilder.visit(args.resetBeforeArg(0).logicalExpression())) + .orElse(null); + + UnresolvedExpression resetAfterExpr = + Optional.ofNullable(ctx.streamstatsArgs()) + .filter(args -> args.resetAfterArg() != null && !args.resetAfterArg().isEmpty()) + .map(args -> expressionBuilder.visit(args.resetAfterArg(0).logicalExpression())) + .orElse(null); + + // 2.1 Build a WindowFrame from the provided arguments + WindowFrame frame = buildFrameFromArgs(current, window); + // 2.2 Build groupList + List groupList = getPartitionExprList(ctx.statsByClause()); + + // 3. Build each window function in the command + ImmutableList.Builder windowFunctionListBuilder = + new ImmutableList.Builder<>(); + + for (OpenSearchPPLParser.StreamstatsAggTermContext aggCtx : ctx.streamstatsAggTerm()) { + UnresolvedExpression windowFunction = internalVisitExpression(aggCtx.windowFunction()); + if (windowFunction instanceof WindowFunction) { + WindowFunction wf = (WindowFunction) windowFunction; + // Attach PARTITION BY clause expressions + wf.setPartitionByList(groupList); + // Inject the frame + wf.setWindowFrame(frame); + } + String name = + aggCtx.alias == null + ? getTextInQuery(aggCtx) + : StringUtils.unquoteIdentifier(aggCtx.alias.getText()); + Alias alias = new Alias(name, windowFunction); + windowFunctionListBuilder.add(alias); + } + + // 4. Build StreamWindow AST node + return new StreamWindow( + windowFunctionListBuilder.build(), + groupList, + current, + window, + global, + resetBeforeExpr, + resetAfterExpr); + } + + private WindowFrame buildFrameFromArgs(boolean current, int window) { + // Build the frame + if (window > 0) { + if (current) { + // N-1 PRECEDING to CURRENT ROW + return WindowFrame.of( + WindowFrame.FrameType.ROWS, (window - 1) + " PRECEDING", "CURRENT ROW"); + } else { + // N PRECEDING to 1 PRECEDING + return WindowFrame.of(WindowFrame.FrameType.ROWS, window + " PRECEDING", "1 PRECEDING"); + } + } else { + // Default: running total + if (current) { + return WindowFrame.toCurrentRow(); + } else { + // Default: running total excluding current row + return WindowFrame.of(WindowFrame.FrameType.ROWS, "UNBOUNDED PRECEDING", "1 PRECEDING"); + } + } + } + /** Dedup command. */ @Override public UnresolvedPlan visitDedupCommand(DedupCommandContext ctx) { diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java index 85481da2426..acf204e8030 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java @@ -28,6 +28,7 @@ import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.IntegerLiteralContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.PrefixSortFieldContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SortFieldContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.StreamstatsCommandContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SuffixSortFieldContext; /** Util class to get all arguments as a list from the PPL command. */ @@ -89,6 +90,25 @@ private static boolean legacyPreferred(Settings settings) { || Boolean.TRUE.equals(settings.getSettingValue(Settings.Key.PPL_SYNTAX_LEGACY_PREFERRED)); } + /** + * Get list of {@link Argument}. + * + * @param ctx StreamstatsCommandContext instance + * @return the list of arguments fetched from the streamstats command + */ + public static List getArgumentList(StreamstatsCommandContext ctx) { + return Arrays.asList( + ctx.streamstatsArgs().currentArg() != null && !ctx.streamstatsArgs().currentArg().isEmpty() + ? new Argument("current", getArgumentValue(ctx.streamstatsArgs().currentArg(0).current)) + : new Argument("current", new Literal(true, DataType.BOOLEAN)), + ctx.streamstatsArgs().windowArg() != null && !ctx.streamstatsArgs().windowArg().isEmpty() + ? new Argument("window", getArgumentValue(ctx.streamstatsArgs().windowArg(0).window)) + : new Argument("window", new Literal(0, DataType.INTEGER)), + ctx.streamstatsArgs().globalArg() != null && !ctx.streamstatsArgs().globalArg().isEmpty() + ? new Argument("global", getArgumentValue(ctx.streamstatsArgs().globalArg(0).global)) + : new Argument("global", new Literal(true, DataType.BOOLEAN))); + } + /** * Get list of {@link Argument}. * diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index abb08b11f2d..9db3f851962 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -85,6 +85,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SpanBin; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -380,6 +381,14 @@ public String visitWindow(Window node, String context) { child, String.join(" ", visitExpressionList(node.getWindowFunctionList())).trim()); } + @Override + public String visitStreamWindow(StreamWindow node, String context) { + String child = node.getChild().get(0).accept(this, context); + return StringUtils.format( + "%s | streamstats %s", + child, String.join(" ", visitExpressionList(node.getWindowFunctionList())).trim()); + } + /** Build {@link LogicalRareTopN}. */ @Override public String visitRareTopN(RareTopN node, String context) { diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java new file mode 100644 index 00000000000..04f4c7610d9 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java @@ -0,0 +1,189 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.test.CalciteAssert; +import org.junit.Test; + +public class CalcitePPLStreamstatsTest extends CalcitePPLAbstractTest { + + public CalcitePPLStreamstatsTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Test + public void testStreamstatsBy() { + String ppl = "source=EMP | streamstats max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], max(SAL)=[MAX($5) OVER" + + " (PARTITION BY $7 ROWS UNBOUNDED PRECEDING)])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (PARTITION BY `DEPTNO` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)" + + " `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t`\n" + + "ORDER BY `__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsCurrent() { + String ppl = "source=EMP | streamstats current = false max(SAL)"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[MAX($5) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) `max(SAL)`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsWindow() { + String ppl = "source=EMP | streamstats window = 5 max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{7," + + " 8}])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalAggregate(group=[{}], max(SAL)=[MAX($5)])\n" + + " LogicalFilter(condition=[AND(>=($8, -($cor0.__stream_seq__, 4)), <=($8," + + " $cor0.__stream_seq__), =($7, $cor0.DEPTNO))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `$cor0`.`EMPNO`, `$cor0`.`ENAME`, `$cor0`.`JOB`, `$cor0`.`MGR`, `$cor0`.`HIREDATE`," + + " `$cor0`.`SAL`, `$cor0`.`COMM`, `$cor0`.`DEPTNO`, `t2`.`max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `$cor0`,\n" + + "LATERAL (SELECT MAX(`SAL`) `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t0`\n" + + "WHERE `__stream_seq__` >= `$cor0`.`__stream_seq__` - 4 AND `__stream_seq__` <=" + + " `$cor0`.`__stream_seq__` AND `DEPTNO` = `$cor0`.`DEPTNO`) `t2`\n" + + "ORDER BY `$cor0`.`__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsGlobal() { + String ppl = "source=EMP | streamstats window = 5 global= false max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], max(SAL)=[MAX($5) OVER" + + " (PARTITION BY $7 ROWS 4 PRECEDING)])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (PARTITION BY `DEPTNO` ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t`\n" + + "ORDER BY `__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsReset() { + String ppl = + "source=EMP | streamstats reset_before=SAL>100 reset_after=SAL<50 avg(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], avg(SAL)=[$12])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{7, 8," + + " 11}])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], __reset_before_flag__=[$9]," + + " __reset_after_flag__=[$10], __seg_id__=[+(SUM($9) OVER (ROWS UNBOUNDED PRECEDING)," + + " COALESCE(SUM($10) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()]," + + " __reset_before_flag__=[CASE(>($5, 100), 1, 0)], __reset_after_flag__=[CASE(<($5," + + " 50), 1, 0)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalAggregate(group=[{}], avg(SAL)=[AVG($5)])\n" + + " LogicalFilter(condition=[AND(<=($8, $cor0.__stream_seq__), =($11," + + " $cor0.__seg_id__), =($7, $cor0.DEPTNO))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], __reset_before_flag__=[$9]," + + " __reset_after_flag__=[$10], __seg_id__=[+(SUM($9) OVER (ROWS UNBOUNDED PRECEDING)," + + " COALESCE(SUM($10) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3]," + + " HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER" + + " ()], __reset_before_flag__=[CASE(>($5, 100), 1, 0)]," + + " __reset_after_flag__=[CASE(<($5, 50), 1, 0)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `$cor0`.`EMPNO`, `$cor0`.`ENAME`, `$cor0`.`JOB`, `$cor0`.`MGR`, `$cor0`.`HIREDATE`," + + " `$cor0`.`SAL`, `$cor0`.`COMM`, `$cor0`.`DEPTNO`, `t4`.`avg(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `__stream_seq__`, `__reset_before_flag__`, `__reset_after_flag__`," + + " (SUM(`__reset_before_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT" + + " ROW)) + COALESCE(SUM(`__reset_after_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING), 0) `__seg_id__`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`, CASE WHEN `SAL` > 100 THEN 1 ELSE 0 END" + + " `__reset_before_flag__`, CASE WHEN `SAL` < 50 THEN 1 ELSE 0 END" + + " `__reset_after_flag__`\n" + + "FROM `scott`.`EMP`) `t`) `$cor0`,\n" + + "LATERAL (SELECT AVG(`SAL`) `avg(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `__stream_seq__`, `__reset_before_flag__`, `__reset_after_flag__`," + + " (SUM(`__reset_before_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT" + + " ROW)) + COALESCE(SUM(`__reset_after_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING), 0) `__seg_id__`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`, CASE WHEN `SAL` > 100 THEN 1 ELSE 0 END" + + " `__reset_before_flag__`, CASE WHEN `SAL` < 50 THEN 1 ELSE 0 END" + + " `__reset_after_flag__`\n" + + "FROM `scott`.`EMP`) `t1`) `t2`\n" + + "WHERE `__stream_seq__` <= `$cor0`.`__stream_seq__` AND `__seg_id__` =" + + " `$cor0`.`__seg_id__` AND `DEPTNO` = `$cor0`.`DEPTNO`) `t4`\n" + + "ORDER BY `$cor0`.`__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } +} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index e5f54de544c..d87ba68ea8b 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -173,6 +173,34 @@ public void testEventstatsCommandWithSpanFunction() { anonymize("source=t | eventstats count(a) by span(b, 1d), c")); } + @Test + public void testStreamstatsCommandWithByClause() { + assertEquals( + "source=table | streamstats count(identifier) by identifier", + anonymize("source=t | streamstats count(a) by b")); + } + + @Test + public void testStreamstatsCommandWithWindowAndCurrent() { + assertEquals( + "source=table | streamstats max(identifier)", + anonymize("source=t | streamstats current=false window=2 max(a)")); + } + + @Test + public void testStreamstatsCommandWithNestedFunctions() { + assertEquals( + "source=table | streamstats sum(+(identifier,identifier))", + anonymize("source=t | streamstats sum(a+b)")); + } + + @Test + public void testStreamstatsCommandWithSpanFunction() { + assertEquals( + "source=table | streamstats count(identifier) by span(identifier, *** d),identifier", + anonymize("source=t | streamstats count(a) by span(b, 1d), c")); + } + @Test public void testBinCommandBasic() { assertEquals("source=table | bin identifier span=***", anonymize("source=t | bin f span=10"));