diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index f4b9abe8330..e79c15e5881 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -92,6 +92,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -748,6 +749,11 @@ public LogicalPlan visitTrendline(Trendline node, AnalysisContext context) { computationsAndTypes.build()); } + @Override + public LogicalPlan visitStreamWindow(StreamWindow node, AnalysisContext context) { + throw getOnlyForCalciteException("Streamstats"); + } + @Override public LogicalPlan visitFlatten(Flatten node, AnalysisContext context) { throw getOnlyForCalciteException("Flatten"); diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index f5d2a1623b3..0dd475c5612 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -79,6 +79,7 @@ import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -410,6 +411,10 @@ public T visitWindow(Window window, C context) { return visitChildren(window, context); } + public T visitStreamWindow(StreamWindow node, C context) { + return visitChildren(node, context); + } + public T visitJoin(Join node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java b/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java new file mode 100644 index 00000000000..ed7bcf10289 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/StreamWindow.java @@ -0,0 +1,71 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +@Getter +@ToString +@EqualsAndHashCode(callSuper = false) +public class StreamWindow extends UnresolvedPlan { + + private final List windowFunctionList; + private final List groupList; + private final boolean current; + private final int window; + private final boolean global; + private final UnresolvedExpression resetBefore; + private final UnresolvedExpression resetAfter; + @ToString.Exclude private UnresolvedPlan child; + + /** StreamWindow Constructor. */ + public StreamWindow( + List windowFunctionList, + List groupList, + boolean current, + int window, + boolean global, + UnresolvedExpression resetBefore, + UnresolvedExpression resetAfter) { + this.windowFunctionList = windowFunctionList; + this.groupList = groupList; + this.current = current; + this.window = window; + this.global = global; + this.resetBefore = resetBefore; + this.resetAfter = resetAfter; + } + + public boolean isCurrent() { + return current; + } + + public boolean isGlobal() { + return global; + } + + @Override + public StreamWindow attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitStreamWindow(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index c6a964fce17..573a51de2a7 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -17,6 +17,7 @@ import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_DEDUP; import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_MAIN; import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_RARE_TOP; +import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_STREAMSTATS; import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_SUBSEARCH; import static org.opensearch.sql.calcite.utils.PlanUtils.getRelation; import static org.opensearch.sql.calcite.utils.PlanUtils.getRexCall; @@ -131,6 +132,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; @@ -1580,6 +1582,330 @@ private void validateFillNullTypeCompatibility( } } + @Override + public RelNode visitStreamWindow(StreamWindow node, CalcitePlanContext context) { + visitChildren(node, context); + + List groupList = node.getGroupList(); + boolean hasGroup = groupList != null && !groupList.isEmpty(); + boolean hasWindow = node.getWindow() > 0; + boolean hasReset = node.getResetBefore() != null || node.getResetAfter() != null; + + // Local helper column names + final String RESET_BEFORE_FLAG_COL = "__reset_before_flag__"; // flag for reset_before + final String RESET_AFTER_FLAG_COL = "__reset_after_flag__"; // flag for reset_after + final String SEGMENT_ID_COL = "__seg_id__"; // segment id + + // CASE: reset + if (hasReset) { + // 1. Build helper columns: seq, before/after flags, segment_id + RelNode leftWithSeg = buildResetHelperColumns(context, node); + + // 2. Run correlate + aggregate with reset-specific filter and cleanup + return buildStreamWindowJoinPlan( + context, + leftWithSeg, + node, + groupList, + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + SEGMENT_ID_COL, + new String[] { + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + RESET_BEFORE_FLAG_COL, + RESET_AFTER_FLAG_COL, + SEGMENT_ID_COL + }); + } + + // CASE: global=true + window>0 + has group + if (node.isGlobal() && hasWindow && hasGroup) { + // 1. Add global sequence column for sliding window + RexNode streamSeq = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(streamSeq); + RelNode left = context.relBuilder.build(); + + // 2. Run correlate + aggregate + return buildStreamWindowJoinPlan( + context, + left, + node, + groupList, + ROW_NUMBER_COLUMN_FOR_STREAMSTATS, + null, + new String[] {ROW_NUMBER_COLUMN_FOR_STREAMSTATS}); + } + + // Default + if (hasGroup) { + // only build sequence when there is by condition + RexNode streamSeq = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(streamSeq); + } + + List overExpressions = + node.getWindowFunctionList().stream().map(w -> rexVisitor.analyze(w, context)).toList(); + context.relBuilder.projectPlus(overExpressions); + + // resort when there is by condition + if (hasGroup) { + context.relBuilder.sort(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_STREAMSTATS)); + context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_FOR_STREAMSTATS)); + } + + return context.relBuilder.peek(); + } + + private RelNode buildStreamWindowJoinPlan( + CalcitePlanContext context, + RelNode leftWithHelpers, + StreamWindow node, + List groupList, + String seqCol, + String segmentCol, + String[] helperColsToCleanup) { + + final Holder<@Nullable RexCorrelVariable> v = Holder.empty(); + context.relBuilder.push(leftWithHelpers); + context.relBuilder.variable(v::set); + + context.relBuilder.push(leftWithHelpers); + RexNode rightSeq = context.relBuilder.field(seqCol); + RexNode outerSeq = context.relBuilder.field(v.get(), seqCol); + + RexNode filter; + if (segmentCol != null) { // reset condition + RexNode segRight = context.relBuilder.field(segmentCol); + RexNode segOuter = context.relBuilder.field(v.get(), segmentCol); + RexNode frame = buildResetFrameFilter(context, node, outerSeq, rightSeq, segOuter, segRight); + RexNode group = buildGroupFilter(context, groupList, v.get()); + filter = (group == null) ? frame : context.relBuilder.and(frame, group); + } else { // global + window + by condition + RexNode frame = buildFrameFilter(context, node, outerSeq, rightSeq); + RexNode group = buildGroupFilter(context, groupList, v.get()); + filter = context.relBuilder.and(frame, group); + } + context.relBuilder.filter(filter); + + // aggregate all window functions on right side + List aggCalls = buildAggCallsForWindowFunctions(node.getWindowFunctionList(), context); + context.relBuilder.aggregate(context.relBuilder.groupKey(), aggCalls); + RelNode rightAgg = context.relBuilder.build(); + + // correlate LEFT with RIGHT using seq + group fields + context.relBuilder.push(leftWithHelpers); + context.relBuilder.push(rightAgg); + List requiredLeft = buildRequiredLeft(context, seqCol, groupList); + if (segmentCol != null) { // also require seg_id for reset segmentation equality + requiredLeft = new ArrayList<>(requiredLeft); + requiredLeft.add(context.relBuilder.field(2, 0, segmentCol)); + } + context.relBuilder.correlate(JoinRelType.LEFT, v.get().id, requiredLeft); + + // resort to original order + boolean hasGroup = !groupList.isEmpty(); + // resort when 1. global + window + by condition 2.reset + by condition + if (hasGroup) { + context.relBuilder.sort(context.relBuilder.field(seqCol)); + } + + // cleanup helper columns + List cleanup = new ArrayList<>(); + for (String c : helperColsToCleanup) { + cleanup.add(context.relBuilder.field(c)); + } + context.relBuilder.projectExcept(cleanup); + return context.relBuilder.peek(); + } + + private RelNode buildResetHelperColumns(CalcitePlanContext context, StreamWindow node) { + // 1. global sequence to define order + RexNode rowNum = + context + .relBuilder + .aggregateCall(SqlStdOperatorTable.ROW_NUMBER) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .as(ROW_NUMBER_COLUMN_FOR_STREAMSTATS); + context.relBuilder.projectPlus(rowNum); + + // 2. before/after flags + RexNode beforePred = + (node.getResetBefore() == null) + ? context.relBuilder.literal(false) + : rexVisitor.analyze(node.getResetBefore(), context); + RexNode afterPred = + (node.getResetAfter() == null) + ? context.relBuilder.literal(false) + : rexVisitor.analyze(node.getResetAfter(), context); + RexNode beforeFlag = + context.relBuilder.call( + SqlStdOperatorTable.CASE, + beforePred, + context.relBuilder.literal(1), + context.relBuilder.literal(0)); + RexNode afterFlag = + context.relBuilder.call( + SqlStdOperatorTable.CASE, + afterPred, + context.relBuilder.literal(1), + context.relBuilder.literal(0)); + context.relBuilder.projectPlus(context.relBuilder.alias(beforeFlag, "__reset_before_flag__")); + context.relBuilder.projectPlus(context.relBuilder.alias(afterFlag, "__reset_after_flag__")); + + // 3. session id = SUM(beforeFlag) over (to current) + SUM(afterFlag) over (to 1 preceding) + RexNode sumBefore = + context + .relBuilder + .aggregateCall( + SqlStdOperatorTable.SUM, context.relBuilder.field("__reset_before_flag__")) + .over() + .rowsTo(RexWindowBounds.CURRENT_ROW) + .toRex(); + RexNode sumAfterPrev = + context + .relBuilder + .aggregateCall( + SqlStdOperatorTable.SUM, context.relBuilder.field("__reset_after_flag__")) + .over() + .rowsBetween( + RexWindowBounds.UNBOUNDED_PRECEDING, + RexWindowBounds.preceding(context.relBuilder.literal(1))) + .toRex(); + sumBefore = + context.relBuilder.call( + SqlStdOperatorTable.COALESCE, sumBefore, context.relBuilder.literal(0)); + sumAfterPrev = + context.relBuilder.call( + SqlStdOperatorTable.COALESCE, sumAfterPrev, context.relBuilder.literal(0)); + + RexNode segId = context.relBuilder.call(SqlStdOperatorTable.PLUS, sumBefore, sumAfterPrev); + context.relBuilder.projectPlus(context.relBuilder.alias(segId, "__seg_id__")); + return context.relBuilder.build(); + } + + private RexNode buildFrameFilter( + CalcitePlanContext context, StreamWindow node, RexNode outerSeq, RexNode rightSeq) { + // window always >0 + // frame: either [outer-(w-1), outer] or [outer-w, outer-1] + if (node.isCurrent()) { + RexNode lower = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, + outerSeq, + context.relBuilder.literal(node.getWindow() - 1)); + return context.relBuilder.between(rightSeq, lower, outerSeq); + } else { + RexNode lower = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, outerSeq, context.relBuilder.literal(node.getWindow())); + RexNode upper = + context.relBuilder.call( + SqlStdOperatorTable.MINUS, outerSeq, context.relBuilder.literal(1)); + return context.relBuilder.between(rightSeq, lower, upper); + } + } + + private RexNode buildResetFrameFilter( + CalcitePlanContext context, + StreamWindow node, + RexNode outerSeq, + RexNode rightSeq, + RexNode segIdOuter, + RexNode segIdRight) { + // 1. Compute sequence range (handle running window semantics when window == 0) + RexNode seqFilter; + if (node.getWindow() == 0) { + // running: current => rightSeq <= outerSeq; excluding current => rightSeq < outerSeq + seqFilter = + node.isCurrent() + ? context.relBuilder.lessThanOrEqual(rightSeq, outerSeq) + : context.relBuilder.lessThan(rightSeq, outerSeq); + } else { + // Reuse normal frame filter logic when window > 0 + seqFilter = buildFrameFilter(context, node, outerSeq, rightSeq); + } + // 2. Ensure same segment (seg_id) for reset partitioning + RexNode segFilter = context.relBuilder.equals(segIdRight, segIdOuter); + // 3. Combine filters + return context.relBuilder.and(seqFilter, segFilter); + } + + private RexNode buildGroupFilter( + CalcitePlanContext context, List groupList, RexCorrelVariable correl) { + // build conjunctive equality filters: right.g_i = outer.g_i + if (groupList.isEmpty()) { + return null; + } + List equalsList = + groupList.stream() + .map( + expr -> { + String groupName = extractGroupFieldName(expr); + RexNode rightGroup = context.relBuilder.field(groupName); + RexNode outerGroup = context.relBuilder.field(correl, groupName); + return context.relBuilder.equals(rightGroup, outerGroup); + }) + .toList(); + return context.relBuilder.and(equalsList); + } + + private String extractGroupFieldName(UnresolvedExpression groupExpr) { + if (groupExpr instanceof Alias groupAlias + && groupAlias.getDelegated() instanceof Field groupField) { + return groupField.getField().toString(); + } else if (groupExpr instanceof Field groupField) { + return groupField.getField().toString(); + } else { + throw new IllegalArgumentException( + "Unsupported group expression: only field or alias(field) is supported"); + } + } + + private List buildAggCallsForWindowFunctions( + List windowExprs, CalcitePlanContext context) { + List aggCalls = new ArrayList<>(); + for (UnresolvedExpression expr : windowExprs) { + if (expr instanceof Alias a && a.getDelegated() instanceof WindowFunction wf) { + Function func = (Function) wf.getFunction(); + List args = func.getFuncArgs(); + // first argument is the input field, others are function params + UnresolvedExpression field = args.isEmpty() ? null : args.get(0); + List rest = + args.size() <= 1 ? List.of() : args.subList(1, args.size()); + AggregateFunction aggFunc = new AggregateFunction(func.getFuncName(), field, rest); + AggCall call = aggVisitor.analyze(new Alias(a.getName(), aggFunc), context); + aggCalls.add(call); + } else { + throw new IllegalArgumentException("Unsupported window function in streamstats"); + } + } + return aggCalls; + } + + private List buildRequiredLeft( + CalcitePlanContext context, String seqCol, List groupList) { + List requiredLeft = new ArrayList<>(); + // reference to left seq column + requiredLeft.add(context.relBuilder.field(2, 0, seqCol)); + for (UnresolvedExpression groupExpr : groupList) { + String groupName = extractGroupFieldName(groupExpr); + requiredLeft.add(context.relBuilder.field(2, 0, groupName)); + } + return requiredLeft; + } + @Override public RelNode visitFillNull(FillNull node, CalcitePlanContext context) { visitChildren(node, context); diff --git a/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java b/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java index 8401e5c867f..f1671e0eb63 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java +++ b/core/src/main/java/org/opensearch/sql/calcite/plan/PPLAggGroupMergeRule.java @@ -144,7 +144,10 @@ public static boolean isDependentField(RexNode node, Collection baseFie // to transform a field into such a literal if (node.getKind() == SqlKind.LITERAL) return true; if (node.getKind() == SqlKind.INPUT_REF && baseFields.contains(node)) return true; - if (node instanceof RexCall && ((RexCall) node).getOperator().isDeterministic()) { + // Use !isAggregator to rule out window functions like row_number() + if (node instanceof RexCall + && ((RexCall) node).getOperator().isDeterministic() + && !((RexCall) node).getOperator().isAggregator()) { return ((RexCall) node) .getOperands().stream().allMatch(op -> isDependentField(op, baseFields)); } diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java index a8975ba7c2d..fefab6d57ce 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java @@ -65,6 +65,7 @@ public interface PlanUtils { String ROW_NUMBER_COLUMN_FOR_RARE_TOP = "_row_number_rare_top_"; String ROW_NUMBER_COLUMN_FOR_MAIN = "_row_number_main_"; String ROW_NUMBER_COLUMN_FOR_SUBSEARCH = "_row_number_subsearch_"; + String ROW_NUMBER_COLUMN_FOR_STREAMSTATS = "__stream_seq__"; static SpanUnit intervalUnitToSpanUnit(IntervalUnit unit) { return switch (unit) { diff --git a/docs/category.json b/docs/category.json index d9605598800..7ebe643373b 100644 --- a/docs/category.json +++ b/docs/category.json @@ -47,6 +47,7 @@ "user/ppl/cmd/showdatasources.rst", "user/ppl/cmd/sort.rst", "user/ppl/cmd/stats.rst", + "user/ppl/cmd/streamstats.rst", "user/ppl/cmd/subquery.rst", "user/ppl/cmd/syntax.rst", "user/ppl/cmd/timechart.rst", diff --git a/docs/user/ppl/cmd/streamstats.rst b/docs/user/ppl/cmd/streamstats.rst new file mode 100644 index 00000000000..0ac18637fec --- /dev/null +++ b/docs/user/ppl/cmd/streamstats.rst @@ -0,0 +1,229 @@ +=========== +streamstats +=========== + +.. rubric:: Table of contents + +.. contents:: + :local: + :depth: 2 + + +Description +=========== +The ``streamstats`` command is used to calculate cumulative or rolling statistics as events are processed in order. Unlike ``stats`` or ``eventstats`` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. + +Key aspects of `streamstats`: + +1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. +2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). +3. Retains all original events and appends new fields containing the calculated statistics. +4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. + +Difference between ``stats``, ``eventstats`` and ``streamstats`` + +All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: + +* Transformation Behavior: + * ``stats``: Transforms all events into an aggregated result table, losing original event structure. + * ``eventstats``: Adds aggregation results as new fields to the original events without removing the event structure. + * ``streamstats``: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. +* Output Format: + * ``stats``: Output contains only aggregated values. Original raw events are not preserved. + * ``eventstats``: Original events remain, with extra fields containing summary statistics. + * ``streamstats``: Original events remain, with extra fields containing running totals or cumulative statistics. +* Aggregation Scope: + * ``stats``: Based on all events in the search (or groups defined by BY clause). + * ``eventstats``: Based on all relevant events, then the result is added back to each event in the group. + * ``streamstats``: Calculations occur progressively as each event is processed; can be scoped by window. +* Use Cases: + * ``stats``: When only aggregated results are needed (e.g., counts, averages, sums). + * ``eventstats``: When aggregated statistics are needed alongside original event data. + * ``streamstats``: When a running total or cumulative statistic is needed across event streams. + +Syntax +====== +streamstats [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause] + +* function: mandatory. A aggregation function or window function. +* current: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=. **Default:** true. +* window: optional. Specifies the number of events to use when computing the statistics. Syntax: window=. **Default:** 0, which means that all previous and current events are used. +* global: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=. **Default:** true. +* reset_before: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("")". **Default:** false. +* reset_after: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("")". **Default:** false. +* by-clause: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no is specified, all events are processed as a single group and running statistics are computed across the entire event stream. +* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, ``span(age, 10)`` creates 10-year age buckets, ``span(timestamp, 1h)`` creates hourly buckets. + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +Aggregation Functions +===================== + +The streamstats command supports the following aggregation functions: + +* COUNT: Count of values +* SUM: Sum of numeric values +* AVG: Average of numeric values +* MAX: Maximum value +* MIN: Minimum value +* VAR_SAMP: Sample variance +* VAR_POP: Population variance +* STDDEV_SAMP: Sample standard deviation +* STDDEV_POP: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* EARLIEST: Earliest value by timestamp +* LATEST: Latest value by timestamp + +For detailed documentation of each function, see `Aggregation Functions <../functions/aggregation.rst>`_. + +Usage +===== + +Streamstats:: + + source = table | streamstats avg(a) + source = table | streamstats current = false avg(a) + source = table | streamstats window = 5 sum(b) + source = table | streamstats current = false window = 2 max(a) + source = table | where a < 50 | streamstats count(c) + source = table | streamstats min(c), max(c) by b + source = table | streamstats count(c) as count_by by b | where count_by > 1000 + source = table | streamstats dc(field) as distinct_count + source = table | streamstats distinct_count(category) by region + source = table | streamstats current=false window=2 global=false avg(a) by b + source = table | streamstats window=2 reset_before=a>31 avg(b) + source = table | streamstats current=false reset_after=a>31 avg(b) by c + + +Example 1: Calculate the running average, sum, and count of a field by group +============================================================================ + +This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. + +PPL query:: + + os> source=accounts | streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender; + fetched rows / total rows = 4/4 + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | + |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ + + +Example 2: Running maximum age over a 2-row window +================================================== + +This example calculates the running maximum age over a 2-row window, excluding the current event. + +PPL query:: + + os> source=state_country | streamstats current=false window=2 max(age) as prev_max_age + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+--------------+ + | name | country | state | month | year | age | prev_max_age | + |-------+---------+------------+-------+------+-----+--------------| + | Jake | USA | California | 4 | 2023 | 70 | null | + | Hello | USA | New York | 4 | 2023 | 30 | 70 | + | John | Canada | Ontario | 4 | 2023 | 25 | 70 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 25 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 27 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 57 | + | David | USA | Washington | 4 | 2023 | 40 | 70 | + +-------+---------+------------+-------+------+-----+--------------+ + + +Example 3: Use the global argument to calculate running statistics +================================================================== + +The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: + +* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. +* global=false: the window itself is created per group, meaning each group gets its own independent window. + +This example shows how to calculate the running average of age across accounts by country, using global argument. + +original data:: + + +-------+---------+------------+-------+------+-----+ + | name | country | state | month | year | age | + |-------+---------+------------+-------+------+-----+ + | Jake | USA | California | 4 | 2023 | 70 | + | Hello | USA | New York | 4 | 2023 | 30 | + | John | Canada | Ontario | 4 | 2023 | 25 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | + | Jim | Canada | B.C | 4 | 2023 | 27 | + | Peter | Canada | B.C | 4 | 2023 | 57 | + | Rick | Canada | B.C | 4 | 2023 | 70 | + | David | USA | Washington | 4 | 2023 | 40 | + +-------+---------+------------+-------+------+-----+ + +* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. +* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. + +PPL query:: + + os> source=state_country | streamstats window=2 global=true avg(age) as running_avg by country ; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+-------------+ + | name | country | state | month | year | age | running_avg | + |-------+---------+------------+-------+------+-----+-------------| + | Jake | USA | California | 4 | 2023 | 70 | 70.0 | + | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | + | David | USA | Washington | 4 | 2023 | 40 | 40.0 | + +-------+---------+------------+-------+------+-----+-------------+ + + os> source=state_country | streamstats window=2 global=false avg(age) as running_avg by country ; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+-------------+ + | name | country | state | month | year | age | running_avg | + |-------+---------+------------+-------+------+-----+-------------| + | Jake | USA | California | 4 | 2023 | 70 | 70.0 | + | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | + | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | + | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | + | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | + | David | USA | Washington | 4 | 2023 | 40 | 35.0 | + +-------+---------+------------+-------+------+-----+-------------+ + + +Example 4: Use the reset_before and reset_after arguments to reset statistics +============================================================================= + +This example calculates the running average of age across accounts by country, with resets applied. + +PPL query:: + + os> source=state_country | streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country; + fetched rows / total rows = 8/8 + +-------+---------+------------+-------+------+-----+---------+ + | name | country | state | month | year | age | avg_age | + |-------+---------+------------+-------+------+-----+---------| + | Jake | USA | California | 4 | 2023 | 70 | null | + | Hello | USA | New York | 4 | 2023 | 30 | 70.0 | + | John | Canada | Ontario | 4 | 2023 | 25 | null | + | Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | + | Jim | Canada | B.C | 4 | 2023 | 27 | null | + | Peter | Canada | B.C | 4 | 2023 | 57 | null | + | Rick | Canada | B.C | 4 | 2023 | 70 | null | + | David | USA | Washington | 4 | 2023 | 40 | null | + +-------+---------+------------+-------+------+-----+---------+ \ No newline at end of file diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst index 17b4797df39..697ec7e2c6e 100644 --- a/docs/user/ppl/index.rst +++ b/docs/user/ppl/index.rst @@ -112,6 +112,8 @@ The query start with search command and then flowing a set of command delimited - `stats command `_ + - `streamstats command `_ + - `subquery (aka subsearch) command `_ - `reverse command `_ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index 69507c71aa5..15051417db1 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -66,6 +66,7 @@ CalcitePPLCryptographicFunctionIT.class, CalcitePPLDedupIT.class, CalcitePPLEventstatsIT.class, + CalciteStreamstatsCommandIT.class, CalcitePPLExistsSubqueryIT.class, CalcitePPLExplainIT.class, CalcitePPLFillnullIT.class, diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index cff92408d4f..77f3a45cc07 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -615,6 +615,45 @@ public void testEventstatsDistinctCountFunctionExplain() throws IOException { assertJsonEqualsIgnoreId(expected, result); } + @Test + public void testStreamstatsDistinctCountExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats dc(state) as distinct_states"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_dc.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsDistinctCountFunctionExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats distinct_count(state) as" + + " distinct_states by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_distinct_count.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsGlobalExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats window=2 global=true avg(age) as" + + " avg_age by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_global.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + + @Test + public void testStreamstatsResetExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | streamstats current=false reset_before=age>34" + + " reset_after=age<25 avg(age) as avg_age by gender"; + var result = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_streamstats_reset.yaml"); + assertYamlEqualsIgnoreId(expected, result); + } + // Only for Calcite, as v2 gets unstable serialized string for function @Test public void testExplainOnAggregationWithSumEnhancement() throws IOException { @@ -740,6 +779,41 @@ public void testExplainOnEventstatsEarliestLatestNoGroupBy() throws IOException TEST_INDEX_LOGS))); } + public void testExplainOnStreamstatsEarliestLatest() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message) as earliest_message, latest(message) as" + + " latest_message by server", + TEST_INDEX_LOGS))); + } + + @Test + public void testExplainOnStreamstatsEarliestLatestWithCustomTimeField() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest_custom_time.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message, created_at) as earliest_message," + + " latest(message, created_at) as latest_message by level", + TEST_INDEX_LOGS))); + } + + @Test + public void testExplainOnStreamstatsEarliestLatestNoGroupBy() throws IOException { + String expected = loadExpectedPlan("explain_streamstats_earliest_latest_no_group.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | streamstats earliest(message) as earliest_message, latest(message) as" + + " latest_message", + TEST_INDEX_LOGS))); + } + @Test public void testListAggregationExplain() throws IOException { String expected = loadExpectedPlan("explain_list_aggregation.json"); diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java index 59c23a0eeed..9839fff00c4 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java @@ -27,7 +27,7 @@ public void init() throws Exception { } @Test - public void testEventstat() throws IOException { + public void testEventstats() throws IOException { JSONObject actual = executeQuery( String.format( @@ -57,7 +57,7 @@ public void testEventstat() throws IOException { } @Test - public void testEventstatWithNull() throws IOException { + public void testEventstatsWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -89,7 +89,7 @@ public void testEventstatWithNull() throws IOException { } @Test - public void testEventstatBy() throws IOException { + public void testEventstatsBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -119,7 +119,7 @@ public void testEventstatBy() throws IOException { } @Test - public void testEventstatByWithNull() throws IOException { + public void testEventstatsByWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -166,7 +166,7 @@ public void testEventstatByWithNull() throws IOException { } @Test - public void testEventstatBySpan() throws IOException { + public void testEventstatsBySpan() throws IOException { JSONObject actual = executeQuery( String.format( @@ -183,7 +183,7 @@ public void testEventstatBySpan() throws IOException { } @Test - public void testEventstatBySpanWithNull() throws IOException { + public void testEventstatsBySpanWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -202,7 +202,7 @@ public void testEventstatBySpanWithNull() throws IOException { } @Test - public void testEventstatByMultiplePartitions1() throws IOException { + public void testEventstatsByMultiplePartitions1() throws IOException { JSONObject actual = executeQuery( String.format( @@ -219,7 +219,7 @@ public void testEventstatByMultiplePartitions1() throws IOException { } @Test - public void testEventstatByMultiplePartitions2() throws IOException { + public void testEventstatsByMultiplePartitions2() throws IOException { JSONObject actual = executeQuery( String.format( @@ -236,7 +236,7 @@ public void testEventstatByMultiplePartitions2() throws IOException { } @Test - public void testEventstatByMultiplePartitionsWithNull1() throws IOException { + public void testEventstatsByMultiplePartitionsWithNull1() throws IOException { JSONObject actual = executeQuery( String.format( @@ -255,7 +255,7 @@ public void testEventstatByMultiplePartitionsWithNull1() throws IOException { } @Test - public void testEventstatByMultiplePartitionsWithNull2() throws IOException { + public void testEventstatsByMultiplePartitionsWithNull2() throws IOException { JSONObject actual = executeQuery( String.format( @@ -289,7 +289,7 @@ public void testUnsupportedWindowFunctions() { } @Test - public void testMultipleEventstat() throws IOException { + public void testMultipleEventstats() throws IOException { JSONObject actual = executeQuery( String.format( @@ -306,7 +306,7 @@ public void testMultipleEventstat() throws IOException { } @Test - public void testMultipleEventstatWithNull() throws IOException { + public void testMultipleEventstatsWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -325,7 +325,7 @@ public void testMultipleEventstatWithNull() throws IOException { } @Test - public void testMultipleEventstatWithEval() throws IOException { + public void testMultipleEventstatsWithEval() throws IOException { JSONObject actual = executeQuery( String.format( @@ -343,7 +343,7 @@ public void testMultipleEventstatWithEval() throws IOException { } @Test - public void testEventstatEmptyRows() throws IOException { + public void testEventstatsEmptyRows() throws IOException { JSONObject actual = executeQuery( String.format( @@ -363,7 +363,7 @@ public void testEventstatEmptyRows() throws IOException { } @Test - public void testEventstatVariance() throws IOException { + public void testEventstatsVariance() throws IOException { JSONObject actual = executeQuery( String.format( @@ -433,7 +433,7 @@ public void testEventstatVariance() throws IOException { } @Test - public void testEventstatVarianceWithNull() throws IOException { + public void testEventstatsVarianceWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -496,7 +496,7 @@ public void testEventstatVarianceWithNull() throws IOException { } @Test - public void testEventstatVarianceBy() throws IOException { + public void testEventstatsVarianceBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -513,7 +513,7 @@ public void testEventstatVarianceBy() throws IOException { } @Test - public void testEventstatVarianceBySpan() throws IOException { + public void testEventstatsVarianceBySpan() throws IOException { JSONObject actual = executeQuery( String.format( @@ -527,7 +527,7 @@ public void testEventstatVarianceBySpan() throws IOException { } @Test - public void testEventstatVarianceWithNullBy() throws IOException { + public void testEventstatsVarianceWithNullBy() throws IOException { JSONObject actual = executeQuery( String.format( @@ -576,7 +576,7 @@ public void testEventstatVarianceWithNullBy() throws IOException { } @Test - public void testEventstatDistinctCount() throws IOException { + public void testEventstatsDistinctCount() throws IOException { JSONObject actual = executeQuery( String.format( @@ -601,7 +601,7 @@ public void testEventstatDistinctCount() throws IOException { } @Test - public void testEventstatDistinctCountByCountry() throws IOException { + public void testEventstatsDistinctCountByCountry() throws IOException { JSONObject actual = executeQuery( String.format( @@ -627,7 +627,7 @@ public void testEventstatDistinctCountByCountry() throws IOException { } @Test - public void testEventstatDistinctCountFunction() throws IOException { + public void testEventstatsDistinctCountFunction() throws IOException { JSONObject actual = executeQuery( String.format( @@ -653,7 +653,7 @@ public void testEventstatDistinctCountFunction() throws IOException { } @Test - public void testEventstatDistinctCountWithNull() throws IOException { + public void testEventstatsDistinctCountWithNull() throws IOException { JSONObject actual = executeQuery( String.format( @@ -681,7 +681,7 @@ public void testEventstatDistinctCountWithNull() throws IOException { } @Test - public void testEventstatEarliestAndLatest() throws IOException { + public void testEventstatsEarliestAndLatest() throws IOException { JSONObject actual = executeQuery( String.format( diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java new file mode 100644 index 00000000000..ee94c218dbb --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteStreamstatsCommandIT.java @@ -0,0 +1,1095 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.legacy.TestsConstants.*; +import static org.opensearch.sql.util.MatcherUtils.*; + +import java.io.IOException; +import java.util.List; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.client.Request; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +public class CalciteStreamstatsCommandIT extends PPLIntegTestCase { + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.STATE_COUNTRY); + loadIndex(Index.STATE_COUNTRY_WITH_NULL); + loadIndex(Index.BANK_TWO); + loadIndex(Index.LOGS); + } + + @Test + public void testStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3, 41.666666666666664, 25, 70), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4, 36.25, 20, 70)); + } + + @Test + public void testStreamstatsWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3, 41.666666666666664, 25, 70), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4, 36.25, 20, 70), + rows(null, "Canada", null, 4, 2023, 10, 5, 31, 10, 70), + rows("Kevin", null, null, 4, 2023, null, 6, 31, 10, 70)); + } + + @Test + public void testStreamstatsBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsByWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("cnt", "bigint"), + schema("avg", "double"), + schema("min", "int"), + schema("max", "int")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 2, 50, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 3, 18.333333333333332, 10, 25), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + + actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 2, 10, 10, 10)); + } + + @Test + public void testStreamstatsBySpan() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsBySpanWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsByMultiplePartitions1() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25)); + } + + @Test + public void testStreamstatsByMultiplePartitions2() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, state", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20)); + } + + @Test + public void testStreamstatsByMultiplePartitionsWithNull1() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2, 22.5, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsByMultiplePartitionsWithNull2() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats count() as cnt, avg(age) as avg, min(age) as min, max(age)" + + " as max by span(age, 10) as age_span, state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 1, 30, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 1, 20, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 1, 10, 10, 10), + rows("Kevin", null, null, 4, 2023, null, 1, null, null, null)); + } + + @Test + public void testStreamstatsCurrent() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current=false avg(age) as prev_avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 41.666666666666664)); + } + + @Test + public void testStreamstatsCurrentWithNUll() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current=false avg(age) as prev_avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 41.666666666666664), + rows(null, "Canada", null, 4, 2023, 10, 36.25), + rows("Kevin", null, null, 4, 2023, null, 31)); + } + + @Test + public void testStreamstatsWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 3 avg(age) as avg", TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 25)); + } + + @Test + public void testStreamstatsWindowWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 3 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 25), + rows(null, "Canada", null, 4, 2023, 10, 18.333333333333332), + rows("Kevin", null, null, 4, 2023, null, 15)); + } + + public void testStreamstatsBigWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window = 10 avg(age) as avg", TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 36.25)); + } + + @Test + public void testStreamstatsWindowError() { + Throwable e = + assertThrowsWithReplace( + IllegalArgumentException.class, + () -> + executeQuery( + String.format( + "source=%s | streamstats window=-1 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY))); + verifyErrorMessageContains(e, "Window size must be >= 0, but got: -1"); + } + + @Test + public void testStreamstatsCurrentAndWindow() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current = false window = 2 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 27.5)); + } + + @Test + public void testStreamstatsCurrentAndWindowWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats current = false window = 2 avg(age) as avg", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 70), + rows("John", "Canada", "Ontario", 4, 2023, 25, 50), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 27.5), + rows(null, "Canada", null, 4, 2023, 10, 22.5), + rows("Kevin", null, null, 4, 2023, null, 15)); + } + + @Test + public void testStreamstatsGlobal() throws IOException { + final int docId = 5; + Request insertRequest = + new Request( + "PUT", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 40,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=false avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 35)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=true avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 40)); + + Request deleteRequest = + new Request( + "DELETE", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsGlobalWithNull() throws IOException { + final int docId = 7; + Request insertRequest = + new Request( + "PUT", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 40,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=false avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 35)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 global=true avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 40, 40)); + + Request deleteRequest = + new Request( + "DELETE", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsReset() throws IOException { + final int docId = 5; + Request insertRequest = + new Request( + "PUT", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 28,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + Request deleteRequest = + new Request( + "DELETE", String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testStreamstatsResetWithNull() throws IOException { + final int docId = 7; + Request insertRequest = + new Request( + "PUT", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + insertRequest.setJsonEntity( + "{\"name\": \"Jay\",\"age\": 28,\"state\":" + + " \"Quebec\",\"country\": \"USA\",\"year\": 2023,\"month\":" + + " 4}\n"); + client().performRequest(insertRequest); + + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_before=age>29 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | streamstats window=2 reset_after=age>22 avg(age) as avg by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual2, + rows("Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows(null, "Canada", null, 4, 2023, 10, 15), + rows("Kevin", null, null, 4, 2023, null, null), + rows("Jay", "USA", "Quebec", 4, 2023, 28, 28)); + + Request deleteRequest = + new Request( + "DELETE", + String.format("/%s/_doc/%d?refresh=true", TEST_INDEX_STATE_COUNTRY_WITH_NULL, docId)); + client().performRequest(deleteRequest); + } + + @Test + public void testUnsupportedWindowFunctions() { + List unsupported = List.of("PERCENTILE_APPROX", "PERCENTILE"); + for (String u : unsupported) { + Throwable e = + assertThrowsWithReplace( + UnsupportedOperationException.class, + () -> + executeQuery( + String.format( + "source=%s | streamstats %s(age)", TEST_INDEX_STATE_COUNTRY, u))); + verifyErrorMessageContains(e, "Unexpected window function: " + u); + } + } + + @Test + public void testMultipleStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" + + " avg(avg_age) as avg_state_age by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20, 22.5)); + } + + @Test + public void testMultipleStreamstatsWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by state, country | streamstats" + + " avg(avg_age) as avg_state_age by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 25), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20, 22.5), + rows(null, "Canada", null, 4, 2023, 10, 10, 18.333333333333332), + rows("Kevin", null, null, 4, 2023, null, null, null)); + } + + @Test + public void testStreamstatsAndEventstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats avg(age) as avg_age| streamstats" + + " avg(age) as avg_age_stream", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 36.25, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, 36.25, 50), + rows("John", "Canada", "Ontario", 4, 2023, 25, 36.25, 41.666666666666664), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 36.25, 36.25)); + } + + @Test + public void testStreamstatsAndSort() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | sort age | streamstats window = 2 avg(age) as avg_age ", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 20), + rows("John", "Canada", "Ontario", 4, 2023, 25, 22.5), + rows("Hello", "USA", "New York", 4, 2023, 30, 27.5), + rows("Jake", "USA", "California", 4, 2023, 70, 50)); + } + + @Test + public void testLeftJoinWithStreamstats() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s as l | left join left=l right=r on l.country = r.country [ source=%s |" + + " streamstats window=2 avg(age) as avg_age]", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows( + "John", "Canada", "Ontario", 4, 2023, 25, "John", "Canada", "Ontario", 4, 2023, 25, + 27.5), + rows( + "John", "Canada", "Ontario", 4, 2023, 25, "Jane", "Canada", "Quebec", 4, 2023, 20, + 22.5), + rows("John", "Canada", "Ontario", 4, 2023, 25, null, "Canada", null, 4, 2023, 10, 15), + rows( + "Jane", "Canada", "Quebec", 4, 2023, 20, "John", "Canada", "Ontario", 4, 2023, 25, + 27.5), + rows( + "Jane", "Canada", "Quebec", 4, 2023, 20, "Jane", "Canada", "Quebec", 4, 2023, 20, 22.5), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, null, "Canada", null, 4, 2023, 10, 15), + rows( + "Jake", "USA", "California", 4, 2023, 70, "Jake", "USA", "California", 4, 2023, 70, 70), + rows("Jake", "USA", "California", 4, 2023, 70, "Hello", "USA", "New York", 4, 2023, 30, 50), + rows("Hello", "USA", "New York", 4, 2023, 30, "Jake", "USA", "California", 4, 2023, 70, 70), + rows("Hello", "USA", "New York", 4, 2023, 30, "Hello", "USA", "New York", 4, 2023, 30, 50)); + } + + @Test + public void testWhereInWithStreamstatsSubquery() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where country in [ source=%s | streamstats window=2 avg(age) as" + + " avg_age | where avg_age > 40 | fields country ]", + TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70), + rows("Hello", "USA", "New York", 4, 2023, 30)); + } + + @Test + public void testMultipleStreamstatsWithEval() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats avg(age) as avg_age by country, state, name | eval" + + " avg_age_divide_20 = avg_age - 20 | streamstats avg(avg_age_divide_20) as" + + " avg_state_age by country, state | where avg_state_age > 0 | streamstats" + + " count(avg_state_age) as count_country_age_greater_20 by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 70, 50, 50, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 30, 10, 10, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 25, 5, 5, 1)); + } + + @Test + public void testStreamstatsEmptyRows() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where name = 'non-existed' | streamstats count(), avg(age), min(age)," + + " max(age), stddev_pop(age), stddev_samp(age), var_pop(age), var_samp(age)", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyNumOfRows(actual, 0); + + JSONObject actual2 = + executeQuery( + String.format( + "source=%s | where name = 'non-existed' | streamstats count(), avg(age), min(age)," + + " max(age), stddev_pop(age), stddev_samp(age), var_pop(age), var_samp(age) by" + + " country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + verifyNumOfRows(actual2, 0); + } + + @Test + public void testStreamstatsVariance() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age)", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("stddev_pop(age)", "double"), + schema("stddev_samp(age)", "double"), + schema("var_pop(age)", "double"), + schema("var_samp(age)", "double")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows( + "John", + "Canada", + "Ontario", + 4, + 2023, + 25, + 20.138409955990955, + 24.66441431158124, + 405.55555555555566, + 608.3333333333335), + rows( + "Jane", + "Canada", + "Quebec", + 4, + 2023, + 20, + 19.803724397193573, + 22.86737122335374, + 392.1875, + 522.9166666666666)); + } + + @Test + public void testStreamstatsVarianceWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age)", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("stddev_pop(age)", "double"), + schema("stddev_samp(age)", "double"), + schema("var_pop(age)", "double"), + schema("var_samp(age)", "double")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows( + "John", + "Canada", + "Ontario", + 4, + 2023, + 25, + 20.138409955990955, + 24.66441431158124, + 405.55555555555566, + 608.3333333333335), + rows( + "Jane", + "Canada", + "Quebec", + 4, + 2023, + 20, + 19.803724397193573, + 22.86737122335374, + 392.1875, + 522.9166666666666), + rows(null, "Canada", null, 4, 2023, 10, 20.591260281974, 23.021728866442675, 424, 530), + rows("Kevin", null, null, 4, 2023, null, 20.591260281974, 23.021728866442675, 424, 530)); + } + + @Test + public void testStreamstatsVarianceBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) by country", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows("John", "Canada", "Ontario", 4, 2023, 25, 0, null, 0, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2.5, 3.5355339059327378, 6.25, 12.5)); + } + + @Test + public void testStreamstatsVarianceBySpan() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | where country != 'USA' | streamstats stddev_samp(age) by span(age," + + " 10)", + TEST_INDEX_STATE_COUNTRY)); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 3.5355339059327378)); + } + + @Test + public void testStreamstatsVarianceWithNullBy() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats stddev_pop(age), stddev_samp(age), var_pop(age)," + + " var_samp(age) by country", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 0, null, 0, null), + rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800), + rows("John", "Canada", "Ontario", 4, 2023, 25, 0, null, 0, null), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2.5, 3.5355339059327378, 6.25, 12.5), + rows( + null, + "Canada", + null, + 4, + 2023, + 10, + 6.2360956446232345, + 7.6376261582597325, + 38.88888888888888, + 58.333333333333314), + rows("Kevin", null, null, 4, 2023, null, null, null, null, null)); + } + + @Test + public void testStreamstatsDistinctCount() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state", TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4)); + } + + @Test + public void testStreamstatsDistinctCountByCountry() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state by country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 1), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2)); + } + + @Test + public void testStreamstatsDistinctCountFunction() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats distinct_count(country) as dc_country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_country", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 1), + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2)); + } + + @Test + public void testStreamstatsDistinctCountWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats dc(state) as dc_state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("Jake", "USA", "California", 4, 2023, 70, 1), + rows("Hello", "USA", "New York", 4, 2023, 30, 2), + rows("John", "Canada", "Ontario", 4, 2023, 25, 3), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows(null, "Canada", null, 4, 2023, 10, 4), + rows("Kevin", null, null, 4, 2023, null, 4)); + } + + @Test + public void testStreamstatsEarliestAndLatest() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | streamstats earliest(message), latest(message) by server", + TEST_INDEX_LOGS)); + verifySchema( + actual, + schema("created_at", "timestamp"), + schema("server", "string"), + schema("@timestamp", "timestamp"), + schema("message", "string"), + schema("level", "string"), + schema("earliest(message)", "string"), + schema("latest(message)", "string")); + verifyDataRows( + actual, + rows( + "2023-01-05 00:00:00", + "server1", + "2023-01-01 00:00:00", + "Database connection failed", + "ERROR", + "Database connection failed", + "Database connection failed"), + rows( + "2023-01-04 00:00:00", + "server2", + "2023-01-02 00:00:00", + "Service started", + "INFO", + "Service started", + "Service started"), + rows( + "2023-01-03 00:00:00", + "server1", + "2023-01-03 00:00:00", + "High memory usage", + "WARN", + "Database connection failed", + "High memory usage"), + rows( + "2023-01-02 00:00:00", + "server3", + "2023-01-04 00:00:00", + "Disk space low", + "ERROR", + "Disk space low", + "Disk space low"), + rows( + "2023-01-01 00:00:00", + "server2", + "2023-01-05 00:00:00", + "Backup completed", + "INFO", + "Service started", + "Backup completed")); + } +} diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml new file mode 100644 index 00000000000..9dd91501bf8 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_dc.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml new file mode 100644 index 00000000000..32538ab17df --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_distinct_count.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..12=[{inputs}], proj#0..10=[{exprs}], distinct_states=[$t12]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml new file mode 100644 index 00000000000..cac21b929ee --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..7=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t6], latest_message=[$t7]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$5], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {1} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml new file mode 100644 index 00000000000..f19625d85e5 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_custom_time.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..7=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t6], latest_message=[$t7]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$5], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $0), ARG_MAX($3, $0)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml new file mode 100644 index 00000000000..f17643ab804 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_earliest_latest_no_group.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[ARG_MIN($3, $2) OVER (ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]], PushDownContext=[[PROJECT->[created_at, server, @timestamp, message, level]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["created_at","server","@timestamp","message","level"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml new file mode 100644 index 00000000000..293dd785f96 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_global.yaml @@ -0,0 +1,29 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(>=($17, -($cor0.__stream_seq__, 1)), <=($17, $cor0.__stream_seq__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..11=[{inputs}], expr#12=[1], expr#13=[-($t11, $t12)], proj#0..11=[{exprs}], $f12=[$t13]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($0, $3), >=($5, $2), <=($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..1=[{inputs}], expr#2=[1], expr#3=[-($t1, $t2)], proj#0..1=[{exprs}], $f2=[$t3]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml new file mode 100644 index 00000000000..0e8ed3a3dde --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_streamstats_reset.yaml @@ -0,0 +1,38 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$21]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17, 20}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(<($17, $cor0.__stream_seq__), =($20, $cor0.__seg_id__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..15=[{inputs}], expr#16=[0], expr#17=[COALESCE($t15, $t16)], expr#18=[+($t14, $t17)], proj#0..11=[{exprs}], __seg_id__=[$t18]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($12)])], window#1=[window(rows between UNBOUNDED PRECEDING and $14 PRECEDING aggs [$SUM0($13)])], constants=[[1]]) + EnumerableCalc(expr#0..11=[{inputs}], expr#12=[34], expr#13=[>($t8, $t12)], expr#14=[1], expr#15=[0], expr#16=[CASE($t13, $t14, $t15)], expr#17=[25], expr#18=[<($t8, $t17)], expr#19=[CASE($t18, $t14, $t15)], proj#0..11=[{exprs}], __reset_before_flag__=[$t16], __reset_after_flag__=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($2, $6), =($0, $3), <($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[COALESCE($t5, $t6)], expr#8=[+($t4, $t7)], proj#0..1=[{exprs}], __seg_id__=[$t8]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($2)])], window#1=[window(rows between UNBOUNDED PRECEDING and $4 PRECEDING aggs [$SUM0($3)])], constants=[[1]]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[34], expr#4=[>($t1, $t3)], expr#5=[1], expr#6=[0], expr#7=[CASE($t4, $t5, $t6)], expr#8=[25], expr#9=[<($t1, $t8)], expr#10=[CASE($t9, $t5, $t6)], gender=[$t0], __stream_seq__=[$t2], __reset_before_flag__=[$t7], __reset_after_flag__=[$t10]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableCalc(expr#0..6=[{inputs}], expr#7=[0], expr#8=[COALESCE($t6, $t7)], expr#9=[+($t5, $t8)], proj#0..2=[{exprs}], __seg_id__=[$t9]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($3)])], window#1=[window(rows between UNBOUNDED PRECEDING and $5 PRECEDING aggs [$SUM0($4)])], constants=[[1]]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[34], expr#4=[>($t1, $t3)], expr#5=[1], expr#6=[0], expr#7=[CASE($t4, $t5, $t6)], expr#8=[25], expr#9=[<($t1, $t8)], expr#10=[CASE($t9, $t5, $t6)], proj#0..2=[{exprs}], __reset_before_flag__=[$t7], __reset_after_flag__=[$t10]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[gender, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["gender","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml new file mode 100644 index 00000000000..6ffa5ad304c --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_dc.yaml @@ -0,0 +1,10 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..17=[{inputs}], proj#0..10=[{exprs}], $11=[$t17]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml new file mode 100644 index 00000000000..550cf0ea9cb --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_distinct_count.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], distinct_states=[DISTINCT_COUNT_APPROX($7) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..18=[{inputs}], proj#0..10=[{exprs}], distinct_states=[$t18]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$17], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [DISTINCT_COUNT_APPROX($7)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml new file mode 100644 index 00000000000..c37fae48771 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (PARTITION BY $1 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..13=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t12], latest_message=[$t13]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {1} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml new file mode 100644 index 00000000000..b85e4b6b7bb --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_custom_time.yaml @@ -0,0 +1,15 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[$12], latest_message=[$13]) + LogicalSort(sort0=[$11], dir0=[ASC]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[$11], earliest_message=[ARG_MIN($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $0) OVER (PARTITION BY $4 ROWS UNBOUNDED PRECEDING)]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], _id=[$5], _index=[$6], _score=[$7], _maxscore=[$8], _sort=[$9], _routing=[$10], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableCalc(expr#0..13=[{inputs}], proj#0..4=[{exprs}], earliest_message=[$t12], latest_message=[$t13]) + EnumerableLimit(fetch=[10000]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableWindow(window#0=[window(partition {4} rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $0), ARG_MAX($3, $0)])]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml new file mode 100644 index 00000000000..79dcbca7555 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_earliest_latest_no_group.yaml @@ -0,0 +1,10 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(created_at=[$0], server=[$1], @timestamp=[$2], message=[$3], level=[$4], earliest_message=[ARG_MIN($3, $2) OVER (ROWS UNBOUNDED PRECEDING)], latest_message=[ARG_MAX($3, $2) OVER (ROWS UNBOUNDED PRECEDING)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..12=[{inputs}], proj#0..4=[{exprs}], $5=[$t11], $6=[$t12]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ARG_MIN($3, $2), ARG_MAX($3, $2)])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_logs]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml new file mode 100644 index 00000000000..3ac52e02f55 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_global.yaml @@ -0,0 +1,30 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$18]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(>=($17, -($cor0.__stream_seq__, 1)), <=($17, $cor0.__stream_seq__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[1], expr#19=[-($t17, $t18)], proj#0..10=[{exprs}], __stream_seq__=[$t17], $f12=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($0, $3), >=($5, $2), <=($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[1], expr#19=[-($t17, $t18)], gender=[$t4], __stream_seq__=[$t17], $f12=[$t19]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..17=[{inputs}], gender=[$t4], age=[$t8], $2=[$t17]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml new file mode 100644 index 00000000000..be28e9b1d8c --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_streamstats_reset.yaml @@ -0,0 +1,38 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], avg_age=[$21]) + LogicalSort(sort0=[$17], dir0=[ASC]) + LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{4, 17, 20}]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + LogicalAggregate(group=[{}], avg_age=[AVG($8)]) + LogicalFilter(condition=[AND(<($17, $cor0.__stream_seq__), =($20, $cor0.__seg_id__), =($4, $cor0.gender))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[$17], __reset_before_flag__=[$18], __reset_after_flag__=[$19], __seg_id__=[+(SUM($18) OVER (ROWS UNBOUNDED PRECEDING), COALESCE(SUM($19) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], __stream_seq__=[ROW_NUMBER() OVER ()], __reset_before_flag__=[CASE(>($8, 34), 1, 0)], __reset_after_flag__=[CASE(<($8, 25), 1, 0)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}], avg_age=[$t16]) + EnumerableLimit(fetch=[10000]) + EnumerableHashJoin(condition=[AND(=($4, $13), =($11, $14), =($12, $15))], joinType=[left]) + EnumerableSort(sort0=[$11], dir0=[ASC]) + EnumerableCalc(expr#0..15=[{inputs}], expr#16=[0], expr#17=[COALESCE($t15, $t16)], expr#18=[+($t14, $t17)], proj#0..11=[{exprs}], __seg_id__=[$t18]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($12)])], window#1=[window(rows between UNBOUNDED PRECEDING and $14 PRECEDING aggs [$SUM0($13)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], proj#0..10=[{exprs}], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..4=[{inputs}], expr#5=[0], expr#6=[=($t4, $t5)], expr#7=[null:BIGINT], expr#8=[CASE($t6, $t7, $t3)], expr#9=[CAST($t8):DOUBLE], expr#10=[/($t9, $t4)], proj#0..2=[{exprs}], avg_age=[$t10]) + EnumerableAggregate(group=[{0, 1, 2}], agg#0=[$SUM0($4)], agg#1=[COUNT($4)]) + EnumerableHashJoin(condition=[AND(=($2, $6), =($0, $3), <($5, $1))], joinType=[inner]) + EnumerableAggregate(group=[{0, 1, 2}]) + EnumerableCalc(expr#0..5=[{inputs}], expr#6=[0], expr#7=[COALESCE($t5, $t6)], expr#8=[+($t4, $t7)], proj#0..1=[{exprs}], __seg_id__=[$t8]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($2)])], window#1=[window(rows between UNBOUNDED PRECEDING and $4 PRECEDING aggs [$SUM0($3)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], gender=[$t4], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableCalc(expr#0..6=[{inputs}], expr#7=[0], expr#8=[COALESCE($t6, $t7)], expr#9=[+($t5, $t8)], proj#0..2=[{exprs}], __seg_id__=[$t9]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [$SUM0($3)])], window#1=[window(rows between UNBOUNDED PRECEDING and $5 PRECEDING aggs [$SUM0($4)])], constants=[[1]]) + EnumerableCalc(expr#0..17=[{inputs}], expr#18=[34], expr#19=[>($t8, $t18)], expr#20=[1], expr#21=[0], expr#22=[CASE($t19, $t20, $t21)], expr#23=[25], expr#24=[<($t8, $t23)], expr#25=[CASE($t24, $t20, $t21)], gender=[$t4], age=[$t8], __stream_seq__=[$t17], __reset_before_flag__=[$t22], __reset_after_flag__=[$t25]) + EnumerableWindow(window#0=[window(rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index dac39d48397..511122fa28c 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -22,6 +22,7 @@ TABLE: 'TABLE'; // Alias for FIELDS command RENAME: 'RENAME'; STATS: 'STATS'; EVENTSTATS: 'EVENTSTATS'; +STREAMSTATS: 'STREAMSTATS'; DEDUP: 'DEDUP'; SORT: 'SORT'; EVAL: 'EVAL'; @@ -110,6 +111,11 @@ DEDUP_SPLITVALUES: 'DEDUP_SPLITVALUES'; PARTITIONS: 'PARTITIONS'; ALLNUM: 'ALLNUM'; DELIM: 'DELIM'; +CURRENT: 'CURRENT'; +WINDOW: 'WINDOW'; +GLOBAL: 'GLOBAL'; +RESET_BEFORE: 'RESET_BEFORE'; +RESET_AFTER: 'RESET_AFTER'; BUCKET_NULLABLE: 'BUCKET_NULLABLE'; USENULL: 'USENULL'; CENTROIDS: 'CENTROIDS'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index f103c52759a..6b98fac02d6 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -54,6 +54,7 @@ commands | renameCommand | statsCommand | eventstatsCommand + | streamstatsCommand | dedupCommand | sortCommand | evalCommand @@ -92,6 +93,7 @@ commandName | RENAME | STATS | EVENTSTATS + | STREAMSTATS | DEDUP | SORT | EVAL @@ -245,6 +247,34 @@ eventstatsCommand : EVENTSTATS eventstatsAggTerm (COMMA eventstatsAggTerm)* (statsByClause)? ; +streamstatsCommand + : STREAMSTATS streamstatsArgs streamstatsAggTerm (COMMA streamstatsAggTerm)* (statsByClause)? + ; + +streamstatsArgs + : (currentArg | windowArg | globalArg | resetBeforeArg | resetAfterArg)* + ; + +currentArg + : CURRENT EQUAL current = booleanLiteral + ; + +windowArg + : WINDOW EQUAL window = integerLiteral + ; + +globalArg + : GLOBAL EQUAL global = booleanLiteral + ; + +resetBeforeArg + : RESET_BEFORE EQUAL logicalExpression + ; + +resetAfterArg + : RESET_AFTER EQUAL logicalExpression + ; + dedupCommand : DEDUP (number = integerLiteral)? fieldList (KEEPEMPTY EQUAL keepempty = booleanLiteral)? (CONSECUTIVE EQUAL consecutive = booleanLiteral)? ; @@ -629,6 +659,10 @@ eventstatsAggTerm : windowFunction (AS alias = wcFieldExpression)? ; +streamstatsAggTerm + : windowFunction (AS alias = wcFieldExpression)? + ; + windowFunction : windowFunctionName LT_PRTHS functionArgs RT_PRTHS ; @@ -1456,6 +1490,11 @@ searchableKeyWord | PARTITIONS | ALLNUM | DELIM + | CURRENT + | WINDOW + | GLOBAL + | RESET_BEFORE + | RESET_AFTER | BUCKET_NULLABLE | USENULL | CENTROIDS diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 8802dcbf3c9..65323229162 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -45,7 +45,29 @@ import org.apache.commons.lang3.tuple.Pair; import org.opensearch.sql.ast.EmptySourcePropagateVisitor; import org.opensearch.sql.ast.dsl.AstDSL; -import org.opensearch.sql.ast.expression.*; +import org.opensearch.sql.ast.expression.Alias; +import org.opensearch.sql.ast.expression.AllFieldsExcludeMeta; +import org.opensearch.sql.ast.expression.Argument; +import org.opensearch.sql.ast.expression.Argument.ArgumentMap; +import org.opensearch.sql.ast.expression.DataType; +import org.opensearch.sql.ast.expression.EqualTo; +import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.Let; +import org.opensearch.sql.ast.expression.Literal; +import org.opensearch.sql.ast.expression.Map; +import org.opensearch.sql.ast.expression.ParseMethod; +import org.opensearch.sql.ast.expression.PatternMethod; +import org.opensearch.sql.ast.expression.PatternMode; +import org.opensearch.sql.ast.expression.QualifiedName; +import org.opensearch.sql.ast.expression.SearchAnd; +import org.opensearch.sql.ast.expression.SearchExpression; +import org.opensearch.sql.ast.expression.SearchGroup; +import org.opensearch.sql.ast.expression.Span; +import org.opensearch.sql.ast.expression.SpanUnit; +import org.opensearch.sql.ast.expression.UnresolvedArgument; +import org.opensearch.sql.ast.expression.UnresolvedExpression; +import org.opensearch.sql.ast.expression.WindowFrame; +import org.opensearch.sql.ast.expression.WindowFunction; import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.Aggregation; import org.opensearch.sql.ast.tree.Append; @@ -83,6 +105,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SpanBin; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -446,6 +469,7 @@ public UnresolvedPlan visitStatsCommand(StatsCommandContext ctx) { return aggregation; } + /** Eventstats command. */ public UnresolvedPlan visitEventstatsCommand(OpenSearchPPLParser.EventstatsCommandContext ctx) { ImmutableList.Builder windownFunctionListBuilder = new ImmutableList.Builder<>(); @@ -467,6 +491,92 @@ public UnresolvedPlan visitEventstatsCommand(OpenSearchPPLParser.EventstatsComma return new Window(windownFunctionListBuilder.build()); } + /** Streamstats command. */ + public UnresolvedPlan visitStreamstatsCommand(OpenSearchPPLParser.StreamstatsCommandContext ctx) { + // 1. Parse arguments from the streamstats command + List argExprList = ArgumentFactory.getArgumentList(ctx); + ArgumentMap arguments = ArgumentMap.of(argExprList); + + // current, window and global from ArgumentFactory + boolean current = (Boolean) arguments.get("current").getValue(); + int window = (Integer) arguments.get("window").getValue(); + boolean global = (Boolean) arguments.get("global").getValue(); + + if (window < 0) { + throw new IllegalArgumentException("Window size must be >= 0, but got: " + window); + } + + // reset_before, reset_after + UnresolvedExpression resetBeforeExpr = + Optional.ofNullable(ctx.streamstatsArgs()) + .filter(args -> args.resetBeforeArg() != null && !args.resetBeforeArg().isEmpty()) + .map(args -> expressionBuilder.visit(args.resetBeforeArg(0).logicalExpression())) + .orElse(null); + + UnresolvedExpression resetAfterExpr = + Optional.ofNullable(ctx.streamstatsArgs()) + .filter(args -> args.resetAfterArg() != null && !args.resetAfterArg().isEmpty()) + .map(args -> expressionBuilder.visit(args.resetAfterArg(0).logicalExpression())) + .orElse(null); + + // 2.1 Build a WindowFrame from the provided arguments + WindowFrame frame = buildFrameFromArgs(current, window); + // 2.2 Build groupList + List groupList = getPartitionExprList(ctx.statsByClause()); + + // 3. Build each window function in the command + ImmutableList.Builder windowFunctionListBuilder = + new ImmutableList.Builder<>(); + + for (OpenSearchPPLParser.StreamstatsAggTermContext aggCtx : ctx.streamstatsAggTerm()) { + UnresolvedExpression windowFunction = internalVisitExpression(aggCtx.windowFunction()); + if (windowFunction instanceof WindowFunction wf) { + // Attach PARTITION BY clause expressions + wf.setPartitionByList(groupList); + // Inject the frame + wf.setWindowFrame(frame); + } + String name = + aggCtx.alias == null + ? getTextInQuery(aggCtx) + : StringUtils.unquoteIdentifier(aggCtx.alias.getText()); + Alias alias = new Alias(name, windowFunction); + windowFunctionListBuilder.add(alias); + } + + // 4. Build StreamWindow AST node + return new StreamWindow( + windowFunctionListBuilder.build(), + groupList, + current, + window, + global, + resetBeforeExpr, + resetAfterExpr); + } + + private WindowFrame buildFrameFromArgs(boolean current, int window) { + // Build the frame + if (window > 0) { + if (current) { + // N-1 PRECEDING to CURRENT ROW + return WindowFrame.of( + WindowFrame.FrameType.ROWS, (window - 1) + " PRECEDING", "CURRENT ROW"); + } else { + // N PRECEDING to 1 PRECEDING + return WindowFrame.of(WindowFrame.FrameType.ROWS, window + " PRECEDING", "1 PRECEDING"); + } + } else { + // Default: running total + if (current) { + return WindowFrame.toCurrentRow(); + } else { + // Default: running total excluding current row + return WindowFrame.of(WindowFrame.FrameType.ROWS, "UNBOUNDED PRECEDING", "1 PRECEDING"); + } + } + } + /** Dedup command. */ @Override public UnresolvedPlan visitDedupCommand(DedupCommandContext ctx) { diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java index 85481da2426..acf204e8030 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/ArgumentFactory.java @@ -28,6 +28,7 @@ import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.IntegerLiteralContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.PrefixSortFieldContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SortFieldContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.StreamstatsCommandContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SuffixSortFieldContext; /** Util class to get all arguments as a list from the PPL command. */ @@ -89,6 +90,25 @@ private static boolean legacyPreferred(Settings settings) { || Boolean.TRUE.equals(settings.getSettingValue(Settings.Key.PPL_SYNTAX_LEGACY_PREFERRED)); } + /** + * Get list of {@link Argument}. + * + * @param ctx StreamstatsCommandContext instance + * @return the list of arguments fetched from the streamstats command + */ + public static List getArgumentList(StreamstatsCommandContext ctx) { + return Arrays.asList( + ctx.streamstatsArgs().currentArg() != null && !ctx.streamstatsArgs().currentArg().isEmpty() + ? new Argument("current", getArgumentValue(ctx.streamstatsArgs().currentArg(0).current)) + : new Argument("current", new Literal(true, DataType.BOOLEAN)), + ctx.streamstatsArgs().windowArg() != null && !ctx.streamstatsArgs().windowArg().isEmpty() + ? new Argument("window", getArgumentValue(ctx.streamstatsArgs().windowArg(0).window)) + : new Argument("window", new Literal(0, DataType.INTEGER)), + ctx.streamstatsArgs().globalArg() != null && !ctx.streamstatsArgs().globalArg().isEmpty() + ? new Argument("global", getArgumentValue(ctx.streamstatsArgs().globalArg(0).global)) + : new Argument("global", new Literal(true, DataType.BOOLEAN))); + } + /** * Get list of {@link Argument}. * diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index f8c935175d0..5b599ae162c 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -85,6 +85,7 @@ import org.opensearch.sql.ast.tree.Search; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SpanBin; +import org.opensearch.sql.ast.tree.StreamWindow; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Timechart; @@ -377,6 +378,14 @@ public String visitWindow(Window node, String context) { child, String.join(" ", visitExpressionList(node.getWindowFunctionList())).trim()); } + @Override + public String visitStreamWindow(StreamWindow node, String context) { + String child = node.getChild().get(0).accept(this, context); + return StringUtils.format( + "%s | streamstats %s", + child, String.join(" ", visitExpressionList(node.getWindowFunctionList())).trim()); + } + /** Build {@link LogicalRareTopN}. */ @Override public String visitRareTopN(RareTopN node, String context) { diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java new file mode 100644 index 00000000000..04f4c7610d9 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStreamstatsTest.java @@ -0,0 +1,189 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.test.CalciteAssert; +import org.junit.Test; + +public class CalcitePPLStreamstatsTest extends CalcitePPLAbstractTest { + + public CalcitePPLStreamstatsTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Test + public void testStreamstatsBy() { + String ppl = "source=EMP | streamstats max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], max(SAL)=[MAX($5) OVER" + + " (PARTITION BY $7 ROWS UNBOUNDED PRECEDING)])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (PARTITION BY `DEPTNO` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)" + + " `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t`\n" + + "ORDER BY `__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsCurrent() { + String ppl = "source=EMP | streamstats current = false max(SAL)"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[MAX($5) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) `max(SAL)`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsWindow() { + String ppl = "source=EMP | streamstats window = 5 max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{7," + + " 8}])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalAggregate(group=[{}], max(SAL)=[MAX($5)])\n" + + " LogicalFilter(condition=[AND(>=($8, -($cor0.__stream_seq__, 4)), <=($8," + + " $cor0.__stream_seq__), =($7, $cor0.DEPTNO))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `$cor0`.`EMPNO`, `$cor0`.`ENAME`, `$cor0`.`JOB`, `$cor0`.`MGR`, `$cor0`.`HIREDATE`," + + " `$cor0`.`SAL`, `$cor0`.`COMM`, `$cor0`.`DEPTNO`, `t2`.`max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `$cor0`,\n" + + "LATERAL (SELECT MAX(`SAL`) `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t0`\n" + + "WHERE `__stream_seq__` >= `$cor0`.`__stream_seq__` - 4 AND `__stream_seq__` <=" + + " `$cor0`.`__stream_seq__` AND `DEPTNO` = `$cor0`.`DEPTNO`) `t2`\n" + + "ORDER BY `$cor0`.`__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsGlobal() { + String ppl = "source=EMP | streamstats window = 5 global= false max(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], max(SAL)=[$9])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], max(SAL)=[MAX($5) OVER" + + " (PARTITION BY $7 ROWS 4 PRECEDING)])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`, MAX(`SAL`)" + + " OVER (PARTITION BY `DEPTNO` ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) `max(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`\n" + + "FROM `scott`.`EMP`) `t`\n" + + "ORDER BY `__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testStreamstatsReset() { + String ppl = + "source=EMP | streamstats reset_before=SAL>100 reset_after=SAL<50 avg(SAL) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], avg(SAL)=[$12])\n" + + " LogicalSort(sort0=[$8], dir0=[ASC])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{7, 8," + + " 11}])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], __reset_before_flag__=[$9]," + + " __reset_after_flag__=[$10], __seg_id__=[+(SUM($9) OVER (ROWS UNBOUNDED PRECEDING)," + + " COALESCE(SUM($10) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER ()]," + + " __reset_before_flag__=[CASE(>($5, 100), 1, 0)], __reset_after_flag__=[CASE(<($5," + + " 50), 1, 0)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " LogicalAggregate(group=[{}], avg(SAL)=[AVG($5)])\n" + + " LogicalFilter(condition=[AND(<=($8, $cor0.__stream_seq__), =($11," + + " $cor0.__seg_id__), =($7, $cor0.DEPTNO))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[$8], __reset_before_flag__=[$9]," + + " __reset_after_flag__=[$10], __seg_id__=[+(SUM($9) OVER (ROWS UNBOUNDED PRECEDING)," + + " COALESCE(SUM($10) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0))])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3]," + + " HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], __stream_seq__=[ROW_NUMBER() OVER" + + " ()], __reset_before_flag__=[CASE(>($5, 100), 1, 0)]," + + " __reset_after_flag__=[CASE(<($5, 50), 1, 0)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `$cor0`.`EMPNO`, `$cor0`.`ENAME`, `$cor0`.`JOB`, `$cor0`.`MGR`, `$cor0`.`HIREDATE`," + + " `$cor0`.`SAL`, `$cor0`.`COMM`, `$cor0`.`DEPTNO`, `t4`.`avg(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `__stream_seq__`, `__reset_before_flag__`, `__reset_after_flag__`," + + " (SUM(`__reset_before_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT" + + " ROW)) + COALESCE(SUM(`__reset_after_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING), 0) `__seg_id__`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`, CASE WHEN `SAL` > 100 THEN 1 ELSE 0 END" + + " `__reset_before_flag__`, CASE WHEN `SAL` < 50 THEN 1 ELSE 0 END" + + " `__reset_after_flag__`\n" + + "FROM `scott`.`EMP`) `t`) `$cor0`,\n" + + "LATERAL (SELECT AVG(`SAL`) `avg(SAL)`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `__stream_seq__`, `__reset_before_flag__`, `__reset_after_flag__`," + + " (SUM(`__reset_before_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT" + + " ROW)) + COALESCE(SUM(`__reset_after_flag__`) OVER (ROWS BETWEEN UNBOUNDED PRECEDING" + + " AND 1 PRECEDING), 0) `__seg_id__`\n" + + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " ROW_NUMBER() OVER () `__stream_seq__`, CASE WHEN `SAL` > 100 THEN 1 ELSE 0 END" + + " `__reset_before_flag__`, CASE WHEN `SAL` < 50 THEN 1 ELSE 0 END" + + " `__reset_after_flag__`\n" + + "FROM `scott`.`EMP`) `t1`) `t2`\n" + + "WHERE `__stream_seq__` <= `$cor0`.`__stream_seq__` AND `__seg_id__` =" + + " `$cor0`.`__seg_id__` AND `DEPTNO` = `$cor0`.`DEPTNO`) `t4`\n" + + "ORDER BY `$cor0`.`__stream_seq__` NULLS LAST"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } +} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 2f18db5c995..48f6c45b4c6 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -173,6 +173,34 @@ public void testEventstatsCommandWithSpanFunction() { anonymize("source=t | eventstats count(a) by span(b, 1d), c")); } + @Test + public void testStreamstatsCommandWithByClause() { + assertEquals( + "source=table | streamstats count(identifier) by identifier", + anonymize("source=t | streamstats count(a) by b")); + } + + @Test + public void testStreamstatsCommandWithWindowAndCurrent() { + assertEquals( + "source=table | streamstats max(identifier)", + anonymize("source=t | streamstats current=false window=2 max(a)")); + } + + @Test + public void testStreamstatsCommandWithNestedFunctions() { + assertEquals( + "source=table | streamstats sum(+(identifier,identifier))", + anonymize("source=t | streamstats sum(a+b)")); + } + + @Test + public void testStreamstatsCommandWithSpanFunction() { + assertEquals( + "source=table | streamstats count(identifier) by span(identifier, *** d),identifier", + anonymize("source=t | streamstats count(a) by span(b, 1d), c")); + } + @Test public void testBinCommandBasic() { assertEquals("source=table | bin identifier span=***", anonymize("source=t | bin f span=10"));