From 8e5e5f1c0470ef15b6b0163ba2716383830f55d2 Mon Sep 17 00:00:00 2001 From: Kyle Hounslow Date: Mon, 8 Dec 2025 15:58:53 -0800 Subject: [PATCH 1/9] Export PPL Command docs from SQL plugin repo. Signed-off-by: Kyle Hounslow --- .gitignore | 5 + _dashboards/management/S3-data-source.md | 6 +- _sql-and-ppl/ppl/cmd/ad.md | 123 ++ _sql-and-ppl/ppl/cmd/addcoltotals.md | 84 + _sql-and-ppl/ppl/cmd/addtotals.md | 108 ++ _sql-and-ppl/ppl/cmd/append.md | 68 + _sql-and-ppl/ppl/cmd/appendcol.md | 123 ++ _sql-and-ppl/ppl/cmd/appendpipe.md | 74 + _sql-and-ppl/ppl/cmd/bin.md | 423 +++++ _sql-and-ppl/ppl/cmd/chart.md | 190 ++ _sql-and-ppl/ppl/cmd/dedup.md | 128 ++ _sql-and-ppl/ppl/cmd/describe.md | 71 + _sql-and-ppl/ppl/cmd/eval.md | 127 ++ _sql-and-ppl/ppl/cmd/eventstats.md | 164 ++ _sql-and-ppl/ppl/cmd/expand.md | 57 + _sql-and-ppl/ppl/cmd/explain.md | 200 ++ _sql-and-ppl/ppl/cmd/fields.md | 224 +++ _sql-and-ppl/ppl/cmd/fillnull.md | 169 ++ _sql-and-ppl/ppl/cmd/flatten.md | 101 + _sql-and-ppl/ppl/cmd/grok.md | 87 + _sql-and-ppl/ppl/cmd/head.md | 85 + _sql-and-ppl/ppl/cmd/index.md | 15 + _sql-and-ppl/ppl/cmd/join.md | 216 +++ _sql-and-ppl/ppl/cmd/kmeans.md | 45 + _sql-and-ppl/ppl/cmd/lookup.md | 357 ++++ _sql-and-ppl/ppl/cmd/ml.md | 151 ++ _sql-and-ppl/ppl/cmd/multisearch.md | 151 ++ _sql-and-ppl/ppl/cmd/parse.md | 134 ++ _sql-and-ppl/ppl/cmd/patterns.md | 244 +++ _sql-and-ppl/ppl/cmd/rare.md | 138 ++ _sql-and-ppl/ppl/cmd/regex.md | 147 ++ _sql-and-ppl/ppl/cmd/rename.md | 138 ++ _sql-and-ppl/ppl/cmd/replace.md | 301 +++ _sql-and-ppl/ppl/cmd/reverse.md | 129 ++ _sql-and-ppl/ppl/cmd/rex.md | 277 +++ _sql-and-ppl/ppl/cmd/search.md | 653 +++++++ _sql-and-ppl/ppl/cmd/showdatasources.md | 39 + _sql-and-ppl/ppl/cmd/sort.md | 234 +++ _sql-and-ppl/ppl/cmd/spath.md | 108 ++ _sql-and-ppl/ppl/cmd/stats.md | 464 +++++ _sql-and-ppl/ppl/cmd/streamstats.md | 266 +++ _sql-and-ppl/ppl/cmd/subquery.md | 226 +++ _sql-and-ppl/ppl/cmd/syntax.md | 83 + _sql-and-ppl/ppl/cmd/table.md | 44 + _sql-and-ppl/ppl/cmd/timechart.md | 353 ++++ _sql-and-ppl/ppl/cmd/top.md | 152 ++ _sql-and-ppl/ppl/cmd/trendline.md | 112 ++ _sql-and-ppl/ppl/cmd/where.md | 188 ++ _sql-and-ppl/ppl/functions.md | 2133 ---------------------- 49 files changed, 7979 insertions(+), 2136 deletions(-) create mode 100644 _sql-and-ppl/ppl/cmd/ad.md create mode 100644 _sql-and-ppl/ppl/cmd/addcoltotals.md create mode 100644 _sql-and-ppl/ppl/cmd/addtotals.md create mode 100644 _sql-and-ppl/ppl/cmd/append.md create mode 100644 _sql-and-ppl/ppl/cmd/appendcol.md create mode 100644 _sql-and-ppl/ppl/cmd/appendpipe.md create mode 100644 _sql-and-ppl/ppl/cmd/bin.md create mode 100644 _sql-and-ppl/ppl/cmd/chart.md create mode 100644 _sql-and-ppl/ppl/cmd/dedup.md create mode 100644 _sql-and-ppl/ppl/cmd/describe.md create mode 100644 _sql-and-ppl/ppl/cmd/eval.md create mode 100644 _sql-and-ppl/ppl/cmd/eventstats.md create mode 100644 _sql-and-ppl/ppl/cmd/expand.md create mode 100644 _sql-and-ppl/ppl/cmd/explain.md create mode 100644 _sql-and-ppl/ppl/cmd/fields.md create mode 100644 _sql-and-ppl/ppl/cmd/fillnull.md create mode 100644 _sql-and-ppl/ppl/cmd/flatten.md create mode 100644 _sql-and-ppl/ppl/cmd/grok.md create mode 100644 _sql-and-ppl/ppl/cmd/head.md create mode 100644 _sql-and-ppl/ppl/cmd/index.md create mode 100644 _sql-and-ppl/ppl/cmd/join.md create mode 100644 _sql-and-ppl/ppl/cmd/kmeans.md create mode 100644 _sql-and-ppl/ppl/cmd/lookup.md create mode 100644 _sql-and-ppl/ppl/cmd/ml.md create mode 100644 _sql-and-ppl/ppl/cmd/multisearch.md create mode 100644 _sql-and-ppl/ppl/cmd/parse.md create mode 100644 _sql-and-ppl/ppl/cmd/patterns.md create mode 100644 _sql-and-ppl/ppl/cmd/rare.md create mode 100644 _sql-and-ppl/ppl/cmd/regex.md create mode 100644 _sql-and-ppl/ppl/cmd/rename.md create mode 100644 _sql-and-ppl/ppl/cmd/replace.md create mode 100644 _sql-and-ppl/ppl/cmd/reverse.md create mode 100644 _sql-and-ppl/ppl/cmd/rex.md create mode 100644 _sql-and-ppl/ppl/cmd/search.md create mode 100644 _sql-and-ppl/ppl/cmd/showdatasources.md create mode 100644 _sql-and-ppl/ppl/cmd/sort.md create mode 100644 _sql-and-ppl/ppl/cmd/spath.md create mode 100644 _sql-and-ppl/ppl/cmd/stats.md create mode 100644 _sql-and-ppl/ppl/cmd/streamstats.md create mode 100644 _sql-and-ppl/ppl/cmd/subquery.md create mode 100644 _sql-and-ppl/ppl/cmd/syntax.md create mode 100644 _sql-and-ppl/ppl/cmd/table.md create mode 100644 _sql-and-ppl/ppl/cmd/timechart.md create mode 100644 _sql-and-ppl/ppl/cmd/top.md create mode 100644 _sql-and-ppl/ppl/cmd/trendline.md create mode 100644 _sql-and-ppl/ppl/cmd/where.md delete mode 100644 _sql-and-ppl/ppl/functions.md diff --git a/.gitignore b/.gitignore index 09b607173de..bd30add4848 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,8 @@ Gemfile.lock .jekyll-cache .project vendor/bundle +node_modules +.vscode +.ruby-version +cdk* +.dev* diff --git a/_dashboards/management/S3-data-source.md b/_dashboards/management/S3-data-source.md index 1a7cc579b08..cd2eaced383 100644 --- a/_dashboards/management/S3-data-source.md +++ b/_dashboards/management/S3-data-source.md @@ -16,9 +16,9 @@ You can connect OpenSearch to your Amazon Simple Storage Service (Amazon S3) dat Before connecting a data source, verify that the following requirements are met: -- You have access to Amazon S3 and the [AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.rst#id2). +- You have access to Amazon S3 and the [AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.md#id2). - You have access to OpenSearch and OpenSearch Dashboards. -- You have an understanding of OpenSearch data source and connector concepts. See the [developer documentation](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/datasources.rst#introduction) for more information. +- You have an understanding of OpenSearch data source and connector concepts. See the [developer documentation](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/datasources.md#introduction) for more information. ## Connect your data source @@ -46,5 +46,5 @@ This feature is currently under development, including the data integration func - Learn about [querying your data in Data Explorer]({{site.url}}{{site.baseurl}}/dashboards/management/query-data-source/) through OpenSearch Dashboards. - Learn about [optimizing the query performance of your external data sources]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/), such as Amazon S3, through Query Workbench. -- Learn about [Amazon S3 and AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.rst) and the APIS used with Amazon S3 data sources, including configuration settings and query examples. +- Learn about [Amazon S3 and AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.md) and the APIS used with Amazon S3 data sources, including configuration settings and query examples. - Learn about [managing your indexes]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index/) through OpenSearch Dashboards. diff --git a/_sql-and-ppl/ppl/cmd/ad.md b/_sql-and-ppl/ppl/cmd/ad.md new file mode 100644 index 00000000000..23b8f80c0b5 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/ad.md @@ -0,0 +1,123 @@ +--- +layout: default +title: "ad" +parent: "Commands" +grand_parent: "PPL" +nav_order: 1 +--- +# ad (deprecated by ml command) + + +The `ad` command applies Random Cut Forest (RCF) algorithm in the ml-commons plugin on the search results returned by a PPL command. Based on the input, the command uses two types of RCF algorithms: fixed-in-time RCF for processing time-series data, batch RCF for processing non-time-series data. + +## Syntax + +The following sections describe the syntax for each RCF algorithm type. + +## Fixed in time RCF for time-series data + +`ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [anomaly_rate] [date_format] [time_zone] [category_field]` +* `number_of_trees`: optional. Number of trees in the forest. **Default:** 30. +* `shingle_size`: optional. A shingle is a consecutive sequence of the most recent records. **Default:** 8. +* `sample_size`: optional. The sample size used by stream samplers in this forest. **Default:** 256. +* `output_after`: optional. The number of points required by stream samplers before results are returned. **Default:** 32. +* `time_decay`: optional. The decay factor used by stream samplers in this forest. **Default:** 0.0001. +* `anomaly_rate`: optional. The anomaly rate. **Default:** 0.005. +* `time_field`: mandatory. Specifies the time field for RCF to use as time-series data. +* `date_format`: optional. Used for formatting time_field. **Default:** "yyyy-MM-dd HH:mm:ss". +* `time_zone`: optional. Used for setting time zone for time_field. **Default:** "UTC". +* `category_field`: optional. Specifies the category field used to group inputs. Each category will be independently predicted. + + +## Batch RCF for non-time-series data + +`ad [number_of_trees] [sample_size] [output_after] [training_data_size] [anomaly_score_threshold] [category_field]` +* `number_of_trees`: optional. Number of trees in the forest. **Default:** 30. +* `sample_size`: optional. Number of random samples given to each tree from the training dataset. **Default:** 256. +* `output_after`: optional. The number of points required by stream samplers before results are returned. **Default:** 32. +* `training_data_size`: optional. **Default:** size of your training dataset. +* `anomaly_score_threshold`: optional. The threshold of anomaly score. **Default:** 1.0. +* `category_field`: optional. Specifies the category field used to group inputs. Each category will be independently predicted. + + +## Example 1: Detecting events in New York City from taxi ridership data with time-series data + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. + +```sql +source=nyc_taxi +| fields value, timestamp +| AD time_field='timestamp' +| where value=10844.0 +``` +{% include copy.html %} + +Expected output: + +| value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | +| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | + + +## Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. + +```sql +source=nyc_taxi +| fields category, value, timestamp +| AD time_field='timestamp' category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +Expected output: + +| category | value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | --- | +| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | +| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | + + +## Example 3: Detecting events in New York City from taxi ridership data with non-time-series data + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. + +```sql +source=nyc_taxi +| fields value +| AD +| where value=10844.0 +``` +{% include copy.html %} + +Expected output: + +| value | score | anomalous | +| --- | --- | --- | +| 10844.0 | 0.0 | False | + + +## Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. + +```sql +source=nyc_taxi +| fields category, value +| AD category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +Expected output: + +| category | value | score | anomalous | +| --- | --- | --- | --- | +| night | 10844.0 | 0.0 | False | +| day | 6526.0 | 0.0 | False | + + +## Limitations + +The `ad` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/addcoltotals.md b/_sql-and-ppl/ppl/cmd/addcoltotals.md new file mode 100644 index 00000000000..57f7d83a1e6 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/addcoltotals.md @@ -0,0 +1,84 @@ +--- +layout: default +title: "addcoltotals" +parent: "Commands" +grand_parent: "PPL" +nav_order: 2 +--- +# addcoltotals + + + +The `addcoltotals` command computes the sum of each column and adds a summary event at the end to show the total of each column. This command works the same way `addtotals` command works with row=false and col=true option. This is useful for creating summary reports with subtotals or grand totals. The `addcoltotals` command only sums numeric fields (integers, floats, doubles). Non-numeric fields in the field list are ignored even if its specified in field-list or in the case of no field-list specified. + +## Syntax + +Use the following syntax: + +`addcoltotals [field-list] [label=] [labelfield=]` + +- `field-list`: Optional. Comma-separated list of numeric fields to sum. If not specified, all numeric fields are summed. +- `labelfield=`: Optional. Field name to place the label. If it specifies a non-existing field, adds the field and shows label at the summary event row at this field. +- `label=`: Optional. Custom text for the totals row labelfield\'s label. Default is \"Total\". + +# Example 1: Basic example + +The following example PPL query shows how to use `addcoltotals` to place the label in an existing field. + +```sql +source=accounts +| fields firstname, balance +| head 3 +| addcoltotals labelfield='firstname' +``` +{% include copy.html %} + +Expected output: + +| firstname | balance | +| --- | --- | +| Amber | 39225 | +| Hattie | 5686 | +| Nanette | 32838 | +| Total | 77749 | + +# Example 2: Adding column totals and adding a summary event with label specified + +The following example PPL query shows how to use `addcoltotals` to add totals after a stats command where final summary event label is \'Sum\' and row=true value was used by default when not specified. It also added new field specified by labelfield as it did not match existing field. + +```sql +source=accounts +| stats count() by gender +| addcoltotals `count()` label='Sum' labelfield='Total' +``` +{% include copy.html %} + +Expected output: + +| count() | gender | Total | +| --- | --- | --- | +| 1 | F | null | +| 3 | M | null | +| 4 | null | Sum | + +# Example 3: With all options + +The following example PPL query shows how to use `addcoltotals` with all options set. + +```sql +source=accounts +| where age > 30 +| stats avg(balance) as avg_balance, count() as count by state +| head 3 +| addcoltotals avg_balance, count label='Sum' labelfield='Column Total' +``` +{% include copy.html %} + +Expected output: + +| avg_balance | count | state | Column Total | +| --- | --- | --- | --- | +| 39225.0 | 1 | IL | null | +| 4180.0 | 1 | MD | null | +| 5686.0 | 1 | TN | null | +| 49091.0 | 3 | null | Sum | diff --git a/_sql-and-ppl/ppl/cmd/addtotals.md b/_sql-and-ppl/ppl/cmd/addtotals.md new file mode 100644 index 00000000000..6cfbf873dc8 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/addtotals.md @@ -0,0 +1,108 @@ +--- +layout: default +title: "addtotals" +parent: "Commands" +grand_parent: "PPL" +nav_order: 3 +--- +# addtotals + + + +The `addtotals` command computes the sum of numeric fields and appends a row with the totals to the result. The command can also add row totals and add a field to store row totals. This is useful for creating summary reports with subtotals or grand totals. The `addtotals` command only sums numeric fields (integers, floats, doubles). Non-numeric fields in the field list are ignored even if it's specified in field-list or in the case of no field-list specified. + +## Syntax + +Use the following syntax: + +`addtotals [field-list] [label=] [labelfield=] [row=] [col=] [fieldname=]` + +- `field-list`: Optional. Comma-separated list of numeric fields to sum. If not specified, all numeric fields are summed. +- `row=`: Optional. Calculates total of each row and add a new field with the total. Default is true. +- `col=`: Optional. Calculates total of each column and add a new event at the end of all events with the total. Default is false. +- `labelfield=`: Optional. Field name to place the label. If it specifies a non-existing field, adds the field and shows label at the summary event row at this field. This is applicable when col=true. +- `label=`: Optional. Custom text for the totals row labelfield\'s label. Default is \"Total\". This is applicable when col=true. This does not have any effect when labelfield and fieldname parameter both have same value. +- `fieldname=`: Optional. Calculates total of each row and add a new field to store this total. This is applicable when row=true. + +## Example 1: Basic example + +The following example PPL query shows how to use `addtotals` to place the label in an existing field. + +```sql +source=accounts +| head 3 +|fields firstname, balance +| addtotals col=true labelfield='firstname' label='Total' +``` +{% include copy.html %} + +Expected output: + +| firstname | balance | Total | +| --- | --- | --- | +| Amber | 39225 | 39225 | +| Hattie | 5686 | 5686 | +| Nanette | 32838 | 32838 | +| Total | 77749 | null | + +## Example 2: Adding column totals and adding a summary event with label specified + +The following example PPL query shows how to use `addtotals` to add totals after a stats command where final summary event label is \'Sum\'. It also added new field specified by labelfield as it did not match existing field. + +```sql +source=accounts +| fields account_number, firstname , balance , age +| addtotals col=true row=false label='Sum' labelfield='Total' +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | balance | age | Total | +| --- | --- | --- | --- | --- | +| 1 | Amber | 39225 | 32 | null | +| 6 | Hattie | 5686 | 36 | null | +| 13 | Nanette | 32838 | 28 | null | +| 18 | Dale | 4180 | 33 | null | +| 38 | null | 81929 | 129 | Sum | + +if row=true in the preceding example, there will be conflict between column added for column totals and column added for row totals being same field \'Total\', in that case the output will have final event row label null instead of \'Sum\' because the column is number type and it cannot output String in number type column. + +```sql +source=accounts +| fields account_number, firstname , balance , age +| addtotals col=true row=true label='Sum' labelfield='Total' +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | balance | age | Total | +| --- | --- | --- | --- | --- | +| 1 | Amber | 39225 | 32 | 39258 | +| 6 | Hattie | 5686 | 36 | 5728 | +| 13 | Nanette | 32838 | 28 | 32879 | +| 18 | Dale | 4180 | 33 | 4231 | +| 38 | null | 81929 | 129 | null | + +## Example 3: With all options + +The following example PPL query shows how to use `addtotals` with all options set. + +```sql +source=accounts +| where age > 30 +| stats avg(balance) as avg_balance, count() as count by state +| head 3 +| addtotals avg_balance, count row=true col=true fieldname='Row Total' label='Sum' labelfield='Column Total' +``` +{% include copy.html %} + +Expected output: + +| avg_balance | count | state | Row Total | Column Total | +| --- | --- | --- | --- | --- | +| 39225.0 | 1 | IL | 39226.0 | null | +| 4180.0 | 1 | MD | 4181.0 | null | +| 5686.0 | 1 | TN | 5687.0 | null | +| 49091.0 | 3 | null | null | Sum | \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/append.md b/_sql-and-ppl/ppl/cmd/append.md new file mode 100644 index 00000000000..5ee7811a7a7 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/append.md @@ -0,0 +1,68 @@ +--- +layout: default +title: "append" +parent: "Commands" +grand_parent: "PPL" +nav_order: 4 +--- +# append + + +The `append` command appends the result of a sub-search and attaches it as additional rows to the bottom of the input search results (the main search). + +The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. + +## Syntax + +Use the following syntax: + +`append ` +* `sub-search`: mandatory. Executes PPL commands as a secondary search. + + +## Limitations + +* **Schema Compatibility**: When fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with `eval` or using `fields` to select non-conflicting columns). + + +## Example 1: Append rows from a count aggregation to existing search results + +The following example appends rows from "count by gender" to "sum by gender, state". + +```sql +source=accounts | stats sum(age) by gender, state | sort -`sum(age)` | head 5 | append [ source=accounts | stats count(age) by gender ] +``` +{% include copy.html %} + +Expected output: + +| sum(age) | gender | state | count(age) | +| --- | --- | --- | --- | +| 36 | M | TN | null | +| 33 | M | MD | null | +| 32 | M | IL | null | +| 28 | F | VA | null | +| null | F | null | 1 | +| null | M | null | 3 | + + +## Example 2: Append rows with merged column names + +The following example appends rows from "sum by gender" to "sum by gender, state" with merged column of same field name and type. + +```sql +source=accounts | stats sum(age) as sum by gender, state | sort -sum | head 5 | append [ source=accounts | stats sum(age) as sum by gender ] +``` +{% include copy.html %} + +Expected output: + +| sum | gender | state | +| --- | --- | --- | +| 36 | M | TN | +| 33 | M | MD | +| 32 | M | IL | +| 28 | F | VA | +| 28 | F | null | +| 101 | M | null | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/appendcol.md b/_sql-and-ppl/ppl/cmd/appendcol.md new file mode 100644 index 00000000000..e8f8e208001 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/appendcol.md @@ -0,0 +1,123 @@ +--- +layout: default +title: "appendcol" +parent: "Commands" +grand_parent: "PPL" +nav_order: 5 +--- +# appendcol + + +The `appendcol` command appends the result of a sub-search and attaches it alongside the input search results (the main search). + +## Syntax + +Use the following syntax: + +`appendcol [override=] ` +* override=: optional. Boolean field to specify should result from main-result be overwritten in the case of column name conflict. **Default:** false. +* `sub-search`: mandatory. Executes PPL commands as a secondary search. The sub-search uses the same data specified in the source clause of the main search results as its input. + + +## Example 1: Append a count aggregation to existing search results + +This example appends "count by gender" to "sum by gender, state". + +```sql +source=accounts +| stats sum(age) by gender, state +| appendcol [ stats count(age) by gender ] +| head 10 +``` +{% include copy.html %} + +Expected output: + +| gender | state | sum(age) | count(age) | +| --- | --- | --- | --- | +| F | AK | 317 | 493 | +| F | AL | 397 | 507 | +| F | AR | 229 | NULL | +| F | AZ | 238 | NULL | +| F | CA | 282 | NULL | +| F | CO | 217 | NULL | +| F | CT | 147 | NULL | +| F | DC | 358 | NULL | +| F | DE | 101 | NULL | +| F | FL | 310 | NULL | + + +## Example 2: Append a count aggregation to existing search results with override option + +This example appends "count by gender" to "sum by gender, state" with override option. + +```sql +source=accounts +| stats sum(age) by gender, state +| appendcol override=true [ stats count(age) by gender ] +| head 10 +``` +{% include copy.html %} + +Expected output: + +| gender | state | sum(age) | count(age) | +| --- | --- | --- | --- | +| F | AK | 317 | 493 | +| M | AL | 397 | 507 | +| F | AR | 229 | NULL | +| F | AZ | 238 | NULL | +| F | CA | 282 | NULL | +| F | CO | 217 | NULL | +| F | CT | 147 | NULL | +| F | DC | 358 | NULL | +| F | DE | 101 | NULL | +| F | FL | 310 | NULL | + + +## Example 3: Append multiple sub-search results + +The following example PPL query shows how to use `appendcol` to chain multiple appendcol commands to add columns from different sub-searches. + +```sql +source=employees +| fields name, dept, age +| appendcol [ stats avg(age) as avg_age ] +| appendcol [ stats max(age) as max_age ] +``` +{% include copy.html %} + +Expected output: + +| name | dept | age | avg_age | max_age | +| --- | --- | --- | --- | --- | +| Lisa | Sales | 35 | 31.2222222222222 | 38 | +| Fred | Engineering | 28 | NULL | NULL | +| Paul | Engineering | 23 | NULL | NULL | +| Evan | Sales | 38 | NULL | NULL | +| Chloe | Engineering | 25 | NULL | NULL | +| Tom | Engineering | 33 | NULL | NULL | +| Alex | Sales | 33 | NULL | NULL | +| Jane | Marketing | 28 | NULL | NULL | +| Jeff | Marketing | 38 | NULL | NULL | + + +## Example 4: Override case of column name conflict + +The following example PPL query demonstrates how to use `appendcol` with the override option when column names conflict between main search and sub-search. + +```sql +source=employees +| stats avg(age) as agg by dept +| appendcol override=true [ stats max(age) as agg by dept ] +``` +{% include copy.html %} + +Expected output: + +| agg | dept | +| --- | --- | +| 38 | Sales | +| 38 | Engineering | +| 38 | Marketing | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/appendpipe.md b/_sql-and-ppl/ppl/cmd/appendpipe.md new file mode 100644 index 00000000000..8893be0ee2f --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/appendpipe.md @@ -0,0 +1,74 @@ +--- +layout: default +title: "appendpipe" +parent: "Commands" +grand_parent: "PPL" +nav_order: 6 +--- +# appendpipe + + +The `appendpipe` command appends the result of the subpipeline to the search results. Unlike a subsearch, the subpipeline is not run first. The subpipeline is run when the search reaches the appendpipe command. +The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. + +## Syntax + +Use the following syntax: + +`appendpipe []` +* `subpipeline`: mandatory. A list of commands that are applied to the search results from the commands that occur in the search before the `appendpipe` command. + + +## Example 1: Append rows from a total count to existing search results + +This example appends rows from "total by gender" to "sum by gender, state" with merged column of same field name and type. + +```sql +source=accounts +| stats sum(age) as part by gender, state +| sort -part +| head 5 +| appendpipe [ stats sum(part) as total by gender ] +``` +{% include copy.html %} + +Expected output: + +| part | gender | state | total | +| --- | --- | --- | --- | +| 36 | M | TN | null | +| 33 | M | MD | null | +| 32 | M | IL | null | +| 28 | F | VA | null | +| null | F | null | 28 | +| null | M | null | 101 | + + +## Example 2: Append rows with merged column names + +This example appends rows from "count by gender" to "sum by gender, state". + +```sql +source=accounts +| stats sum(age) as total by gender, state +| sort -total +| head 5 +| appendpipe [ stats sum(total) as total by gender ] +``` +{% include copy.html %} + +Expected output: + +| total | gender | state | +| --- | --- | --- | +| 36 | M | TN | +| 33 | M | MD | +| 32 | M | IL | +| 28 | F | VA | +| 28 | F | null | +| 101 | M | null | + + +## Limitations + +* **Schema Compatibility**: Same as command `append`, when fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with `eval` or using `fields` to select non-conflicting columns). \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/bin.md b/_sql-and-ppl/ppl/cmd/bin.md new file mode 100644 index 00000000000..02dd7356560 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/bin.md @@ -0,0 +1,423 @@ +--- +layout: default +title: "bin" +parent: "Commands" +grand_parent: "PPL" +nav_order: 7 +--- +# bin + + +The `bin` command groups numeric values into buckets of equal intervals, making it useful for creating histograms and analyzing data distribution. It takes a numeric or time-based field and generates a new field with values that represent the lower bound of each bucket. + +## Syntax + +Use the following syntax: + +`bin [span=] [minspan=] [bins=] [aligntime=(earliest | latest | )] [start=] [end=]` +* `field`: mandatory. The field to bin. Accepts numeric or time-based fields. +* `span`: optional. The interval size for each bin. Cannot be used with bins or minspan parameters. + * Supports numeric (e.g., `1000`), logarithmic (e.g., `log10`, `2log10`), and time intervals + * Available time units: + * microsecond (us) + * millisecond (ms) + * centisecond (cs) + * decisecond (ds) + * second (s, sec, secs, second, seconds) + * minute (m, min, mins, minute, minutes) + * hour (h, hr, hrs, hour, hours) + * day (d, day, days) + * month (M, mon, month, months) +* `minspan`: optional. The minimum interval size for automatic span calculation. Cannot be used with span or bins parameters. +* `bins`: optional. The maximum number of equal-width bins to create. Cannot be used with span or minspan parameters. The bins parameter must be between 2 and 50000 (inclusive). + + **Limitation**: The bins parameter on timestamp fields has the following requirements: + + 1. **Pushdown must be enabled**: Controlled by ``plugins.calcite.pushdown.enabled`` (enabled by default). When pushdown is disabled, use the ``span`` parameter instead (e.g., ``bin @timestamp span=5m``). + 2. **Timestamp field must be used as an aggregation bucket**: The binned timestamp field must be used in a ``stats`` aggregation (e.g., ``source=events | bin @timestamp bins=3 | stats count() by @timestamp``). Using bins on timestamp fields outside of aggregation buckets is not supported. +* `aligntime`: optional. Align the bin times for time-based fields. Valid only for time-based discretization. Options: + * earliest: Align bins to the earliest timestamp in the data + * latest: Align bins to the latest timestamp in the data + * \: Align bins to a specific epoch time value or time modifier expression +* `start`: optional. The starting value for binning range. **Default:** minimum field value. +* `end`: optional. The ending value for binning range. **Default:** maximum field value. + +**Parameter Behavior** +When multiple parameters are specified, priority order is: span > minspan > bins > start/end > default. +**Special Behaviors:** +* Logarithmic span (`log10`, `2log10`, etc.) creates logarithmic bin boundaries instead of linear +* Daily/monthly spans automatically align to calendar boundaries and return date strings (YYYY-MM-DD) instead of timestamps +* aligntime parameter only applies to time spans excluding days/months +* start/end parameters expand the range (never shrink) and affect bin width calculation + + +## Example 1: Basic numeric span + +```sql +source=accounts +| bin age span=10 +| fields age, account_number +| head 3 +``` +{% include copy.html %} + +Expected output: + +| age | account_number | +| --- | --- | +| 30-40 | 1 | +| 30-40 | 6 | +| 20-30 | 13 | + + +## Example 2: Large numeric span + +```sql +source=accounts +| bin balance span=25000 +| fields balance +| head 2 +``` +{% include copy.html %} + +Expected output: + +| balance | +| --- | +| 25000-50000 | +| 0-25000 | + + +## Example 3: Logarithmic span (log10) + +```sql +source=accounts +| bin balance span=log10 +| fields balance +| head 2 +``` +{% include copy.html %} + +Expected output: + +| balance | +| --- | +| 10000.0-100000.0 | +| 1000.0-10000.0 | + + +## Example 4: Logarithmic span with coefficient + +```sql +source=accounts +| bin balance span=2log10 +| fields balance +| head 3 +``` +{% include copy.html %} + +Expected output: + +| balance | +| --- | +| 20000.0-200000.0 | +| 2000.0-20000.0 | +| 20000.0-200000.0 | + + +## Example 5: Basic bins parameter + +```sql +source=time_test +| bin value bins=5 +| fields value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| value | +| --- | +| 8000-9000 | +| 7000-8000 | +| 9000-10000 | + + +## Example 6: Low bin count + +```sql +source=accounts +| bin age bins=2 +| fields age +| head 1 +``` +{% include copy.html %} + +Expected output: + +| age | +| --- | +| 30-40 | + + +## Example 7: High bin count + +```sql +source=accounts +| bin age bins=21 +| fields age, account_number +| head 3 +``` +{% include copy.html %} + +Expected output: + +| age | account_number | +| --- | --- | +| 32-33 | 1 | +| 36-37 | 6 | +| 28-29 | 13 | + + +## Example 8: Basic minspan + +```sql +source=accounts +| bin age minspan=5 +| fields age, account_number +| head 3 +``` +{% include copy.html %} + +Expected output: + +| age | account_number | +| --- | --- | +| 30-40 | 1 | +| 30-40 | 6 | +| 20-30 | 13 | + + +## Example 9: Large minspan + +```sql +source=accounts +| bin age minspan=101 +| fields age +| head 1 +``` +{% include copy.html %} + +Expected output: + +| age | +| --- | +| 0-1000 | + + +## Example 10: Start and end range + +```sql +source=accounts +| bin age start=0 end=101 +| fields age +| head 1 +``` +{% include copy.html %} + +Expected output: + +| age | +| --- | +| 0-100 | + + +## Example 11: Large end range + +```sql +source=accounts +| bin balance start=0 end=100001 +| fields balance +| head 1 +``` +{% include copy.html %} + +Expected output: + +| balance | +| --- | +| 0-100000 | + + +## Example 12: Span with start/end + +```sql +source=accounts +| bin age span=1 start=25 end=35 +| fields age +| head 6 +``` +{% include copy.html %} + +Expected output: + +| age | +| --- | +| 32-33 | +| 36-37 | +| 28-29 | +| 33-34 | + + +## Example 13: Hour span + +```sql +source=time_test +| bin @timestamp span=1h +| fields @timestamp, value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | value | +| --- | --- | +| 2025-07-28 00:00:00 | 8945 | +| 2025-07-28 01:00:00 | 7623 | +| 2025-07-28 02:00:00 | 9187 | + + +## Example 14: Minute span + +```sql +source=time_test +| bin @timestamp span=45minute +| fields @timestamp, value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | value | +| --- | --- | +| 2025-07-28 00:00:00 | 8945 | +| 2025-07-28 01:30:00 | 7623 | +| 2025-07-28 02:15:00 | 9187 | + + +## Example 15: Second span + +```sql +source=time_test +| bin @timestamp span=30seconds +| fields @timestamp, value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | value | +| --- | --- | +| 2025-07-28 00:15:30 | 8945 | +| 2025-07-28 01:42:00 | 7623 | +| 2025-07-28 02:28:30 | 9187 | + + +## Example 16: Daily span + +```sql +source=time_test +| bin @timestamp span=7day +| fields @timestamp, value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | value | +| --- | --- | +| 2025-07-24 00:00:00 | 8945 | +| 2025-07-24 00:00:00 | 7623 | +| 2025-07-24 00:00:00 | 9187 | + + +## Example 17: Aligntime with time modifier + +```sql +source=time_test +| bin @timestamp span=2h aligntime='@d+3h' +| fields @timestamp, value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | value | +| --- | --- | +| 2025-07-27 23:00:00 | 8945 | +| 2025-07-28 01:00:00 | 7623 | +| 2025-07-28 01:00:00 | 9187 | + + +## Example 18: Aligntime with epoch timestamp + +```sql +source=time_test +| bin @timestamp span=2h aligntime=1500000000 +| fields @timestamp, value +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | value | +| --- | --- | +| 2025-07-27 22:40:00 | 8945 | +| 2025-07-28 00:40:00 | 7623 | +| 2025-07-28 00:40:00 | 9187 | + + +## Example 19: Default behavior (no parameters) + +```sql +source=accounts +| bin age +| fields age, account_number +| head 3 +``` +{% include copy.html %} + +Expected output: + +| age | account_number | +| --- | --- | +| 32.0-33.0 | 1 | +| 36.0-37.0 | 6 | +| 28.0-29.0 | 13 | + + +## Example 20: Binning with string fields + +```sql +source=accounts +| eval age_str = CAST(age AS STRING) +| bin age_str bins=3 +| stats count() by age_str +| sort age_str +``` +{% include copy.html %} + +Expected output: + +| count() | age_str | +| --- | --- | +| 1 | 20-30 | +| 3 | 30-40 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/chart.md b/_sql-and-ppl/ppl/cmd/chart.md new file mode 100644 index 00000000000..4b435959afb --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/chart.md @@ -0,0 +1,190 @@ +--- +layout: default +title: "chart" +parent: "Commands" +grand_parent: "PPL" +nav_order: 8 +--- +# chart + + +The `chart` command transforms search results by applying a statistical aggregation function and optionally grouping the data by one or two fields. The results are suitable for visualization as a two-dimension chart when grouping by two fields, where unique values in the second group key can be pivoted to column names. + +## Syntax + +Use the following syntax: + +`chart [limit=(top|bottom) ] [useother=] [usenull=] [nullstr=] [otherstr=] [ by ] | [over ] [ by ]` +* `limit`: optional. Specifies the number of categories to display when using column split. Each unique value in the column split field represents a category. **Default:** top10. + * Syntax: `limit=(top|bottom)` or `limit=` (defaults to top) + * When `limit=K` is set, the top or bottom K categories from the column split field are retained; the remaining categories are grouped into an "OTHER" category if `useother` is not set to false. + * Set limit to 0 to show all categories without any limit. + * Use `limit=topK` or `limit=bottomK` to specify whether to retain the top or bottom K column categories. The ranking is based on the sum of aggregated values for each column category. For example, `chart limit=top3 count() by region, product` keeps the 3 products with the highest total counts across all regions. If not specified, top is used by default. + * Only applies when column split is present (by 2 fields or over...by... coexists). +* `useother`: optional. Controls whether to create an "OTHER" category for categories beyond the limit. **Default:** true + * When set to false, only the top/bottom N categories (based on limit) are shown without an "OTHER" category. + * When set to true, categories beyond the limit are grouped into an "OTHER" category. + * Only applies when using column split and when there are more categories than the limit. +* `usenull`: optional. Controls whether to group events without a column split (i.e. whose column split is null) into a separate "NULL" category. **Default:** true + * `usenull` only applies to column split. + * Row split should always be non-null value. Documents with null values in row split will be ignored. + * When `usenull=false`, events with a null column split are excluded from results. + * When `usenull=true`, events with a null column split are grouped into a separate "NULL" category. +* `nullstr`: optional. Specifies the category name for rows that do not contain the column split value. **Default:** "NULL" + * Only applies when `usenull` is set to true. +* `otherstr`: optional. Specifies the category name for the "OTHER" category. **Default:** "OTHER" + * Only applies when `useother` is set to true and there are values beyond the limit. +* `aggregation_function`: mandatory. The aggregation function to apply to the data. + * Currently, only a single aggregation function is supported. + * Available functions: aggregation functions supported by the stats command. +* `by`: optional. Groups the results by either one field (row split) or two fields (row split and column split) + * `limit`, `useother`, and `usenull` apply to the column split + * Results are returned as individual rows for each combination. + * If not specified, the aggregation is performed across all documents. +* over...by...: optional. Alternative syntax for grouping by multiple fields. + * `over by ` groups the results by both fields. + * Using `over` alone on one field is equivalent to `by ` + + +## Notes + +* The fields generated by column splitting are converted to strings so that they are compatible with `nullstr` and `otherstr` and can be used as column names once pivoted. +* Documents with null values in fields used by the aggregation function are excluded from aggregation. For example, in `chart avg(balance) over deptno, group`, documents where `balance` is null are excluded from the average calculation. +* The aggregation metric appears as the last column in the result. Result columns are ordered as: [row-split] [column-split] [aggregation-metrics]. + + +## Example 1: Basic aggregation without grouping + +This example calculates the average balance across all accounts. + +```sql +source=accounts +| chart avg(balance) +``` +{% include copy.html %} + +Expected output: + +| avg(balance) | +| --- | +| 20482.25 | + + +## Example 2: Group by single field + +This example calculates the count of accounts grouped by gender. + +```sql +source=accounts +| chart count() by gender +``` +{% include copy.html %} + +Expected output: + +| gender | count() | +| --- | --- | +| F | 1 | +| M | 3 | + + +## Example 3: Using over and by for multiple field grouping + +The following example PPL query shows how to use `chart` to calculate average balance grouped by both gender and age fields. Note that the age column in the result is converted to string type. + +```sql +source=accounts +| chart avg(balance) over gender by age +``` +{% include copy.html %} + +Expected output: + +| gender | age | avg(balance) | +| --- | --- | --- | +| F | 28 | 32838.0 | +| M | 32 | 39225.0 | +| M | 33 | 4180.0 | +| M | 36 | 5686.0 | + + +## Example 4: Using basic limit functionality + +This example limits the results to show only the top 1 age group. Note that the age column in the result is converted to string type. + +```sql +source=accounts +| chart limit=1 count() over gender by age +``` +{% include copy.html %} + +Expected output: + +| gender | age | count() | +| --- | --- | --- | +| F | OTHER | 1 | +| M | 33 | 1 | +| M | OTHER | 2 | + + +## Example 5: Using limit with other parameters + +The following example PPL query shows how to use `chart` with limit, useother, and custom otherstr parameters. + +```sql +source=accounts +| chart limit=top1 useother=true otherstr='minor_gender' count() over state by gender +``` +{% include copy.html %} + +Expected output: + +| state | gender | count() | +| --- | --- | --- | +| IL | M | 1 | +| MD | M | 1 | +| TN | M | 1 | +| VA | minor_gender | 1 | + + +## Example 6: Using null parameters + +The following example PPL query shows how to use `chart` with limit, usenull, and custom nullstr parameters. + +```sql +source=accounts +| chart usenull=true nullstr='employer not specified' count() over firstname by employer +``` +{% include copy.html %} + +Expected output: + +| firstname | employer | count() | +| --- | --- | --- | +| Amber | Pyrami | 1 | +| Dale | employer not specified | 1 | +| Hattie | Netagy | 1 | +| Nanette | Quility | 1 | + + +## Example 7: Using chart command with span + +The following example PPL query demonstrates how to use `chart` with span for grouping age ranges. + +```sql +source=accounts +| chart max(balance) by age span=10, gender +``` +{% include copy.html %} + +Expected output: + +| age | gender | max(balance) | +| --- | --- | --- | +| 20 | F | 32838 | +| 30 | M | 39225 | + + +## Limitations + +* Only a single aggregation function is supported per chart command. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/dedup.md b/_sql-and-ppl/ppl/cmd/dedup.md new file mode 100644 index 00000000000..4a95b2dd92c --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/dedup.md @@ -0,0 +1,128 @@ +--- +layout: default +title: "dedup" +parent: "Commands" +grand_parent: "PPL" +nav_order: 9 +--- +# dedup + + +The `dedup` command removes duplicate documents defined by specified fields from the search result. + +## Syntax + +Use the following syntax: + +`dedup [int] [keepempty=] [consecutive=]` +* `int`: optional. The `dedup` command retains multiple events for each combination when you specify ``. The number for `` must be greater than 0. All other duplicates are removed from the results. **Default:** 1 +* `keepempty`: optional. If set to true, keep the document if any field in the field-list has NULL value or field is MISSING. **Default:** false. +* `consecutive`: optional. If set to true, removes only events with duplicate combinations of values that are consecutive. **Default:** false. +* `field-list`: mandatory. The comma-delimited field list. At least one field is required. + + +## Example 1: Dedup by one field + +The following example PPL query shows how to use `dedup` to remove duplicate documents based on the `gender` field: + +```sql +source=accounts +| dedup gender +| fields account_number, gender +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 13 | F | + + +## Example 2: Keep two duplicates documents + +The following example PPL query shows how to use `dedup` to remove duplicate documents based on the `gender` field while keeping two duplicates: + +```sql +source=accounts +| dedup 2 gender +| fields account_number, gender +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 6 | M | +| 13 | F | + + +## Example 3: Keep or ignore empty fields by default + +The following example PPL query shows how to use `dedup` to remove duplicate documents while keeping documents with null values in the specified field: + +```sql +source=accounts +| dedup email keepempty=true +| fields account_number, email +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | email | +| --- | --- | +| 1 | amberduke@pyrami.com | +| 6 | hattiebond@netagy.com | +| 13 | null | +| 18 | daleadams@boink.com | + +The following example PPL query shows how to use `dedup` to remove duplicate documents while ignoring documents with empty values in the specified field: + +```sql +source=accounts +| dedup email +| fields account_number, email +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | email | +| --- | --- | +| 1 | amberduke@pyrami.com | +| 6 | hattiebond@netagy.com | +| 18 | daleadams@boink.com | + + +## Example 4: Dedup in consecutive document + +The following example PPL query shows how to use `dedup` to remove duplicate consecutive documents: + +```sql +source=accounts +| dedup gender consecutive=true +| fields account_number, gender +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 13 | F | +| 18 | M | + + +## Limitations + +The `dedup` with `consecutive=true` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/describe.md b/_sql-and-ppl/ppl/cmd/describe.md new file mode 100644 index 00000000000..d3bf02a5112 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/describe.md @@ -0,0 +1,71 @@ +--- +layout: default +title: "describe" +parent: "Commands" +grand_parent: "PPL" +nav_order: 10 +--- +# describe + + +The `describe` command queries metadata of the index. The `describe` command can only be used as the first command in the PPL query. + +## Syntax + +Use the following syntax: + +`describe [dataSource.][schema.]` +* `dataSource`: optional. If dataSource is not provided, it resolves to OpenSearch dataSource. +* `schema`: optional. If schema is not provided, it resolves to default schema. +* `tablename`: mandatory. describe command must specify which tablename to query from. + + +## Example 1: Fetch all the metadata + +This example describes the accounts index. + +```sql +describe accounts +``` +{% include copy.html %} + +Expected output: + +| TABLE_CAT | TABLE_SCHEM | TABLE_NAME | COLUMN_NAME | DATA_TYPE | TYPE_NAME | COLUMN_SIZE | BUFFER_LENGTH | DECIMAL_DIGITS | NUM_PREC_RADIX | NULLABLE | REMARKS | COLUMN_DEF | SQL_DATA_TYPE | SQL_DATETIME_SUB | CHAR_OCTET_LENGTH | ORDINAL_POSITION | IS_NULLABLE | SCOPE_CATALOG | SCOPE_SCHEMA | SCOPE_TABLE | SOURCE_DATA_TYPE | IS_AUTOINCREMENT | IS_GENERATEDCOLUMN | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| docTestCluster | null | accounts | account_number | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 0 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | firstname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 1 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | address | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 2 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | balance | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 3 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | gender | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 4 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | city | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 5 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | employer | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 6 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | state | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 7 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | age | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 8 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | email | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 9 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | lastname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 10 | | null | null | null | null | NO | | + + +## Example 2: Fetch metadata with condition and filter + +This example retrieves columns with type bigint in the accounts index. + +```sql +describe accounts +| where TYPE_NAME="bigint" +| fields COLUMN_NAME +``` +{% include copy.html %} + +Expected output: + +| COLUMN_NAME | +| --- | +| account_number | +| balance | +| age | + + +## Example 3: Fetch metadata for table in Prometheus datasource + +See [Fetch metadata for table in Prometheus datasource]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/admin/datasources/) for more context. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/eval.md b/_sql-and-ppl/ppl/cmd/eval.md new file mode 100644 index 00000000000..8addfb25b5d --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/eval.md @@ -0,0 +1,127 @@ +--- +layout: default +title: "eval" +parent: "Commands" +grand_parent: "PPL" +nav_order: 11 +--- +# eval + + +The `eval` command evaluates the expression and appends the result to the search result. + +## Syntax + +Use the following syntax: + +`eval = ["," = ]...` +* `field`: mandatory. If the field name does not exist, a new field is added. If the field name already exists, it will be overridden. +* expression: mandatory. Any expression supported by the system. + + +## Example 1: Create a new field + +The following example PPL query shows how to use `eval` to create a new field for each document. In this example, the new field is `doubleAge`. + +```sql +source=accounts +| eval doubleAge = age * 2 +| fields age, doubleAge +``` +{% include copy.html %} + +Expected output: + +| age | doubleAge | +| --- | --- | +| 32 | 64 | +| 36 | 72 | +| 28 | 56 | +| 33 | 66 | + + +## Example 2: Override an existing field + +The following example PPL query shows how to use `eval` to override an existing field. In this example, the existing field `age` is overridden by the `age` field plus 1. + +```sql +source=accounts +| eval age = age + 1 +| fields age +``` +{% include copy.html %} + +Expected output: + +| age | +| --- | +| 33 | +| 37 | +| 29 | +| 34 | + + +## Example 3: Create a new field with field defined in eval + +The following example PPL query shows how to use `eval` to create a new field based on the fields defined in the `eval` expression. In this example, the new field `ddAge` is the evaluation result of the `doubleAge` field multiplied by 2. `doubleAge` is defined in the `eval` command. + +```sql +source=accounts +| eval doubleAge = age * 2, ddAge = doubleAge * 2 +| fields age, doubleAge, ddAge +``` +{% include copy.html %} + +Expected output: + +| age | doubleAge | ddAge | +| --- | --- | --- | +| 32 | 64 | 128 | +| 36 | 72 | 144 | +| 28 | 56 | 112 | +| 33 | 66 | 132 | + + +## Example 4: String concatenation + +The following example PPL query shows using the `+` operator for string concatenation. You can concatenate string literals and field values. + +```sql +source=accounts +| eval greeting = 'Hello ' + firstname +| fields firstname, greeting +``` +{% include copy.html %} + +Expected output: + +| firstname | greeting | +| --- | --- | +| Amber | Hello Amber | +| Hattie | Hello Hattie | +| Nanette | Hello Nanette | +| Dale | Hello Dale | + + +## Example 5: Multiple string concatenation with type casting + +The following example PPL query shows multiple concatenations with type casting from numeric to string. + +```sql +source=accounts | eval full_info = 'Name: ' + firstname + ', Age: ' + CAST(age AS STRING) | fields firstname, age, full_info +``` +{% include copy.html %} + +Expected output: + +| firstname | age | full_info | +| --- | --- | --- | +| Amber | 32 | Name: Amber, Age: 32 | +| Hattie | 36 | Name: Hattie, Age: 36 | +| Nanette | 28 | Name: Nanette, Age: 28 | +| Dale | 33 | Name: Dale, Age: 33 | + + +## Limitations + +The `eval` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/eventstats.md b/_sql-and-ppl/ppl/cmd/eventstats.md new file mode 100644 index 00000000000..8429c344b6f --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/eventstats.md @@ -0,0 +1,164 @@ +--- +layout: default +title: "eventstats" +parent: "Commands" +grand_parent: "PPL" +nav_order: 12 +--- +# eventstats + + +The `eventstats` command enriches your event data with calculated summary statistics. It operates by analyzing specified fields within your events, computing various statistical measures, and then appending these results as new fields to each original event. +Key aspects of `eventstats`: +1. It performs calculations across the entire search results or within defined groups. +2. The original events remain intact, with new fields added to contain the statistical results. +3. The command is particularly useful for comparative analysis, identifying outliers, or providing additional context to individual events. + +Difference between `stats` and `eventstats` +The `stats` and `eventstats` commands are both used for calculating statistics, but they have some key differences in how they operate and what they produce: +* Output Format + * `stats`: Produces a summary table with only the calculated statistics. + * `eventstats`: Adds the calculated statistics as new fields to the existing events, preserving the original data. +* Event Retention + * `stats`: Reduces the search results to only the statistical summary, discarding individual events. + * `eventstats`: Retains all original events and adds new fields with the calculated statistics. +* Use Cases + * `stats`: Best for creating summary reports or dashboards. Often used as a final command to summarize results. + * `eventstats`: Useful when you need to enrich events with statistical context for further analysis or filtering. It can be used mid-search to add statistics that can be used in subsequent commands. + + +## Syntax + +Use the following syntax: + +`eventstats [bucket_nullable=bool] ... [by-clause]` +* `function`: mandatory. An aggregation function or window function. +* `bucket_nullable`: optional. Controls whether the eventstats command consider null buckets as a valid group in group-by aggregations. When set to `false`, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` +* `by-clause`: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** aggregation over the entire search results. +* `span-expression`: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +## Aggregation functions + +The eventstats command supports the following aggregation functions: +* `COUNT`: Count of values +* `SUM`: Sum of numeric values +* `AVG`: Average of numeric values +* `MAX`: Maximum value +* `MIN`: Minimum value +* `VAR_SAMP`: Sample variance +* `VAR_POP`: Population variance +* `STDDEV_SAMP`: Sample standard deviation +* `STDDEV_POP`: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* `EARLIEST`: Earliest value by timestamp +* `LATEST`: Latest value by timestamp + +For detailed documentation of each function, see [Aggregation Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/aggregations/). + +## Usage + +Eventstats + +```sql +source = table | eventstats avg(a) +source = table | where a < 50 | eventstats count(c) +source = table | eventstats min(c), max(c) by b +source = table | eventstats count(c) as count_by by b | where count_by > 1000 +source = table | eventstats dc(field) as distinct_count +source = table | eventstats distinct_count(category) by region +``` +{% include copy.html %} + + +## Example 1: Calculate the average, sum and count of a field by group + +The following example PPL query shows how to use `eventstats` to calculate the average age, sum of age, and count of events for all accounts grouped by gender. + +```sql +source=accounts +| fields account_number, gender, age +| eventstats avg(age), sum(age), count() by gender +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | age | avg(age) | sum(age) | count() | +| --- | --- | --- | --- | --- | --- | +| 1 | M | 32 | 33.666666666666664 | 101 | 3 | +| 6 | M | 36 | 33.666666666666664 | 101 | 3 | +| 13 | F | 28 | 28.0 | 28 | 1 | +| 18 | M | 33 | 33.666666666666664 | 101 | 3 | + + +## Example 2: Calculate the count by a gender and span + +The following example PPL query shows how to use `eventstats` to count events by age intervals of 5 years, grouped by gender. + +```sql +source=accounts +| fields account_number, gender, age +| eventstats count() as cnt by span(age, 5) as age_span, gender +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | age | cnt | +| --- | --- | --- | --- | +| 1 | M | 32 | 2 | +| 6 | M | 36 | 1 | +| 13 | F | 28 | 1 | +| 18 | M | 33 | 2 | + + +## Example 3: Null buckets handling + +```sql +source=accounts +| eventstats bucket_nullable=false count() as cnt by employer +| fields account_number, firstname, employer, cnt +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | null | + +```sql +source=accounts +| eventstats bucket_nullable=true count() as cnt by employer +| fields account_number, firstname, employer, cnt +| sort account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | 1 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/expand.md b/_sql-and-ppl/ppl/cmd/expand.md new file mode 100644 index 00000000000..b87881c172c --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/expand.md @@ -0,0 +1,57 @@ +--- +layout: default +title: "expand" +parent: "Commands" +grand_parent: "PPL" +nav_order: 13 +--- +# expand + + +The `expand` command transforms a single document with a nested array field into multiple documents—each containing one element from the array. All other fields in the original document are duplicated across the resulting documents. +Key aspects of `expand`: +* It generates one row per element in the specified array field. +* The specified array field is converted into individual rows. +* If an alias is provided, the expanded values appear under the alias instead of the original field name. +* If the specified field is an empty array, the row is retained with the expanded field set to null. + + +## Syntax + +Use the following syntax: + +`expand [as alias]` +* `field`: mandatory. The field to be expanded (exploded). Currently only nested arrays are supported. +* `alias`: optional. The name to use instead of the original field name. + + +## Example 1: Expand address field with an alias + +Given a dataset `migration` with the following data: + +```text +{"name":"abbas","age":24,"address":[{"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}}]} +{"name":"chen","age":32,"address":[{"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}},{"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}}]} + +``` + +The following query expand the address field and rename it to addr: + +```sql +source=migration +| expand address as addr +``` +{% include copy.html %} + +Expected output: + +| name | age | addr | +| --- | --- | --- | +| abbas | 24 | {"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}} | +| chen | 32 | {"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}} | +| chen | 32 | {"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}} | + + +## Limitations + +* The `expand` command currently only supports nested arrays. Primitive fields storing arrays are not supported. E.g. a string field storing an array of strings cannot be expanded with the current implementation. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/explain.md b/_sql-and-ppl/ppl/cmd/explain.md new file mode 100644 index 00000000000..06b28b15ea6 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/explain.md @@ -0,0 +1,200 @@ +--- +layout: default +title: "explain" +parent: "Commands" +grand_parent: "PPL" +nav_order: 14 +--- +# explain + + +The `explain` command displays the execution plan of a query, which is often used for query translation and troubleshooting. The `explain` command can only be used as the first command in the PPL query. + +## Syntax + +Use the following syntax: + +`explain queryStatement` +* `mode`: optional. There are 4 explain modes: "simple", "standard", "cost", "extended". **Default:** standard. + * standard: The default mode. Display logical and physical plan with pushdown information (DSL). + * simple: Display the logical plan tree without attributes. + * cost: Display the standard information plus plan cost attributes. + * extended: Display the standard information plus generated code. +* `queryStatement`: mandatory. A PPL query to explain. + + +## Example 1: Explain a PPL query in v2 engine + +When Calcite is disabled (plugins.calcite.enabled=false), explaining a PPL query will get its physical plan of v2 engine and pushdown information. + +```sql +explain source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` +{% include copy.html %} + +Explain: + +```json +{ + "root": { + "name": "ProjectOperator", + "description": { + "fields": "[count(), country]" + }, + "children": [ + { + "name": "OpenSearchIndexScan", + "description": { + "request": """OpenSearchQueryRequest(indexName=state_country, sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"should":[{"term":{"country":{"value":"USA","boost":1.0}}},{"term":{"country":{"value":"England","boost":1.0}}}],"adjust_pure_negative":true,"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, pitId=null, cursorKeepAlive=null, searchAfter=null, searchResponse=null)""" + }, + "children": [] + } + ] + } +} +``` + + +## Example 2: Explain a PPL query in v3 engine + +When Calcite is enabled (`plugins.calcite.enabled=true`), explaining a PPL query will get its logical and physical plan of v3 engine and pushdown information. + +```sql +explain source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` +{% include copy.html %} + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]) + LogicalAggregate(group=[{1}], count()=[COUNT()]) + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#53:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/input=RelSubset#43,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) +""" + } +} +``` + + +## Example 3: Explain a PPL query with simple mode + +When Calcite is enabled (`plugins.calcite.enabled=true`), you can explain a PPL query with the "simple" mode. + +```sql +explain simple source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` +{% include copy.html %} + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject + LogicalAggregate + LogicalFilter + CalciteLogicalIndexScan +""" + } +} +``` + + +## Example 4: Explain a PPL query with cost mode + +When Calcite is enabled (`plugins.calcite.enabled=true`), you can explain a PPL query with the "cost" mode. + +```sql +explain cost source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` +{% include copy.html %} + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]): rowcount = 2.5, cumulative cost = {130.3125 rows, 206.0 cpu, 0.0 io}, id = 75 + LogicalAggregate(group=[{1}], count()=[COUNT()]): rowcount = 2.5, cumulative cost = {127.8125 rows, 201.0 cpu, 0.0 io}, id = 74 + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]): rowcount = 25.0, cumulative cost = {125.0 rows, 201.0 cpu, 0.0 io}, id = 73 + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 72 +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]): rowcount = 100.0, cumulative cost = {200.0 rows, 501.0 cpu, 0.0 io}, id = 138 + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#125:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/input=RelSubset#115,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 133 +""" + } +} +``` + + +## Example 5: Explain a PPL query with extended mode + +```sql +explain extended source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` +{% include copy.html %} + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]) + LogicalAggregate(group=[{1}], count()=[COUNT()]) + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#193:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/input=RelSubset#183,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) +""", + "extended": """public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) { + final org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan v1stashed = (org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan) root.get("v1stashed"); + final org.apache.calcite.linq4j.Enumerable _inputEnumerable = v1stashed.scan(); + return new org.apache.calcite.linq4j.AbstractEnumerable(){ + public org.apache.calcite.linq4j.Enumerator enumerator() { + return new org.apache.calcite.linq4j.Enumerator(){ + public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator(); + public void reset() { + inputEnumerator.reset(); + } + public boolean moveNext() { + return inputEnumerator.moveNext(); + } + public void close() { + inputEnumerator.close(); + } + public Object current() { + final Object[] current = (Object[]) inputEnumerator.current(); + final Object input_value = current[1]; + final Object input_value0 = current[0]; + return new Object[] { + input_value, + input_value0}; + } + }; + } + }; +} +public Class getElementType() { + return java.lang.Object[].class; +} +""" + } +} +``` \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/fields.md b/_sql-and-ppl/ppl/cmd/fields.md new file mode 100644 index 00000000000..55c426aafea --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/fields.md @@ -0,0 +1,224 @@ +--- +layout: default +title: "fields" +parent: "Commands" +grand_parent: "PPL" +nav_order: 15 +--- +# fields + + +The `fields` command specifies the fields that should be included in or excluded from the search results. + +## Syntax + +Use the following syntax: + +`fields [+|-] ` +* `+|-`: optional. If the plus (+) is used, only the fields specified in the field list will be included. If the minus (-) is used, all the fields specified in the field list will be excluded. **Default:** `+`. +* `field-list`: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. + + +## Example 1: Select specified fields from the search result + +The following example PPL query shows how to retrieve the `account_number`, `firstname`, and `lastname` fields from the search results: + +```sql +source=accounts +| fields account_number, firstname, lastname +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | lastname | +| --- | --- | --- | +| 1 | Amber | Duke | +| 6 | Hattie | Bond | +| 13 | Nanette | Bates | +| 18 | Dale | Adams | + + +## Example 2: Remove specified fields from the search results + +The following example PPL query shows how to remove the `account_number` field from the search results: + +```sql +source=accounts +| fields account_number, firstname, lastname +| fields - account_number +``` +{% include copy.html %} + +Expected output: + +| firstname | lastname | +| --- | --- | +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | + + +## Example 3: Space-delimited field selection + +Fields can be specified using spaces instead of commas, providing a more concise syntax. +**Syntax**: `fields field1 field2 field3` + +```sql +source=accounts +| fields firstname lastname age +``` +{% include copy.html %} + +Expected output: + +| firstname | lastname | age | +| --- | --- | --- | +| Amber | Duke | 32 | +| Hattie | Bond | 36 | +| Nanette | Bates | 28 | +| Dale | Adams | 33 | + + +## Example 4: Prefix wildcard pattern + +Select fields starting with a pattern using prefix wildcards. + +```sql +source=accounts +| fields account* +``` +{% include copy.html %} + +Expected output: + +| account_number | +| --- | +| 1 | +| 6 | +| 13 | +| 18 | + + +## Example 5: Suffix wildcard pattern + +Select fields ending with a pattern using suffix wildcards. + +```sql +source=accounts +| fields *name +``` +{% include copy.html %} + +Expected output: + +| firstname | lastname | +| --- | --- | +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | + + +## Example 6: Contains wildcard pattern + +Select fields containing a pattern using contains wildcards. + +```sql +source=accounts +| fields *a* +| head 1 +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | address | balance | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | IL | 32 | amberduke@pyrami.com | Duke | + + +## Example 7: Mixed delimiter syntax + +Combine spaces and commas for flexible field specification. + +```sql +source=accounts +| fields firstname, account* *name +``` +{% include copy.html %} + +Expected output: + +| firstname | account_number | lastname | +| --- | --- | --- | +| Amber | 1 | Duke | +| Hattie | 6 | Bond | +| Nanette | 13 | Bates | +| Dale | 18 | Adams | + + +## Example 8: Field deduplication + +Automatically prevents duplicate columns when wildcards expand to already specified fields. + +```sql +source=accounts +| fields firstname, *name +``` +{% include copy.html %} + +Expected output: + +| firstname | lastname | +| --- | --- | +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | + +Note: Even though `firstname` is explicitly specified and would also match `*name`, it appears only once due to automatic deduplication. + +## Example 9: Full wildcard selection + +Select all available fields using `*` or `` `*` ``. This selects all fields defined in the index schema, including fields that may contain null values. + +```sql +source=accounts +| fields `*` +| head 1 +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | + +Note: The `*` wildcard selects fields based on the index schema, not on data content. Fields with null values are included in the result set. Use backticks `` `*` ` if the plain `*`` doesn't return all expected fields. + +## Example 10: Wildcard exclusion + +Remove fields using wildcard patterns with the minus (-) operator. + +```sql +source=accounts +| fields - *name +``` +{% include copy.html %} + +Expected output: + +| account_number | address | balance | gender | city | employer | state | age | email | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | +| 6 | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | +| 13 | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | +| 18 | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | + + +## See also + +- [table]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/table/) - Alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/fillnull.md b/_sql-and-ppl/ppl/cmd/fillnull.md new file mode 100644 index 00000000000..f5432897cc0 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/fillnull.md @@ -0,0 +1,169 @@ +--- +layout: default +title: "fillnull" +parent: "Commands" +grand_parent: "PPL" +nav_order: 16 +--- +# fillnull + + +The `fillnull` command fills null values with the provided value in one or more fields in the search results. + +## Syntax + +Use one of the following syntax options: + +`fillnull with [in ]` +`fillnull using = [, = ]` +`fillnull value= []` +* `replacement`: mandatory. The value used to replace null values. +* `field-list`: optional. List of fields to apply the replacement to. It can be comma-delimited (with `with` or `using` syntax) or space-delimited (with `value=` syntax). **Default:** all fields. +* `field`: mandatory when using `using` syntax. Individual field name to assign a specific replacement value. +* **Syntax variations** + * `with in ` - Apply same value to specified fields + * `using =, ...` - Apply different values to different fields + * `value= []` - Alternative syntax with optional space-delimited field list + + +## Example 1: Replace null values with a specified value on one field + +The following example PPL query shows how to use `fillnull` to replace null values in the email field with '\'. + +```sql +source=accounts +| fields email, employer +| fillnull with '' in email +``` +{% include copy.html %} + +Expected output: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | null | + + +## Example 2: Replace null values with a specified value on multiple fields + +The following example PPL query shows how to use `fillnull` to replace null values in both email and employer fields with the same replacement value '\'. + +```sql +source=accounts +| fields email, employer +| fillnull with '' in email, employer +``` +{% include copy.html %} + +Expected output: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | + + +## Example 3: Replace null values with a specified value on all fields + +The following example PPL query shows how to use `fillnull` to replace null values in all fields when no field list is specified. + +```sql +source=accounts +| fields email, employer +| fillnull with '' +``` +{% include copy.html %} + +Expected output: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | + + +## Example 4: Replace null values with multiple specified values on multiple fields + +The following example PPL query shows how to use `fillnull` with different replacement values for different fields using the 'using' syntax. + +```sql +source=accounts +| fields email, employer +| fillnull using email = '', employer = '' +``` +{% include copy.html %} + +Expected output: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | + + +## Example 5: Replace null with specified value on specific fields (value= syntax) + +The following example PPL query shows how to use `fillnull` with the alternative 'value=' syntax to replace null values in specific fields. + +```sql +source=accounts +| fields email, employer +| fillnull value="" email employer +``` +{% include copy.html %} + +Expected output: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | + + +## Example 6: Replace null with specified value on all fields (value= syntax) + +When no field list is specified, the replacement applies to all fields in the result. + +```sql +source=accounts +| fields email, employer +| fillnull value='' +``` +{% include copy.html %} + +Expected output: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | + + +## Limitations + +* The `fillnull` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. +* When applying the same value to all fields without specifying field names, all fields must be the same type. For mixed types, use separate fillnull commands or explicitly specify fields. +* The replacement value type must match ALL field types in the field list. When applying the same value to multiple fields, all fields must be the same type (all strings or all numeric). + + **Example:** + +```sql + # This FAILS - same value for mixed-type fields + source=accounts | fillnull value=0 firstname, age + # ERROR: fillnull failed: replacement value type INTEGER is not compatible with field 'firstname' (type: VARCHAR). The replacement value type must match the field type. +``` +{% include copy.html %} + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/flatten.md b/_sql-and-ppl/ppl/cmd/flatten.md new file mode 100644 index 00000000000..fd4e8a9f579 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/flatten.md @@ -0,0 +1,101 @@ +--- +layout: default +title: "flatten" +parent: "Commands" +grand_parent: "PPL" +nav_order: 17 +--- +# flatten + + +The `flatten` command flattens a struct or an object field into separate fields in a document. + +The flattened fields will be ordered **lexicographically** by their original key names in the struct. For example, if the struct has keys `b`, `c` and `Z`, the flattened fields will be ordered as `Z`, `b`, `c`. +Note that `flatten` should not be applied to arrays. Use the `expand` command to expand an array field into multiple rows instead. However, since an array can be stored in a non-array field in OpenSearch, when flattening a field storing a nested array, only the first element of the array will be flattened. + +## Syntax + +Use the following syntax: + +`flatten [as ()]` +* `field`: mandatory. The field to be flattened. Only object and nested fields are supported. +* `alias-list`: optional. The names to use instead of the original key names. Names are separated by commas. It is advised to put the alias-list in parentheses if there is more than one alias. The length must match the number of keys in the struct field. The provided alias names **must** follow the lexicographical order of the corresponding original keys in the struct. + + +## Example: Flatten an object field with aliases + +The following example PPL query shows how to use `flatten` to flatten a message object field and use aliases to rename the flattened fields. +Given the following index `my-index` + +```text + {"message":{"info":"a","author":"e","dayOfWeek":1},"myNum":1} + {"message":{"info":"b","author":"f","dayOfWeek":2},"myNum":2} + +``` + +with the following mapping: + +```json + { + "mappings": { + "properties": { + "message": { + "type": "object", + "properties": { + "info": { + "type": "keyword", + "index": "true" + }, + "author": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + }, + "index": "true" + }, + "dayOfWeek": { + "type": "long" + } + } + }, + "myNum": { + "type": "long" + } + } + } + } + + +``` + +The following query flattens the `message` field and renames the keys to +`creator, dow, info`: + +```sql +source=my-index +| flatten message as (creator, dow, info) +``` +{% include copy.html %} + +Expected output: + +| message | myNum | creator | dow | info | +| --- | --- | --- | --- | --- | +| {"info":"a","author":"e","dayOfWeek":1} | 1 | e | 1 | a | +| {"info":"b","author":"f","dayOfWeek":2} | 2 | f | 2 | b | + + +## Limitations + +* `flatten` command may not work as expected when its flattened fields are + + invisible. + For example in query + `source=my-index | fields message | flatten message`, the + `flatten message` command doesn't work since some flattened fields such as + `message.info` and `message.author` after command `fields message` are + invisible. + As an alternative, you can change to `source=my-index | flatten message`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/grok.md b/_sql-and-ppl/ppl/cmd/grok.md new file mode 100644 index 00000000000..a1f5872ee57 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/grok.md @@ -0,0 +1,87 @@ +--- +layout: default +title: "grok" +parent: "Commands" +grand_parent: "PPL" +nav_order: 18 +--- +# grok + + +The `grok` command parses a text field with a grok pattern and appends the results to the search results. + +## Syntax + +Use the following syntax: + +`grok ` +* `field`: mandatory. The field must be a text field. +* `pattern`: mandatory. The grok pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. + + +## Example 1: Create the new field + +The following example PPL query shows how to use `grok` to create new field `host` for each document. `host` will be the hostname after `@` in `email` field. Parsing a null field will return an empty string. + +```sql +source=accounts +| grok email '.+@%{HOSTNAME:host}' +| fields email, host +``` +{% include copy.html %} + +Expected output: + +| email | host | +| --- | --- | +| amberduke@pyrami.com | pyrami.com | +| hattiebond@netagy.com | netagy.com | +| null | | +| daleadams@boink.com | boink.com | + + +## Example 2: Override the existing field + +The following example PPL query shows how to use `grok` to override the existing `address` field with street number removed. + +```sql +source=accounts +| grok address '%{NUMBER} %{GREEDYDATA:address}' +| fields address +``` +{% include copy.html %} + +Expected output: + +| address | +| --- | +| Holmes Lane | +| Bristol Street | +| Madison Street | +| Hutchinson Court | + + +## Example 3: Using grok to parse logs + +The following example PPL query shows how to use `grok` to parse raw logs. + +```sql +source=apache +| grok message '%{COMMONAPACHELOG}' +| fields COMMONAPACHELOG, timestamp, response, bytes +``` +{% include copy.html %} + +Expected output: + +| COMMONAPACHELOG | timestamp | response | bytes | +| --- | --- | --- | --- | +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | 28/Sep/2022:10:15:57 -0700 | 404 | 19927 | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | 28/Sep/2022:10:15:57 -0700 | 100 | 28722 | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | 28/Sep/2022:10:15:57 -0700 | 401 | 27439 | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | 28/Sep/2022:10:15:57 -0700 | 301 | 9481 | + + +## Limitations + +The grok command has the same limitations as the parse command, see [parse limitations]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/parse#limitations) for details. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/head.md b/_sql-and-ppl/ppl/cmd/head.md new file mode 100644 index 00000000000..7cb8a442487 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/head.md @@ -0,0 +1,85 @@ +--- +layout: default +title: "head" +parent: "Commands" +grand_parent: "PPL" +nav_order: 19 +--- +# head + + +The `head` command returns the first N number of lines from a search result. + +## Syntax + +Use the following syntax: + +`head [] [from ]` +* `size`: optional integer. The number of results you want to return. **Default:** 10 +* `offset`: optional integer after `from`. Number of results to skip. **Default:** 0 + + +## Example 1: Get the first 10 results + +The following example PPL query shows how to use `head` to return the first 10 search results: + +```sql +source=accounts +| fields firstname, age +| head +``` +{% include copy.html %} + +Expected output: + +| firstname | age | +| --- | --- | +| Amber | 32 | +| Hattie | 36 | +| Nanette | 28 | +| Dale | 33 | + + +## Example 2: Get first N results + +The following example PPL query shows how to use `head` to get a specified number of search results. In this example, N is equal to 3: + +```sql +source=accounts +| fields firstname, age +| head 3 +``` +{% include copy.html %} + +Expected output: + +| firstname | age | +| --- | --- | +| Amber | 32 | +| Hattie | 36 | +| Nanette | 28 | + + +## Example 3: Get the first N results after offset M + +The following example PPL query example shows getting the first 3 results after offset 1 from the `accounts` index. + +```sql +source=accounts +| fields firstname, age +| head 3 from 1 +``` +{% include copy.html %} + +Expected output: + +| firstname | age | +| --- | --- | +| Hattie | 36 | +| Nanette | 28 | +| Dale | 33 | + + +## Limitations + +The `head` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/index.md b/_sql-and-ppl/ppl/cmd/index.md new file mode 100644 index 00000000000..aaa3fb7c88b --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/index.md @@ -0,0 +1,15 @@ +--- +layout: default +title: "Commands" +parent: "PPL" +nav_order: 20 +has_children: true +redirect_from: + - /search-plugins/sql/ppl/functions/ + - /observability-plugin/ppl/commands/ + - /search-plugins/ppl/commands/ + - /search-plugins/ppl/functions/ + - /sql-and-ppl/ppl/functions/ +--- +# Commands +PPL supports most common [SQL functions]({{site.url}}{{site.baseurl}}/search-plugins/sql/functions/), including [relevance search]({{site.url}}{{site.baseurl}}/search-plugins/sql/full-text/), but also introduces several more functions (called _commands_), which are available in PPL only. diff --git a/_sql-and-ppl/ppl/cmd/join.md b/_sql-and-ppl/ppl/cmd/join.md new file mode 100644 index 00000000000..d285620039d --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/join.md @@ -0,0 +1,216 @@ +--- +layout: default +title: "join" +parent: "Commands" +grand_parent: "PPL" +nav_order: 21 +--- +# join + + +The `join` command combines two datasets together. The left side could be an index or results from a piped commands, the right side could be either an index or a subsearch. + +## Syntax + +The `join` command supports basic and extended syntax options. + +### Basic syntax + +`[joinType] join [leftAlias] [rightAlias] (on | where) ` +* `joinType`: optional. The type of join to perform. Options: `left`, `semi`, `anti`, and performance-sensitive types `right`, `full`, `cross`. **Default:** `inner`. +* `leftAlias`: optional. The subsearch alias to use with the left join side, to avoid ambiguous naming. Pattern: `left = ` +* `rightAlias`: optional. The subsearch alias to use with the right join side, to avoid ambiguous naming. Pattern: `right = ` +* `joinCriteria`: mandatory. Any comparison expression. Must follow `on` or `where` keyword. +* `right-dataset`: mandatory. Right dataset could be either an `index` or a `subsearch` with/without alias. + +### Extended syntax: + +`join [type=] [overwrite=] [max=n] ( | [leftAlias] [rightAlias] (on | where) ) ` +* `type`: optional. Join type using extended syntax. Options: `left`, `outer` (alias of `left`), `semi`, `anti`, and performance-sensitive types `right`, `full`, `cross`. **Default:** `inner`. +* `overwrite`: optional boolean. Only works with `join-field-list`. Specifies whether duplicate-named fields from right-dataset should replace corresponding fields in the main search results. **Default:** `true`. +* `max`: optional integer. Controls how many subsearch results could be joined against each row in main search. **Default:** 0 (unlimited). +* `join-field-list`: optional. The fields used to build the join criteria. The join field list must exist on both sides. If not specified, all fields common to both sides will be used as join keys. +* `leftAlias`: optional. Same as basic syntax when used with extended syntax. +* `rightAlias`: optional. Same as basic syntax when used with extended syntax. +* `joinCriteria`: mandatory. Same as basic syntax when used with extended syntax. +* `right-dataset`: mandatory. Same as basic syntax. + + +## Configuration + +The following settings configure the `join` command behavior. + +### plugins.ppl.join.subsearch_maxout + +The size configures the maximum of rows from subsearch to join against. The default value is: `50000`. A value of `0` indicates that the restriction is unlimited. +Change the join.subsearch_maxout to 5000 + +```bash +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.ppl.join.subsearch_maxout" : "5000"}}' +``` +{% include copy.html %} + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "ppl": { + "join": { + "subsearch_maxout": "5000" + } + } + } + }, + "transient": {} +} +``` + + +## Usage + +Basic join syntax: + +``` +source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | inner join left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | cross join left = l right = r on 1=1 table2 +source = table1 | left semi join left = l right = r on l.a = r.a table2 +source = table1 | left anti join left = l right = r on l.a = r.a table2 +source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] +source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c +source = table1 | inner join on a = c table2 | fields a, b, c, d +source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a +source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a +source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a +``` + +Extended syntax with options: + +``` +source = table1 | join type=outer left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join type=left left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join type=inner max=1 left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join a table2 | fields a, b, c +source = table1 | join a, b table2 | fields a, b, c +source = table1 | join type=outer a b table2 | fields a, b, c +source = table1 | join type=inner max=1 a, b table2 | fields a, b, c +source = table1 | join type=left overwrite=false max=0 a, b [source=table2 | rename d as b] | fields a, b, c +``` + + +## Example 1: Two indexes join + +The following example PPL query shows how to use `join` to join two indexes using the basic join syntax. + +```sql +source = state_country +| inner join left=a right=b ON a.name = b.name occupation +| stats avg(salary) by span(age, 10) as age_span, b.country +``` +{% include copy.html %} + +Expected output: + +| avg(salary) | age_span | b.country | +| --- | --- | --- | +| 120000.0 | 40 | USA | +| 105000.0 | 20 | Canada | +| 0.0 | 40 | Canada | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | + + +## Example 2: Join with subsearch + +The following example PPL query shows how to use `join` to join with a subsearch using the basic join syntax. + +```sql +source = state_country as a +| where country = 'USA' OR country = 'England' +| left join ON a.name = b.name [ source = occupation +| where salary > 0 +| fields name, country, salary +| sort salary +| head 3 ] as b +| stats avg(salary) by span(age, 10) as age_span, b.country +``` +{% include copy.html %} + +Expected output: + +| avg(salary) | age_span | b.country | +| --- | --- | --- | +| null | 40 | null | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | + + +## Example 3: Join with field list + +The following example PPL query shows how to use `join` with the extended syntax and field list. + +```sql +source = state_country +| where country = 'USA' OR country = 'England' +| join type=left overwrite=true name [ source = occupation +| where salary > 0 +| fields name, country, salary +| sort salary +| head 3 ] +| stats avg(salary) by span(age, 10) as age_span, country +``` +{% include copy.html %} + +Expected output: + +| avg(salary) | age_span | country | +| --- | --- | --- | +| null | 40 | null | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | + + +## Example 4: Join with options + +The following example PPL query shows how to use `join` with the extended syntax and additional options. + +```sql +source = state_country +| join type=inner overwrite=false max=1 name occupation +| stats avg(salary) by span(age, 10) as age_span, country +``` +{% include copy.html %} + +Expected output: + +| avg(salary) | age_span | country | +| --- | --- | --- | +| 120000.0 | 40 | USA | +| 100000.0 | 70 | USA | +| 105000.0 | 20 | Canada | +| 70000.0 | 30 | USA | + + +## Limitations + +For basic syntax, if fields in the left outputs and right outputs have the same name. Typically, in the join criteria +`ON t1.id = t2.id`, the names `id` in output are ambiguous. To avoid ambiguous, the ambiguous +fields in output rename to `.id`, or else `.id` if no alias existing. + +Assume table1 and table2 only contain field `id`, following PPL queries and their outputs are: + +| Query | Output | +| --- | --- | +| source=table1 \| join left=t1 right=t2 on t1.id=t2.id table2 \| eval a = 1 | t1.id, t2.id, a | +| source=table1 \| join on table1.id=table2.id table2 \| eval a = 1 | table1.id, table2.id, a | +| source=table1 \| join on table1.id=t2.id table2 as t2 \| eval a = 1 | table1.id, t2.id, a | +| source=table1 \| join right=tt on table1.id=t2.id [ source=table2 as t2 \| eval b = id ] \| eval a = 1 | table1.id, tt.id, tt.b, a | + +For extended syntax (join with field list), when duplicate-named fields in output results are deduplicated, the fields in output determined by the value of 'overwrite' option. +Join types `inner`, `left`, `outer` (alias of `left`), `semi` and `anti` are supported by default. `right`, `full`, `cross` are performance-sensitive join types which are disabled by default. Set config `plugins.calcite.all_join_types.allowed = true` to enable. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/kmeans.md b/_sql-and-ppl/ppl/cmd/kmeans.md new file mode 100644 index 00000000000..2ac063e2188 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/kmeans.md @@ -0,0 +1,45 @@ +--- +layout: default +title: "kmeans" +parent: "Commands" +grand_parent: "PPL" +nav_order: 22 +--- +# kmeans (deprecated by ml command) + + +The `kmeans` command applies the kmeans algorithm in the ml-commons plugin on the search results returned by a PPL command. + +## Syntax + +Use the following syntax: + +`kmeans ` +* `centroids`: optional. The number of clusters you want to group your data points into. **Default:** 2. +* `iterations`: optional. Number of iterations. **Default:** 10. +* `distance_type`: optional. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. + + +## Example: Clustering of iris dataset + +The following example PPL query shows how to use `kmeans` to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. + +```sql +source=iris_data +| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm +| kmeans centroids=3 +``` +{% include copy.html %} + +Expected output: + +| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | +| --- | --- | --- | --- | --- | +| 5.1 | 3.5 | 1.4 | 0.2 | 1 | +| 5.6 | 3.0 | 4.1 | 1.3 | 0 | +| 6.7 | 2.5 | 5.8 | 1.8 | 2 | + + +## Limitations + +The `kmeans` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/lookup.md b/_sql-and-ppl/ppl/cmd/lookup.md new file mode 100644 index 00000000000..460b8c37fc4 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/lookup.md @@ -0,0 +1,357 @@ +--- +layout: default +title: "lookup" +parent: "Commands" +grand_parent: "PPL" +nav_order: 23 +--- +# lookup + + +The `lookup` command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend fields of an index with values from a dimension table, append or replace values when lookup condition is matched. As an alternative of join command, lookup command is more suitable for enriching the source data with a static dataset. + +## Syntax + +Use the following syntax: + +`lookup ( [as ])... [(replace | append) ( [as ])...]` +* `lookupIndex`: mandatory. The name of lookup index (dimension table). +* `lookupMappingField`: mandatory. A mapping key in `lookupIndex`, analogy to a join key from right table. You can specify multiple `lookupMappingField` with comma-delimited. +* `sourceMappingField`: optional. A mapping key from source (left side), analogy to a join key from left side. If not specified, defaults to `lookupMappingField`. +* `inputField`: optional. A field in `lookupIndex` where matched values are applied to result output. You can specify multiple `inputField` with comma-delimited. If not specified, all fields except `lookupMappingField` from `lookupIndex` are applied to result output. +* `outputField`: optional. A field of output. You can specify zero or multiple `outputField`. If `outputField` has an existing field name in source query, its values will be replaced or appended by matched values from `inputField`. If the field specified in `outputField` is a new field, in replace strategy, an extended new field will be applied to the results, but fail in append strategy. +* replace \| append: optional. The output strategies. If replace, matched values in `lookupIndex` field overwrite the values in result. If append, matched values in `lookupIndex` field only append to the missing values in result. **Default:** replace. + + +## Usage + +Lookup + +``` +source = table1 | lookup table2 id +source = table1 | lookup table2 id, name +source = table1 | lookup table2 id as cid, name +source = table1 | lookup table2 id as cid, name replace dept as department +source = table1 | lookup table2 id as cid, name replace dept as department, city as location +source = table1 | lookup table2 id as cid, name append dept as department +source = table1 | lookup table2 id as cid, name append dept as department, city as location +``` + + +## Example 1: Replace strategy + +The following example PPL query shows how to use `lookup` with the REPLACE strategy to overwrite existing values. + +```bash +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information uid AS id REPLACE department + | fields id, name, occupation, country, salary, department + """ +}' +``` +{% include copy.html %} + +Result set + +```json +{ + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "occupation", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "salary", + "type": "integer" + }, + { + "name": "department", + "type": "string" + } + ], + "datarows": [ + [ + 1000, + "Jake", + "Engineer", + "England", + 100000, + "IT" + ], + [ + 1001, + "Hello", + "Artist", + "USA", + 70000, + null + ], + [ + 1002, + "John", + "Doctor", + "Canada", + 120000, + "DATA" + ], + [ + 1003, + "David", + "Doctor", + null, + 120000, + "HR" + ], + [ + 1004, + "David", + null, + "Canada", + 0, + null + ], + [ + 1005, + "Jane", + "Scientist", + "Canada", + 90000, + "DATA" + ] + ], + "total": 6, + "size": 6 +} +``` + + +## Example 2: Append strategy + +The following example PPL query shows how to use `lookup` with the APPEND strategy to fill missing values only. + +```bash +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information uid AS id APPEND department + | fields id, name, occupation, country, salary, department + """ +}' +``` +{% include copy.html %} + + +## Example 3: No inputField specified + +The following example PPL query shows how to use `lookup` without specifying inputField, which applies all fields from the lookup index. + +```bash +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information uid AS id, name + | fields id, name, occupation, country, salary, department + """ +}' +``` +{% include copy.html %} + +Result set + +```json +{ + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "salary", + "type": "integer" + }, + { + "name": "department", + "type": "string" + }, + { + "name": "occupation", + "type": "string" + } + ], + "datarows": [ + [ + 1000, + "Jake", + "England", + 100000, + "IT", + "Engineer" + ], + [ + 1001, + "Hello", + "USA", + 70000, + null, + null + ], + [ + 1002, + "John", + "Canada", + 120000, + "DATA", + "Scientist" + ], + [ + 1003, + "David", + null, + 120000, + "HR", + "Doctor" + ], + [ + 1004, + "David", + "Canada", + 0, + null, + null + ], + [ + 1005, + "Jane", + "Canada", + 90000, + "DATA", + "Engineer" + ] + ], + "total": 6, + "size": 6 +} +``` + + +## Example 4: OutputField as a new field + +The following example PPL query shows how to use `lookup` with outputField as a new field name. + +```bash +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information name REPLACE occupation AS new_col + | fields id, name, occupation, country, salary, new_col + """ +}' +``` +{% include copy.html %} + +Result set + +```json +{ + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "occupation", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "salary", + "type": "integer" + }, + { + "name": "new_col", + "type": "string" + } + ], + "datarows": [ + [ + 1003, + "David", + "Doctor", + null, + 120000, + "Doctor" + ], + [ + 1004, + "David", + null, + "Canada", + 0, + "Doctor" + ], + [ + 1001, + "Hello", + "Artist", + "USA", + 70000, + null + ], + [ + 1000, + "Jake", + "Engineer", + "England", + 100000, + "Engineer" + ], + [ + 1005, + "Jane", + "Scientist", + "Canada", + 90000, + "Engineer" + ], + [ + 1002, + "John", + "Doctor", + "Canada", + 120000, + "Scientist" + ] + ], + "total": 6, + "size": 6 +} +``` \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/ml.md b/_sql-and-ppl/ppl/cmd/ml.md new file mode 100644 index 00000000000..57478caac80 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/ml.md @@ -0,0 +1,151 @@ +--- +layout: default +title: "ml" +parent: "Commands" +grand_parent: "PPL" +nav_order: 24 +--- +# ml + + +The `ml` command trains, predicts, or trains and predicts on any algorithm in the ml-commons plugin on the search results returned by a PPL command. + +## Syntax + +The `ml` command supports different syntax options depending on the algorithm. + +## AD - Fixed in time RCF for time-series data + +`ml action='train' algorithm='rcf' ` +* `number_of_trees`: optional integer. Number of trees in the forest. **Default:** 30. +* `shingle_size`: optional integer. A shingle is a consecutive sequence of the most recent records. **Default:** 8. +* `sample_size`: optional integer. The sample size used by stream samplers in this forest. **Default:** 256. +* `output_after`: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. +* `time_decay`: optional double. The decay factor used by stream samplers in this forest. **Default:** 0.0001. +* `anomaly_rate`: optional double. The anomaly rate. **Default:** 0.005. +* `time_field`: mandatory string. It specifies the time field for RCF to use as time-series data. +* `date_format`: optional string. It's used for formatting time_field field. **Default:** "yyyy-MM-dd HH:mm:ss". +* `time_zone`: optional string. It's used for setting time zone for time_field field. **Default:** UTC. +* `category_field`: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. + + +## AD - Batch RCF for non-time-series data: + +`ml action='train' algorithm='rcf' ` +* `number_of_trees`: optional integer. Number of trees in the forest. **Default:** 30. +* `sample_size`: optional integer. Number of random samples given to each tree from the training dataset. **Default:** 256. +* `output_after`: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. +* `training_data_size`: optional integer. **Default:** size of your training dataset. +* `anomaly_score_threshold`: optional double. The threshold of anomaly score. **Default:** 1.0. +* `category_field`: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. + + +## KMEANS: + +`ml action='train' algorithm='kmeans' ` +* `centroids`: optional integer. The number of clusters you want to group your data points into. **Default:** 2. +* `iterations`: optional integer. Number of iterations. **Default:** 10. +* `distance_type`: optional string. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. + + +## Example 1: Detecting events in New York City from taxi ridership data with time-series data + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. + +```sql +source=nyc_taxi +| fields value, timestamp +| ml action='train' algorithm='rcf' time_field='timestamp' +| where value=10844.0 +``` +{% include copy.html %} + +Expected output: + +| value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | +| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | + + +## Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. + +```sql +source=nyc_taxi +| fields category, value, timestamp +| ml action='train' algorithm='rcf' time_field='timestamp' category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +Expected output: + +| category | value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | --- | +| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | +| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | + + +## Example 3: Detecting events in New York City from taxi ridership data with non-time-series data + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. + +```sql +source=nyc_taxi +| fields value +| ml action='train' algorithm='rcf' +| where value=10844.0 +``` +{% include copy.html %} + +Expected output: + +| value | score | anomalous | +| --- | --- | --- | +| 10844.0 | 0.0 | False | + + +## Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. + +```sql +source=nyc_taxi +| fields category, value +| ml action='train' algorithm='rcf' category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +Expected output: + +| category | value | score | anomalous | +| --- | --- | --- | --- | +| night | 10844.0 | 0.0 | False | +| day | 6526.0 | 0.0 | False | + + +## Example 5: KMEANS - Clustering of iris dataset + +The following example PPL query shows how to use `ml` with KMEANS to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. + +```sql +source=iris_data +| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm +| ml action='train' algorithm='kmeans' centroids=3 +``` +{% include copy.html %} + +Expected output: + +| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | +| --- | --- | --- | --- | --- | +| 5.1 | 3.5 | 1.4 | 0.2 | 1 | +| 5.6 | 3.0 | 4.1 | 1.3 | 0 | +| 6.7 | 2.5 | 5.8 | 1.8 | 2 | + + +## Limitations + +The `ml` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/multisearch.md b/_sql-and-ppl/ppl/cmd/multisearch.md new file mode 100644 index 00000000000..de1d0a52251 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/multisearch.md @@ -0,0 +1,151 @@ +--- +layout: default +title: "multisearch" +parent: "Commands" +grand_parent: "PPL" +nav_order: 25 +--- +# multisearch + + +The `multisearch` command runs multiple search subsearches and merges their results together. The command allows you to combine data from different queries on the same or different sources, and optionally apply subsequent processing to the combined search results. +Key aspects of `multisearch`: +1. Combines results from multiple search operations into a single result set. +2. Each subsearch can have different filtering criteria, data transformations, and field selections. +3. Results are merged and can be further processed with aggregations, sorting, and other PPL commands. +4. Particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. +5. Supports timestamp-based result interleaving when working with time-series data. + +Use Cases: +* **Comparative Analysis**: Compare metrics across different segments, regions, or time periods +* **Success Rate Monitoring**: Calculate success rates by comparing successful compared to total operations +* **Multi-source Data Combination**: Merge data from different indexes or apply different filters to the same source +* **A/B Testing Analysis**: Combine results from different test groups for comparison +* **Time-series Data Merging**: Interleave events from multiple sources based on timestamps + + +## Syntax + +Use the following syntax: + +`multisearch ...` +* subsearch1, subsearch2, ...: mandatory. At least two subsearches required. Each subsearch must be enclosed in square brackets and start with the `search` keyword. Format: `[search source=index | commands...]`. All PPL commands are supported within subsearches. +* `result-processing`: optional. Commands applied to the merged results after the multisearch operation, such as `stats`, `sort`, `head`, etc. + + +## Usage + +Basic multisearch + +``` +| multisearch [search source=table | where condition1] [search source=table | where condition2] +| multisearch [search source=index1 | fields field1, field2] [search source=index2 | fields field1, field2] +| multisearch [search source=table | where status="success"] [search source=table | where status="error"] +``` + + +## Example 1: Basic age group analysis + +This example combines young and adult customers into a single result set for further analysis. + +```sql +| multisearch [search source=accounts +| where age < 30 +| eval age_group = "young" +| fields firstname, age, age_group] [search source=accounts +| where age >= 30 +| eval age_group = "adult" +| fields firstname, age, age_group] +| sort age +``` +{% include copy.html %} + +Expected output: + +| firstname | age | age_group | +| --- | --- | --- | +| Nanette | 28 | young | +| Amber | 32 | adult | +| Dale | 33 | adult | +| Hattie | 36 | adult | + + +## Example 2: Success rate Pattern + +This example combines high-balance and all valid accounts for comparison analysis. + +```sql +| multisearch [search source=accounts +| where balance > 20000 +| eval query_type = "high_balance" +| fields firstname, balance, query_type] [search source=accounts +| where balance > 0 AND balance <= 20000 +| eval query_type = "regular" +| fields firstname, balance, query_type] +| sort balance desc +``` +{% include copy.html %} + +Expected output: + +| firstname | balance | query_type | +| --- | --- | --- | +| Amber | 39225 | high_balance | +| Nanette | 32838 | high_balance | +| Hattie | 5686 | regular | +| Dale | 4180 | regular | + + +## Example 3: Timestamp interleaving + +This example combines time-series data from multiple sources with automatic timestamp-based ordering. + +```sql +| multisearch [search source=time_data +| where category IN ("A", "B")] [search source=time_data2 +| where category IN ("E", "F")] +| fields @timestamp, category, value, timestamp +| head 5 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | category | value | timestamp | +| --- | --- | --- | --- | +| 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 | +| 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 | +| 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 | +| 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 | +| 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 | + + +## Example 4: Type compatibility - missing fields + +The following example PPL query demonstrates how missing fields are handled with NULL insertion. + +```sql +| multisearch [search source=accounts +| where age < 30 +| eval young_flag = "yes" +| fields firstname, age, young_flag] [search source=accounts +| where age >= 30 +| fields firstname, age] +| sort age +``` +{% include copy.html %} + +Expected output: + +| firstname | age | young_flag | +| --- | --- | --- | +| Nanette | 28 | yes | +| Amber | 32 | null | +| Dale | 33 | null | +| Hattie | 36 | null | + + +## Limitations + +* **Minimum Subsearches**: At least two subsearches must be specified +* **Schema Compatibility**: When fields with the same name exist across subsearches but have incompatible types, the system automatically resolves conflicts by renaming the conflicting fields. The first occurrence retains the original name, while subsequent conflicting fields are renamed with a numeric suffix (e.g., `age` becomes `age0`, `age1`, etc.). This ensures all data is preserved while maintaining schema consistency. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/parse.md b/_sql-and-ppl/ppl/cmd/parse.md new file mode 100644 index 00000000000..692c96abbfd --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/parse.md @@ -0,0 +1,134 @@ +--- +layout: default +title: "parse" +parent: "Commands" +grand_parent: "PPL" +nav_order: 26 +--- +# parse + + +The `parse` command extracts information from a text field using a regular expression and adds it to the search result. + +## Syntax + +Use the following syntax: + +`parse ` +* `field`: mandatory. The field must be a text field. +* `pattern`: mandatory. The regular expression pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. + + +## Regular expression +The regular expression pattern is used to match the whole text field of each document with Java regex engine. Each named capture group in the expression will become a new `STRING` field. + +## Example 1: Create a new field + +The following example PPL query shows how to create new field `host` for each document. `host` becomes the hostname after the @ symbol in the `email` field. Parsing a null field returns an empty string. + +```sql +source=accounts +| parse email '.+@(?.+)' +| fields email, host +``` +{% include copy.html %} + +Expected output: + +| email | host | +| --- | --- | +| amberduke@pyrami.com | pyrami.com | +| hattiebond@netagy.com | netagy.com | +| null | | +| daleadams@boink.com | boink.com | + + +## Example 2: Override an existing field + +The following example PPL query shows how to override the existing `address` field while excluding the street number: + +```sql +source=accounts +| parse address '\d+ (?
.+)' +| fields address +``` +{% include copy.html %} + +Expected output: + +| address | +| --- | +| Holmes Lane | +| Bristol Street | +| Madison Street | +| Hutchinson Court | + + +## Example 3: Filter and sort by casted parsed field + +The following example PPL query shows how to sort street numbers that are higher than 500 in the `address` field. + +```sql +source=accounts +| parse address '(?\d+) (?.+)' +| where cast(streetNumber as int) > 500 +| sort num(streetNumber) +| fields streetNumber, street +``` +{% include copy.html %} + +Expected output: + +| streetNumber | street | +| --- | --- | +| 671 | Bristol Street | +| 789 | Madison Street | +| 880 | Holmes Lane | + + +## Limitations + +There are a few limitations with parse command: +- Fields defined by parse cannot be parsed again. + +The following command will not work + +``` +source=accounts | parse address '\d+ (?.+)' | parse street '\w+ (?\w+)' ; +``` + +- Fields defined by parse cannot be overridden with other commands. + +`where` will not match any documents since `street` cannot be overridden + +``` +source=accounts | parse address '\d+ (?.+)' | eval street='1' | where street='1' ; +``` + +- The text field used by parse cannot be overridden. + +`street` will not be successfully parsed since `address` is overridden + +``` +source=accounts | parse address '\d+ (?.+)' | eval address='1' ; +``` + +- Fields defined by parse cannot be filtered/sorted after using them in `stats` command. + +`where` in the following command will not work + +``` +source=accounts | parse email '.+@(?.+)' | stats avg(age) by host | where host=pyrami.com ; +``` + +- Fields defined by parse will not appear in the final result unless the original source field is included in the `fields` command. + +For example, the following query will not display the parsed fields `host` unless the source field `email` is also explicitly included + +``` +source=accounts | parse email '.+@(?.+)' | fields email, host ; +``` + +- Named capture group must start with a letter and contain only letters and digits. + + For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/patterns.md b/_sql-and-ppl/ppl/cmd/patterns.md new file mode 100644 index 00000000000..e2fddedc05d --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/patterns.md @@ -0,0 +1,244 @@ +--- +layout: default +title: "patterns" +parent: "Commands" +grand_parent: "PPL" +nav_order: 27 +--- +# patterns + + +The `patterns` command extracts log patterns from a text field and appends the results to the search results. Grouping logs by their patterns makes it easier to aggregate stats from large volumes of log data for analysis and troubleshooting. +`patterns` command allows users to select different log parsing algorithms to get high log pattern grouping accuracy. Two pattern methods are supported: `simple_pattern` and `brain`. +`simple_pattern` algorithm is basically a regex parsing method compared to `brain` algorithm is an automatic log grouping algorithm with high grouping accuracy and keeps semantic meaning. +`patterns` command supports two modes: `label` and `aggregation`. `label` mode returns individual pattern labels. `aggregation` mode returns aggregated results on target field. +Calcite engine by default labels the variables with '\<*\>' placeholder. If `show_numbered_token` option is turned on, Calcite engine's `label` mode not only labels pattern of text but also labels variable tokens in map. In `aggregation` mode, it will also output labeled pattern as well as variable tokens per pattern. The variable placeholder is in the format of '' instead of '<\*>'. + +## Syntax + +Use the following syntax: + +`patterns [by byClause...] [method=simple_pattern | brain] [mode=label | aggregation] [max_sample_count=integer] [buffer_limit=integer] [show_numbered_token=boolean] [new_field=] (algorithm parameters...)` +* `field`: mandatory. The text field to analyze for patterns. +* `byClause`: optional. Fields or scalar functions used to group logs for labeling/aggregation. +* `method`: optional. Algorithm choice: `simple_pattern` or `brain`. **Default:** `simple_pattern`. +* `mode`: optional. Output mode: `label` or `aggregation`. **Default:** `label`. +* `max_sample_count`: optional. Max sample logs returned per pattern in aggregation mode. **Default:** 10. +* `buffer_limit`: optional. Safeguard parameter for `brain` algorithm to limit internal temporary buffer size (min: 50,000). **Default:** 100,000. +* `show_numbered_token`: optional. The flag to turn on numbered token output format. **Default:** false. +* `new_field`: optional. Alias of the output pattern field. **Default:** "patterns_field". +* algorithm parameters: optional. Algorithm-specific tuning: + * `simple_pattern`: Define regex through "pattern". + * `brain`: Adjust sensitivity with variable_count_threshold and frequency_threshold_percentage. + * `variable_count_threshold`: optional integer. Words are split by space. Algorithm counts how many distinct words are at specific position in initial log groups. Adjusting this threshold can determine the sensitivity of constant words. **Default:** 5. + * `frequency_threshold_percentage`: optional double. Brain's log pattern is selected based on longest word combination. This sets the lower bound of frequency to ignore low frequency words. **Default:** 0.3. + + +## Change the default pattern method + +To override default pattern parameters, users can run following command + +``` + PUT _cluster/settings + { + "persistent": { + "plugins.ppl.pattern.method": "brain", + "plugins.ppl.pattern.mode": "aggregation", + "plugins.ppl.pattern.max.sample.count": 5, + "plugins.ppl.pattern.buffer.limit": 50000, + "plugins.ppl.pattern.show.numbered.token": true + } + } +``` + + +## Simple pattern example 1: Create the new field + +The following example PPL query shows how to use `patterns` to extract patterns in `email` for each document. Parsing a null field will return an empty string. + +```sql +source=accounts +| patterns email method=simple_pattern +| fields email, patterns_field +``` +{% include copy.html %} + +Expected output: + +| email | patterns_field | +| --- | --- | +| amberduke@pyrami.com | <*>@<*>.<*> | +| hattiebond@netagy.com | <*>@<*>.<*> | +| null | | +| daleadams@boink.com | <*>@<*>.<*> | + + +## Simple pattern example 2: Extract log patterns + +The following example PPL query shows how to use `patterns` to extract patterns from a raw log field using the default patterns. + +```sql +source=apache +| patterns message method=simple_pattern +| fields message, patterns_field +``` +{% include copy.html %} + +Expected output: + +| message | patterns_field | +| --- | --- | +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | + + +## Simple pattern example 3: Extract log patterns with custom regex pattern + +The following example PPL query shows how to use `patterns` to extract patterns from a raw log field using user defined patterns. + +```sql +source=apache +| patterns message method=simple_pattern new_field='no_numbers' pattern='[0-9]' +| fields message, no_numbers +``` +{% include copy.html %} + +Expected output: + +| message | no_numbers | +| --- | --- | +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*><*><*>.<*><*>.<*>.<*><*> - upton<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "HEAD /e-business/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*><*><*>.<*><*>.<*><*><*>.<*> - pouros<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*><*><*>.<*><*><*>.<*><*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "PATCH /strategize/out-of-the-box HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*><*><*>.<*><*><*>.<*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "POST /users HTTP/<*>.<*>" <*><*><*> <*><*><*><*> | + + +## Simple pattern example 4: Return log patterns aggregation result + +The following example PPL query shows how to use `patterns` to get aggregated results from a raw log field. + +```sql +source=apache +| patterns message method=simple_pattern mode=aggregation +| fields patterns_field, pattern_count, sample_logs +``` +{% include copy.html %} + +Expected output: + +| patterns_field | pattern_count | sample_logs | +| --- | --- | --- | +| <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | 1 | [210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | +| <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | 1 | [118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439] | +| <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927] | +| <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722] | + + +## Simple pattern example 5: Return log patterns aggregation result with detected variable tokens + +The following example PPL query shows how to use `patterns` to get aggregated results with detected variable tokens. + +## Configuration + +With option `show_numbered_token` enabled, the output can detect numbered variable tokens from the pattern field. + +```sql +source=apache +| patterns message method=simple_pattern mode=aggregation show_numbered_token=true +| fields patterns_field, pattern_count, tokens +| head 1 +``` +{% include copy.html %} + +Expected output: + +| patterns_field | pattern_count | tokens | +| --- | --- | --- | +| ... - - [//::: -] " / /." | 1 | {'': ['HTTP'], '': ['users'], '': ['1'], '': ['1'], '': ['9481'], '': ['301'], '': ['28'], '': ['104'], '': ['2022'], '': ['Sep'], '': ['15'], '': ['10'], '': ['57'], '': ['210'], '': ['POST'], '': ['15'], '': ['0700'], '': ['204']} | + + +## Brain Example 1: Extract log patterns + +The following example PPL query shows how to use `patterns` to extract semantic meaningful log patterns from a raw log field using the brain algorithm. The default variable count threshold is 5. + +```sql +source=apache +| patterns message method=brain +| fields message, patterns_field +``` +{% include copy.html %} + +Expected output: + +| message | patterns_field | +| --- | --- | +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "HEAD /e-business/mindshare HTTP/<*>" 404 <*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>" 100 <*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "PATCH /strategize/out-of-the-box HTTP/<*>" 401 <*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*>" 301 <*> | + + +## Brain Example 2: Extract log patterns with custom parameters + +The following example PPL query shows how to use `patterns` to extract semantic meaningful log patterns from a raw log field using custom parameters of the brain algorithm. + +```sql +source=apache +| patterns message method=brain variable_count_threshold=2 +| fields message, patterns_field +``` +{% include copy.html %} + +Expected output: + +| message | patterns_field | +| --- | --- | +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | + + +## Brain Example 3: Return log patterns aggregation result + +The following example PPL query shows how to use `patterns` to get aggregated results from a raw log field using the brain algorithm. + +```sql +source=apache +| patterns message method=brain mode=aggregation variable_count_threshold=2 +| fields patterns_field, pattern_count, sample_logs +``` +{% include copy.html %} + +Expected output: + +| patterns_field | pattern_count | sample_logs | +| --- | --- | --- | +| <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | 4 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927,127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722,118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439,210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | + + +## Brain Example 4: Return log patterns aggregation result with detected variable tokens + +The following example PPL query shows how to use `patterns` to get aggregated results with detected variable tokens using the brain algorithm. + +With option `show_numbered_token` enabled, the output can detect numbered variable tokens from the pattern field. + +```sql +source=apache +| patterns message method=brain mode=aggregation show_numbered_token=true variable_count_threshold=2 +| fields patterns_field, pattern_count, tokens +``` +{% include copy.html %} + +Expected output: + +| patterns_field | pattern_count | tokens | +| --- | --- | --- | +| - [/Sep/::: ] HTTP/" | 4 | {'': ['19927', '28722', '27439', '9481'], '': ['10', '10', '10', '10'], '': ['2022', '2022', '2022', '2022'], '': ['57', '57', '57', '57'], '': ['15', '15', '15', '15'], '': ['"HEAD', '"GET', '"PATCH', '"POST'], '': ['-0700', '-0700', '-0700', '-0700'], '': ['/e-business/mindshare', '/architectures/convergence/niches/mindshare', '/strategize/out-of-the-box', '/users'], '': ['177.95.8.74', '127.45.152.6', '118.223.210.10... | + + +## Limitations + +- Patterns command is not pushed down to OpenSearch data node for now. It will only group log patterns on log messages returned to coordinator node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/rare.md b/_sql-and-ppl/ppl/cmd/rare.md new file mode 100644 index 00000000000..e7c973194c5 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/rare.md @@ -0,0 +1,138 @@ +--- +layout: default +title: "rare" +parent: "Commands" +grand_parent: "PPL" +nav_order: 28 +--- +# rare + + +The `rare` command finds the least common tuple of values of all fields in the field list. + +**Note**: A maximum of 10 results is returned for each distinct tuple of values of the group-by fields. + +## Syntax + +Use the following syntax: + +`rare [rare-options] [by-clause]` +* `field-list`: mandatory. Comma-delimited list of field names. +* `by-clause`: optional. One or more fields to group the results by. +* `rare-options`: optional. Options for the rare command. Supported syntax is [countfield=\] [showcount=\]. +* showcount=\: optional. Whether to create a field in output that represent a count of the tuple of values. **Default:** `true`. +* countfield=\: optional. The name of the field that contains count. **Default:** `'count'`. +* usenull=\: optional. whether to output the null value. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`: + * When `plugins.ppl.syntax.legacy.preferred=true`, `usenull` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `usenull` defaults to `false` + + +## Example 1: Find the least common values in a field + +The following example PPL query shows how to use `rare` to find the least common gender of all the accounts. + +```sql +source=accounts +| rare showcount=false gender +``` +{% include copy.html %} + +Expected output: + +| gender | +| --- | +| F | +| M | + + +## Example 2: Find the least common values organized by gender + +The following example PPL query shows how to use `rare` to find the least common age of all the accounts grouped by gender. + +```sql +source=accounts +| rare showcount=false age by gender +``` +{% include copy.html %} + +Expected output: + +| gender | age | +| --- | --- | +| F | 28 | +| M | 32 | +| M | 33 | +| M | 36 | + + +## Example 3: Rare command + +The following example PPL query shows how to use `rare` to find the least common gender of all the accounts. + +```sql +source=accounts +| rare gender +``` +{% include copy.html %} + +Expected output: + +| gender | count | +| --- | --- | +| F | 1 | +| M | 3 | + + +## Example 4: Specify the count field option + +The following example PPL query shows how to use `rare` to specify the count field. + +```sql +source=accounts +| rare countfield='cnt' gender +``` +{% include copy.html %} + +Expected output: + +| gender | cnt | +| --- | --- | +| F | 1 | +| M | 3 | + + +## Example 5: Specify the usenull field option + +```sql +source=accounts +| rare usenull=false email +``` +{% include copy.html %} + +Expected output: + +| email | count | +| --- | --- | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + +```sql +source=accounts +| rare usenull=true email +``` +{% include copy.html %} + +Expected output: + +| email | count | +| --- | --- | +| null | 1 | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + + +## Limitations + +The `rare` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/regex.md b/_sql-and-ppl/ppl/cmd/regex.md new file mode 100644 index 00000000000..20340048f24 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/regex.md @@ -0,0 +1,147 @@ +--- +layout: default +title: "regex" +parent: "Commands" +grand_parent: "PPL" +nav_order: 29 +--- +# regex + + +The `regex` command filters search results by matching field values against a regular expression pattern. Only documents where the specified field matches the pattern are included in the results. + +## Syntax + +Use the following syntax: + +`regex = ` +`regex != ` +* `field`: mandatory. The field name to match against. +* `pattern`: mandatory string. The regular expression pattern to match. Supports Java regex syntax including named groups, lookahead/lookbehind, and character classes. +* = : operator for positive matching (include matches) +* != : operator for negative matching (exclude matches) + + +## Regular expression engine + +The regex command uses Java's built-in regular expression engine, which supports: +* **Standard regex features**: Character classes, quantifiers, anchors +* **Named capture groups**: `(?pattern)` syntax +* **Lookahead/lookbehind**: `(?=...)` and `(?<=...)` assertions +* **Inline flags**: Case-insensitive `(?i)`, multiline `(?m)`, dotall `(?s)`, and other modes + +For complete documentation of Java regex patterns and available modes, see the [Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). + +## Example 1: Basic pattern matching + +The following example PPL query shows how to use `regex` to filter documents where the `lastname` field matches names starting with uppercase letters. + +```sql +source=accounts +| regex lastname="^[A-Z][a-z]+$" +| fields account_number, firstname, lastname +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | lastname | +| --- | --- | --- | +| 1 | Amber | Duke | +| 6 | Hattie | Bond | +| 13 | Nanette | Bates | +| 18 | Dale | Adams | + + +## Example 2: Negative matching + +The following example PPL query shows how to use `regex` to exclude documents where the `lastname` field ends with "son". + +```sql +source=accounts +| regex lastname!=".*son$" +| fields account_number, lastname +``` +{% include copy.html %} + +Expected output: + +| account_number | lastname | +| --- | --- | +| 1 | Duke | +| 6 | Bond | +| 13 | Bates | +| 18 | Adams | + + +## Example 3: Email domain matching + +The following example PPL query shows how to use `regex` to filter documents by email domain patterns. + +```sql +source=accounts +| regex email="@pyrami\.com$" +| fields account_number, email +``` +{% include copy.html %} + +Expected output: + +| account_number | email | +| --- | --- | +| 1 | amberduke@pyrami.com | + + +## Example 4: Complex patterns with character classes + +The following example PPL query shows how to use `regex` with complex regex patterns with character classes and quantifiers. + +```sql +source=accounts | regex address="\\d{3,4}\\s+[A-Z][a-z]+\\s+(Street|Lane|Court)" | fields account_number, address +``` +{% include copy.html %} + +Expected output: + +| account_number | address | +| --- | --- | +| 1 | 880 Holmes Lane | +| 6 | 671 Bristol Street | +| 13 | 789 Madison Street | +| 18 | 467 Hutchinson Court | + + +## Example 5: Case-sensitive matching + +The following example PPL query demonstrates that regex matching is case-sensitive by default. + +```sql +source=accounts +| regex state="va" +| fields account_number, state +``` +{% include copy.html %} + +Expected output: + +| account_number | state | +| --- | --- | + +```sql +source=accounts +| regex state="VA" +| fields account_number, state +``` +{% include copy.html %} + +Expected output: + +| account_number | state | +| --- | --- | +| 13 | VA | + + +## Limitations + +* **Field specification required**: A field name must be specified in the regex command. Pattern-only syntax (e.g., `regex "pattern"`) is not currently supported +* **String fields only**: The regex command currently only supports string fields. Using it on numeric or boolean fields will result in an error \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/rename.md b/_sql-and-ppl/ppl/cmd/rename.md new file mode 100644 index 00000000000..b3bdb9990ea --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/rename.md @@ -0,0 +1,138 @@ +--- +layout: default +title: "rename" +parent: "Commands" +grand_parent: "PPL" +nav_order: 30 +--- +# rename + + +The `rename` command renames one or more fields in the search results. + +## Syntax + +Use the following syntax: + +`rename AS ["," AS ]...` +* `source-field`: mandatory. The name of the field you want to rename. Supports wildcard patterns using `*`. +* `target-field`: mandatory. The name you want to rename to. Must have same number of wildcards as the source. + + +## Behavior + +The rename command handles non-existent fields as follows: +* **Renaming a non-existent field to a non-existent field**: No change occurs to the search results. +* **Renaming a non-existent field to an existing field**: The existing target field is removed from the search results. +* **Renaming an existing field to an existing field**: The existing target field is removed and the source field is renamed to the target. + + +## Example 1: Rename one field + +The following example PPL query shows how to use `rename` to rename one field. + +```sql +source=accounts +| rename account_number as an +| fields an +``` +{% include copy.html %} + +Expected output: + +| an | +| --- | +| 1 | +| 6 | +| 13 | +| 18 | + + +## Example 2: Rename multiple fields + +The following example PPL query shows how to use `rename` to rename multiple fields. + +```sql +source=accounts +| rename account_number as an, employer as emp +| fields an, emp +``` +{% include copy.html %} + +Expected output: + +| an | emp | +| --- | --- | +| 1 | Pyrami | +| 6 | Netagy | +| 13 | Quility | +| 18 | null | + + +## Example 3: Rename with wildcards + +The following example PPL query shows how to use `rename` to rename multiple fields using wildcard patterns. + +```sql +source=accounts +| rename *name as *_name +| fields first_name, last_name +``` +{% include copy.html %} + +Expected output: + +| first_name | last_name | +| --- | --- | +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | + + +## Example 4: Rename with multiple wildcard patterns + +The following example PPL query shows how to use `rename` to rename multiple fields using multiple wildcard patterns. + +```sql +source=accounts +| rename *name as *_name, *_number as *number +| fields first_name, last_name, accountnumber +``` +{% include copy.html %} + +Expected output: + +| first_name | last_name | accountnumber | +| --- | --- | --- | +| Amber | Duke | 1 | +| Hattie | Bond | 6 | +| Nanette | Bates | 13 | +| Dale | Adams | 18 | + + +## Example 5: Rename existing field to existing field + +The following example PPL query shows how to use `rename` to rename an existing field to an existing field. The target field gets removed and the source field is renamed to the target field. + +```sql +source=accounts +| rename firstname as age +| fields age +``` +{% include copy.html %} + +Expected output: + +| age | +| --- | +| Amber | +| Hattie | +| Nanette | +| Dale | + + +## Limitations + +The `rename` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. +Literal asterisk (*) characters in field names cannot be replaced as asterisk is used for wildcard matching. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/replace.md b/_sql-and-ppl/ppl/cmd/replace.md new file mode 100644 index 00000000000..4ec179d3109 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/replace.md @@ -0,0 +1,301 @@ +--- +layout: default +title: "replace" +parent: "Commands" +grand_parent: "PPL" +nav_order: 31 +--- +# replace + + +The `replace` command replaces text in one or more fields in the search results. Supports literal string replacement and wildcard patterns using `*`. + +## Syntax + +Use the following syntax: + +`replace '' WITH '' [, '' WITH '']... IN [, ]...` +* `pattern`: mandatory. The text pattern you want to replace. +* `replacement`: mandatory. The text you want to replace with. +* `field-name`: mandatory. One or more field names where the replacement should occur. + + +## Example 1: Replace text in one field + +The following example PPL query shows how to use `replace` to replace text in one field. + +```sql +source=accounts +| replace "IL" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +Expected output: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 2: Replace text in multiple fields + +The following example PPL query shows how to use `replace` to replace text in multiple fields. + +```sql +source=accounts +| replace "IL" WITH "Illinois" IN state, address +| fields state, address +``` +{% include copy.html %} + +Expected output: + +| state | address | +| --- | --- | +| Illinois | 880 Holmes Lane | +| TN | 671 Bristol Street | +| VA | 789 Madison Street | +| MD | 467 Hutchinson Court | + + +## Example 3: Replace with other commands in a pipeline + +The following example PPL query shows how to use `replace` with other commands in a query pipeline. + +```sql +source=accounts +| replace "IL" WITH "Illinois" IN state +| where age > 30 +| fields state, age +``` +{% include copy.html %} + +Expected output: + +| state | age | +| --- | --- | +| Illinois | 32 | +| TN | 36 | +| MD | 33 | + + +## Example 4: Replace with multiple pattern/replacement pairs + +The following example PPL query shows how to use `replace` with multiple pattern/replacement pairs in a single replace command. The replacements are applied sequentially. + +```sql +source=accounts +| replace "IL" WITH "Illinois", "TN" WITH "Tennessee" IN state +| fields state +``` +{% include copy.html %} + +Expected output: + +| state | +| --- | +| Illinois | +| Tennessee | +| VA | +| MD | + + +## Example 5: Pattern matching with LIKE and replace + +Since replace command only supports plain string literals, you can use LIKE command with replace for pattern matching needs. + +```sql +source=accounts +| where LIKE(address, '%Holmes%') +| replace "Holmes" WITH "HOLMES" IN address +| fields address, state, gender, age, city +``` +{% include copy.html %} + +Expected output: + +| address | state | gender | age | city | +| --- | --- | --- | --- | --- | +| 880 HOLMES Lane | IL | M | 32 | Brogan | + + +## Example 6: Wildcard suffix match + +Replace values that end with a specific pattern. The wildcard `*` matches any prefix. + +```sql +source=accounts +| replace "*IL" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +Expected output: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 7: Wildcard prefix match + +Replace values that start with a specific pattern. The wildcard `*` matches any suffix. + +```sql +source=accounts +| replace "IL*" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +Expected output: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 8: Wildcard capture and substitution + +Use wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement. + +```sql +source=accounts +| replace "* Lane" WITH "Lane *" IN address +| fields address +``` +{% include copy.html %} + +Expected output: + +| address | +| --- | +| Lane 880 Holmes | +| 671 Bristol Street | +| 789 Madison Street | +| 467 Hutchinson Court | + + +## Example 9: Multiple wildcards for pattern transformation + +Use multiple wildcards to transform patterns. Each wildcard in the replacement substitutes the corresponding captured value. + +```sql +source=accounts +| replace "* *" WITH "*_*" IN address +| fields address +``` +{% include copy.html %} + +Expected output: + +| address | +| --- | +| 880_Holmes Lane | +| 671_Bristol Street | +| 789_Madison Street | +| 467_Hutchinson Court | + + +## Example 10: Wildcard with zero wildcards in replacement + +When replacement has zero wildcards, all matching values are replaced with the literal replacement string. + +```sql +source=accounts +| replace "*IL*" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +Expected output: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 11: Matching literal asterisks + +Use `\*` to match literal asterisk characters (`\*` = literal asterisk, `\\` = literal backslash). + +```sql +source=accounts +| eval note = 'price: *sale*' +| replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note +| fields note +``` +{% include copy.html %} + +Expected output: + +| note | +| --- | +| DISCOUNTED | +| DISCOUNTED | +| DISCOUNTED | +| DISCOUNTED | + + +## Example 12: Wildcard with no replacement wildcards + +Use wildcards in pattern but none in replacement to create a fixed output. + +```sql +source=accounts +| eval test = 'prefix-value-suffix' +| replace 'prefix-*-suffix' WITH 'MATCHED' IN test +| fields test +``` +{% include copy.html %} + +Expected output: + +| test | +| --- | +| MATCHED | +| MATCHED | +| MATCHED | +| MATCHED | + + +## Example 13: Escaped asterisks with wildcards + +Combine escaped asterisks (literal) with wildcards for complex patterns. + +```sql +source=accounts +| eval label = 'file123.txt' +| replace 'file*.*' WITH '\**.*' IN label +| fields label +``` +{% include copy.html %} + +Expected output: + +| label | +| --- | +| *123.txt | +| *123.txt | +| *123.txt | +| *123.txt | + + +## Limitations + +* `Wildcards`: `*` matches zero or more characters (case-sensitive) +* Replacement wildcards must match pattern wildcard count, or be zero +* Escape sequences: `\*` (literal asterisk), `\\` (literal backslash) \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/reverse.md b/_sql-and-ppl/ppl/cmd/reverse.md new file mode 100644 index 00000000000..064b3f1e687 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/reverse.md @@ -0,0 +1,129 @@ +--- +layout: default +title: "reverse" +parent: "Commands" +grand_parent: "PPL" +nav_order: 32 +--- +# reverse + + +The `reverse` command reverses the display order of search results. The same results are returned, but in reverse order. + +## Syntax + +Use the following syntax: + +`reverse` +* No parameters: The reverse command takes no arguments or options. + + +## Note + +The `reverse` command processes the entire dataset. If applied directly to millions of records, it will consume significant memory resources on the coordinating node. Users should only apply the `reverse` command to smaller datasets, typically after aggregation operations. + +## Example 1: Basic reverse operation + +The following example PPL query shows how to use `reverse` to reverse the order of all documents. + +```sql +source=accounts +| fields account_number, age +| reverse +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 2: Reverse with sort + +The following example PPL query shows how to use `reverse` to reverse results after sorting by age in ascending order, effectively giving descending order. + +```sql +source=accounts +| sort age +| fields account_number, age +| reverse +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 3: Reverse with head + +The following example PPL query shows how to use `reverse` with head to get the last 2 records from the original order. + +```sql +source=accounts +| reverse +| head 2 +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | + + +## Example 4: Double reverse + +The following example PPL query demonstrates that applying reverse twice returns to the original order. + +```sql +source=accounts +| reverse +| reverse +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | + + +## Example 5: Reverse with complex pipeline + +The following example PPL query shows how to use `reverse` with filtering and field selection. + +```sql +source=accounts +| where age > 30 +| fields account_number, age +| reverse +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/rex.md b/_sql-and-ppl/ppl/cmd/rex.md new file mode 100644 index 00000000000..6fc6267abe0 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/rex.md @@ -0,0 +1,277 @@ +--- +layout: default +title: "rex" +parent: "Commands" +grand_parent: "PPL" +nav_order: 33 +--- +# rex + + +The `rex` command extracts fields from a raw text field using regular expression named capture groups. + +## Syntax + +Use the following syntax: + +`rex [mode=] field= [max_match=] [offset_field=]` +* `field`: mandatory. The field must be a string field to extract data from. +* `pattern`: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using `(?pattern)` syntax. +* `mode`: optional. Either `extract` or `sed`. **Default:** extract + * **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior. + * **sed mode**: Performs text substitution on the field using sed-style patterns + * `s/pattern/replacement/` - Replace first occurrence + * `s/pattern/replacement/g` - Replace all occurrences (global) + * `s/pattern/replacement/n` - Replace only the nth occurrence (where n is a number) + * `y/from_chars/to_chars/` - Character-by-character transliteration + * Backreferences: `\1`, `\2`, etc. reference captured groups in replacement +* `max_match`: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable through `plugins.ppl.rex.max_match.limit`). +* `offset_field`: optional string. Field name to store the character offset positions of matches. Only available in extract mode. + + +## Example 1: Basic field Extraction + +The following example PPL query shows how to use `rex` to extract username and domain from email addresses using named capture groups. Both extracted fields are returned as string type. + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, username, domain +| head 2 +``` +{% include copy.html %} + +Expected output: + +| email | username | domain | +| --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | +| hattiebond@netagy.com | hattiebond | netagy | + + +## Example 2: Handling non-matching Patterns + +The following example PPL query shows that the rex command returns all events, setting extracted fields to null for non-matching patterns. Extracted fields would be string type when matches are found. + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?gmail\\.com)" +| fields email, user, domain +| head 2 +``` +{% include copy.html %} + +Expected output: + +| email | user | domain | +| --- | --- | --- | +| amberduke@pyrami.com | null | null | +| hattiebond@netagy.com | null | null | + + +## Example 3: Multiple matches with max_match + +The following example PPL query shows how to use `rex` to extract multiple words from address field using max_match parameter. The extracted field is returned as an array type containing string elements. + +```sql +source=accounts +| rex field=address "(?[A-Za-z]+)" max_match=2 +| fields address, words +| head 3 +``` +{% include copy.html %} + +Expected output: + +| address | words | +| --- | --- | +| 880 Holmes Lane | [Holmes,Lane] | +| 671 Bristol Street | [Bristol,Street] | +| 789 Madison Street | [Madison,Street] | + + +## Example 4: Text replacement with mode=sed + +The following example PPL query shows how to use `rex` to replace email domains using sed mode for text substitution. The extracted field is returned as string type. + +```sql +source=accounts +| rex field=email mode=sed "s/@.*/@company.com/" +| fields email +| head 2 +``` +{% include copy.html %} + +Expected output: + +| email | +| --- | +| amberduke@company.com | +| hattiebond@company.com | + + +## Example 5: Using offset_field + +The following example PPL query shows how to use `rex` to track the character positions where matches occur. Extracted fields are string type, and the offset_field is also string type. + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos +| fields email, username, domain, matchpos +| head 2 +``` +{% include copy.html %} + +Expected output: + +| email | username | domain | matchpos | +| --- | --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | domain=10-15&username=0-8 | +| hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | + + +## Example 6: Complex email Pattern + +The following example PPL query shows how to use `rex` to extract comprehensive email components including top-level domain. All extracted fields are returned as string type. + +```sql +source=accounts +| rex field=email "(?[a-zA-Z0-9._%+-]+)@(?[a-zA-Z0-9.-]+)\\.(?[a-zA-Z]{2,})" +| fields email, user, domain, tld +| head 2 +``` +{% include copy.html %} + +Expected output: + +| email | user | domain | tld | +| --- | --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | com | +| hattiebond@netagy.com | hattiebond | netagy | com | + + +## Example 7: Chaining multiple rex Commands + +The following example PPL query shows how to use `rex` to extract initial letters from both first and last names. All extracted fields are returned as string type. + +```sql +source=accounts +| rex field=firstname "(?^.)" +| rex field=lastname "(?^.)" +| fields firstname, lastname, firstinitial, lastinitial +| head 3 +``` +{% include copy.html %} + +Expected output: + +| firstname | lastname | firstinitial | lastinitial | +| --- | --- | --- | --- | +| Amber | Duke | A | D | +| Hattie | Bond | H | B | +| Nanette | Bates | N | B | + + +## Example 8: Named capture group limitations + +The following example PPL query demonstrates naming restrictions for capture groups. Group names cannot contain underscores due to Java regex limitations. +Invalid PPL query with underscores + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, user_name, email_domain +``` +{% include copy.html %} + +Expected output: + +```text +{'reason': 'Invalid Query', 'details': "Invalid capture group name 'user_name'. Java regex group names must start with a letter and contain only letters and digits.", 'type': 'IllegalArgumentException'} +Error: Query returned no data +``` + +Correct PPL query without underscores + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, username, emaildomain +| head 2 +``` +{% include copy.html %} + +Expected output: + +| email | username | emaildomain | +| --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | +| hattiebond@netagy.com | hattiebond | netagy | + + +## Example 9: Max match limit protection + +The following example PPL query demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion. +PPL query with max_match=0 automatically capped to default limit of 10 + +```sql +source=accounts +| rex field=address "(?\\d*)" max_match=0 +| eval digit_count=array_length(digit) +| fields address, digit_count +| head 1 +``` +{% include copy.html %} + +Expected output: + +| address | digit_count | +| --- | --- | +| 880 Holmes Lane | 10 | + +PPL query exceeding the configured limit results in an error + +```sql +source=accounts +| rex field=address "(?\\d*)" max_match=100 +| fields address, digit +| head 1 +``` +{% include copy.html %} + +Expected output: + +```text +{'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'} +Error: Query returned no data +``` + + +## Comparison with related commands + +| Feature | rex | parse | +| --- | --- | --- | +| Pattern Type | Java Regex | Java Regex | +| Named Groups Required | Yes | Yes | +| Multiple Named Groups | Yes | No | +| Multiple Matches | Yes | No | +| Text Substitution | Yes | No | +| Offset Tracking | Yes | No | +| Special Characters in Group Names | No | No | + + +## Limitations + +**Named Capture Group Naming:** +* Group names must start with a letter and contain only letters and digits +* For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) + +**Pattern Requirements:** +* Pattern must contain at least one named capture group +* Regular capture groups `(...)` without names are not allowed + +**Max Match Limit:** +* The `max_match` parameter is subject to a configurable system limit to prevent memory exhaustion +* When `max_match=0` (unlimited) is specified, it is automatically capped at the configured limit (default: 10) +* User-specified values exceeding the configured limit will result in an error +* Users can adjust the limit through the `plugins.ppl.rex.max_match.limit` cluster setting. Setting this limit to a large value is not recommended as it can lead to excessive memory consumption, especially with patterns that match empty strings (e.g., `\d*`, `\w*`) \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/search.md b/_sql-and-ppl/ppl/cmd/search.md new file mode 100644 index 00000000000..133db3d6cf4 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/search.md @@ -0,0 +1,653 @@ +--- +layout: default +title: "search" +parent: "Commands" +grand_parent: "PPL" +nav_order: 34 +--- +# search + + +The `search` command retrieves documents from the index. The `search` command can only be used as the first command in the PPL query. + +## Syntax + +Use the following syntax: + +`search source=[:] [search-expression]` +* `search`: search keyword, which could be ignored. +* `index`: mandatory. search command must specify which index to query from. The index name can be prefixed by "\:" for cross-cluster search. +* `search-expression`: optional. Search expression that gets converted to OpenSearch [query_string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) function which uses [Lucene Query Syntax](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html). + + +## Search expression + +The search expression syntax supports: +* **Full text search**: `error` or `"error message"` - Searches the default field configured by the `index.query.default_field` setting (defaults to `*` which searches all fields) +* **Field-value comparisons**: `field=value`, `field!=value`, `field>value`, `field>=value`, `field[+<...>]@` - Time offset from current time + +**Relative Time Components**: +* **Time offset**: `+` (future) or `-` (past) +* **Time amount**: Numeric value + time unit (`second`, `minute`, `hour`, `day`, `week`, `month`, `year`, and their variants) +* **Snap to unit**: Optional `@` to round to nearest unit (hour, day, month, etc.) + +**Examples of Time Modifier Values**: +* `earliest=now` - From current time +* `latest='2024-12-31 23:59:59'` - Until a specific date +* `earliest=-7d` - From 7 days ago +* `latest='+1d@d'` - Until tomorrow at start of day +* `earliest='-1month@month'` - From start of previous month +* `latest=1754020061` - Until a unix timestamp (August 1, 2025 03:47:41 at UTC) + +Read more details on time modifiers in the [PPL relative_timestamp documentation](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/ppl-lang/functions/ppl-datetime.md#relative_timestamp). +**Notes:** +* **Column name conflicts**: If your data contains columns named "earliest" or "latest", use backticks to access them as regular fields (e.g., `` `earliest`="value"``) to avoid conflicts with time modifier syntax. +* **Time snap syntax**: Time modifiers with chained time offsets must be wrapped in quotes (e.g., `latest='+1d@month-10h'`) for proper query parsing. + + +## Default field configuration + +When you search without specifying a field, it searches the default field configured by the `index.query.default_field` index setting (defaults to `*` which searches all fields). +You can check or modify the default field setting + GET /accounts/_settings/index.query.default_field + PUT /accounts/_settings + { + "index.query.default_field": "firstname,lastname,email" + } + +## Field types and search behavior + +**Text Fields**: Full-text search, phrase search +* `search message="error occurred" source=logs` +* `Limitations`: Wildcards apply to terms after analysis, not entire field value. + +**Keyword Fields**: Exact matching, wildcard patterns +* `search status="ACTIVE" source=logs` +* `Limitations`: No text analysis, case-sensitive matching + +**Numeric Fields**: Range queries, exact matching, IN operator +* `search age>=18 AND balance<50000 source=accounts` +* `Limitations`: No wildcard or text search support + +**Date Fields**: Range queries, exact matching, IN operator +* `search timestamp>="2024-01-01" source=logs` +* `Limitations`: Must use index mapping date format, no wildcards + +**Boolean Fields**: true/false values only, exact matching, IN operator +* `search active=true source=users` +* `Limitations`: No wildcards or range queries + +**IP Fields**: Exact matching, CIDR notation +* `search client_ip="192.168.1.0/24" source=logs` +* `Limitations`: No wildcards for partial IP matching. For wildcard search use multi field with keyword: `search ip_address.keyword='1*' source=logs` or WHERE clause: `source=logs | where cast(ip_address as string) like '1%'` + +**Field Type Performance Tips**: + * Each field type has specific search capabilities and limitations. Using the wrong field type during ingestion impacts performance and accuracy + * For wildcard searches on non-keyword fields: Add a keyword field copy for better performance. Example: If you need wildcards on a text field, create `message.keyword` alongside `message` + + +## Cross-cluster search + +Cross-cluster search lets any node in a cluster execute search requests against other clusters. Refer to [Cross-Cluster Search]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/admin/cross_cluster_search/) for configuration. + +## Example 1: Text search + +**Basic Text Search** (unquoted single term) + +```sql +search ERROR source=otellogs +| sort @timestamp +| fields severityText, body +| head 1 +``` +{% include copy.html %} + +Expected output: + +| severityText | body | +| --- | --- | +| ERROR | Payment failed: Insufficient funds for user@example.com | + +**Phrase Search** (requires quotes for multi-word exact match) + +```sql +search "Payment failed" source=otellogs +| fields body +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + +**Implicit AND with Multiple Terms** (unquoted literals are combined with AND) + +```sql +search user email source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Executing SQL: SELECT * FROM users WHERE email LIKE '%@gmail.com' AND status != 'deleted' ORDER BY created_at DESC | + +Note: `search user email` is equivalent to `search user AND email`. Multiple unquoted terms are automatically combined with AND. +**Enclose in double quotes for terms which contain special characters** + +```sql +search "john.doe+newsletter@company.com" source=otellogs +| fields body +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Email notification sent to john.doe+newsletter@company.com with subject: 'Welcome! Your order #12345 is confirmed' | + +### Mixed phrase and boolean + +```sql +search "User authentication" OR OAuth2 source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| [2024-01-15 10:30:09] production.INFO: User authentication successful for admin@company.org using OAuth2 | + + +## Example 2: Boolean logic and operator precedence + +The following examples demonstrate boolean operators and precedence. + +### Boolean operators + +```sql +search severityText="ERROR" OR severityText="FATAL" source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +Expected output: + +| severityText | +| --- | +| ERROR | +| FATAL | +| ERROR | + +```sql +search severityText="INFO" AND `resource.attributes.service.name`="cart-service" source=otellogs +| fields body +| head 1; +``` +{% include copy.html %} + +Expected output + +| body | +| --- | +| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | + +**Operator Precedence** (highest to lowest): Parentheses → NOT → OR → AND + +```sql +search severityText="ERROR" OR severityText="WARN" AND severityNumber>15 source=otellogs +| sort @timestamp +| fields severityText, severityNumber +| head 2 +``` +{% include copy.html %} + +Expected output: + +| severityText | severityNumber | +| --- | --- | +| ERROR | 17 | +| ERROR | 17 | + +The preceding expression evaluates as `(severityText="ERROR" OR severityText="WARN") AND severityNumber>15` + +## Example 3: NOT compared to != Semantics + +**!= operator** (field must exist and not equal the value) + +```sql +search employer!="Quility" source=accounts +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | + +**NOT operator** (excludes matching conditions, includes null fields) + +```sql +search NOT employer="Quility" source=accounts +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + +**Key difference**: `!=` excludes null values, `NOT` includes them. +Dale Adams (account 18) has `employer=null`. He appears in `NOT employer="Quility"` but not in `employer!="Quility"`. + +## Example 4: Wildcards + +The following examples demonstrate wildcard pattern matching. + +### Wildcard patterns + +```sql +search severityText=ERR* source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +Expected output: + +| severityText | +| --- | +| ERROR | +| ERROR | +| ERROR2 | + +```sql +search body=user* source=otellogs +| sort @timestamp +| fields body +| head 2; +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | +| Payment failed: Insufficient funds for user@example.com | + +**Wildcard Rules**: +* `*` - Matches zero or more characters +* `?` - Matches exactly one character + +### Single character wildcard (?) + +```sql +search severityText="INFO?" source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +Expected output: + +| severityText | +| --- | +| INFO2 | +| INFO3 | +| INFO4 | + + +## Example 5: Range queries + +Use comparison operators (>, <, >=, <=) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics. + +```sql +search severityNumber>15 AND severityNumber<=20 source=otellogs +| sort @timestamp +| fields severityNumber +| head 3 +``` +{% include copy.html %} + +Expected output: + +| severityNumber | +| --- | +| 17 | +| 17 | +| 18 | + +```sql +search `attributes.payment.amount`>=1000.0 AND `attributes.payment.amount`<=2000.0 source=otellogs +| fields body; +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + + +## Example 6: Field search with Wildcards + +When searching in text or keyword fields, wildcards enable partial matching. This is particularly useful for finding records where you only know part of the value. Note that wildcards work best with keyword fields, while text fields may produce unexpected results due to tokenization. +**Partial Search in Keyword Fields** + +```sql +search employer=Py* source=accounts +| fields firstname, employer +``` +{% include copy.html %} + +Expected output: + +| firstname | employer | +| --- | --- | +| Amber | Pyrami | + +### Combining wildcards with field comparisons + +```sql +search firstname=A* AND age>30 source=accounts +| fields firstname, age, city +``` +{% include copy.html %} + +Expected output: + +| firstname | age | city | +| --- | --- | --- | +| Amber | 32 | Brogan | + +**Important Notes on Wildcard Usage**: +* **Keyword fields**: Best for wildcard searches - exact value matching with pattern support +* **Text fields**: Wildcards apply to individual tokens after analysis, not the entire field value +* **Performance**: Leading wildcards (e.g., `*@example.com`) are slower than trailing wildcards +* **Case sensitivity**: Keyword field wildcards are case-sensitive unless normalized during indexing + + +## Example 7: IN operator and field comparisons + +The IN operator efficiently checks if a field matches any value from a list. This is cleaner and more performant than chaining multiple OR conditions for the same field. +**IN Operator** + +```sql +search severityText IN ("ERROR", "WARN", "FATAL") source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +Expected output: + +| severityText | +| --- | +| ERROR | +| WARN | +| FATAL | + +### Field comparison examples + +```sql +search severityNumber=17 source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + +```sql +search `attributes.user.email`="user@example.com" source=otellogs +| fields body; +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + + +## Example 8: Complex expressions + +Combine multiple conditions using boolean operators and parentheses to create sophisticated search queries. + +```sql +search (severityText="ERROR" OR severityText="WARN") AND severityNumber>10 source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +Expected output: + +| severityText | +| --- | +| ERROR | +| WARN | +| ERROR | + +```sql +search `attributes.user.email`="user@example.com" OR (`attributes.error.code`="INSUFFICIENT_FUNDS" AND severityNumber>15) source=otellogs +| fields body; +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + + +## Example 9: Time modifiers + +Time modifiers filter search results by time range using the implicit `@timestamp` field. They support various time formats for precise temporal filtering. +**Absolute Time Filtering** + +```sql +search earliest='2024-01-15 10:30:05' latest='2024-01-15 10:30:10' source=otellogs +| fields @timestamp, severityText +``` +{% include copy.html %} + +Expected output: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:05.678901234 | FATAL | +| 2024-01-15 10:30:06.789012345 | TRACE | +| 2024-01-15 10:30:07.890123456 | ERROR | +| 2024-01-15 10:30:08.901234567 | WARN | +| 2024-01-15 10:30:09.012345678 | INFO | +| 2024-01-15 10:30:10.123456789 | TRACE2 | + +**Relative Time Filtering** (before 30 seconds ago) + +```sql +search latest=-30s source=otellogs +| sort @timestamp +| fields @timestamp, severityText +| head 3 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | +| 2024-01-15 10:30:02.345678901 | WARN | + +**Time Snapping** (before start of current minute) + +```sql +search latest='@m' source=otellogs +| fields @timestamp, severityText +| head 2 +``` +{% include copy.html %} + +Expected output: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | + +### Unix timestamp filtering + +```sql +search earliest=1705314600 latest=1705314605 source=otellogs +| fields @timestamp, severityText +``` +{% include copy.html %} + +Expected output: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | +| 2024-01-15 10:30:02.345678901 | WARN | +| 2024-01-15 10:30:03.456789012 | DEBUG | +| 2024-01-15 10:30:04.567890123 | INFO | + + +## Example 10: Special characters and Escaping + +Understand when and how to escape special characters in your search queries. There are two categories of characters that need escaping: +**Characters that must be escaped**: +* **Backslashes (\)**: Always escape as `\\` to search for literal backslash +* **Quotes (")**: Escape as `\"` when inside quoted strings + +**Wildcard characters (escape only to search literally)**: +* **Asterisk (*)**: Use as-is for wildcard, escape as `\\*` to search for literal asterisk +* **Question mark (?)**: Use as-is for wildcard, escape as `\\?` to search for literal question mark + + +| Intent | PPL syntax | Result | +|--------|------------|--------| +| Wildcard search | `field=user*` | Matches "user", "user123", "userABC" | +| Literal "user*" | `field="user\\*"` | Matches only "user*" | +| Wildcard search | `field=log?` | Matches "log1", "logA", "logs" | +| Literal "log?" | `field="log\\?"` | Matches only "log?" | + + +**Backslash in file paths** + +```sql +search `attributes.error.type`="C:\\\\Users\\\\admin" source=otellogs +| fields `attributes.error.type` +``` +{% include copy.html %} + +Expected output: + +| attributes.error.type | +| --- | +| C:\Users\admin | + +Note: Each backslash in the search value needs to be escaped with another backslash. When using REST API with JSON, additional JSON escaping is required. +**Quotes within strings** + +```sql +search body="\"exact phrase\"" source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | + +**Text with special characters** + +```sql +search "wildcard\\* fuzzy~2" source=otellogs +| fields body +| head 1 +``` +{% include copy.html %} + +Expected output: + +| body | +| --- | +| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | + + +## Example 11: Fetch all Data + +Retrieve all documents from an index by specifying only the source without any search conditions. This is useful for exploring small datasets or verifying data ingestion. + +```sql +source=accounts +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/showdatasources.md b/_sql-and-ppl/ppl/cmd/showdatasources.md new file mode 100644 index 00000000000..57c1b85cf49 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/showdatasources.md @@ -0,0 +1,39 @@ +--- +layout: default +title: "showdatasources" +parent: "Commands" +grand_parent: "PPL" +nav_order: 35 +--- +# show datasources + + +The `show datasources` command queries datasources configured in the PPL engine. The `show datasources` command can only be used as the first command in the PPL query. + +## Syntax + +Use the following syntax: + +`show datasources` + +## Example 1: Fetch all PROMETHEUS datasources + +The following example PPL query shows how to use `showdatasources` to fetch all the datasources of type prometheus. +PPL query for all PROMETHEUS DATASOURCES + +```sql +show datasources +| where CONNECTOR_TYPE='PROMETHEUS' +``` +{% include copy.html %} + +Expected output: + +| DATASOURCE_NAME | CONNECTOR_TYPE | +| --- | --- | +| my_prometheus | PROMETHEUS | + + +## Limitations + +The `show datasources` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/sort.md b/_sql-and-ppl/ppl/cmd/sort.md new file mode 100644 index 00000000000..e9d1be9f89f --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/sort.md @@ -0,0 +1,234 @@ +--- +layout: default +title: "sort" +parent: "Commands" +grand_parent: "PPL" +nav_order: 36 +--- +# sort + + +The `sort` command sorts all the search results by the specified fields. + +## Syntax + +Use the following syntax: + +`sort [count] <[+|-] sort-field | sort-field [asc|a|desc|d]>...` +* `count`: optional. The number of results to return. Specifying a count of 0 or less than 0 returns all results. **Default:** 0. +* `[+|-]`: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. +* `[asc|a|desc|d]`: optional. asc/a stands for ascending order and NULL/MISSING first. desc/d stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. +* `sort-field`: mandatory. The field used to sort. Can use `auto(field)`, `str(field)`, `ip(field)`, or `num(field)` to specify how to interpret field values. + +> **Note:** +> You cannot mix +/- and asc/desc in the same sort command. Choose one approach for all fields in a single sort command. +> +> + +## Example 1: Sort by one field + +The following example PPL query shows how to use `sort` to sort all documents by age field in ascending order. + +```sql +source=accounts +| sort age +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | + + +## Example 2: Sort by one field return all the result + +The following example PPL query shows how to use `sort` to sort all documents by age field in ascending order and return all results. + +```sql +source=accounts +| sort 0 age +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | + + +## Example 3: Sort by one field in descending order (using -) + +The following example PPL query shows how to use `sort` to sort all documents by age field in descending order. + +```sql +source=accounts +| sort - age +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 4: Sort by one field in descending order (using desc) + +The following example PPL query shows how to use `sort` to sort all documents by the age field in descending order using the desc keyword. + +```sql +source=accounts +| sort age desc +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 5: Sort by multiple fields (using +/-) + +The following example PPL query shows how to use `sort` to sort all documents by gender field in ascending order and age field in descending order using +/- operators. + +```sql +source=accounts +| sort + gender, - age +| fields account_number, gender, age +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | age | +| --- | --- | --- | +| 13 | F | 28 | +| 6 | M | 36 | +| 18 | M | 33 | +| 1 | M | 32 | + + +## Example 6: Sort by multiple fields (using asc/desc) + +The following example PPL query shows how to use `sort` to sort all documents by the gender field in ascending order and age field in descending order using asc/desc keywords. + +```sql +source=accounts +| sort gender asc, age desc +| fields account_number, gender, age +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | age | +| --- | --- | --- | +| 13 | F | 28 | +| 6 | M | 36 | +| 18 | M | 33 | +| 1 | M | 32 | + + +## Example 7: Sort by field include null value + +The following example PPL query shows how to use `sort` to sort employer field by default option (ascending order and null first). The result shows that null value is in the first row. + +```sql +source=accounts +| sort employer +| fields employer +``` +{% include copy.html %} + +Expected output: + +| employer | +| --- | +| null | +| Netagy | +| Pyrami | +| Quility | + + +## Example 8: Specify the number of sorted documents to return + +The following example PPL query shows how to use `sort` to sort all documents and return 2 documents. + +```sql +source=accounts +| sort 2 age +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | + + +## Example 9: Sort with desc modifier + +The following example PPL query shows how to use `sort` to sort with the desc modifier to reverse sort order. + +```sql +source=accounts +| sort age desc +| fields account_number, age +``` +{% include copy.html %} + +Expected output: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 10: Sort with specifying field type + +The following example PPL query shows how to use `sort` to sort with str() to sort numeric values lexicographically. + +```sql +source=accounts +| sort str(account_number) +| fields account_number +``` +{% include copy.html %} + +Expected output: + +| account_number | +| --- | +| 1 | +| 13 | +| 18 | +| 6 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/spath.md b/_sql-and-ppl/ppl/cmd/spath.md new file mode 100644 index 00000000000..2c37c6a13f8 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/spath.md @@ -0,0 +1,108 @@ +--- +layout: default +title: "spath" +parent: "Commands" +grand_parent: "PPL" +nav_order: 37 +--- +# spath + + +The `spath` command extracts fields from structured text data. It currently allows selecting from JSON data with JSON paths. + +## Syntax + +Use the following syntax: + +`spath input= [output=] [path=]` +* `input`: mandatory. The field to scan for JSON data. +* `output`: optional. The destination field that the data will be loaded to. **Default:** value of `path`. +* `path`: mandatory. The path of the data to load for the object. For more information about path syntax, see [json_extract]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/json#json_extract). + + +## Note + +The `spath` command currently does not support pushdown behavior for extraction. It will be slow on large datasets. It's generally better to index fields needed for filtering directly instead of using `spath` to filter nested fields. + +## Example 1: Simple field Extraction + +The simplest spath is to extract a single field. This example extracts `n` from the `doc` field of type `text`. + +```sql +source=structured +| spath input=doc_n n +| fields doc_n n +``` +{% include copy.html %} + +Expected output: + +| doc_n | n | +| --- | --- | +| {"n": 1} | 1 | +| {"n": 2} | 2 | +| {"n": 3} | 3 | + + +## Example 2: Lists and nesting + +The following example PPL query demonstrates more JSON path uses, like traversing nested fields and extracting list elements. + +```sql +source=structured +| spath input=doc_list output=first_element list{0} +| spath input=doc_list output=all_elements list{} +| spath input=doc_list output=nested nest_out.nest_in +| fields doc_list first_element all_elements nested +``` +{% include copy.html %} + +Expected output: + +| doc_list | first_element | all_elements | nested | +| --- | --- | --- | --- | +| {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | +| {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | +| {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a | + + +## Example 3: Sum of inner elements + +The following example PPL query shows how to use `spath` to extract an inner field and do statistics on it, using the docs from example 1. It also demonstrates that `spath` always returns strings for inner types. + +```sql +source=structured +| spath input=doc_n n +| eval n=cast(n as int) +| stats sum(n) +| fields `sum(n)` +``` +{% include copy.html %} + +Expected output: + +| sum(n) | +| --- | +| 6 | + + +## Example 4: Escaped paths + +`spath` can escape paths with strings to accept any path that `json_extract` does. This includes escaping complex field names as array components. + +```sql +source=structured +| spath output=a input=doc_escape "['a fancy field name']" +| spath output=b input=doc_escape "['a.b.c']" +| fields a b +``` +{% include copy.html %} + +Expected output: + +| a | b | +| --- | --- | +| true | 0 | +| true | 1 | +| false | 2 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/stats.md b/_sql-and-ppl/ppl/cmd/stats.md new file mode 100644 index 00000000000..8228c7acefa --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/stats.md @@ -0,0 +1,464 @@ +--- +layout: default +title: "stats" +parent: "Commands" +grand_parent: "PPL" +nav_order: 38 +--- +# stats + + +The `stats` command calculates the aggregation from the search results. + +## Syntax + +Use the following syntax: + +`stats [bucket_nullable=bool] ... [by-clause]` +* `aggregation`: mandatory. An aggregation function. +* `bucket_nullable`: optional. Controls whether the stats command includes null buckets in group-by aggregations. When set to `false`, the aggregation ignores records where the group-by field is null, resulting in faster performance by excluding null bucket. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` +* `by-clause`: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** If no by-clause is specified, the stats command returns only one row, which is the aggregation over the entire search results. +* `span-expression`: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). The unit of the interval expression is the natural unit by default. If the field is a date/time type field, the aggregation results always ignore null bucket. For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. + * Available time units + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + + +## Aggregation functions + +The stats command supports the following aggregation functions: +* COUNT/C: Count of values +* `SUM`: Sum of numeric values +* `AVG`: Average of numeric values +* `MAX`: Maximum value +* `MIN`: Minimum value +* `VAR_SAMP`: Sample variance +* `VAR_POP`: Population variance +* `STDDEV_SAMP`: Sample standard deviation +* `STDDEV_POP`: Population standard deviation +* `DISTINCT_COUNT_APPROX`: Approximate distinct count +* `TAKE`: List of original values +* PERCENTILE/PERCENTILE_APPROX: Percentile calculations +* PERC\/P\: Percentile shortcut functions +* `MEDIAN`: 50th percentile +* `EARLIEST`: Earliest value by timestamp +* `LATEST`: Latest value by timestamp +* `FIRST`: First non-null value +* `LAST`: Last non-null value +* `LIST`: Collect all values into array +* `VALUES`: Collect unique values into sorted array + +For detailed documentation of each function, see [Aggregation Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/aggregations/). + +## Limitations + +The following limitations apply to the `stats` command. + +### Bucket aggregation result may be approximate in large dataset + +In OpenSearch, `doc_count` values for a terms bucket aggregation may be approximate. As a result, any aggregations (such as `sum` and `avg`) on the terms bucket aggregation may also be approximate. +For example, the following PPL query (find the top 10 URLs) may return an approximate result if the cardinality of `URL` is high. + +```sql +source=hits +| stats bucket_nullable=false count() as c by URL +| sort - c +| head 10 +``` +{% include copy.html %} + +This query is pushed down to a terms bucket aggregation DSL query with `"order": { "_count": "desc" }`. In OpenSearch, this terms aggregation may throw away some buckets. + +### Sorting by ascending doc_count may produce inaccurate results + +Similar to the preceding PPL query, the following query (find the rare 10 URLs) often produces inaccurate results. + +```sql +source=hits +| stats bucket_nullable=false count() as c by URL +| sort + c +| head 10 +``` +{% include copy.html %} + +A term that is globally infrequent might not appear as infrequent on every individual shard or might be entirely absent from the least frequent results returned by some shards. Conversely, a term that appears infrequently on one shard might be common on another. In both scenarios, rare terms can be missed during shard-level aggregation, resulting in incorrect overall results. + +## Example 1: Calculate the count of events + +The following example PPL query shows how to use `stats` to calculate the count of events in the accounts. + +```sql +source=accounts +| stats count() +``` +{% include copy.html %} + +Expected output: + +| count() | +| --- | +| 4 | + + +## Example 2: Calculate the average of a field + +The following example PPL query shows how to use `stats` to calculate the average age of all the accounts. + +```sql +source=accounts +| stats avg(age) +``` +{% include copy.html %} + +Expected output: + +| avg(age) | +| --- | +| 32.25 | + + +## Example 3: Calculate the average of a field by group + +The following example PPL query shows how to use `stats` to calculate the average age of all the accounts group by gender. + +```sql +source=accounts +| stats avg(age) by gender +``` +{% include copy.html %} + +Expected output: + +| avg(age) | gender | +| --- | --- | +| 28.0 | F | +| 33.666666666666664 | M | + + +## Example 4: Calculate the average, sum and count of a field by group + +The following example PPL query shows how to use `stats` to calculate the average age, sum age and count of events of all the accounts group by gender. + +```sql +source=accounts +| stats avg(age), sum(age), count() by gender +``` +{% include copy.html %} + +Expected output: + +| avg(age) | sum(age) | count() | gender | +| --- | --- | --- | --- | +| 28.0 | 28 | 1 | F | +| 33.666666666666664 | 101 | 3 | M | + + +## Example 5: Calculate the maximum of a field + +The example calculates the max age of all the accounts. + +```sql +source=accounts +| stats max(age) +``` +{% include copy.html %} + +Expected output: + +| max(age) | +| --- | +| 36 | + + +## Example 6: Calculate the maximum and minimum of a field by group + +The example calculates the max and min age values of all the accounts group by gender. + +```sql +source=accounts +| stats max(age), min(age) by gender +``` +{% include copy.html %} + +Expected output: + +| max(age) | min(age) | gender | +| --- | --- | --- | +| 28 | 28 | F | +| 36 | 32 | M | + + +## Example 7: Calculate the distinct count of a field + +To get the count of distinct values of a field, you can use `DISTINCT_COUNT` (or `DC`) function instead of `COUNT`. The example calculates both the count and the distinct count of gender field of all the accounts. + +```sql +source=accounts +| stats count(gender), distinct_count(gender) +``` +{% include copy.html %} + +Expected output: + +| count(gender) | distinct_count(gender) | +| --- | --- | +| 4 | 2 | + + +## Example 8: Calculate the count by a span + +The example gets the count of age by the interval of 10 years. + +```sql +source=accounts +| stats count(age) by span(age, 10) as age_span +``` +{% include copy.html %} + +Expected output: + +| count(age) | age_span | +| --- | --- | +| 1 | 20 | +| 3 | 30 | + + +## Example 9: Calculate the count by a gender and span + +The example gets the count of age by the interval of 10 years and group by gender. + +```sql +source=accounts +| stats count() as cnt by span(age, 5) as age_span, gender +``` +{% include copy.html %} + +Expected output: + +| cnt | age_span | gender | +| --- | --- | --- | +| 1 | 25 | F | +| 2 | 30 | M | +| 1 | 35 | M | + +Span will always be the first grouping key whatever order you specify. + +```sql +source=accounts +| stats count() as cnt by gender, span(age, 5) as age_span +``` +{% include copy.html %} + +Expected output: + +| cnt | age_span | gender | +| --- | --- | --- | +| 1 | 25 | F | +| 2 | 30 | M | +| 1 | 35 | M | + + +## Example 10: Calculate the count and get email list by a gender and span + +The example gets the count of age by the interval of 10 years and group by gender, additionally for each row get a list of at most 5 emails. + +```sql +source=accounts +| stats count() as cnt, take(email, 5) by span(age, 5) as age_span, gender +``` +{% include copy.html %} + +Expected output: + +| cnt | take(email, 5) | age_span | gender | +| --- | --- | --- | --- | +| 1 | [] | 25 | F | +| 2 | [amberduke@pyrami.com,daleadams@boink.com] | 30 | M | +| 1 | [hattiebond@netagy.com] | 35 | M | + + +## Example 11: Calculate the percentile of a field + +The following example PPL query shows how to use `stats` to calculate the percentile 90th age of all the accounts. + +```sql +source=accounts +| stats percentile(age, 90) +``` +{% include copy.html %} + +Expected output: + +| percentile(age, 90) | +| --- | +| 36 | + + +## Example 12: Calculate the percentile of a field by group + +The following example PPL query shows how to use `stats` to calculate the percentile 90th age of all the accounts group by gender. + +```sql +source=accounts +| stats percentile(age, 90) by gender +``` +{% include copy.html %} + +Expected output: + +| percentile(age, 90) | gender | +| --- | --- | +| 28 | F | +| 36 | M | + + +## Example 13: Calculate the percentile by a gender and span + +The example gets the percentile 90th age by the interval of 10 years and group by gender. + +```sql +source=accounts +| stats percentile(age, 90) as p90 by span(age, 10) as age_span, gender +``` +{% include copy.html %} + +Expected output: + +| p90 | age_span | gender | +| --- | --- | --- | +| 28 | 20 | F | +| 36 | 30 | M | + + +## Example 14: Collect all values in a field using LIST + +The following example PPL query shows how to use `stats` to collect all firstname values, preserving duplicates and order. + +```sql +source=accounts +| stats list(firstname) +``` +{% include copy.html %} + +Expected output: + +| list(firstname) | +| --- | +| [Amber,Hattie,Nanette,Dale] | + + +## Example 15: Ignore null bucket + +```sql +source=accounts +| stats bucket_nullable=false count() as cnt by email +``` +{% include copy.html %} + +Expected output: + +| cnt | email | +| --- | --- | +| 1 | amberduke@pyrami.com | +| 1 | daleadams@boink.com | +| 1 | hattiebond@netagy.com | + + +## Example 16: Collect unique values in a field using VALUES + +The following example PPL query shows how to use `stats` to collect all unique firstname values, sorted lexicographically with duplicates removed. + +```sql +source=accounts +| stats values(firstname) +``` +{% include copy.html %} + +Expected output: + +| values(firstname) | +| --- | +| [Amber,Dale,Hattie,Nanette] | + + +## Example 17: Span on date/time field always ignore null bucket + +Index example data: ++-------+--------+------------+ +Name | DEPTNO | birthday | ++=======+========+============+ +Alice | 1 | 2024-04-21 | ++-------+--------+------------+ +Bob | 2 | 2025-08-21 | ++-------+--------+------------+ +Jeff | null | 2025-04-22 | ++-------+--------+------------+ +Adam | 2 | null | ++-------+--------+------------+ + +```sql +source=example +| stats count() as cnt by span(birthday, 1y) as year +``` +{% include copy.html %} + +Expected output: + +| cnt | year | +| --- | --- | +| 1 | 2024-01-01 | +| 2 | 2025-01-01 | + +```sql +source=example +| stats count() as cnt by span(birthday, 1y) as year, DEPTNO +``` +{% include copy.html %} + +Expected output: + +| cnt | year | DEPTNO | +| --- | --- | --- | +| 1 | 2024-01-01 | 1 | +| 1 | 2025-01-01 | 2 | +| 1 | 2025-01-01 | null | + +```sql +source=example +| stats bucket_nullable=false count() as cnt by span(birthday, 1y) as year, DEPTNO +``` +{% include copy.html %} + +Expected output: + +| cnt | year | DEPTNO | +| --- | --- | --- | +| 1 | 2024-01-01 | 1 | +| 1 | 2025-01-01 | 2 | + + +## Example 18: Calculate the count by the implicit @timestamp field + +The following example PPL query demonstrates that if you omit the field parameter in the span function, it will automatically use the implicit `@timestamp` field. + +```sql +source=big5 +| stats count() by span(1month) +``` +{% include copy.html %} + +Expected output: + +| count() | span(1month) | +| --- | --- | +| 1 | 2023-01-01 00:00:00 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/streamstats.md b/_sql-and-ppl/ppl/cmd/streamstats.md new file mode 100644 index 00000000000..5f896cea7e9 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/streamstats.md @@ -0,0 +1,266 @@ +--- +layout: default +title: "streamstats" +parent: "Commands" +grand_parent: "PPL" +nav_order: 39 +--- +# streamstats + + +The `streamstats` command calculates cumulative or rolling statistics as events are processed in order. Unlike `stats` or `eventstats` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. +Key aspects of `streamstats`: +1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. +2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). +3. Retains all original events and appends new fields containing the calculated statistics. +4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. + +Difference between `stats`, `eventstats` and `streamstats` +All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: +* Transformation Behavior + * `stats`: Transforms all events into an aggregated result table, losing original event structure. + * `eventstats`: Adds aggregation results as new fields to the original events without removing the event structure. + * `streamstats`: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. +* Output Format + * `stats`: Output contains only aggregated values. Original raw events are not preserved. + * `eventstats`: Original events remain, with extra fields containing summary statistics. + * `streamstats`: Original events remain, with extra fields containing running totals or cumulative statistics. +* Aggregation Scope + * `stats`: Based on all events in the search (or groups defined by BY clause). + * `eventstats`: Based on all relevant events, then the result is added back to each event in the group. + * `streamstats`: Calculations occur progressively as each event is processed; can be scoped by window. +* Use Cases + * `stats`: When only aggregated results are needed (e.g., counts, averages, sums). + * `eventstats`: When aggregated statistics are needed alongside original event data. + * `streamstats`: When a running total or cumulative statistic is needed across event streams. + + +## Syntax + +Use the following syntax: + +`streamstats [bucket_nullable=bool] [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause]` +* `function`: mandatory. A aggregation function or window function. +* `bucket_nullable`: optional. Controls whether the streamstats command consider null buckets as a valid group in group-by aggregations. When set to `false`, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` +* `current`: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=\. **Default:** true. +* `window`: optional. Specifies the number of events to use when computing the statistics. Syntax: window=\. **Default:** 0, which means that all previous and current events are used. +* `global`: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=\. **Default:** true. +* `reset_before`: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("\")". **Default:** false. +* `reset_after`: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("\")". **Default:** false. +* `by-clause`: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no \ is specified, all events are processed as a single group and running statistics are computed across the entire event stream. +* `span-expression`: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. + * Available time units + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + + +## Aggregation functions + +The streamstats command supports the following aggregation functions: +* `COUNT`: Count of values +* `SUM`: Sum of numeric values +* `AVG`: Average of numeric values +* `MAX`: Maximum value +* `MIN`: Minimum value +* `VAR_SAMP`: Sample variance +* `VAR_POP`: Population variance +* `STDDEV_SAMP`: Sample standard deviation +* `STDDEV_POP`: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* `EARLIEST`: Earliest value by timestamp +* `LATEST`: Latest value by timestamp + +For detailed documentation of each function, see [Aggregation Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/aggregations/). + +## Usage + +Streamstats + +```sql +source = table | streamstats avg(a) +source = table | streamstats current = false avg(a) +source = table | streamstats window = 5 sum(b) +source = table | streamstats current = false window = 2 max(a) +source = table | where a < 50 | streamstats count(c) +source = table | streamstats min(c), max(c) by b +source = table | streamstats count(c) as count_by by b | where count_by > 1000 +source = table | streamstats dc(field) as distinct_count +source = table | streamstats distinct_count(category) by region +source = table | streamstats current=false window=2 global=false avg(a) by b +source = table | streamstats window=2 reset_before=a>31 avg(b) +source = table | streamstats current=false reset_after=a>31 avg(b) by c +``` +{% include copy.html %} + + +## Example 1: Calculate the running average, sum, and count of a field by group + +This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. + +```sql +source=accounts +| streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | + + +## Example 2: Running maximum age over a 2-row window + +This example calculates the running maximum age over a 2-row window, excluding the current event. + +```sql +source=state_country +| streamstats current=false window=2 max(age) as prev_max_age +``` +{% include copy.html %} + +Expected output: + +| name | country | state | month | year | age | prev_max_age | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | null | +| Hello | USA | New York | 4 | 2023 | 30 | 70 | +| John | Canada | Ontario | 4 | 2023 | 25 | 70 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 25 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 27 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 57 | +| David | USA | Washington | 4 | 2023 | 40 | 70 | + + +## Example 3: Use the global argument to calculate running statistics + +The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: +* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. +* global=false: the window itself is created per group, meaning each group gets its own independent window. + +The following example PPL query shows how to use `streamstats` to calculate the running average of age across accounts by country, using global argument. +Original data: +| name | country | state | month | year | age | +| --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | +| Hello | USA | New York | 4 | 2023 | 30 | +| John | Canada | Ontario | 4 | 2023 | 25 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | +| Jim | Canada | B.C | 4 | 2023 | 27 | +| Peter | Canada | B.C | 4 | 2023 | 57 | +| Rick | Canada | B.C | 4 | 2023 | 70 | +| David | USA | Washington | 4 | 2023 | 40 | +* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. +* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. + +```sql +source=state_country +| streamstats window=2 global=true avg(age) as running_avg by country +``` +{% include copy.html %} + +Expected output: + +| name | country | state | month | year | age | running_avg | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | 70.0 | +| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | +| David | USA | Washington | 4 | 2023 | 40 | 40.0 | + +```sql +source=state_country +| streamstats window=2 global=false avg(age) as running_avg by country ; +``` +{% include copy.html %} + +Expected output: + +| name | country | state | month | year | age | running_avg | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | 70.0 | +| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | +| David | USA | Washington | 4 | 2023 | 40 | 35.0 | + + +## Example 4: Use the reset_before and reset_after arguments to reset statistics + +This example calculates the running average of age across accounts by country, with resets applied. + +```sql +source=state_country +| streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country +``` +{% include copy.html %} + +Expected output: + +| name | country | state | month | year | age | avg_age | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | null | +| Hello | USA | New York | 4 | 2023 | 30 | 70.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | null | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | +| Jim | Canada | B.C | 4 | 2023 | 27 | null | +| Peter | Canada | B.C | 4 | 2023 | 57 | null | +| Rick | Canada | B.C | 4 | 2023 | 70 | null | +| David | USA | Washington | 4 | 2023 | 40 | null | + + +## Example 5: Null buckets handling + +```sql +source=accounts +| streamstats bucket_nullable=false count() as cnt by employer +| fields account_number, firstname, employer, cnt +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | null | + +```sql +source=accounts +| streamstats bucket_nullable=true count() as cnt by employer +| fields account_number, firstname, employer, cnt +``` +{% include copy.html %} + +Expected output: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | 1 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/subquery.md b/_sql-and-ppl/ppl/cmd/subquery.md new file mode 100644 index 00000000000..94ca89fe869 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/subquery.md @@ -0,0 +1,226 @@ +--- +layout: default +title: "subquery" +parent: "Commands" +grand_parent: "PPL" +nav_order: 40 +--- +# subquery + + +The `subquery` command embeds one PPL query inside another, enabling complex filtering and data retrieval operations. A subquery is a nested query that executes first and returns results that are used by the outer query for filtering, comparison, or joining operations. +Subqueries are useful for: +1. Filtering data based on results from another query +2. Checking for the existence of related data +3. Performing calculations that depend on aggregated values from other tables +4. Creating complex joins with dynamic conditions + + +## Syntax + +Use the following syntax: + +`subquery: [ source=... | ... | ... ]` + +Subqueries use the same syntax as regular PPL queries but must be enclosed in square brackets. There are four main types of subqueries: + +**IN Subquery** +Tests whether a field value exists in the results of a subquery: + +```sql +where [not] in [ source=... | ... | ... ] +``` +{% include copy.html %} + +**EXISTS Subquery** +Tests whether a subquery returns any results: + +```sql +where [not] exists [ source=... | ... | ... ] +``` + +**Scalar Subquery** +Returns a single value that can be used in comparisons or calculations + +```sql +where = [ source=... | ... | ... ] +``` +{% include copy.html %} + +**Relation Subquery** +Used in join operations to provide dynamic right-side data + +```sql +| join ON condition [ source=... | ... | ... ] +``` +{% include copy.html %} + + +## Configuration + +The following settings configure the `subquery` command behavior. + +### plugins.ppl.subsearch.maxout + +The size configures the maximum of rows to return from subsearch. The default value is: `10000`. A value of `0` indicates that the restriction is unlimited. + +Change the subsearch.maxout to unlimited: + +```bash +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.ppl.subsearch.maxout" : "0"}}' +``` +{% include copy.html %} + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "ppl": { + "subsearch": { + "maxout": "-1" + } + } + } + }, + "transient": {} +} +``` + + +## Usage + +InSubquery: + +```sql +source = outer | where a in [ source = inner | fields b ] +source = outer | where (a) in [ source = inner | fields b ] +source = outer | where (a,b,c) in [ source = inner | fields d,e,f ] +source = outer | where a not in [ source = inner | fields b ] +source = outer | where (a) not in [ source = inner | fields b ] +source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ] +source = outer a in [ source = inner | fields b ] // search filtering with subquery +source = outer a not in [ source = inner | fields b ] // search filtering with subquery) +source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ] // nested +source = table1 | inner join left = l right = r on l.a = r.a AND r.a in [ source = inner | fields d ] | fields l.a, r.a, b, c //as join filter +``` +{% include copy.html %} + +ExistsSubquery: + +```sql +// Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table nested +source = outer | where exists [ source = inner | where a = c ] +source = outer | where not exists [ source = inner | where a = c ] +source = outer | where exists [ source = inner | where a = c and b = d ] +source = outer | where not exists [ source = inner | where a = c and b = d ] +source = outer exists [ source = inner | where a = c ] // search filtering with subquery +source = outer not exists [ source = inner | where a = c ] //search filtering with subquery +source = table as t1 exists [ source = table as t2 | where t1.a = t2.a ] //table alias is useful in exists subquery +source = outer | where exists [ source = inner1 | where a = c and exists [ source = nested | where c = e ] ] //nested +source = outer | where exists [ source = inner1 | where a = c | where exists [ source = nested | where c = e ] ] //nested +source = outer | where exists [ source = inner | where c > 10 ] //uncorrelated exists +source = outer | where not exists [ source = inner | where c > 10 ] //uncorrelated exists +source = outer | where exists [ source = inner ] | eval l = "nonEmpty" | fields l //special uncorrelated exists +``` +{% include copy.html %} + +ScalarSubquery: + +```sql +//Uncorrelated scalar subquery in Select +source = outer | eval m = [ source = inner | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | stats max(c) ] + b | fields m, a +//Uncorrelated scalar subquery in Where** +source = outer | where a > [ source = inner | stats min(c) ] | fields a +//Uncorrelated scalar subquery in Search filter +source = outer a > [ source = inner | stats min(c) ] | fields a +//Correlated scalar subquery in Select +source = outer | eval m = [ source = inner | where outer.b = inner.d | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | where b = d | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | where outer.b > inner.d | stats max(c) ] | fields m, a +//Correlated scalar subquery in Where +source = outer | where a = [ source = inner | where outer.b = inner.d | stats max(c) ] +source = outer | where a = [ source = inner | where b = d | stats max(c) ] +source = outer | where [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a +//Correlated scalar subquery in Search filter +source = outer a = [ source = inner | where b = d | stats max(c) ] +source = outer [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a +//Nested scalar subquery +source = outer | where a = [ source = inner | stats max(c) | sort c ] OR b = [ source = inner | where c = 1 | stats min(d) | sort d ] +source = outer | where a = [ source = inner | where c = [ source = nested | stats max(e) by f | sort f ] | stats max(d) by c | sort c | head 1 ] +RelationSubquery +source = table1 | join left = l right = r on condition [ source = table2 | where d > 10 | head 5 ] //subquery in join right side +source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1 +``` +{% include copy.html %} + + +## Example 1: TPC-H q20 + +The following example PPL query shows a complex TPC-H query 20 implementation using nested subqueries. + +```bash +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = supplier + | join ON s_nationkey = n_nationkey nation + | where n_name = 'CANADA' + and s_suppkey in [ + source = partsupp + | where ps_partkey in [ + source = part + | where like(p_name, 'forest%') + | fields p_partkey + ] + and ps_availqty > [ + source = lineitem + | where l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date('1994-01-01') + and l_shipdate < date_add(date('1994-01-01'), interval 1 year) + | stats sum(l_quantity) as sum_l_quantity + | eval half_sum_l_quantity = 0.5 * sum_l_quantity // Stats and Eval commands can combine when issues/819 resolved + | fields half_sum_l_quantity + ] + | fields ps_suppkey + ] + """ +}' +``` +{% include copy.html %} + + +## Example 2: TPC-H q22 + +The following example PPL query shows a TPC-H query 22 implementation using EXISTS and scalar subqueries. + +```bash +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = [ + source = customer + | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > [ + source = customer + | where c_acctbal > 0.00 + and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + | stats avg(c_acctbal) + ] + and not exists [ + source = orders + | where o_custkey = c_custkey + ] + | eval cntrycode = substring(c_phone, 1, 2) + | fields cntrycode, c_acctbal + ] as custsale + | stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode + | sort cntrycode + """ +}' +``` +{% include copy.html %} diff --git a/_sql-and-ppl/ppl/cmd/syntax.md b/_sql-and-ppl/ppl/cmd/syntax.md new file mode 100644 index 00000000000..bf94ecfe793 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/syntax.md @@ -0,0 +1,83 @@ +--- +layout: default +title: "syntax" +parent: "Commands" +grand_parent: "PPL" +nav_order: 41 +--- +# PPL syntax + +Every PPL query starts with the `search` command. It specifies the index to search and retrieve documents from. + +`PPL` supports exactly one `search` command per PPL query, and it is always the first command. The word `search` can be omitted. + +Subsequent commands can follow in any order. + + +## Syntax + +```sql +search source= [boolean-expression] +source= [boolean-expression] +``` +{% include copy.html %} + + +Field | Description | Required +:--- | :--- |:--- +`index` | Specifies the index to query. | No +`bool-expression` | Specifies an expression that evaluates to a Boolean value. | No + + +### Required arguments + +Required arguments are shown in angle brackets `< >`. + +### Optional arguments + +Optional arguments are enclosed in square brackets `[ ]`. + + +## Examples + +**Example 1: Search through accounts index** + +In the following example, the `search` command refers to an `accounts` index as the source and uses `fields` and `where` commands for the conditions: + +```sql +search source=accounts +| where age > 18 +| fields firstname, lastname +``` +{% include copy.html %} + +**Example 2: Get all documents** + +To get all documents from the `accounts` index, specify it as the `source`: + +```sql +search source=accounts; +``` +{% include copy.html %} + + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +:--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams + +**Example 3: Get documents that match a condition** + +To get all documents from the `accounts` index that either have `account_number` equal to 1 or have `gender` as `F`, use the following query: + +```sql +search source=accounts account_number=1 or gender=\"F\"; +``` +{% include copy.html %} + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +:--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | diff --git a/_sql-and-ppl/ppl/cmd/table.md b/_sql-and-ppl/ppl/cmd/table.md new file mode 100644 index 00000000000..9ddaae59f18 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/table.md @@ -0,0 +1,44 @@ +--- +layout: default +title: "table" +parent: "Commands" +grand_parent: "PPL" +nav_order: 42 +--- +# table + + +The `table` command is an alias for the [`fields`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/fields/) command and provides the same field selection capabilities. It allows you to keep or remove fields from the search results using enhanced syntax options. + +## Syntax + +Use the following syntax: + +`table [+|-] ` +* `[+|-]`: optional. If the plus (+) is used, only the fields specified in the field list will be kept. If the minus (-) is used, all the fields specified in the field list will be removed. **Default:** +. +* `field-list`: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. + + +## Example 1: Basic table command usage + +The following example PPL query shows basic field selection using the table command. + +```sql +source=accounts +| table firstname lastname age +``` +{% include copy.html %} + +Expected output: + +| firstname | lastname | age | +| --- | --- | --- | +| Amber | Duke | 32 | +| Hattie | Bond | 36 | +| Nanette | Bates | 28 | +| Dale | Adams | 33 | + + +## See also + +- [fields]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/fields/) - Alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/timechart.md b/_sql-and-ppl/ppl/cmd/timechart.md new file mode 100644 index 00000000000..fe77b78ff38 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/timechart.md @@ -0,0 +1,353 @@ +--- +layout: default +title: "timechart" +parent: "Commands" +grand_parent: "PPL" +nav_order: 43 +--- +# timechart + + +The `timechart` command creates a time-based aggregation of data. It groups data by time intervals and optionally by a field, then applies an aggregation function to each group. The results are returned in an unpivoted format with separate rows for each time-field combination. + +## Syntax + +Use the following syntax: + +`timechart [timefield=] [span=] [limit=] [useother=] [by ]` +* `timefield`: optional. Specifies the timestamp field to use for time interval grouping. **Default**: `@timestamp`. +* `span`: optional. Specifies the time interval for grouping data. **Default:** 1m (1 minute). + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) +* `limit`: optional. Specifies the maximum number of distinct values to display when using the "by" clause. **Default:** 10. + * When there are more distinct values than the limit, the additional values are grouped into an "OTHER" category if useother is not set to false. + * The "most distinct" values are determined by calculating the sum of the aggregation values across all time intervals for each distinct field value. The top N values with the highest sums are displayed individually, while the rest are grouped into the "OTHER" category. + * Set to 0 to show all distinct values without any limit (when limit=0, useother is automatically set to false). + * The parameters can be specified in any order before the aggregation function. + * Only applies when using the "by" clause to group results. +* `useother`: optional. Controls whether to create an "OTHER" category for values beyond the limit. **Default:** true. + * When set to false, only the top N values (based on limit) are shown without an "OTHER" column. + * When set to true, values beyond the limit are grouped into an "OTHER" category. + * Only applies when using the "by" clause and when there are more distinct values than the limit. +* `usenull`: optional. Controls whether NULL values are placed into a separate category in the chart. **Default:** true. + * When set to true, NULL values are grouped into a separate category with the label specified by nullstr. + * When set to false, NULL values are excluded from the results. +* `nullstr`: optional. The display label used for NULL values when usenull is true. **Default:** "NULL". + * Specifies the string representation for the NULL category in the chart output. +* `aggregation_function`: mandatory. The aggregation function to apply to each time bucket. + * Currently, only a single aggregation function is supported. + * Available functions: All aggregation functions supported by the [stats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/stats/) command, as well as the timechart-specific aggregations listed in the following section. +* `by`: optional. Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. + + +## PER_SECOND + +Usage: per_second(field) calculates the per-second rate for a numeric field within each time bucket. +The calculation formula is: `per_second(field) = sum(field) / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE + +## PER_MINUTE + +Usage: per_minute(field) calculates the per-minute rate for a numeric field within each time bucket. +The calculation formula is: `per_minute(field) = sum(field) * 60 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE + +## PER_HOUR + +Usage: per_hour(field) calculates the per-hour rate for a numeric field within each time bucket. +The calculation formula is: `per_hour(field) = sum(field) * 3600 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE + +## PER_DAY + +Usage: per_day(field) calculates the per-day rate for a numeric field within each time bucket. +The calculation formula is: `per_day(field) = sum(field) * 86400 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE + +## Notes + +* The `timechart` command requires a timestamp field in the data. By default, it uses the `@timestamp` field, but you can specify a different field using the `timefield` parameter. +* Results are returned in an unpivoted format with separate rows for each time-field combination that has data. +* Only combinations with actual data is included in the results - empty combinations are omitted rather than showing null or zero values. +* The "top N" values for the `limit` parameter are selected based on the sum of values across all time intervals for each distinct field value. +* When using the `limit` parameter, values beyond the limit are grouped into an "OTHER" category (unless `useother=false`). +* Examples 6 and 7 use different datasets: Example 6 uses the `events` dataset with fewer hosts for simplicity, while Example 7 uses the `events_many_hosts` dataset with 11 distinct hosts. +* **Null values**: Documents with null values in the "by" field are treated as a separate category and appear as null in the results. + + +## Example 1: Count events by hour + +This example counts events for each hour and groups them by host. + +```sql +source=events +| timechart span=1h count() by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 4 | +| 2023-01-01 10:00:00 | server2 | 4 | + + +## Example 2: Count events by minute + +This example counts events for each minute and groups them by host. + +```sql +source=events +| timechart span=1m count() by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 1 | +| 2023-01-01 10:05:00 | server2 | 1 | +| 2023-01-01 10:10:00 | server1 | 1 | +| 2023-01-01 10:15:00 | server2 | 1 | +| 2023-01-01 10:20:00 | server1 | 1 | +| 2023-01-01 10:25:00 | server2 | 1 | +| 2023-01-01 10:30:00 | server1 | 1 | +| 2023-01-01 10:35:00 | server2 | 1 | + + +## Example 3: Calculate average number of packets by minute + +This example calculates the average packets for each minute without grouping by any field. + +```sql +source=events +| timechart span=1m avg(packets) +``` +{% include copy.html %} + +Expected output: + +| @timestamp | avg(packets) | +| --- | --- | +| 2023-01-01 10:00:00 | 60.0 | +| 2023-01-01 10:05:00 | 30.0 | +| 2023-01-01 10:10:00 | 60.0 | +| 2023-01-01 10:15:00 | 30.0 | +| 2023-01-01 10:20:00 | 60.0 | +| 2023-01-01 10:25:00 | 30.0 | +| 2023-01-01 10:30:00 | 180.0 | +| 2023-01-01 10:35:00 | 90.0 | + + +## Example 4: Calculate average number of packets by every 20 minutes and status + +This example calculates the average number of packets for every 20 minutes and groups them by status. + +```sql +source=events +| timechart span=20m avg(packets) by status +``` +{% include copy.html %} + +Expected output: + +| @timestamp | status | avg(packets) | +| --- | --- | --- | +| 2023-01-01 10:00:00 | active | 30.0 | +| 2023-01-01 10:00:00 | inactive | 30.0 | +| 2023-01-01 10:00:00 | pending | 60.0 | +| 2023-01-01 10:00:00 | processing | 60.0 | +| 2023-01-01 10:20:00 | cancelled | 180.0 | +| 2023-01-01 10:20:00 | completed | 60.0 | +| 2023-01-01 10:20:00 | inactive | 90.0 | +| 2023-01-01 10:20:00 | pending | 30.0 | + + +## Example 5: Count events by hour and category + +This example counts events for each second and groups them by category + +```sql +source=events +| timechart span=1h count() by category +``` +{% include copy.html %} + +Expected output: + +| @timestamp | category | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | orders | 4 | +| 2023-01-01 10:00:00 | users | 4 | + + +## Example 6: Using the limit parameter with count() function + +When there are many distinct values in the "by" field, the timechart command will display the top values based on the limit parameter and group the rest into an "OTHER" category. +This query will display the top 2 hosts with the highest count values, and group the remaining hosts into an "OTHER" category. + +```sql +source=events +| timechart span=1m limit=2 count() by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 1 | +| 2023-01-01 10:05:00 | server2 | 1 | +| 2023-01-01 10:10:00 | server1 | 1 | +| 2023-01-01 10:15:00 | server2 | 1 | +| 2023-01-01 10:20:00 | server1 | 1 | +| 2023-01-01 10:25:00 | server2 | 1 | +| 2023-01-01 10:30:00 | server1 | 1 | +| 2023-01-01 10:35:00 | server2 | 1 | + + +## Example 7: Using limit=0 with count() to show all values + +To display all distinct values without any limit, set limit=0: + +```sql +source=events_many_hosts +| timechart span=1h limit=0 count() by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2024-07-01 00:00:00 | web-01 | 1 | +| 2024-07-01 00:00:00 | web-02 | 1 | +| 2024-07-01 00:00:00 | web-03 | 1 | +| 2024-07-01 00:00:00 | web-04 | 1 | +| 2024-07-01 00:00:00 | web-05 | 1 | +| 2024-07-01 00:00:00 | web-06 | 1 | +| 2024-07-01 00:00:00 | web-07 | 1 | +| 2024-07-01 00:00:00 | web-08 | 1 | +| 2024-07-01 00:00:00 | web-09 | 1 | +| 2024-07-01 00:00:00 | web-10 | 1 | +| 2024-07-01 00:00:00 | web-11 | 1 | + +This shows all 11 hosts as separate rows without an "OTHER" category. + +## Example 8: Using useother=false with count() function + +Limit to top 10 hosts without OTHER category (useother=false): + +```sql +source=events_many_hosts +| timechart span=1h useother=false count() by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2024-07-01 00:00:00 | web-01 | 1 | +| 2024-07-01 00:00:00 | web-02 | 1 | +| 2024-07-01 00:00:00 | web-03 | 1 | +| 2024-07-01 00:00:00 | web-04 | 1 | +| 2024-07-01 00:00:00 | web-05 | 1 | +| 2024-07-01 00:00:00 | web-06 | 1 | +| 2024-07-01 00:00:00 | web-07 | 1 | +| 2024-07-01 00:00:00 | web-08 | 1 | +| 2024-07-01 00:00:00 | web-09 | 1 | +| 2024-07-01 00:00:00 | web-10 | 1 | + + +## Example 9: Using limit with useother parameter and avg() function + +Limit to top 3 hosts with OTHER category (default useother=true): + +```sql +source=events_many_hosts +| timechart span=1h limit=3 avg(cpu_usage) by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | avg(cpu_usage) | +| --- | --- | --- | +| 2024-07-01 00:00:00 | OTHER | 41.3 | +| 2024-07-01 00:00:00 | web-03 | 55.3 | +| 2024-07-01 00:00:00 | web-07 | 48.6 | +| 2024-07-01 00:00:00 | web-09 | 67.8 | + +Limit to top 3 hosts without OTHER category (useother=false): + +```sql +source=events_many_hosts +| timechart span=1h limit=3 useother=false avg(cpu_usage) by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | avg(cpu_usage) | +| --- | --- | --- | +| 2024-07-01 00:00:00 | web-03 | 55.3 | +| 2024-07-01 00:00:00 | web-07 | 48.6 | +| 2024-07-01 00:00:00 | web-09 | 67.8 | + + +## Example 10: Handling null values in the "by" field + +The following example PPL query shows how null values in the "by" field are treated as a separate category. The dataset events_null has 1 entry that does not have a host field. +It is put into a separate "NULL" category because the defaults for `usenull` and `nullstr` are `true` and `"NULL"` respectively. + +```sql +source=events_null +| timechart span=1h count() by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2024-07-01 00:00:00 | NULL | 1 | +| 2024-07-01 00:00:00 | db-01 | 1 | +| 2024-07-01 00:00:00 | web-01 | 2 | +| 2024-07-01 00:00:00 | web-02 | 2 | + + +## Example 11: Calculate packets per second rate + +This example calculates the per-second packet rate for network traffic data using the per_second() function. + +```sql +source=events +| timechart span=30m per_second(packets) by host +``` +{% include copy.html %} + +Expected output: + +| @timestamp | host | per_second(packets) | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 0.1 | +| 2023-01-01 10:00:00 | server2 | 0.05 | +| 2023-01-01 10:30:00 | server1 | 0.1 | +| 2023-01-01 10:30:00 | server2 | 0.05 | + + +## Limitations + +* Only a single aggregation function is supported per timechart command. +* The `bins` parameter and other bin options are not supported since the `bin` command is not implemented yet. Use the `span` parameter to control time intervals. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/top.md b/_sql-and-ppl/ppl/cmd/top.md new file mode 100644 index 00000000000..5634a3baffe --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/top.md @@ -0,0 +1,152 @@ +--- +layout: default +title: "top" +parent: "Commands" +grand_parent: "PPL" +nav_order: 44 +--- +# top + + +The `top` command finds the most common tuple of values of all fields in the field list. + +## Syntax + +Use the following syntax: + +`top [N] [top-options] [by-clause]` +* `N`: optional. number of results to return. **Default**: 10 +* `top-options`: optional. options for the top command. Supported syntax is [countfield=\] [showcount=\]. + * showcount=\: optional. whether to create a field in output that represent a count of the tuple of values. **Default:** true. + * countfield=\: optional. the name of the field that contains count. **Default:** 'count'. + * usenull=\: optional (since 3.4.0). whether to output the null value. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `usenull` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `usenull` defaults to `false` +* `field-list`: mandatory. comma-delimited list of field names. +* `by-clause`: optional. one or more fields to group the results by. + + +## Example 1: Find the most common values in a field + +This example finds the most common gender of all the accounts. + +```sql +source=accounts +| top showcount=false gender +``` +{% include copy.html %} + +Expected output: + +| gender | +| --- | +| M | +| F | + + +## Example 2: Limit results to top N values + +This example finds the most common gender and limits results to 1 value. + +```sql +source=accounts +| top 1 showcount=false gender +``` +{% include copy.html %} + +Expected output: + +| gender | +| --- | +| M | + + +## Example 3: Find the most common values grouped by field + +This example finds the most common age of all the accounts grouped by gender. + +```sql +source=accounts +| top 1 showcount=false age by gender +``` +{% include copy.html %} + +Expected output: + +| gender | age | +| --- | --- | +| F | 28 | +| M | 32 | + + +## Example 4: Top command with count field + +This example finds the most common gender of all the accounts and includes the count. + +```sql +source=accounts +| top gender +``` +{% include copy.html %} + +Expected output: + +| gender | count | +| --- | --- | +| M | 3 | +| F | 1 | + + +## Example 5: Specify the count field option + +This example specifies a custom name for the count field. + +```sql +source=accounts +| top countfield='cnt' gender +``` +{% include copy.html %} + +Expected output: + +| gender | cnt | +| --- | --- | +| M | 3 | +| F | 1 | + + +## Example 5: Specify the usenull field option + +```sql +source=accounts +| top usenull=false email +``` +{% include copy.html %} + +Expected output: + +| email | count | +| --- | --- | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + +```sql +source=accounts +| top usenull=true email +``` +{% include copy.html %} + +Expected output: + +| email | count | +| --- | --- | +| null | 1 | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + + +## Limitations + +The `top` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/trendline.md b/_sql-and-ppl/ppl/cmd/trendline.md new file mode 100644 index 00000000000..5cea6544561 --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/trendline.md @@ -0,0 +1,112 @@ +--- +layout: default +title: "trendline" +parent: "Commands" +grand_parent: "PPL" +nav_order: 45 +--- +# trendline + + +The `trendline` command calculates moving averages of fields. + +## Syntax + +Use the following syntax: + +`trendline [sort <[+|-] sort-field>] [sma|wma]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/number-of-datapoints-field/) [as ] [[sma|wma]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/number-of-datapoints-field/) [as ]]...` +* `[+|-]`: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. +* `sort-field`: mandatory when sorting is used. The field used to sort. +* `sma|wma`: mandatory. Simple Moving Average (sma) applies equal weighting to all values, Weighted Moving Average (wma) applies greater weight to more recent values. +* `number-of-datapoints`: mandatory. The number of datapoints to calculate the moving average (must be greater than zero). +* `field`: mandatory. The name of the field the moving average should be calculated for. +* `alias`: optional. The name of the resulting column containing the moving average. **Default:** field name with "_trendline". + + +## Example 1: Calculate the simple moving average on one field + +The following example PPL query shows how to use `trendline` to calculate the simple moving average on one field. + +```sql +source=accounts +| trendline sma(2, account_number) as an +| fields an +``` +{% include copy.html %} + +Expected output: + +| an | +| --- | +| null | +| 3.5 | +| 9.5 | +| 15.5 | + + +## Example 2: Calculate the simple moving average on multiple fields + +The following example PPL query shows how to use `trendline` to calculate the simple moving average on multiple fields. + +```sql +source=accounts +| trendline sma(2, account_number) as an sma(2, age) as age_trend +| fields an, age_trend +``` +{% include copy.html %} + +Expected output: + +| an | age_trend | +| --- | --- | +| null | null | +| 3.5 | 34.0 | +| 9.5 | 32.0 | +| 15.5 | 30.5 | + + +## Example 3: Calculate the simple moving average on one field without specifying an alias + +The following example PPL query shows how to use `trendline` to calculate the simple moving average on one field. + +```sql +source=accounts +| trendline sma(2, account_number) +| fields account_number_trendline +``` +{% include copy.html %} + +Expected output: + +| account_number_trendline | +| --- | +| null | +| 3.5 | +| 9.5 | +| 15.5 | + + +## Example 4: Calculate the weighted moving average on one field + +The following example PPL query shows how to use `trendline` to calculate the weighted moving average on one field. + +```sql +source=accounts +| trendline wma(2, account_number) +| fields account_number_trendline +``` +{% include copy.html %} + +Expected output: + +| account_number_trendline | +| --- | +| null | +| 4.333333333333333 | +| 10.666666666666666 | +| 16.333333333333332 | + + +## Limitations + +The `trendline` command requires all values in the specified `field` to be non-null. Any rows with null values present in the calculation field will be automatically excluded from the command's output. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/where.md b/_sql-and-ppl/ppl/cmd/where.md new file mode 100644 index 00000000000..70b9e098d6d --- /dev/null +++ b/_sql-and-ppl/ppl/cmd/where.md @@ -0,0 +1,188 @@ +--- +layout: default +title: "where" +parent: "Commands" +grand_parent: "PPL" +nav_order: 46 +--- +# where + + +The `where` command filters the search results. The `where` command only returns the result when the bool-expression evaluates to true. + +## Syntax + +Use the following syntax: + +`where ` +* `bool-expression`: optional. Any expression which could be evaluated to boolean value. + + +## Example 1: Filter search results with condition + +The following example PPL query shows how to use `where` to fetch all the documents from the accounts index where account_number is 1 or gender is "F". + +```sql +source=accounts +| where account_number=1 or gender="F" +| fields account_number, gender +``` +{% include copy.html %} + +Expected output: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 13 | F | + + +## Example 2: Basic field Comparison + +The following example PPL query shows how to use `where` to filter accounts with balance greater than 30000. + +```sql +source=accounts +| where balance > 30000 +| fields account_number, balance +``` +{% include copy.html %} + +Expected output: + +| account_number | balance | +| --- | --- | +| 1 | 39225 | +| 13 | 32838 | + + +## Example 3: Pattern matching with LIKE + +Pattern Matching with Underscore (\_) +The following example PPL query demonstrates using LIKE with underscore (\_) to match a single character. + +```sql +source=accounts +| where LIKE(state, 'M_') +| fields account_number, state +``` +{% include copy.html %} + +Expected output: + +| account_number | state | +| --- | --- | +| 18 | MD | + +Pattern Matching with Percent (%) +The following example PPL query demonstrates using LIKE with percent (%) to match multiple characters. + +```sql +source=accounts +| where LIKE(state, 'V%') +| fields account_number, state +``` +{% include copy.html %} + +Expected output: + +| account_number | state | +| --- | --- | +| 13 | VA | + + +## Example 4: Multiple conditions + +The following example PPL query shows how to combine multiple conditions using AND operator. + +```sql +source=accounts +| where age > 30 AND gender = 'M' +| fields account_number, age, gender +``` +{% include copy.html %} + +Expected output: + +| account_number | age | gender | +| --- | --- | --- | +| 1 | 32 | M | +| 6 | 36 | M | +| 18 | 33 | M | + + +## Example 5: Using IN operator + +The following example PPL query demonstrates using IN operator to match multiple values. + +```sql +source=accounts +| where state IN ('IL', 'VA') +| fields account_number, state +``` +{% include copy.html %} + +Expected output: + +| account_number | state | +| --- | --- | +| 1 | IL | +| 13 | VA | + + +## Example 6: NULL Checks + +The following example PPL query shows how to filter records with NULL values. + +```sql +source=accounts +| where ISNULL(employer) +| fields account_number, employer +``` +{% include copy.html %} + +Expected output: + +| account_number | employer | +| --- | --- | +| 18 | null | + + +## Example 7: Complex conditions + +The following example PPL query demonstrates combining multiple conditions with parentheses and logical operators. + +```sql +source=accounts +| where (balance > 40000 OR age > 35) AND gender = 'M' +| fields account_number, balance, age, gender +``` +{% include copy.html %} + +Expected output: + +| account_number | balance | age | gender | +| --- | --- | --- | --- | +| 6 | 5686 | 36 | M | + + +## Example 8: NOT conditions + +The following example PPL query shows how to use NOT operator to exclude matching records. + +```sql +source=accounts +| where NOT state = 'CA' +| fields account_number, state +``` +{% include copy.html %} + +Expected output: + +| account_number | state | +| --- | --- | +| 1 | IL | +| 6 | TN | +| 13 | VA | +| 18 | MD | + diff --git a/_sql-and-ppl/ppl/functions.md b/_sql-and-ppl/ppl/functions.md deleted file mode 100644 index 147bb62cd6c..00000000000 --- a/_sql-and-ppl/ppl/functions.md +++ /dev/null @@ -1,2133 +0,0 @@ ---- -layout: default -title: Commands -parent: PPL -nav_order: 2 -redirect_from: - - /search-plugins/sql/ppl/functions/ - - /observability-plugin/ppl/commands/ - - /search-plugins/ppl/commands/ - - /search-plugins/ppl/functions/ ---- - -# Commands - -
- - Table of contents - - {: .text-delta } -- TOC -{:toc} -
- -{::options toc_levels="2..2" /} - -PPL supports most common [SQL functions]({{site.url}}{{site.baseurl}}/search-plugins/sql/functions/), including [relevance search]({{site.url}}{{site.baseurl}}/search-plugins/sql/full-text/), but also introduces several more functions (called _commands_), which are available in PPL only. - ---- - -## ad - -
- - Syntax and examples - - {: .text-delta } - -The `ad` command applies the Random Cut Forest (RCF) algorithm in the [ML Commons plugin]({{site.url}}{{site.baseurl}}/ml-commons-plugin/index/) on the search result returned by a PPL command. Based on the input, the plugin uses two types of RCF algorithms: fixed-in-time RCF for processing time-series data and batch RCF for processing non-time-series data. - -### Syntax: Fixed-in-time RCF for time-series data command - -```sql -ad -``` -{% include copy.html %} - - -The following table describes the parameters for the `ad` command when using fixed-in-time RCF for time-series data. - -Field | Description | Required -:--- | :--- | :--- -`shingle_size` | A consecutive sequence of the most recent records. The default value is 8. | No -`time_decay` | Specifies how much of the recent past to consider when computing an anomaly score. The default value is 0.001. | No -`time_field` | Specifies the time field for RCF to use as time-series data. Must be either a long value, such as the timestamp in milliseconds, or a string value in "yyyy-MM-dd HH:mm:ss".| Yes - -### Syntax: Batch RCF for non-time-series data command - -```sql -ad -``` -{% include copy.html %} - - -The following table describes the parameters for the `ad` command when using batch RCF for non-time-series data. - -Field | Description | Required -:--- | :--- | :--- -`shingle_size` | A consecutive sequence of the most recent records. The default value is 8. | No -`time_decay` | Specifies how much of the recent past to consider when computing an anomaly score. The default value is 0.001. | No - -**Example 1: Detecting events in New York City from taxi ridership data with time-series data** - -The following example trains an RCF model and uses the model to detect anomalies in the time-series ridership data: - -```sql -source=nyc_taxi | fields value, timestamp | AD time_field='timestamp' | where value=10844.0 -``` -{% include copy.html %} - - -The command returns the following results. - -value | timestamp | score | anomaly_grade -:--- | :--- | :--- | :--- -10844.0 | 1404172800000 | 0.0 | 0.0 - -**Example 2: Detecting events in New York City from taxi ridership data with non-time-series data** - -The following example uses batch RCF to detect anomalies in non-time-series data: - -```sql -source=nyc_taxi | fields value | AD | where value=10844.0 -``` -{% include copy.html %} - - -The command returns the following results. - -value | score | anomalous -:--- | :--- | :--- -10844.0 | 0.0 | false - -
- ---- - -## bin - -
- - Syntax and examples - - {: .text-delta } - -The `bin` command groups numeric values into buckets of equal intervals, making it useful for creating histograms and analyzing data distribution. It takes a numeric or time-based field and generates a new field with values that represent the lower bound of each bucket. - -### Syntax - -```sql -bin [span=] [minspan=] [bins=] [aligntime=(earliest | latest | )] [start=] [end=] -``` -{% include copy.html %} - - -The following table describes the parameters for the `bin` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`field` | The field to bin. Accepts numeric or time-based fields. | Yes | N/A -`span` | The interval size for each bin. Cannot be used with bins or minspan parameters. | No | N/A -`minspan` | The minimum interval size for automatic span calculation. Cannot be used with span or bins parameters. | No | N/A -`bins` | The maximum number of equal-width bins to create. Cannot be used with span or minspan parameters. The bins parameter must be between 2 and 50000 (inclusive). | No | N/A -`aligntime` | Align the bin times for time-based fields. Valid only for time-based discretization. | No | N/A -`start` | The starting value for the bin range. | No | Minimum field value -`end` | The ending value for the bin range. | No | Maximum field value - -**Example 1: Basic numeric span** - -```sql -source=accounts | bin age span=10 | fields age, account_number | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| age | account_number -:--- | :--- | -| 30-40 | 1 -| 30-40 | 6 -| 20-30 | 13 - -**Example 2: Logarithmic span (log10)** - -```sql -source=accounts | bin balance span=log10 | fields balance | head 2; -``` -{% include copy.html %} - - -The command returns the following results. - -| balance -:--- | -| 10000.0-1000000.0 -| 1000.0-10000.0 - -**Example 3: Basic bins parameter** - -```sql -source=time_test | bin value bins=5 | fields value | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| value -:--- | -| 8000-9000 -| 7000-8000 -| 9000-10000 - -**Example 4: High bin count** - -```sql -source=accounts | bin age bins=21 | fields age, account_number | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| age | account_number -:--- | :--- | -| 32-33 | 1 -| 36-37 | 6 -| 28-29 | 13 - -**Example 5: Basic minspan** - -```sql -source=accounts | bin age minspan=5 | fields age, account_number | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| age | account_number -:--- | :--- | -| 30-40 | 1 -| 30-40 | 6 -| 20-30 | 13 - -**Example 6: Span with start/end** - -```sql -source=accounts | bin age span=1 start=25 end=35 | fields age | head 6; -``` -{% include copy.html %} - - -The command returns the following results. - -| age -:--- | -| 32-33 -| 36-37 -| 28-29 -| 33-34 - -**Example 7: Hour span** - -```sql -source=time_test | bin @timestamp span=1h | fields @timestamp, value | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | value -:--- | :--- | -| 2025-07-28 00:00:00 | 8945 -| 2025-07-28 01:00:00 | 7623 -| 2025-07-28 02:00:00 | 9187 - -**Example 8: Default behavior (no parameters)** - -```sql -source=accounts | bin age | fields age, account_number | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| age | account_number -:--- | :--- | -| 32.0-33.0 | 1 -| 36.0-37.0 | 6 -| 28.0-29.0 | 13 - -**Example 9: Using the `bin` command with string fields** - -```sql -source=accounts | eval age_str = CAST(age AS STRING) | bin age_str bins=3 | stats count() by age_str | sort age_str; -``` -{% include copy.html %} - - -The command returns the following results. - -| count() | age_str -:--- | :--- | -| 1 | 20-30 -| 3 | 30-40 - -
- ---- - -## dedup - -
- - Syntax and examples - - {: .text-delta } - -The `dedup` (data deduplication) command removes duplicate documents defined by a field from the search result. - -### Syntax - -```sql -dedup [int] [keepempty=] [consecutive=] -``` -{% include copy.html %} - - -The following table describes the parameters for the `dedup` command. - -Field | Description | Type | Required | Default -:--- | :--- | :--- | :--- | :--- -`int` | Retain the specified number of duplicate events for each combination. The number must be greater than 0. If you do not specify a number, only the first occurring event is kept and all other duplicates are removed from the results. | `integer` | No | 1 -`keepempty` | If true, keep the document if any field in the field list has a null value or a field missing. | `boolean` | No | False -`consecutive` | If true, remove only consecutive events with duplicate combinations of values. | `boolean` | No | False -`field-list` | Specify a comma-delimited field list. At least one field is required. | `string` | Yes | N/A - -**Example 1: Dedup by one field** - -To remove duplicate documents with the same gender, use the following command: - -```sql -search source=accounts | dedup gender | fields account_number, gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | gender -:--- | :--- | -1 | M -13 | F - - -**Example 2: Keep two duplicate documents** - -To keep two duplicate documents with the same gender, use the following command: - -```sql -search source=accounts | dedup 2 gender | fields account_number, gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | gender -:--- | :--- | -1 | M -6 | M -13 | F - -**Example 3: Keep or ignore an empty field by default** - -To keep two duplicate documents with a `null` field value, use the following command: - -```sql -search source=accounts | dedup email keepempty=true | fields account_number, email; -``` -{% include copy.html %} - - -The command returns the following results. - -account_number | email -:--- | :--- -1 | amberduke@pyrami.com -6 | hattiebond@netagy.com -13 | null -18 | daleadams@boink.com - -To remove duplicate documents with the `null` field value, use the following command: - -```sql -search source=accounts | dedup email | fields account_number, email; -``` -{% include copy.html %} - - -account_number | email -:--- | :--- -1 | amberduke@pyrami.com -6 | hattiebond@netagy.com -18 | daleadams@boink.com - -**Example 4: Dedup of consecutive documents** - -To remove duplicates of consecutive documents, use the following command: - -```sql -search source=accounts | dedup gender consecutive=true | fields account_number, gender; -``` -{% include copy.html %} - - -The command returns the following results. - -account_number | gender -:--- | :--- -1 | M -13 | F -18 | M - -### Limitations - -The `dedup` command is not rewritten to OpenSearch query domain-specific language (DSL); it is only executed on the coordinating node. - -
- ---- - -## eval - -
- - Syntax and examples - - {: .text-delta } - -The `eval` command evaluates an expression and appends its result to the search result. - -### Syntax - -```sql -eval = ["," = ]... -``` -{% include copy.html %} - - -The following table describes the parameters for the `eval` command. - -Field | Description | Required -:--- | :--- | :--- -`field` | If a field name does not exist, a new field is added. If the field name already exists, it's overwritten. | Yes -`expression` | Specify any supported expression. | Yes - -**Example 1: Create a new field** - -To create a new `doubleAge` field for each document where `doubleAge` is the result of `age` multiplied by 2, use the following command: - -```sql -search source=accounts | eval doubleAge = age * 2 | fields age, doubleAge; -``` -{% include copy.html %} - - -The command returns the following results. - -| age | doubleAge -:--- | :--- | -32 | 64 -36 | 72 -28 | 56 -33 | 66 - -**Example 2: Overwrite the existing field** - -To overwrite the `age` field with `age` plus 1, use the following command: - -```sql -search source=accounts | eval age = age + 1 | fields age; -``` -{% include copy.html %} - - -The command returns the following results. - -| age -| :--- -| 33 -| 37 -| 29 -| 34 - -**Example 3: Create a new field with a field defined with the `eval` command** - -To create a new field `ddAge` where `ddAge` is the result of `doubleAge` multiplied by 2 and `doubleAge` is defined in the `eval` command, use the following command: - -```sql -search source=accounts | eval doubleAge = age * 2, ddAge = doubleAge * 2 | fields age, doubleAge, ddAge; -``` -{% include copy.html %} - - -The command returns the following results. - -age | doubleAge | ddAge -:--- | :--- | :--- -32 | 64 | 128 -36 | 72 | 144 -28 | 56 | 112 -33 | 66 | 132 - - -### Limitations - -The `eval` command is not rewritten to OpenSearch query DSL; it is only executed on the coordinating node. - -
- ---- - -## fields - -
- - Syntax and examples - - {: .text-delta } - -Use the `fields` command to keep or remove fields from a search result. - -### Syntax - -```sql -fields [+|-] -``` -{% include copy.html %} - - -The following table describes the parameters for the `fields` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`index` | Plus (+) keeps only fields specified in the field list. Minus (-) removes all fields specified in the field list. | No | + -`field-list` | Specify a comma-delimited list of fields. | Yes | No default - -**Example 1: Select specified fields from result** - -To get `account_number`, `firstname`, and `lastname` fields from a search result, use the following command: - -```sql -search source=accounts | fields account_number, firstname, lastname; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | firstname | lastname -:--- | :--- | :--- -| 1 | Amber | Duke -| 6 | Hattie | Bond -| 13 | Nanette | Bates -| 18 | Dale | Adams - -**Example 2: Remove specified fields from a search result** - -To remove the `account_number` field from the search results, use the following command: - -```sql -search source=accounts | fields account_number, firstname, lastname | fields - account_number; -``` -{% include copy.html %} - - -The command returns the following results. - -| firstname | lastname -| :--- | :--- | -| Amber | Duke -| Hattie | Bond -| Nanette | Bates -| Dale | Adams - -
- ---- - -## head - -
- - Syntax and examples - - {: .text-delta } - -Use the `head` command to return the first N number of results in a specified search order. - -### Syntax - -```sql -head [N] -``` -{% include copy.html %} - - -The following table describes the parameters for the `head` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`N` | Specify the number of results to return. | No | 10 - -**Example 1: Get the first 10 results** - -To get the first 10 results, use the following command: - -```sql -search source=accounts | fields firstname, age | head; -``` -{% include copy.html %} - - -The command returns the following results. - -| firstname | age -:--- | :--- | -| Amber | 32 -| Hattie | 36 -| Nanette | 28 - -**Example 2: Get the first N results** - -To get the first two results, use the following command: - -```sql -search source=accounts | fields firstname, age | head 2; -``` -{% include copy.html %} - - -The command returns the following results. - -| firstname | age -:--- | :--- | -| Amber | 32 -| Hattie | 36 - -### Limitations - -The `head` command is not rewritten to OpenSearch query DSL; it is only executed on the coordinating node. - -
- ---- - -## join - -
- - Syntax and examples - - {: .text-delta } - -You can combine two datasets using the `join` command. The left side can be an index or results from piped commands, while the right side can be either an index or a subquery. - -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). -{: .warning} - -### Syntax - -```sql -[join-type] join [left-alias] [right-alias] on -``` -{% include copy.html %} - - -The following table describes additional requirements for the `join` command. - -Parameter | Description | Required -:--- | :--- | :--- -`join-criteria` | Any comparison expression. | Yes -`right-dataset` | Either an index or a subquery with or without an alias. | Yes - -The following table describes the parameters for the `join` command. - -Field | Description | Type | Required | Default -:--- | :--- | :--- | :--- | :--- -`join-type` | The type of join to perform. Valid values are `inner`, `left`, `right`, `full`, `cross`, `semi`, and `anti`. | `String` | No | `inner` -`left-alias` | The subquery alias to use with the left join side in order to avoid ambiguous naming. Fixed pattern: `left = ` | `String` | No | N/A -`right-alias` | The subquery alias to use with the right join side in order to avoid ambiguous naming. Fixed pattern: `right = ` | `String` | No | N/A -`join-criteria` | Any comparison expression. | `String` | Yes | N/A -`right-dataset` | Either an index or a subquery with/without an alias. | `String` | Yes | N/A - -The following examples use the `state_country` and `occupation` indexes. - -The `state_country` index contains the following data. - -| Name | Age | State | Country -:--- | :--- | :--- | :--- -| Jake | 70 | California | USA -| Hello | 30 | New York | USA -| John | 25 | Ontario | Canada -| Jane | 20 | Quebec | Canada -| Jim | 27 | B.C. | Canada -| Peter | 57 | B.C. | Canada -| Rick | 70 | B.C. | Canada -| David | 40 | Washington | USA - -The `occupation` index contains the following data. - -| Name | Occupation | Country | Salary -:--- | :--- | :--- | :--- -| Jake | Engineer | England | 100000 -| Hello | Artist | USA | 70000 -| John | Doctor | Canada | 120000 -| David | Doctor | USA | 120000 -| David | Unemployed | Canada | 0 -| Jane | Scientist | Canada | 90000 - -**Example 1: Join two indexes** - -The following example performs an inner join between two indexes: - -```sql -search source = state_country -| inner join left=a right=b ON a.name = b.name occupation -| stats avg(salary) by span(age, 10) as age_span, b.country -``` -{% include copy.html %} - - -The command returns the following results. - -avg(salary) | age_span | b.country -:--- | :--- | :--- -120000.0 | 40 | USA -105000.0 | 20 | Canada -0.0 | 40 | Canada -70000.0 | 30 | USA -100000.0 | 70 | England - -**Example 2: Join with a subsearch** - -The following example performs a left join with a subsearch: - -```sql -search source = state_country as a -| where country = 'USA' OR country = 'England' -| left join on a.name = b.name [ - source = occupation - | where salary > 0 - | fields name, country, salary - | sort salary - | head 3 - ] as b -| stats avg(salary) by span(age, 10) as age_span, b.country -``` -{% include copy.html %} - - -The command returns the following results. - -avg(salary) | age_span | b.country -:--- | :--- | :--- -null | 40 | null -70000.0 | 30 | USA -100000.0 | 70 | England - -### Limitations - -The `join` command works only when `plugins.calcite.enabled` is set to `true`. - -
- ---- - -## kmeans - -
- - Syntax and examples - - {: .text-delta } - -The `kmeans` command applies the ML Commons plugin's k-means algorithm to the provided PPL command's search results. - -### Syntax - -```sql -kmeans -``` -{% include copy.html %} - - -The following table describes the parameters for the `kmeans` command. - -Field | Description | Required -:--- | :--- | :--- -`cluster-number` | The number of clusters you want to group your data points into. | Yes - -**Example: Group Iris data** - -This example shows how to classify three Iris species (Iris setosa, Iris virginica, and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals: - -```sql -source=iris_data | fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm | kmeans 3 -``` -{% include copy.html %} - - -The command returns the following results. - -sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID -:--- | :--- | :--- | :--- | :--- | -| 5.1 | 3.5 | 1.4 | 0.2 | 1 -| 5.6 | 3.0 | 4.1 | 1.3 | 0 -| 6.7 | 2.5 | 5.8 | 1.8 | 2 - -
- ---- - -## lookup - -
- - Syntax and examples - - {: .text-delta } - -The `lookup` command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend index fields with values from a dimension table or append/replace values when a lookup condition is matched. As an alternative to the `join` command, the `lookup` command is more suitable for enriching the source data with a static dataset. - -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). -{: .warning} - -### Syntax - -```sql -lookup ( [as ])... [(replace | append) ( [AS ])...] -``` -{% include copy.html %} - - -The following table describes the parameters for the `lookup` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`lookup-index` | The name of lookup index (dimension table). | Yes | N/A -`lookup-mapping-field`| A mapping key in the `lookup-index`, analogous to a `join` key from the right table. You can specify multiple `lookup-mapping-field` values with commas. | Yes | N/A -`source-mapping-field`| A mapping key from the source (left side), analogous to a `join` key from the left side. | No | `lookup-mapping-field` -`replace` \| `append` | The output strategies. When specifying `replace`, matched values in the `lookup-index` field overwrite the values in the results. If you specify `append`, matched values in the `lookup-index` field only append to the missing values in the results. | No | `replace` -`input-field` | A field in `lookup-index` where matched values are applied to the result output. You can specify multiple `input-field` values with commas. If you don't specify any `input-field`, all fields except `lookup-mapping-field` from `lookup-index` are matched values that are applied to the result output. | No | N/A -`output-field` | A field of output. You can specify zero or multiple `output-field` values. If you specify `output-field` with an existing field name in the source query, its values are replaced or appended by the matched values from `input-field`. If the field specified in `output-field` is a new field, an extended new field is applied to the results. | No | `input-field` - -The following examples use the `workers` and `work_information` indexes. - -The `workers` index contains the following data. - -| ID | Name | Occupation | Country | Salary -:--- | :--- | :--- | :--- | :--- -| 1000 | Jake | Engineer | England | 100000 -| 1001 | Hello | Artist | USA | 70000 -| 1002 | John | Doctor | Canada | 120000 -| 1003 | David | Doctor | N/A | 120000 -| 1004 | David | N/A | Canada | 0 -| 1005 | Jane | Scientist | Canada | 90000 - -The `work_information` index contains the following data. - -| UID | Name | Department | Occupation -:--- | :--- | :--- | :--- -| 1000 | Jake | IT | Engineer | -| 1002 | John | DATA | Scientist | -| 1003 | David | HR | Doctor | -| 1005 | Jane | DATA | Engineer | -| 1006 | Tom | SALES | Artist | - -**Example 1: Look up workers and return the corresponding department** - -The following example looks up workers and returns the corresponding department: - -```sql -source = workers | lookup work_information uid as id append department -``` -{% include copy.html %} - - -The command returns the following results. - -| id | name | occupation | country | salary | department -:--- | :--- | :--- | :--- | :--- | :--- -1000 | Jake | Engineer | England | 100000 | IT -1001 | Hello | Artist | USA | 70000 | Null -1002 | John | Doctor | Canada | 120000 | DATA -1003 | David | Doctor | Null | 120000 | HR -1004 | David | Null | Canada | 0 | Null -1005 | Jane | Scientist | Canada | 90000 | DATA - -**Example 2: Look up workers and replace their occupation and department** - -The following example looks up workers and replaces their occupation and department using their `work_information`: - -```sql -source = workers | lookup work_information uid as id, name -``` -{% include copy.html %} - - -The command returns the following results. - -id | name | occupation | country | salary | department -:--- | :--- | :--- | :--- | :--- | :--- -1000 | Jake | Engineer | England | 100000 | IT -1001 | Hello | null | USA | 70000 | null -1002 | John | Scientist | Canada | 120000 | DATA -1003 | David | Doctor | null | 120000 | HR -1004 | David | null | Canada | 0 | null -1005 | Jane | Engineer | Canada | 90000 | DATA - -**Example 3: Look up workers and create a new occupation field** - -The following example looks up workers and appends their occupation from `work_information` as a new field: - -```sql -source = workers | lookup work_information name replace occupation as new_occupation -``` -{% include copy.html %} - - -The command returns the following results. - -id | name | occupation | country | salary | new_occupation -:--- | :--- | :--- | :--- | :--- | :--- -1000 | Jake | Engineer | England | 100000 | Engineer -1001 | Hello | Artist | USA | 70000 | null -1002 | John | Doctor | Canada | 120000 | Scientist -1003 | David | Doctor | null | 120000 | Doctor -1004 | David | null | Canada | 0 | Doctor -1005 | Jane | Scientist | Canada | 90000 | Engineer - -### Limitations - -The `lookup` command works only when `plugins.calcite.enabled` is set to `true`. - -
- ---- - -## parse - -
- - Syntax and examples - - {: .text-delta } - -Use the `parse` command to parse a text field using a regular expression and append the result to the search result. - -### Syntax - -```sql -parse -``` -{% include copy.html %} - - -The following table describes the parameters for the `parse` command. - -Field | Description | Required -:--- | :--- | :--- -`field` | A text field. | Yes -`regular-expression` | The regular expression used to extract new fields from the given text field. If a new field name exists, it replaces the original field. | Yes - -The regular expression is used to match the whole text field of each document with the Java regex engine. Each named capture group in the expression becomes a new `STRING` field. - -**Example 1: Create a new field** - -The following example shows how to create a new field `host` for each document. `host` is the hostname after `@` in the `email` field. Parsing a null field returns an empty string: - -```sql -source=accounts | parse email '.+@(?.+)' | fields email, host ; -``` -{% include copy.html %} - - -The command returns the following results. - -| email | host -:--- | :--- | -| amberduke@pyrami.com | pyrami.com -| hattiebond@netagy.com | netagy.com -| null | null -| daleadams@boink.com | boink.com - -**Example 2: Override the existing field** - -The following example shows how to override the existing address field with the street number removed: - -```sql -source=accounts | parse address '\d+ (?
.+)' | fields address ; -``` -{% include copy.html %} - - -The command returns the following results. - -| address -:--- | -| Holmes Lane -| Bristol Street -| Madison Street -| Hutchinson Court - -**Example 3: Filter and sort by a cast-parsed field** - -The following example shows how to sort street numbers that are higher than 500 in the address field: - -```sql -source=accounts | parse address '(?\d+) (?.+)' | where cast(streetNumber as int) > 500 | sort num(streetNumber) | fields streetNumber, street ; -``` -{% include copy.html %} - - -The command returns the following results. - -| streetNumber | street -:--- | :--- | -| 671 | Bristol Street -| 789 | Madison Street -| 880 | Holmes Lane - -### Limitations - -A few limitations exist when using the `parse` command: - -- Fields defined by `parse` cannot be parsed again. For example, `source=accounts | parse address '\d+ (?.+)' | parse street '\w+ (?\w+)' ;` fails to return any expressions. -- Fields defined by `parse` cannot be overridden with other commands. For example, when entering `source=accounts | parse address '\d+ (?.+)' | eval street='1' | where street='1' ;` `where` does not match any documents since `street` cannot be overridden. -- The text field used by `parse` cannot be overridden. For example, when entering `source=accounts | parse address '\d+ (?.+)' | eval address='1' ;`, `street` is not parsed since the address is overridden. -- Fields defined by `parse` cannot be filtered/sorted after using them in the `stats` command. For example, `source=accounts | parse email '.+@(?.+)' | stats avg(age) by host | where host=pyrami.com ;` `where` does not match the domain listed. - -
- ---- - -## rare - -
- - Syntax and examples - - {: .text-delta } - -Use the `rare` command to find the least common values of all fields in a field list. A maximum of 10 results are returned for each distinct set of values of the group-by fields. - -### Syntax - -```sql -rare [by-clause] -``` -{% include copy.html %} - - -The following table describes the parameters for the `rare` command. - -Field | Description | Required -:--- | :--- | :--- -`field-list` | Specify a comma-delimited list of field names. | No -`by-clause` | Specify one or more fields to group the results by. | No - -**Example 1: Find the least common values in a field** - -To find the least common values of gender, use the following command: - -```sql -search source=accounts | rare gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender -:--- | -| F -| M - -**Example 2: Find the least common values grouped by gender** - -To find the least common age grouped by gender, use the following command: - -```sql -search source=accounts | rare age by gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender | age -:--- | :--- | -| F | 28 -| M | 32 -| M | 33 - -### Limitations - -The `rare` command is not rewritten to OpenSearch query DSL; it is only executed on the coordinating node. - -
- ---- - -## regex - -
- - Syntax and examples - - {: .text-delta } - -The `regex` command filters search results by matching field values against a regular expression pattern. Only documents in which the specified field matches the pattern are included in the results. - - -### Syntax - -```sql -regex = -regex != -``` -{% include copy.html %} - - -The following table describes the parameters for the `regex` command. - -Field | Description | Required -:--- | :--- | :--- -`field` | The field name to match against. | Yes -`pattern` | The regular expression pattern to match. Supports Java regex syntax, including named groups, lookahead/lookbehind, and character classes. | Yes - -**Example 1: Basic pattern matching** - -The following example shows how to filter documents where the ``lastname`` field matches names starting with uppercase letters: - -```sql -source=accounts | regex lastname="^[A-Z][a-z]+$" | fields account_number, firstname, lastname; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | firstname | lastname -:--- | :--- | :--- | -| 1 | Amber | Duke -| 6 | Hattie | Bond -| 13 | Nanette | Bates -| 18 | Dale | Adams - -**Example 2: Negative matching** - -The following example shows how to exclude documents where the ``lastname`` field ends with "son": - -```sql -source=accounts | regex lastname!=".*son$" | fields account_number, lastname; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | lastname -:--- | :--- | -| 1 | Duke -| 6 | Bond -| 13 | Bates -| 18 | Adams - -**Example 3: Email domain matching** - -The following example shows how to filter documents by email domain patterns: - -```sql -source=accounts | regex email="@pyrami\.com$" | fields account_number, email; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | email -:--- | :--- | -| 1 | amberduke@pyrami.com - -**Example 4: Complex patterns with character classes** - -The following example shows how to use complex regex patterns with character classes and quantifiers: - -```sql -source=accounts | regex address="\d{3,4}\s+[A-Z][a-z]+\s+(Street|Lane|Court)" | fields account_number, address; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | address -:--- | :--- | -| 1 | 880 Holmes Lane -| 6 | 671 Bristol Street -| 13 | 789 Madison Street -| 18 | 467 Hutchinson Court - -**Example 5: Case-sensitive matching** - -The following example demonstrates that regex matching is case-sensitive by default: - -```sql -source=accounts | regex state="va" | fields account_number, state; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | state -| :--- | :--- - -```sql -source=accounts | regex state="VA" | fields account_number, state; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | state -:--- | :--- | -| 13 | VA - -### Limitations - -- **Field specification required**: A field name must be specified in the `regex` command. Pattern-only syntax (for example, ``regex "pattern"``) is not currently supported. -- **String fields only**: The `regex` command currently only supports string fields. Using it on numeric or Boolean fields results in an error. - -
- ---- - -## rename - -
- - Syntax and examples - - {: .text-delta } - -Use the `rename` command to rename one or more fields in the search result. - - -### Syntax - -```sql -rename AS ["," AS ]... -``` -{% include copy.html %} - - -The following table describes the parameters for the `rename` command. - -Field | Description | Required -:--- | :--- | :--- -`source-field` | The name of the field that you want to rename. | Yes -`target-field` | The name you want to rename to. | Yes - -**Example 1: Rename one field** - -To rename the `account_number` field as `an`, use the following command: - -```sql -search source=accounts | rename account_number as an | fields an; -``` -{% include copy.html %} - - -The command returns the following results. - -| an -:--- | -| 1 -| 6 -| 13 -| 18 - -**Example 2: Rename multiple fields** - -To rename the `account_number` field as `an` and `employer` as `emp`, use the following command: - -```sql -search source=accounts | rename account_number as an, employer as emp | fields an, emp; -``` -{% include copy.html %} - - -The command returns the following results. - -| an | emp -:--- | :--- | -| 1 | Pyrami -| 6 | Netagy -| 13 | Quility -| 18 | null - -### Limitations - -The `rename` command is not rewritten to OpenSearch query DSL; it is only executed on the coordinating node. - -
- ---- - -## rex - -
- - Syntax and examples - - {: .text-delta } - -The `rex` command extracts fields from a raw text field using regular expression named capture groups. - - -### Syntax - -```sql -rex [mode=] field= [max_match=] [offset_field=] -``` -{% include copy.html %} - - -The following table describes the parameters for the `rex` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`field` | The field must be a string field to extract data from. | Yes | N/A -`pattern` | The regular expression pattern with named capture groups used to extract new fields. The pattern must contain at least one named capture group using ``(?pattern)`` syntax. | Yes | N/A -`mode` | Either `extract`, which creates new fields from regular expression named capture groups, or `sed`, which performs text substitution on the field using sed-style patterns. | No | `extract` -`max_match` | The maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable using `plugins.ppl.rex.max_match.limit`). | No | 1 -`offset_field` | The field name used to store the character offset positions of matches. Only available in extract mode. | No | N/A - -**Example 1: Basic field extraction** - -The following example shows how to extract the username and domain from email addresses using named capture groups. Both extracted fields are returned as a string type: - -```sql -source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, username, domain | head 2; -``` -{% include copy.html %} - - -The command returns the following results. - -| email | username | domain -:--- | :--- | :--- | -| amberduke@pyrami.com | amberduke | pyrami -| hattiebond@netagy.com | hattiebond | netagy - -**Example 2: Handling non-matching patterns** - -The following example shows the `rex` command returning all events, setting extracted fields to null for non-matching patterns. Extracted fields are of a string type when matches are found: - -```sql -source=accounts | rex field=email "(?[^@]+)@(?gmail\\.com)" | fields email, user, domain | head 2; -``` -{% include copy.html %} - - -The command returns the following results. - -| email | user | domain -:--- | :--- | :--- | -| amberduke@pyrami.com | null | null -| hattiebond@netagy.com | null | null - -**Example 3: Multiple matches with max_match** - -The following example shows how to extract multiple words from the address field using the `max_match` parameter. The extracted field is returned as an array type containing string elements: - -```sql -source=accounts | rex field=address "(?[A-Za-z]+)" max_match=2 | fields address, words | head 3; -``` -{% include copy.html %} - - -The command returns the following results. - -| address | words -| :--- | :--- | -| 880 Holmes Lane | [Holmes,Lane] -| 671 Bristol Street | [Bristol,Street] -| 789 Madison Street | [Madison,Street] - -**Example 4: Text replacement with mode=sed** - -The following example shows how to replace email domains using sed mode for text substitution. The extracted field is returned as a string type: - -```sql -source=accounts | rex field=email mode=sed "s/@.*/@company.com/" | fields email | head 2; -``` -{% include copy.html %} - - -The command returns the following results. - -| email -| :--- -| amberduke@company.com -| hattiebond@company.com - -**Example 5: Using offset_field** - -The following example shows how to track the character positions where matches occur. Extracted fields are of a string type, and the `offset_field` is also of a string type: - -```sql -source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos | fields email, username, domain, matchpos | head 2; -``` -{% include copy.html %} - - -The command returns the following results. - -| email | username | domain | matchpos -:--- | :--- | :--- | :--- | -| amberduke@pyrami.com. | amberduke | pyrami | domain=10-15&username=0-8 -| hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 - -### Limitations - -**Named Capture Group Naming:** - -- Group names must start with a letter and contain only letters and digits. -- For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). - -**Pattern requirements**: - -- The pattern must contain at least one named capture group. -- Regular capture groups ``(...)`` without names are not allowed. - -**Max match limit**: - -- The ``max_match`` parameter is subject to a configurable system limit to prevent memory exhaustion. -- When ``max_match=0`` (unlimited) is specified, it is automatically capped at the configured limit (default: 10). -- User-specified values exceeding the configured limit result in an error. -- Users can adjust the limit using the ``plugins.ppl.rex.max_match.limit`` cluster setting. Setting this limit to a large value is not recommended because it can lead to excessive memory consumption, especially with patterns that match empty strings (for example, ``\d*``, ``\w*``). - -
- ---- - -## sort - -
- - Syntax and examples - - {: .text-delta } - -Use the `sort` command to sort search results by a specified field. - - -### Syntax - -```sql -sort [count] <[+|-] sort-field>... -``` -{% include copy.html %} - - -The following table describes the parameters for the `sort` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`count` | The maximum number of results to return from the sorted result. If count=0, all results are returned. | No | 1000 -`[+|-]` | Use plus [+] to sort by ascending order and minus [-] to sort by descending order. | No | Ascending order -`sort-field` | Specify the field that you want to sort by. | Yes | N/A - -**Example 1: Sort by one field** - -To sort all documents by the `age` field in ascending order, use the following command: - -```sql -search source=accounts | sort age | fields account_number, age; -``` -{% include copy.html %} - - -The command returns the following results. - -account_number | age -:--- | :--- -13 | 28 -1 | 32 -18 | 33 -6 | 36 - -**Example 2: Sort by one field and return all results** - -To sort all documents by the `age` field in ascending order and specify `count` as 0 to return all results, use the following command: - -```sql -search source=accounts | sort 0 age | fields account_number, age; -``` -{% include copy.html %} - - -The command returns the following results. - -account_number | age -:--- | :--- -13 | 28 -1 | 32 -18 | 33 -6 | 36 - -**Example 3: Sort by one field in descending order** - -To sort all documents by the `age` field in descending order, use the following command: - -```sql -search source=accounts | sort - age | fields account_number, age; -``` -{% include copy.html %} - - -account_number | age -:--- | :--- -6 | 36 -18 | 33 -1 | 32 -13 | 28 - -**Example 4: Specify the number of sorted documents to return** - -To sort all documents by the `age` field in ascending order and specify `count` as 2 to return two results, use the following command: - -```sql -search source=accounts | sort 2 age | fields account_number, age; -``` -{% include copy.html %} - - -The command returns the following results. - -account_number | age -:--- | :--- -13 | 28 -1 | 32 - -**Example 5: Sort by multiple fields** - -To sort all documents by the `gender` field in ascending order and the `age` field in descending order, use the following command: - -```sql -search source=accounts | sort + gender, - age | fields account_number, gender, age; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | gender | age -| :--- | :--- | :--- | -| 13 | F | 28 -| 6 | M | 36 -| 18 | M | 33 -| 1 | M | 32 - -
- ---- - -## spath - -
- - Syntax and examples - - {: .text-delta } - -The `spath` command allows you to extract fields from structured text data. It currently allows selecting from JSON data with JSON paths. - - -### Syntax - -```sql -spath input= [output=] [path=] -``` -{% include copy.html %} - - -The following table describes the parameters for the `spath` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`input` | The field to scan for JSON data. | Yes | N/A -`output` | The destination field that the data is loaded to. | No | Value of path -`path` | The path of the data to load for the object. | Yes | N/A - -**Example 1: Simple field extraction** - -The simplest `spath` is to extract a single field. The following example extracts n from the doc field of type text: - -```sql -source=structured | spath input=doc_n n | fields doc_n n; -``` -{% include copy.html %} - - -The command returns the following results. - -doc_n | n -:--- | :--- -{"n": 1} | 1 -{"n": 2} | 2 -{"n": 3} | 3 - -**Example 2: Lists and nesting** - -The following example demonstrates additional JSON path use cases, such as traversing nested fields and extracting list elements: - -```sql -source=structured | spath input=doc_list output=first_element list{0} | spath input=doc_list output=all_elements list{} | spath input=doc_list output=nested nest_out.nest_in | fields doc_list first_element all_elements nested; -``` -{% include copy.html %} - - -| doc_list | first_element | all_elements | nested -| :--- | :--- | :--- | :--- | -| {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a -| {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a -| {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a - -**Example 3: Sum of inner elements** - -The following example shows how to extract an inner field and generate statistics on it, using the documents from the first example. It also demonstrates that `spath` always returns strings for inner types: - -```sql -source=structured | spath input=doc_n n | eval n=cast(n as int) | stats sum(n) | fields `sum(n)`; -``` -{% include copy.html %} - - -The command returns the following results. - -| sum(n) -| :--- -| 6 - -**Example 4: Escaped paths** - -`spath` can escape paths with strings to accept any path that `json_extract` does. This includes escaping complex field names as array components: - -```sql -source=structured | spath output=a input=doc_escape "['a fancy field name']" | spath output=b input=doc_escape "['a.b.c']" | fields a b; -``` -{% include copy.html %} - - -The command returns the following results. - -| a | b -| :--- | :--- -| true | 0 -| true | 1 -| false | 2 - -
- ---- - -## stats - -
- - Syntax and examples - - {: .text-delta } - -Use the `stats` command to aggregate data from search results. - -The following table lists the aggregation functions and also indicates how each one handles null or missing values. - -Function | NULL | MISSING -:--- | :--- | :--- -`COUNT` | Not counted | Not counted -`SUM` | Ignore | Ignore -`AVG` | Ignore | Ignore -`MAX` | Ignore | Ignore -`MIN` | Ignore | Ignore - - -### Syntax - -```sql -stats ... [by-clause]... -``` -{% include copy.html %} - - -The following table describes the parameters for the `stats` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`aggregation` | Specify a statistical aggregation function. The argument of this function must be a field. | Yes | N/A -`by-clause` | Specify one or more fields to group the results by. If not specified, the `stats` command returns only one row, which is the aggregation over the entire result set. | No | N/A - -**Example 1: Calculate the average value of a field** - -To calculate the average `age` of all documents, use the following command: - -```sql -search source=accounts | stats avg(age); -``` -{% include copy.html %} - - -The command returns the following results. - -| avg(age) -:--- | -| 32.25 - -**Example 2: Calculate the average value of a field by group** - -To calculate the average `age` grouped by gender, use the following command: - -```sql -search source=accounts | stats avg(age) by gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender | avg(age) -:--- | :--- | -| F | 28.0 -| M | 33.666666666666664 - -**Example 3: Calculate the average and sum of a field by group** - -To calculate the average and sum of `age` grouped by gender, use the following command: - -```sql -search source=accounts | stats avg(age), sum(age) by gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender | avg(age) | sum(age) -:--- | :--- | :--- | -| F | 28 | 28 -| M | 33.666666666666664 | 101 - -**Example 4: Calculate the maximum value of a field** - -To calculate the maximum `age`, use the following command: - -```sql -search source=accounts | stats max(age); -``` -{% include copy.html %} - - -| max(age) -:--- | -| 36 - -**Example 5: Calculate the maximum and minimum value of a field by group** - -To calculate the maximum and minimum `age` values grouped by gender, use the following command: - -```sql -search source=accounts | stats max(age), min(age) by gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender | min(age) | max(age) -:--- | :--- | :--- | -| F | 28 | 28 -| M | 32 | 36 - -
- ---- - -## timechart - -
- - Syntax and examples - - {: .text-delta } - -The `timechart` command creates a time-based aggregation of data. It groups data by time intervals and optionally by a field, then applies an aggregation function to each group. The results are returned in an unpivoted format with separate rows for each time-field combination. - - -### Syntax - -```sql -timechart [timefield=] [span=] [limit=] [useother=] [by ] -``` -{% include copy.html %} - - -The following table describes the parameters for the `timechart` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- | -`timefield` | The field to use for time-based grouping. Must be a timestamp field. | No | `@timestamp` -`span` | Specifies the time interval for grouping data. | No | 1m -`limit` | Specifies the maximum number of distinct values to display when using the "by" clause. | No | 10 -`useother` | Controls whether to create an "OTHER" category for values beyond the limit. | No | `true` -`aggregation_function` | The aggregation function to apply to each time bucket. | Yes | N/A -`by` | Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. | No | N/A - -**Example 1: Count events by hour** - -The following example counts events for each hour and groups them by host: - -```sql -source=events | timechart span=1h count() by host; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | host | count() -:--- | :--- | :--- | -| 2023-01-01 10:00:00 | server1 | 4 -| 2023-01-01 10:00:00 | server2 | 4 - -**Example 2: Calculate average number of packets by minute** - -The following example calculates the average packets for each minute without grouping by any field: - -```sql -source=events | timechart span=1m avg(packets); -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | avg(packets) -:--- | :--- | -| 2023-01-01 10:00:00 | 60.0 -| 2023-01-01 10:05:00 | 30.0 -| 2023-01-01 10:10:00 | 60.0 -| 2023-01-01 10:15:00 | 30.0 -| 2023-01-01 10:20:00 | 60.0 -| 2023-01-01 10:25:00 | 30.0 -| 2023-01-01 10:30:00 | 180.0 -| 2023-01-01 10:35:00 | 90.0 - -**Example 3: Calculate average number of packets by every 20 minutes and status** - -The following example calculates the average number of packets for every 20 minutes and groups them by status: - -```sql -source=events | timechart span=20m avg(packets) by status; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | status | avg(packets) -:--- | :--- | :--- | -| 2023-01-01 10:00:00 | active | 30.0 -| 2023-01-01 10:05:00 | inactive | 30.0 -| 2023-01-01 10:10:00 | pending | 60.0 -| 2023-01-01 10:15:00 | processing | 60.0 -| 2023-01-01 10:20:00 | cancelled | 180.0 -| 2023-01-01 10:25:00 | completed | 60.0 -| 2023-01-01 10:30:00 | inactive | 90.0 -| 2023-01-01 10:35:00 | pending | 30.0 - -**Example 4: Using the limit parameter with the count() function** - -When there are many distinct values in the "by" field, the `timechart` command displays the top values based on the `limit` parameter and groups the rest into an "OTHER" category. -The following query displays the top 2 hosts with the highest count values and groups the remaining hosts into an "OTHER" category: - -```sql -source=events | timechart span=1m limit=2 count() by host; -``` -{% include copy.html %} - - -| @timestamp | host | count() -:--- | :--- | :--- | -| 2023-01-01 10:00:00 | server1 | 1 -| 2023-01-01 10:05:00 | server2 | 1 -| 2023-01-01 10:10:00 | server1 | 1 -| 2023-01-01 10:15:00 | server2 | 1 -| 2023-01-01 10:20:00 | server1 | 1 -| 2023-01-01 10:25:00 | server2 | 1 -| 2023-01-01 10:30:00 | server1 | 1 -| 2023-01-01 10:35:00 | server2 | 1 - -**Example 5: Using limit=0 with count() to show all values** - -To display all distinct values without any limit, set `limit=0` and use the following command: - -```sql -source=events_many_hosts | timechart span=1h limit=0 count() by host; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | host | count() -:--- | :--- | :--- | -| 2024-07-01 00:00:00 | web-01 | 1 -| 2024-07-01 00:00:00 | web-02 | 1 -| 2024-07-01 00:00:00 | web-03 | 1 -| 2024-07-01 00:00:00 | web-04 | 1 -| 2024-07-01 00:00:00 | web-05 | 1 -| 2024-07-01 00:00:00 | web-06 | 1 -| 2024-07-01 00:00:00 | web-07 | 1 -| 2024-07-01 00:00:00 | web-08 | 1 -| 2024-07-01 00:00:00 | web-09 | 1 -| 2024-07-01 00:00:00 | web-10 | 1 -| 2024-07-01 00:00:00 | web-11 | 1 - -**Example 6: Using useother=false with the count() function** - -The following example displays the top 10 hosts without the OTHER category (`useother=false`): - -```sql -source=events_many_hosts | timechart span=1h useother=false count() by host; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | host | count() -:--- | :--- | :--- | -| 2024-07-01 00:00:00 | web-01 | 1 -| 2024-07-01 00:00:00 | web-02 | 1 -| 2024-07-01 00:00:00 | web-03 | 1 -| 2024-07-01 00:00:00 | web-04 | 1 -| 2024-07-01 00:00:00 | web-05 | 1 -| 2024-07-01 00:00:00 | web-06 | 1 -| 2024-07-01 00:00:00 | web-07 | 1 -| 2024-07-01 00:00:00 | web-08 | 1 -| 2024-07-01 00:00:00 | web-09 | 1 -| 2024-07-01 00:00:00 | web-10 | 1 - -**Example 7: Using the limit parameter with the useother parameter and the avg() function** - -The following example displays the top 3 hosts with the OTHER category (default is `useother=true`): - -```sql -source=events_many_hosts | timechart span=1h limit=3 avg(cpu_usage) by host; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | host | avg(cpu_usage) -:--- | :--- | :--- | -| 2024-07-01 00:00:00 | OTHER | 41.3 -| 2024-07-01 00:00:00 | web-03 | 55.3 -| 2024-07-01 00:00:00 | web-07 | 48.6 -| 2024-07-01 00:00:00 | web-09 | 67.8 - -**Example 8: Handling null values in the "by" field** - -The following example shows how null values in the "by" field are treated as a separate category. The dataset `events_null` has 1 entry that does not have a host field. -It is put into a separate "NULL" category because the defaults for `usenull` and `nullstr` are `true` and `"NULL"`, respectively: - -```sql -source=events_null | timechart span=1h count() by host; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | host | count() -:--- | :--- | :--- | -| 2024-07-01 00:00:00 | NULL | 1 -| 2024-07-01 00:00:00 | db-01 | 1 -| 2024-07-01 00:00:00 | web-01 | 2 -| 2024-07-01 00:00:00 | web-02 | 2 - -**Example 9: Calculate packets per second rate** - -The following example calculates the per-second packet rate for network traffic data using the `per_second()` function: - -```sql -source=events | timechart span=30m per_second(packets) by host; -``` -{% include copy.html %} - - -The command returns the following results. - -| @timestamp | host | per_second(packets) -:--- | :--- | :--- | -| 2024-07-01 00:00:00 | server1 | 0.1 -| 2024-07-01 00:00:00 | server2 | 0.05 -| 2024-07-01 00:00:00 | server1 | 0.1 -| 2024-07-01 00:00:00 | server2 | 0.05 - -### Limitations -- Only a single aggregation function is supported per `timechart` command. -- The `bins` parameter and other bin options are not supported in the `timechart` command. Use the `span` parameter to control time intervals. - -
- ---- - -## top - -
- - Syntax and examples - - {: .text-delta } - -Use the `top` command to find the most common values of all fields in the field list. - -### Syntax - -```sql -top [N] [by-clause] -``` -{% include copy.html %} - - -The following table describes the parameters for the `top` command. - -Field | Description | Required | Default -:--- | :--- | :--- | :--- -`N` | Specify the number of results to return. | No | 10 -`field-list` | Specify a comma-delimited list of field names. | Yes | N/A -`by-clause` | Specify one or more fields to group the results by. | No | N/A - -**Example 1: Find the most common values in a field** - -To find the most common genders, use the following command: - -```sql -search source=accounts | top gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender -:--- | -| M -| F - -**Example 2: Find the most common value in a field** - -To find the most common gender, use the following command: - -```sql -search source=accounts | top 1 gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender -:--- | -| M - -**Example 3: Find the most common values grouped by gender** - -To find the most common age grouped by gender, use the following command: - -```sql -search source=accounts | top 1 age by gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| gender | age -:--- | :--- | -| F | 28 -| M | 32 - -### Limitations - -The `top` command is not rewritten to OpenSearch query DSL; it is only executed on the coordinating node. - -
- ---- - -## where - -
- - Syntax and examples - - {: .text-delta } - -Use the `where` command with a Boolean expression to filter the search result. The `where` command only returns the result when the Boolean expression evaluates to `true`. - -### Syntax - -```sql -where -``` -{% include copy.html %} - - -The following table describes the parameters for the `where` command. - -Field | Description | Required -:--- | :--- | :--- -`boolean-expression` | An expression that evaluates to a Boolean value. | No - -**Example: Filter the result set with a condition** - -To get all documents from the `accounts` index where `account_number` is 1 or gender is `F`, use the following command: - -```sql -search source=accounts | where account_number=1 or gender="F" | fields account_number, gender; -``` -{% include copy.html %} - - -The command returns the following results. - -| account_number | gender -:--- | :--- | -| 1 | M -| 13 | F - -
\ No newline at end of file From dc8c7dd64bcde9e22e2ca2c166b66ad3baaee49c Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Tue, 23 Dec 2025 15:08:45 -0500 Subject: [PATCH 2/9] Doc review Signed-off-by: Fanit Kolchina --- _sql-and-ppl/ppl/cmd/ad.md | 123 --- _sql-and-ppl/ppl/cmd/addcoltotals.md | 84 -- _sql-and-ppl/ppl/cmd/addtotals.md | 108 --- _sql-and-ppl/ppl/cmd/append.md | 68 -- _sql-and-ppl/ppl/cmd/appendpipe.md | 74 -- _sql-and-ppl/ppl/cmd/chart.md | 190 ----- _sql-and-ppl/ppl/cmd/dedup.md | 128 ---- _sql-and-ppl/ppl/cmd/eval.md | 127 --- _sql-and-ppl/ppl/cmd/eventstats.md | 164 ---- _sql-and-ppl/ppl/cmd/expand.md | 57 -- _sql-and-ppl/ppl/cmd/fillnull.md | 169 ---- _sql-and-ppl/ppl/cmd/flatten.md | 101 --- _sql-and-ppl/ppl/cmd/head.md | 85 --- _sql-and-ppl/ppl/cmd/index.md | 15 - _sql-and-ppl/ppl/cmd/join.md | 216 ------ _sql-and-ppl/ppl/cmd/kmeans.md | 45 -- _sql-and-ppl/ppl/cmd/lookup.md | 357 --------- _sql-and-ppl/ppl/cmd/ml.md | 151 ---- _sql-and-ppl/ppl/cmd/multisearch.md | 151 ---- _sql-and-ppl/ppl/cmd/parse.md | 134 ---- _sql-and-ppl/ppl/cmd/rare.md | 138 ---- _sql-and-ppl/ppl/cmd/regex.md | 147 ---- _sql-and-ppl/ppl/cmd/rename.md | 138 ---- _sql-and-ppl/ppl/cmd/replace.md | 301 -------- _sql-and-ppl/ppl/cmd/reverse.md | 129 ---- _sql-and-ppl/ppl/cmd/rex.md | 277 ------- _sql-and-ppl/ppl/cmd/search.md | 653 ---------------- _sql-and-ppl/ppl/cmd/showdatasources.md | 39 - _sql-and-ppl/ppl/cmd/sort.md | 234 ------ _sql-and-ppl/ppl/cmd/spath.md | 108 --- _sql-and-ppl/ppl/cmd/stats.md | 464 ----------- _sql-and-ppl/ppl/cmd/streamstats.md | 266 ------- _sql-and-ppl/ppl/cmd/syntax.md | 83 -- _sql-and-ppl/ppl/cmd/table.md | 44 -- _sql-and-ppl/ppl/cmd/timechart.md | 353 --------- _sql-and-ppl/ppl/cmd/top.md | 152 ---- _sql-and-ppl/ppl/cmd/trendline.md | 112 --- _sql-and-ppl/ppl/cmd/where.md | 188 ----- _sql-and-ppl/ppl/commands/ad.md | 157 ++++ _sql-and-ppl/ppl/commands/addcoltotals.md | 94 +++ _sql-and-ppl/ppl/commands/addtotals.md | 120 +++ _sql-and-ppl/ppl/commands/append.md | 76 ++ .../ppl/{cmd => commands}/appendcol.md | 50 +- _sql-and-ppl/ppl/commands/appendpipe.md | 86 +++ _sql-and-ppl/ppl/{cmd => commands}/bin.md | 189 +++-- _sql-and-ppl/ppl/commands/chart.md | 181 +++++ _sql-and-ppl/ppl/commands/dedup.md | 134 ++++ .../ppl/{cmd => commands}/describe.md | 51 +- _sql-and-ppl/ppl/commands/eval.md | 136 ++++ _sql-and-ppl/ppl/commands/eventstats.md | 169 ++++ _sql-and-ppl/ppl/commands/expand.md | 68 ++ _sql-and-ppl/ppl/{cmd => commands}/explain.md | 74 +- _sql-and-ppl/ppl/{cmd => commands}/fields.md | 74 +- _sql-and-ppl/ppl/commands/fillnull.md | 180 +++++ _sql-and-ppl/ppl/commands/flatten.md | 101 +++ _sql-and-ppl/ppl/{cmd => commands}/grok.md | 49 +- _sql-and-ppl/ppl/commands/head.md | 94 +++ _sql-and-ppl/ppl/commands/index.md | 17 + _sql-and-ppl/ppl/commands/join.md | 218 ++++++ _sql-and-ppl/ppl/commands/kmeans.md | 57 ++ _sql-and-ppl/ppl/commands/lookup.md | 123 +++ _sql-and-ppl/ppl/commands/ml.md | 189 +++++ _sql-and-ppl/ppl/commands/multisearch.md | 157 ++++ _sql-and-ppl/ppl/commands/parse.md | 136 ++++ .../ppl/{cmd => commands}/patterns.md | 185 +++-- _sql-and-ppl/ppl/commands/rare.md | 147 ++++ _sql-and-ppl/ppl/commands/regex.md | 155 ++++ _sql-and-ppl/ppl/commands/rename.md | 148 ++++ _sql-and-ppl/ppl/commands/replace.md | 288 +++++++ _sql-and-ppl/ppl/commands/reverse.md | 130 ++++ _sql-and-ppl/ppl/commands/rex.md | 271 +++++++ _sql-and-ppl/ppl/commands/search.md | 721 ++++++++++++++++++ _sql-and-ppl/ppl/commands/showdatasources.md | 41 + _sql-and-ppl/ppl/commands/sort.md | 196 +++++ _sql-and-ppl/ppl/commands/spath.md | 117 +++ _sql-and-ppl/ppl/commands/stats.md | 477 ++++++++++++ _sql-and-ppl/ppl/commands/streamstats.md | 263 +++++++ .../ppl/{cmd => commands}/subquery.md | 258 +++---- _sql-and-ppl/ppl/{ => commands}/syntax.md | 67 +- _sql-and-ppl/ppl/commands/table.md | 52 ++ _sql-and-ppl/ppl/commands/timechart.md | 368 +++++++++ _sql-and-ppl/ppl/commands/top.md | 156 ++++ _sql-and-ppl/ppl/commands/trendline.md | 122 +++ _sql-and-ppl/ppl/commands/where.md | 199 +++++ 84 files changed, 6907 insertions(+), 6807 deletions(-) delete mode 100644 _sql-and-ppl/ppl/cmd/ad.md delete mode 100644 _sql-and-ppl/ppl/cmd/addcoltotals.md delete mode 100644 _sql-and-ppl/ppl/cmd/addtotals.md delete mode 100644 _sql-and-ppl/ppl/cmd/append.md delete mode 100644 _sql-and-ppl/ppl/cmd/appendpipe.md delete mode 100644 _sql-and-ppl/ppl/cmd/chart.md delete mode 100644 _sql-and-ppl/ppl/cmd/dedup.md delete mode 100644 _sql-and-ppl/ppl/cmd/eval.md delete mode 100644 _sql-and-ppl/ppl/cmd/eventstats.md delete mode 100644 _sql-and-ppl/ppl/cmd/expand.md delete mode 100644 _sql-and-ppl/ppl/cmd/fillnull.md delete mode 100644 _sql-and-ppl/ppl/cmd/flatten.md delete mode 100644 _sql-and-ppl/ppl/cmd/head.md delete mode 100644 _sql-and-ppl/ppl/cmd/index.md delete mode 100644 _sql-and-ppl/ppl/cmd/join.md delete mode 100644 _sql-and-ppl/ppl/cmd/kmeans.md delete mode 100644 _sql-and-ppl/ppl/cmd/lookup.md delete mode 100644 _sql-and-ppl/ppl/cmd/ml.md delete mode 100644 _sql-and-ppl/ppl/cmd/multisearch.md delete mode 100644 _sql-and-ppl/ppl/cmd/parse.md delete mode 100644 _sql-and-ppl/ppl/cmd/rare.md delete mode 100644 _sql-and-ppl/ppl/cmd/regex.md delete mode 100644 _sql-and-ppl/ppl/cmd/rename.md delete mode 100644 _sql-and-ppl/ppl/cmd/replace.md delete mode 100644 _sql-and-ppl/ppl/cmd/reverse.md delete mode 100644 _sql-and-ppl/ppl/cmd/rex.md delete mode 100644 _sql-and-ppl/ppl/cmd/search.md delete mode 100644 _sql-and-ppl/ppl/cmd/showdatasources.md delete mode 100644 _sql-and-ppl/ppl/cmd/sort.md delete mode 100644 _sql-and-ppl/ppl/cmd/spath.md delete mode 100644 _sql-and-ppl/ppl/cmd/stats.md delete mode 100644 _sql-and-ppl/ppl/cmd/streamstats.md delete mode 100644 _sql-and-ppl/ppl/cmd/syntax.md delete mode 100644 _sql-and-ppl/ppl/cmd/table.md delete mode 100644 _sql-and-ppl/ppl/cmd/timechart.md delete mode 100644 _sql-and-ppl/ppl/cmd/top.md delete mode 100644 _sql-and-ppl/ppl/cmd/trendline.md delete mode 100644 _sql-and-ppl/ppl/cmd/where.md create mode 100644 _sql-and-ppl/ppl/commands/ad.md create mode 100644 _sql-and-ppl/ppl/commands/addcoltotals.md create mode 100644 _sql-and-ppl/ppl/commands/addtotals.md create mode 100644 _sql-and-ppl/ppl/commands/append.md rename _sql-and-ppl/ppl/{cmd => commands}/appendcol.md (57%) create mode 100644 _sql-and-ppl/ppl/commands/appendpipe.md rename _sql-and-ppl/ppl/{cmd => commands}/bin.md (51%) create mode 100644 _sql-and-ppl/ppl/commands/chart.md create mode 100644 _sql-and-ppl/ppl/commands/dedup.md rename _sql-and-ppl/ppl/{cmd => commands}/describe.md (64%) create mode 100644 _sql-and-ppl/ppl/commands/eval.md create mode 100644 _sql-and-ppl/ppl/commands/eventstats.md create mode 100644 _sql-and-ppl/ppl/commands/expand.md rename _sql-and-ppl/ppl/{cmd => commands}/explain.md (59%) rename _sql-and-ppl/ppl/{cmd => commands}/fields.md (57%) create mode 100644 _sql-and-ppl/ppl/commands/fillnull.md create mode 100644 _sql-and-ppl/ppl/commands/flatten.md rename _sql-and-ppl/ppl/{cmd => commands}/grok.md (50%) create mode 100644 _sql-and-ppl/ppl/commands/head.md create mode 100644 _sql-and-ppl/ppl/commands/index.md create mode 100644 _sql-and-ppl/ppl/commands/join.md create mode 100644 _sql-and-ppl/ppl/commands/kmeans.md create mode 100644 _sql-and-ppl/ppl/commands/lookup.md create mode 100644 _sql-and-ppl/ppl/commands/ml.md create mode 100644 _sql-and-ppl/ppl/commands/multisearch.md create mode 100644 _sql-and-ppl/ppl/commands/parse.md rename _sql-and-ppl/ppl/{cmd => commands}/patterns.md (51%) create mode 100644 _sql-and-ppl/ppl/commands/rare.md create mode 100644 _sql-and-ppl/ppl/commands/regex.md create mode 100644 _sql-and-ppl/ppl/commands/rename.md create mode 100644 _sql-and-ppl/ppl/commands/replace.md create mode 100644 _sql-and-ppl/ppl/commands/reverse.md create mode 100644 _sql-and-ppl/ppl/commands/rex.md create mode 100644 _sql-and-ppl/ppl/commands/search.md create mode 100644 _sql-and-ppl/ppl/commands/showdatasources.md create mode 100644 _sql-and-ppl/ppl/commands/sort.md create mode 100644 _sql-and-ppl/ppl/commands/spath.md create mode 100644 _sql-and-ppl/ppl/commands/stats.md create mode 100644 _sql-and-ppl/ppl/commands/streamstats.md rename _sql-and-ppl/ppl/{cmd => commands}/subquery.md (54%) rename _sql-and-ppl/ppl/{ => commands}/syntax.md (52%) create mode 100644 _sql-and-ppl/ppl/commands/table.md create mode 100644 _sql-and-ppl/ppl/commands/timechart.md create mode 100644 _sql-and-ppl/ppl/commands/top.md create mode 100644 _sql-and-ppl/ppl/commands/trendline.md create mode 100644 _sql-and-ppl/ppl/commands/where.md diff --git a/_sql-and-ppl/ppl/cmd/ad.md b/_sql-and-ppl/ppl/cmd/ad.md deleted file mode 100644 index 23b8f80c0b5..00000000000 --- a/_sql-and-ppl/ppl/cmd/ad.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -layout: default -title: "ad" -parent: "Commands" -grand_parent: "PPL" -nav_order: 1 ---- -# ad (deprecated by ml command) - - -The `ad` command applies Random Cut Forest (RCF) algorithm in the ml-commons plugin on the search results returned by a PPL command. Based on the input, the command uses two types of RCF algorithms: fixed-in-time RCF for processing time-series data, batch RCF for processing non-time-series data. - -## Syntax - -The following sections describe the syntax for each RCF algorithm type. - -## Fixed in time RCF for time-series data - -`ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [anomaly_rate] [date_format] [time_zone] [category_field]` -* `number_of_trees`: optional. Number of trees in the forest. **Default:** 30. -* `shingle_size`: optional. A shingle is a consecutive sequence of the most recent records. **Default:** 8. -* `sample_size`: optional. The sample size used by stream samplers in this forest. **Default:** 256. -* `output_after`: optional. The number of points required by stream samplers before results are returned. **Default:** 32. -* `time_decay`: optional. The decay factor used by stream samplers in this forest. **Default:** 0.0001. -* `anomaly_rate`: optional. The anomaly rate. **Default:** 0.005. -* `time_field`: mandatory. Specifies the time field for RCF to use as time-series data. -* `date_format`: optional. Used for formatting time_field. **Default:** "yyyy-MM-dd HH:mm:ss". -* `time_zone`: optional. Used for setting time zone for time_field. **Default:** "UTC". -* `category_field`: optional. Specifies the category field used to group inputs. Each category will be independently predicted. - - -## Batch RCF for non-time-series data - -`ad [number_of_trees] [sample_size] [output_after] [training_data_size] [anomaly_score_threshold] [category_field]` -* `number_of_trees`: optional. Number of trees in the forest. **Default:** 30. -* `sample_size`: optional. Number of random samples given to each tree from the training dataset. **Default:** 256. -* `output_after`: optional. The number of points required by stream samplers before results are returned. **Default:** 32. -* `training_data_size`: optional. **Default:** size of your training dataset. -* `anomaly_score_threshold`: optional. The threshold of anomaly score. **Default:** 1.0. -* `category_field`: optional. Specifies the category field used to group inputs. Each category will be independently predicted. - - -## Example 1: Detecting events in New York City from taxi ridership data with time-series data - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. - -```sql -source=nyc_taxi -| fields value, timestamp -| AD time_field='timestamp' -| where value=10844.0 -``` -{% include copy.html %} - -Expected output: - -| value | timestamp | score | anomaly_grade | -| --- | --- | --- | --- | -| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | - - -## Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. - -```sql -source=nyc_taxi -| fields category, value, timestamp -| AD time_field='timestamp' category_field='category' -| where value=10844.0 or value=6526.0 -``` -{% include copy.html %} - -Expected output: - -| category | value | timestamp | score | anomaly_grade | -| --- | --- | --- | --- | --- | -| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | -| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | - - -## Example 3: Detecting events in New York City from taxi ridership data with non-time-series data - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. - -```sql -source=nyc_taxi -| fields value -| AD -| where value=10844.0 -``` -{% include copy.html %} - -Expected output: - -| value | score | anomalous | -| --- | --- | --- | -| 10844.0 | 0.0 | False | - - -## Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. - -```sql -source=nyc_taxi -| fields category, value -| AD category_field='category' -| where value=10844.0 or value=6526.0 -``` -{% include copy.html %} - -Expected output: - -| category | value | score | anomalous | -| --- | --- | --- | --- | -| night | 10844.0 | 0.0 | False | -| day | 6526.0 | 0.0 | False | - - -## Limitations - -The `ad` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/addcoltotals.md b/_sql-and-ppl/ppl/cmd/addcoltotals.md deleted file mode 100644 index 57f7d83a1e6..00000000000 --- a/_sql-and-ppl/ppl/cmd/addcoltotals.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -layout: default -title: "addcoltotals" -parent: "Commands" -grand_parent: "PPL" -nav_order: 2 ---- -# addcoltotals - - - -The `addcoltotals` command computes the sum of each column and adds a summary event at the end to show the total of each column. This command works the same way `addtotals` command works with row=false and col=true option. This is useful for creating summary reports with subtotals or grand totals. The `addcoltotals` command only sums numeric fields (integers, floats, doubles). Non-numeric fields in the field list are ignored even if its specified in field-list or in the case of no field-list specified. - -## Syntax - -Use the following syntax: - -`addcoltotals [field-list] [label=] [labelfield=]` - -- `field-list`: Optional. Comma-separated list of numeric fields to sum. If not specified, all numeric fields are summed. -- `labelfield=`: Optional. Field name to place the label. If it specifies a non-existing field, adds the field and shows label at the summary event row at this field. -- `label=`: Optional. Custom text for the totals row labelfield\'s label. Default is \"Total\". - -# Example 1: Basic example - -The following example PPL query shows how to use `addcoltotals` to place the label in an existing field. - -```sql -source=accounts -| fields firstname, balance -| head 3 -| addcoltotals labelfield='firstname' -``` -{% include copy.html %} - -Expected output: - -| firstname | balance | -| --- | --- | -| Amber | 39225 | -| Hattie | 5686 | -| Nanette | 32838 | -| Total | 77749 | - -# Example 2: Adding column totals and adding a summary event with label specified - -The following example PPL query shows how to use `addcoltotals` to add totals after a stats command where final summary event label is \'Sum\' and row=true value was used by default when not specified. It also added new field specified by labelfield as it did not match existing field. - -```sql -source=accounts -| stats count() by gender -| addcoltotals `count()` label='Sum' labelfield='Total' -``` -{% include copy.html %} - -Expected output: - -| count() | gender | Total | -| --- | --- | --- | -| 1 | F | null | -| 3 | M | null | -| 4 | null | Sum | - -# Example 3: With all options - -The following example PPL query shows how to use `addcoltotals` with all options set. - -```sql -source=accounts -| where age > 30 -| stats avg(balance) as avg_balance, count() as count by state -| head 3 -| addcoltotals avg_balance, count label='Sum' labelfield='Column Total' -``` -{% include copy.html %} - -Expected output: - -| avg_balance | count | state | Column Total | -| --- | --- | --- | --- | -| 39225.0 | 1 | IL | null | -| 4180.0 | 1 | MD | null | -| 5686.0 | 1 | TN | null | -| 49091.0 | 3 | null | Sum | diff --git a/_sql-and-ppl/ppl/cmd/addtotals.md b/_sql-and-ppl/ppl/cmd/addtotals.md deleted file mode 100644 index 6cfbf873dc8..00000000000 --- a/_sql-and-ppl/ppl/cmd/addtotals.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -layout: default -title: "addtotals" -parent: "Commands" -grand_parent: "PPL" -nav_order: 3 ---- -# addtotals - - - -The `addtotals` command computes the sum of numeric fields and appends a row with the totals to the result. The command can also add row totals and add a field to store row totals. This is useful for creating summary reports with subtotals or grand totals. The `addtotals` command only sums numeric fields (integers, floats, doubles). Non-numeric fields in the field list are ignored even if it's specified in field-list or in the case of no field-list specified. - -## Syntax - -Use the following syntax: - -`addtotals [field-list] [label=] [labelfield=] [row=] [col=] [fieldname=]` - -- `field-list`: Optional. Comma-separated list of numeric fields to sum. If not specified, all numeric fields are summed. -- `row=`: Optional. Calculates total of each row and add a new field with the total. Default is true. -- `col=`: Optional. Calculates total of each column and add a new event at the end of all events with the total. Default is false. -- `labelfield=`: Optional. Field name to place the label. If it specifies a non-existing field, adds the field and shows label at the summary event row at this field. This is applicable when col=true. -- `label=`: Optional. Custom text for the totals row labelfield\'s label. Default is \"Total\". This is applicable when col=true. This does not have any effect when labelfield and fieldname parameter both have same value. -- `fieldname=`: Optional. Calculates total of each row and add a new field to store this total. This is applicable when row=true. - -## Example 1: Basic example - -The following example PPL query shows how to use `addtotals` to place the label in an existing field. - -```sql -source=accounts -| head 3 -|fields firstname, balance -| addtotals col=true labelfield='firstname' label='Total' -``` -{% include copy.html %} - -Expected output: - -| firstname | balance | Total | -| --- | --- | --- | -| Amber | 39225 | 39225 | -| Hattie | 5686 | 5686 | -| Nanette | 32838 | 32838 | -| Total | 77749 | null | - -## Example 2: Adding column totals and adding a summary event with label specified - -The following example PPL query shows how to use `addtotals` to add totals after a stats command where final summary event label is \'Sum\'. It also added new field specified by labelfield as it did not match existing field. - -```sql -source=accounts -| fields account_number, firstname , balance , age -| addtotals col=true row=false label='Sum' labelfield='Total' -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | balance | age | Total | -| --- | --- | --- | --- | --- | -| 1 | Amber | 39225 | 32 | null | -| 6 | Hattie | 5686 | 36 | null | -| 13 | Nanette | 32838 | 28 | null | -| 18 | Dale | 4180 | 33 | null | -| 38 | null | 81929 | 129 | Sum | - -if row=true in the preceding example, there will be conflict between column added for column totals and column added for row totals being same field \'Total\', in that case the output will have final event row label null instead of \'Sum\' because the column is number type and it cannot output String in number type column. - -```sql -source=accounts -| fields account_number, firstname , balance , age -| addtotals col=true row=true label='Sum' labelfield='Total' -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | balance | age | Total | -| --- | --- | --- | --- | --- | -| 1 | Amber | 39225 | 32 | 39258 | -| 6 | Hattie | 5686 | 36 | 5728 | -| 13 | Nanette | 32838 | 28 | 32879 | -| 18 | Dale | 4180 | 33 | 4231 | -| 38 | null | 81929 | 129 | null | - -## Example 3: With all options - -The following example PPL query shows how to use `addtotals` with all options set. - -```sql -source=accounts -| where age > 30 -| stats avg(balance) as avg_balance, count() as count by state -| head 3 -| addtotals avg_balance, count row=true col=true fieldname='Row Total' label='Sum' labelfield='Column Total' -``` -{% include copy.html %} - -Expected output: - -| avg_balance | count | state | Row Total | Column Total | -| --- | --- | --- | --- | --- | -| 39225.0 | 1 | IL | 39226.0 | null | -| 4180.0 | 1 | MD | 4181.0 | null | -| 5686.0 | 1 | TN | 5687.0 | null | -| 49091.0 | 3 | null | null | Sum | \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/append.md b/_sql-and-ppl/ppl/cmd/append.md deleted file mode 100644 index 5ee7811a7a7..00000000000 --- a/_sql-and-ppl/ppl/cmd/append.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -layout: default -title: "append" -parent: "Commands" -grand_parent: "PPL" -nav_order: 4 ---- -# append - - -The `append` command appends the result of a sub-search and attaches it as additional rows to the bottom of the input search results (the main search). - -The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. - -## Syntax - -Use the following syntax: - -`append ` -* `sub-search`: mandatory. Executes PPL commands as a secondary search. - - -## Limitations - -* **Schema Compatibility**: When fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with `eval` or using `fields` to select non-conflicting columns). - - -## Example 1: Append rows from a count aggregation to existing search results - -The following example appends rows from "count by gender" to "sum by gender, state". - -```sql -source=accounts | stats sum(age) by gender, state | sort -`sum(age)` | head 5 | append [ source=accounts | stats count(age) by gender ] -``` -{% include copy.html %} - -Expected output: - -| sum(age) | gender | state | count(age) | -| --- | --- | --- | --- | -| 36 | M | TN | null | -| 33 | M | MD | null | -| 32 | M | IL | null | -| 28 | F | VA | null | -| null | F | null | 1 | -| null | M | null | 3 | - - -## Example 2: Append rows with merged column names - -The following example appends rows from "sum by gender" to "sum by gender, state" with merged column of same field name and type. - -```sql -source=accounts | stats sum(age) as sum by gender, state | sort -sum | head 5 | append [ source=accounts | stats sum(age) as sum by gender ] -``` -{% include copy.html %} - -Expected output: - -| sum | gender | state | -| --- | --- | --- | -| 36 | M | TN | -| 33 | M | MD | -| 32 | M | IL | -| 28 | F | VA | -| 28 | F | null | -| 101 | M | null | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/appendpipe.md b/_sql-and-ppl/ppl/cmd/appendpipe.md deleted file mode 100644 index 8893be0ee2f..00000000000 --- a/_sql-and-ppl/ppl/cmd/appendpipe.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: default -title: "appendpipe" -parent: "Commands" -grand_parent: "PPL" -nav_order: 6 ---- -# appendpipe - - -The `appendpipe` command appends the result of the subpipeline to the search results. Unlike a subsearch, the subpipeline is not run first. The subpipeline is run when the search reaches the appendpipe command. -The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. - -## Syntax - -Use the following syntax: - -`appendpipe []` -* `subpipeline`: mandatory. A list of commands that are applied to the search results from the commands that occur in the search before the `appendpipe` command. - - -## Example 1: Append rows from a total count to existing search results - -This example appends rows from "total by gender" to "sum by gender, state" with merged column of same field name and type. - -```sql -source=accounts -| stats sum(age) as part by gender, state -| sort -part -| head 5 -| appendpipe [ stats sum(part) as total by gender ] -``` -{% include copy.html %} - -Expected output: - -| part | gender | state | total | -| --- | --- | --- | --- | -| 36 | M | TN | null | -| 33 | M | MD | null | -| 32 | M | IL | null | -| 28 | F | VA | null | -| null | F | null | 28 | -| null | M | null | 101 | - - -## Example 2: Append rows with merged column names - -This example appends rows from "count by gender" to "sum by gender, state". - -```sql -source=accounts -| stats sum(age) as total by gender, state -| sort -total -| head 5 -| appendpipe [ stats sum(total) as total by gender ] -``` -{% include copy.html %} - -Expected output: - -| total | gender | state | -| --- | --- | --- | -| 36 | M | TN | -| 33 | M | MD | -| 32 | M | IL | -| 28 | F | VA | -| 28 | F | null | -| 101 | M | null | - - -## Limitations - -* **Schema Compatibility**: Same as command `append`, when fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with `eval` or using `fields` to select non-conflicting columns). \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/chart.md b/_sql-and-ppl/ppl/cmd/chart.md deleted file mode 100644 index 4b435959afb..00000000000 --- a/_sql-and-ppl/ppl/cmd/chart.md +++ /dev/null @@ -1,190 +0,0 @@ ---- -layout: default -title: "chart" -parent: "Commands" -grand_parent: "PPL" -nav_order: 8 ---- -# chart - - -The `chart` command transforms search results by applying a statistical aggregation function and optionally grouping the data by one or two fields. The results are suitable for visualization as a two-dimension chart when grouping by two fields, where unique values in the second group key can be pivoted to column names. - -## Syntax - -Use the following syntax: - -`chart [limit=(top|bottom) ] [useother=] [usenull=] [nullstr=] [otherstr=] [ by ] | [over ] [ by ]` -* `limit`: optional. Specifies the number of categories to display when using column split. Each unique value in the column split field represents a category. **Default:** top10. - * Syntax: `limit=(top|bottom)` or `limit=` (defaults to top) - * When `limit=K` is set, the top or bottom K categories from the column split field are retained; the remaining categories are grouped into an "OTHER" category if `useother` is not set to false. - * Set limit to 0 to show all categories without any limit. - * Use `limit=topK` or `limit=bottomK` to specify whether to retain the top or bottom K column categories. The ranking is based on the sum of aggregated values for each column category. For example, `chart limit=top3 count() by region, product` keeps the 3 products with the highest total counts across all regions. If not specified, top is used by default. - * Only applies when column split is present (by 2 fields or over...by... coexists). -* `useother`: optional. Controls whether to create an "OTHER" category for categories beyond the limit. **Default:** true - * When set to false, only the top/bottom N categories (based on limit) are shown without an "OTHER" category. - * When set to true, categories beyond the limit are grouped into an "OTHER" category. - * Only applies when using column split and when there are more categories than the limit. -* `usenull`: optional. Controls whether to group events without a column split (i.e. whose column split is null) into a separate "NULL" category. **Default:** true - * `usenull` only applies to column split. - * Row split should always be non-null value. Documents with null values in row split will be ignored. - * When `usenull=false`, events with a null column split are excluded from results. - * When `usenull=true`, events with a null column split are grouped into a separate "NULL" category. -* `nullstr`: optional. Specifies the category name for rows that do not contain the column split value. **Default:** "NULL" - * Only applies when `usenull` is set to true. -* `otherstr`: optional. Specifies the category name for the "OTHER" category. **Default:** "OTHER" - * Only applies when `useother` is set to true and there are values beyond the limit. -* `aggregation_function`: mandatory. The aggregation function to apply to the data. - * Currently, only a single aggregation function is supported. - * Available functions: aggregation functions supported by the stats command. -* `by`: optional. Groups the results by either one field (row split) or two fields (row split and column split) - * `limit`, `useother`, and `usenull` apply to the column split - * Results are returned as individual rows for each combination. - * If not specified, the aggregation is performed across all documents. -* over...by...: optional. Alternative syntax for grouping by multiple fields. - * `over by ` groups the results by both fields. - * Using `over` alone on one field is equivalent to `by ` - - -## Notes - -* The fields generated by column splitting are converted to strings so that they are compatible with `nullstr` and `otherstr` and can be used as column names once pivoted. -* Documents with null values in fields used by the aggregation function are excluded from aggregation. For example, in `chart avg(balance) over deptno, group`, documents where `balance` is null are excluded from the average calculation. -* The aggregation metric appears as the last column in the result. Result columns are ordered as: [row-split] [column-split] [aggregation-metrics]. - - -## Example 1: Basic aggregation without grouping - -This example calculates the average balance across all accounts. - -```sql -source=accounts -| chart avg(balance) -``` -{% include copy.html %} - -Expected output: - -| avg(balance) | -| --- | -| 20482.25 | - - -## Example 2: Group by single field - -This example calculates the count of accounts grouped by gender. - -```sql -source=accounts -| chart count() by gender -``` -{% include copy.html %} - -Expected output: - -| gender | count() | -| --- | --- | -| F | 1 | -| M | 3 | - - -## Example 3: Using over and by for multiple field grouping - -The following example PPL query shows how to use `chart` to calculate average balance grouped by both gender and age fields. Note that the age column in the result is converted to string type. - -```sql -source=accounts -| chart avg(balance) over gender by age -``` -{% include copy.html %} - -Expected output: - -| gender | age | avg(balance) | -| --- | --- | --- | -| F | 28 | 32838.0 | -| M | 32 | 39225.0 | -| M | 33 | 4180.0 | -| M | 36 | 5686.0 | - - -## Example 4: Using basic limit functionality - -This example limits the results to show only the top 1 age group. Note that the age column in the result is converted to string type. - -```sql -source=accounts -| chart limit=1 count() over gender by age -``` -{% include copy.html %} - -Expected output: - -| gender | age | count() | -| --- | --- | --- | -| F | OTHER | 1 | -| M | 33 | 1 | -| M | OTHER | 2 | - - -## Example 5: Using limit with other parameters - -The following example PPL query shows how to use `chart` with limit, useother, and custom otherstr parameters. - -```sql -source=accounts -| chart limit=top1 useother=true otherstr='minor_gender' count() over state by gender -``` -{% include copy.html %} - -Expected output: - -| state | gender | count() | -| --- | --- | --- | -| IL | M | 1 | -| MD | M | 1 | -| TN | M | 1 | -| VA | minor_gender | 1 | - - -## Example 6: Using null parameters - -The following example PPL query shows how to use `chart` with limit, usenull, and custom nullstr parameters. - -```sql -source=accounts -| chart usenull=true nullstr='employer not specified' count() over firstname by employer -``` -{% include copy.html %} - -Expected output: - -| firstname | employer | count() | -| --- | --- | --- | -| Amber | Pyrami | 1 | -| Dale | employer not specified | 1 | -| Hattie | Netagy | 1 | -| Nanette | Quility | 1 | - - -## Example 7: Using chart command with span - -The following example PPL query demonstrates how to use `chart` with span for grouping age ranges. - -```sql -source=accounts -| chart max(balance) by age span=10, gender -``` -{% include copy.html %} - -Expected output: - -| age | gender | max(balance) | -| --- | --- | --- | -| 20 | F | 32838 | -| 30 | M | 39225 | - - -## Limitations - -* Only a single aggregation function is supported per chart command. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/dedup.md b/_sql-and-ppl/ppl/cmd/dedup.md deleted file mode 100644 index 4a95b2dd92c..00000000000 --- a/_sql-and-ppl/ppl/cmd/dedup.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -layout: default -title: "dedup" -parent: "Commands" -grand_parent: "PPL" -nav_order: 9 ---- -# dedup - - -The `dedup` command removes duplicate documents defined by specified fields from the search result. - -## Syntax - -Use the following syntax: - -`dedup [int] [keepempty=] [consecutive=]` -* `int`: optional. The `dedup` command retains multiple events for each combination when you specify ``. The number for `` must be greater than 0. All other duplicates are removed from the results. **Default:** 1 -* `keepempty`: optional. If set to true, keep the document if any field in the field-list has NULL value or field is MISSING. **Default:** false. -* `consecutive`: optional. If set to true, removes only events with duplicate combinations of values that are consecutive. **Default:** false. -* `field-list`: mandatory. The comma-delimited field list. At least one field is required. - - -## Example 1: Dedup by one field - -The following example PPL query shows how to use `dedup` to remove duplicate documents based on the `gender` field: - -```sql -source=accounts -| dedup gender -| fields account_number, gender -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | -| --- | --- | -| 1 | M | -| 13 | F | - - -## Example 2: Keep two duplicates documents - -The following example PPL query shows how to use `dedup` to remove duplicate documents based on the `gender` field while keeping two duplicates: - -```sql -source=accounts -| dedup 2 gender -| fields account_number, gender -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | -| --- | --- | -| 1 | M | -| 6 | M | -| 13 | F | - - -## Example 3: Keep or ignore empty fields by default - -The following example PPL query shows how to use `dedup` to remove duplicate documents while keeping documents with null values in the specified field: - -```sql -source=accounts -| dedup email keepempty=true -| fields account_number, email -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | email | -| --- | --- | -| 1 | amberduke@pyrami.com | -| 6 | hattiebond@netagy.com | -| 13 | null | -| 18 | daleadams@boink.com | - -The following example PPL query shows how to use `dedup` to remove duplicate documents while ignoring documents with empty values in the specified field: - -```sql -source=accounts -| dedup email -| fields account_number, email -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | email | -| --- | --- | -| 1 | amberduke@pyrami.com | -| 6 | hattiebond@netagy.com | -| 18 | daleadams@boink.com | - - -## Example 4: Dedup in consecutive document - -The following example PPL query shows how to use `dedup` to remove duplicate consecutive documents: - -```sql -source=accounts -| dedup gender consecutive=true -| fields account_number, gender -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | -| --- | --- | -| 1 | M | -| 13 | F | -| 18 | M | - - -## Limitations - -The `dedup` with `consecutive=true` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/eval.md b/_sql-and-ppl/ppl/cmd/eval.md deleted file mode 100644 index 8addfb25b5d..00000000000 --- a/_sql-and-ppl/ppl/cmd/eval.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -layout: default -title: "eval" -parent: "Commands" -grand_parent: "PPL" -nav_order: 11 ---- -# eval - - -The `eval` command evaluates the expression and appends the result to the search result. - -## Syntax - -Use the following syntax: - -`eval = ["," = ]...` -* `field`: mandatory. If the field name does not exist, a new field is added. If the field name already exists, it will be overridden. -* expression: mandatory. Any expression supported by the system. - - -## Example 1: Create a new field - -The following example PPL query shows how to use `eval` to create a new field for each document. In this example, the new field is `doubleAge`. - -```sql -source=accounts -| eval doubleAge = age * 2 -| fields age, doubleAge -``` -{% include copy.html %} - -Expected output: - -| age | doubleAge | -| --- | --- | -| 32 | 64 | -| 36 | 72 | -| 28 | 56 | -| 33 | 66 | - - -## Example 2: Override an existing field - -The following example PPL query shows how to use `eval` to override an existing field. In this example, the existing field `age` is overridden by the `age` field plus 1. - -```sql -source=accounts -| eval age = age + 1 -| fields age -``` -{% include copy.html %} - -Expected output: - -| age | -| --- | -| 33 | -| 37 | -| 29 | -| 34 | - - -## Example 3: Create a new field with field defined in eval - -The following example PPL query shows how to use `eval` to create a new field based on the fields defined in the `eval` expression. In this example, the new field `ddAge` is the evaluation result of the `doubleAge` field multiplied by 2. `doubleAge` is defined in the `eval` command. - -```sql -source=accounts -| eval doubleAge = age * 2, ddAge = doubleAge * 2 -| fields age, doubleAge, ddAge -``` -{% include copy.html %} - -Expected output: - -| age | doubleAge | ddAge | -| --- | --- | --- | -| 32 | 64 | 128 | -| 36 | 72 | 144 | -| 28 | 56 | 112 | -| 33 | 66 | 132 | - - -## Example 4: String concatenation - -The following example PPL query shows using the `+` operator for string concatenation. You can concatenate string literals and field values. - -```sql -source=accounts -| eval greeting = 'Hello ' + firstname -| fields firstname, greeting -``` -{% include copy.html %} - -Expected output: - -| firstname | greeting | -| --- | --- | -| Amber | Hello Amber | -| Hattie | Hello Hattie | -| Nanette | Hello Nanette | -| Dale | Hello Dale | - - -## Example 5: Multiple string concatenation with type casting - -The following example PPL query shows multiple concatenations with type casting from numeric to string. - -```sql -source=accounts | eval full_info = 'Name: ' + firstname + ', Age: ' + CAST(age AS STRING) | fields firstname, age, full_info -``` -{% include copy.html %} - -Expected output: - -| firstname | age | full_info | -| --- | --- | --- | -| Amber | 32 | Name: Amber, Age: 32 | -| Hattie | 36 | Name: Hattie, Age: 36 | -| Nanette | 28 | Name: Nanette, Age: 28 | -| Dale | 33 | Name: Dale, Age: 33 | - - -## Limitations - -The `eval` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/eventstats.md b/_sql-and-ppl/ppl/cmd/eventstats.md deleted file mode 100644 index 8429c344b6f..00000000000 --- a/_sql-and-ppl/ppl/cmd/eventstats.md +++ /dev/null @@ -1,164 +0,0 @@ ---- -layout: default -title: "eventstats" -parent: "Commands" -grand_parent: "PPL" -nav_order: 12 ---- -# eventstats - - -The `eventstats` command enriches your event data with calculated summary statistics. It operates by analyzing specified fields within your events, computing various statistical measures, and then appending these results as new fields to each original event. -Key aspects of `eventstats`: -1. It performs calculations across the entire search results or within defined groups. -2. The original events remain intact, with new fields added to contain the statistical results. -3. The command is particularly useful for comparative analysis, identifying outliers, or providing additional context to individual events. - -Difference between `stats` and `eventstats` -The `stats` and `eventstats` commands are both used for calculating statistics, but they have some key differences in how they operate and what they produce: -* Output Format - * `stats`: Produces a summary table with only the calculated statistics. - * `eventstats`: Adds the calculated statistics as new fields to the existing events, preserving the original data. -* Event Retention - * `stats`: Reduces the search results to only the statistical summary, discarding individual events. - * `eventstats`: Retains all original events and adds new fields with the calculated statistics. -* Use Cases - * `stats`: Best for creating summary reports or dashboards. Often used as a final command to summarize results. - * `eventstats`: Useful when you need to enrich events with statistical context for further analysis or filtering. It can be used mid-search to add statistics that can be used in subsequent commands. - - -## Syntax - -Use the following syntax: - -`eventstats [bucket_nullable=bool] ... [by-clause]` -* `function`: mandatory. An aggregation function or window function. -* `bucket_nullable`: optional. Controls whether the eventstats command consider null buckets as a valid group in group-by aggregations. When set to `false`, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. - * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` - * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` -* `by-clause`: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** aggregation over the entire search results. -* `span-expression`: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. - * Available time units: - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - -## Aggregation functions - -The eventstats command supports the following aggregation functions: -* `COUNT`: Count of values -* `SUM`: Sum of numeric values -* `AVG`: Average of numeric values -* `MAX`: Maximum value -* `MIN`: Minimum value -* `VAR_SAMP`: Sample variance -* `VAR_POP`: Population variance -* `STDDEV_SAMP`: Sample standard deviation -* `STDDEV_POP`: Population standard deviation -* DISTINCT_COUNT/DC: Distinct count of values -* `EARLIEST`: Earliest value by timestamp -* `LATEST`: Latest value by timestamp - -For detailed documentation of each function, see [Aggregation Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/aggregations/). - -## Usage - -Eventstats - -```sql -source = table | eventstats avg(a) -source = table | where a < 50 | eventstats count(c) -source = table | eventstats min(c), max(c) by b -source = table | eventstats count(c) as count_by by b | where count_by > 1000 -source = table | eventstats dc(field) as distinct_count -source = table | eventstats distinct_count(category) by region -``` -{% include copy.html %} - - -## Example 1: Calculate the average, sum and count of a field by group - -The following example PPL query shows how to use `eventstats` to calculate the average age, sum of age, and count of events for all accounts grouped by gender. - -```sql -source=accounts -| fields account_number, gender, age -| eventstats avg(age), sum(age), count() by gender -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | age | avg(age) | sum(age) | count() | -| --- | --- | --- | --- | --- | --- | -| 1 | M | 32 | 33.666666666666664 | 101 | 3 | -| 6 | M | 36 | 33.666666666666664 | 101 | 3 | -| 13 | F | 28 | 28.0 | 28 | 1 | -| 18 | M | 33 | 33.666666666666664 | 101 | 3 | - - -## Example 2: Calculate the count by a gender and span - -The following example PPL query shows how to use `eventstats` to count events by age intervals of 5 years, grouped by gender. - -```sql -source=accounts -| fields account_number, gender, age -| eventstats count() as cnt by span(age, 5) as age_span, gender -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | age | cnt | -| --- | --- | --- | --- | -| 1 | M | 32 | 2 | -| 6 | M | 36 | 1 | -| 13 | F | 28 | 1 | -| 18 | M | 33 | 2 | - - -## Example 3: Null buckets handling - -```sql -source=accounts -| eventstats bucket_nullable=false count() as cnt by employer -| fields account_number, firstname, employer, cnt -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | employer | cnt | -| --- | --- | --- | --- | -| 1 | Amber | Pyrami | 1 | -| 6 | Hattie | Netagy | 1 | -| 13 | Nanette | Quility | 1 | -| 18 | Dale | null | null | - -```sql -source=accounts -| eventstats bucket_nullable=true count() as cnt by employer -| fields account_number, firstname, employer, cnt -| sort account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | employer | cnt | -| --- | --- | --- | --- | -| 1 | Amber | Pyrami | 1 | -| 6 | Hattie | Netagy | 1 | -| 13 | Nanette | Quility | 1 | -| 18 | Dale | null | 1 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/expand.md b/_sql-and-ppl/ppl/cmd/expand.md deleted file mode 100644 index b87881c172c..00000000000 --- a/_sql-and-ppl/ppl/cmd/expand.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -layout: default -title: "expand" -parent: "Commands" -grand_parent: "PPL" -nav_order: 13 ---- -# expand - - -The `expand` command transforms a single document with a nested array field into multiple documents—each containing one element from the array. All other fields in the original document are duplicated across the resulting documents. -Key aspects of `expand`: -* It generates one row per element in the specified array field. -* The specified array field is converted into individual rows. -* If an alias is provided, the expanded values appear under the alias instead of the original field name. -* If the specified field is an empty array, the row is retained with the expanded field set to null. - - -## Syntax - -Use the following syntax: - -`expand [as alias]` -* `field`: mandatory. The field to be expanded (exploded). Currently only nested arrays are supported. -* `alias`: optional. The name to use instead of the original field name. - - -## Example 1: Expand address field with an alias - -Given a dataset `migration` with the following data: - -```text -{"name":"abbas","age":24,"address":[{"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}}]} -{"name":"chen","age":32,"address":[{"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}},{"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}}]} - -``` - -The following query expand the address field and rename it to addr: - -```sql -source=migration -| expand address as addr -``` -{% include copy.html %} - -Expected output: - -| name | age | addr | -| --- | --- | --- | -| abbas | 24 | {"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}} | -| chen | 32 | {"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}} | -| chen | 32 | {"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}} | - - -## Limitations - -* The `expand` command currently only supports nested arrays. Primitive fields storing arrays are not supported. E.g. a string field storing an array of strings cannot be expanded with the current implementation. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/fillnull.md b/_sql-and-ppl/ppl/cmd/fillnull.md deleted file mode 100644 index f5432897cc0..00000000000 --- a/_sql-and-ppl/ppl/cmd/fillnull.md +++ /dev/null @@ -1,169 +0,0 @@ ---- -layout: default -title: "fillnull" -parent: "Commands" -grand_parent: "PPL" -nav_order: 16 ---- -# fillnull - - -The `fillnull` command fills null values with the provided value in one or more fields in the search results. - -## Syntax - -Use one of the following syntax options: - -`fillnull with [in ]` -`fillnull using = [, = ]` -`fillnull value= []` -* `replacement`: mandatory. The value used to replace null values. -* `field-list`: optional. List of fields to apply the replacement to. It can be comma-delimited (with `with` or `using` syntax) or space-delimited (with `value=` syntax). **Default:** all fields. -* `field`: mandatory when using `using` syntax. Individual field name to assign a specific replacement value. -* **Syntax variations** - * `with in ` - Apply same value to specified fields - * `using =, ...` - Apply different values to different fields - * `value= []` - Alternative syntax with optional space-delimited field list - - -## Example 1: Replace null values with a specified value on one field - -The following example PPL query shows how to use `fillnull` to replace null values in the email field with '\'. - -```sql -source=accounts -| fields email, employer -| fillnull with '' in email -``` -{% include copy.html %} - -Expected output: - -| email | employer | -| --- | --- | -| amberduke@pyrami.com | Pyrami | -| hattiebond@netagy.com | Netagy | -| | Quility | -| daleadams@boink.com | null | - - -## Example 2: Replace null values with a specified value on multiple fields - -The following example PPL query shows how to use `fillnull` to replace null values in both email and employer fields with the same replacement value '\'. - -```sql -source=accounts -| fields email, employer -| fillnull with '' in email, employer -``` -{% include copy.html %} - -Expected output: - -| email | employer | -| --- | --- | -| amberduke@pyrami.com | Pyrami | -| hattiebond@netagy.com | Netagy | -| | Quility | -| daleadams@boink.com | | - - -## Example 3: Replace null values with a specified value on all fields - -The following example PPL query shows how to use `fillnull` to replace null values in all fields when no field list is specified. - -```sql -source=accounts -| fields email, employer -| fillnull with '' -``` -{% include copy.html %} - -Expected output: - -| email | employer | -| --- | --- | -| amberduke@pyrami.com | Pyrami | -| hattiebond@netagy.com | Netagy | -| | Quility | -| daleadams@boink.com | | - - -## Example 4: Replace null values with multiple specified values on multiple fields - -The following example PPL query shows how to use `fillnull` with different replacement values for different fields using the 'using' syntax. - -```sql -source=accounts -| fields email, employer -| fillnull using email = '', employer = '' -``` -{% include copy.html %} - -Expected output: - -| email | employer | -| --- | --- | -| amberduke@pyrami.com | Pyrami | -| hattiebond@netagy.com | Netagy | -| | Quility | -| daleadams@boink.com | | - - -## Example 5: Replace null with specified value on specific fields (value= syntax) - -The following example PPL query shows how to use `fillnull` with the alternative 'value=' syntax to replace null values in specific fields. - -```sql -source=accounts -| fields email, employer -| fillnull value="" email employer -``` -{% include copy.html %} - -Expected output: - -| email | employer | -| --- | --- | -| amberduke@pyrami.com | Pyrami | -| hattiebond@netagy.com | Netagy | -| | Quility | -| daleadams@boink.com | | - - -## Example 6: Replace null with specified value on all fields (value= syntax) - -When no field list is specified, the replacement applies to all fields in the result. - -```sql -source=accounts -| fields email, employer -| fillnull value='' -``` -{% include copy.html %} - -Expected output: - -| email | employer | -| --- | --- | -| amberduke@pyrami.com | Pyrami | -| hattiebond@netagy.com | Netagy | -| | Quility | -| daleadams@boink.com | | - - -## Limitations - -* The `fillnull` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. -* When applying the same value to all fields without specifying field names, all fields must be the same type. For mixed types, use separate fillnull commands or explicitly specify fields. -* The replacement value type must match ALL field types in the field list. When applying the same value to multiple fields, all fields must be the same type (all strings or all numeric). - - **Example:** - -```sql - # This FAILS - same value for mixed-type fields - source=accounts | fillnull value=0 firstname, age - # ERROR: fillnull failed: replacement value type INTEGER is not compatible with field 'firstname' (type: VARCHAR). The replacement value type must match the field type. -``` -{% include copy.html %} - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/flatten.md b/_sql-and-ppl/ppl/cmd/flatten.md deleted file mode 100644 index fd4e8a9f579..00000000000 --- a/_sql-and-ppl/ppl/cmd/flatten.md +++ /dev/null @@ -1,101 +0,0 @@ ---- -layout: default -title: "flatten" -parent: "Commands" -grand_parent: "PPL" -nav_order: 17 ---- -# flatten - - -The `flatten` command flattens a struct or an object field into separate fields in a document. - -The flattened fields will be ordered **lexicographically** by their original key names in the struct. For example, if the struct has keys `b`, `c` and `Z`, the flattened fields will be ordered as `Z`, `b`, `c`. -Note that `flatten` should not be applied to arrays. Use the `expand` command to expand an array field into multiple rows instead. However, since an array can be stored in a non-array field in OpenSearch, when flattening a field storing a nested array, only the first element of the array will be flattened. - -## Syntax - -Use the following syntax: - -`flatten [as ()]` -* `field`: mandatory. The field to be flattened. Only object and nested fields are supported. -* `alias-list`: optional. The names to use instead of the original key names. Names are separated by commas. It is advised to put the alias-list in parentheses if there is more than one alias. The length must match the number of keys in the struct field. The provided alias names **must** follow the lexicographical order of the corresponding original keys in the struct. - - -## Example: Flatten an object field with aliases - -The following example PPL query shows how to use `flatten` to flatten a message object field and use aliases to rename the flattened fields. -Given the following index `my-index` - -```text - {"message":{"info":"a","author":"e","dayOfWeek":1},"myNum":1} - {"message":{"info":"b","author":"f","dayOfWeek":2},"myNum":2} - -``` - -with the following mapping: - -```json - { - "mappings": { - "properties": { - "message": { - "type": "object", - "properties": { - "info": { - "type": "keyword", - "index": "true" - }, - "author": { - "type": "keyword", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - }, - "index": "true" - }, - "dayOfWeek": { - "type": "long" - } - } - }, - "myNum": { - "type": "long" - } - } - } - } - - -``` - -The following query flattens the `message` field and renames the keys to -`creator, dow, info`: - -```sql -source=my-index -| flatten message as (creator, dow, info) -``` -{% include copy.html %} - -Expected output: - -| message | myNum | creator | dow | info | -| --- | --- | --- | --- | --- | -| {"info":"a","author":"e","dayOfWeek":1} | 1 | e | 1 | a | -| {"info":"b","author":"f","dayOfWeek":2} | 2 | f | 2 | b | - - -## Limitations - -* `flatten` command may not work as expected when its flattened fields are - - invisible. - For example in query - `source=my-index | fields message | flatten message`, the - `flatten message` command doesn't work since some flattened fields such as - `message.info` and `message.author` after command `fields message` are - invisible. - As an alternative, you can change to `source=my-index | flatten message`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/head.md b/_sql-and-ppl/ppl/cmd/head.md deleted file mode 100644 index 7cb8a442487..00000000000 --- a/_sql-and-ppl/ppl/cmd/head.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -layout: default -title: "head" -parent: "Commands" -grand_parent: "PPL" -nav_order: 19 ---- -# head - - -The `head` command returns the first N number of lines from a search result. - -## Syntax - -Use the following syntax: - -`head [] [from ]` -* `size`: optional integer. The number of results you want to return. **Default:** 10 -* `offset`: optional integer after `from`. Number of results to skip. **Default:** 0 - - -## Example 1: Get the first 10 results - -The following example PPL query shows how to use `head` to return the first 10 search results: - -```sql -source=accounts -| fields firstname, age -| head -``` -{% include copy.html %} - -Expected output: - -| firstname | age | -| --- | --- | -| Amber | 32 | -| Hattie | 36 | -| Nanette | 28 | -| Dale | 33 | - - -## Example 2: Get first N results - -The following example PPL query shows how to use `head` to get a specified number of search results. In this example, N is equal to 3: - -```sql -source=accounts -| fields firstname, age -| head 3 -``` -{% include copy.html %} - -Expected output: - -| firstname | age | -| --- | --- | -| Amber | 32 | -| Hattie | 36 | -| Nanette | 28 | - - -## Example 3: Get the first N results after offset M - -The following example PPL query example shows getting the first 3 results after offset 1 from the `accounts` index. - -```sql -source=accounts -| fields firstname, age -| head 3 from 1 -``` -{% include copy.html %} - -Expected output: - -| firstname | age | -| --- | --- | -| Hattie | 36 | -| Nanette | 28 | -| Dale | 33 | - - -## Limitations - -The `head` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/index.md b/_sql-and-ppl/ppl/cmd/index.md deleted file mode 100644 index aaa3fb7c88b..00000000000 --- a/_sql-and-ppl/ppl/cmd/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -layout: default -title: "Commands" -parent: "PPL" -nav_order: 20 -has_children: true -redirect_from: - - /search-plugins/sql/ppl/functions/ - - /observability-plugin/ppl/commands/ - - /search-plugins/ppl/commands/ - - /search-plugins/ppl/functions/ - - /sql-and-ppl/ppl/functions/ ---- -# Commands -PPL supports most common [SQL functions]({{site.url}}{{site.baseurl}}/search-plugins/sql/functions/), including [relevance search]({{site.url}}{{site.baseurl}}/search-plugins/sql/full-text/), but also introduces several more functions (called _commands_), which are available in PPL only. diff --git a/_sql-and-ppl/ppl/cmd/join.md b/_sql-and-ppl/ppl/cmd/join.md deleted file mode 100644 index d285620039d..00000000000 --- a/_sql-and-ppl/ppl/cmd/join.md +++ /dev/null @@ -1,216 +0,0 @@ ---- -layout: default -title: "join" -parent: "Commands" -grand_parent: "PPL" -nav_order: 21 ---- -# join - - -The `join` command combines two datasets together. The left side could be an index or results from a piped commands, the right side could be either an index or a subsearch. - -## Syntax - -The `join` command supports basic and extended syntax options. - -### Basic syntax - -`[joinType] join [leftAlias] [rightAlias] (on | where) ` -* `joinType`: optional. The type of join to perform. Options: `left`, `semi`, `anti`, and performance-sensitive types `right`, `full`, `cross`. **Default:** `inner`. -* `leftAlias`: optional. The subsearch alias to use with the left join side, to avoid ambiguous naming. Pattern: `left = ` -* `rightAlias`: optional. The subsearch alias to use with the right join side, to avoid ambiguous naming. Pattern: `right = ` -* `joinCriteria`: mandatory. Any comparison expression. Must follow `on` or `where` keyword. -* `right-dataset`: mandatory. Right dataset could be either an `index` or a `subsearch` with/without alias. - -### Extended syntax: - -`join [type=] [overwrite=] [max=n] ( | [leftAlias] [rightAlias] (on | where) ) ` -* `type`: optional. Join type using extended syntax. Options: `left`, `outer` (alias of `left`), `semi`, `anti`, and performance-sensitive types `right`, `full`, `cross`. **Default:** `inner`. -* `overwrite`: optional boolean. Only works with `join-field-list`. Specifies whether duplicate-named fields from right-dataset should replace corresponding fields in the main search results. **Default:** `true`. -* `max`: optional integer. Controls how many subsearch results could be joined against each row in main search. **Default:** 0 (unlimited). -* `join-field-list`: optional. The fields used to build the join criteria. The join field list must exist on both sides. If not specified, all fields common to both sides will be used as join keys. -* `leftAlias`: optional. Same as basic syntax when used with extended syntax. -* `rightAlias`: optional. Same as basic syntax when used with extended syntax. -* `joinCriteria`: mandatory. Same as basic syntax when used with extended syntax. -* `right-dataset`: mandatory. Same as basic syntax. - - -## Configuration - -The following settings configure the `join` command behavior. - -### plugins.ppl.join.subsearch_maxout - -The size configures the maximum of rows from subsearch to join against. The default value is: `50000`. A value of `0` indicates that the restriction is unlimited. -Change the join.subsearch_maxout to 5000 - -```bash -curl -sS -H 'Content-Type: application/json' \ --X PUT localhost:9200/_plugins/_query/settings \ --d '{"persistent" : {"plugins.ppl.join.subsearch_maxout" : "5000"}}' -``` -{% include copy.html %} - -```json -{ - "acknowledged": true, - "persistent": { - "plugins": { - "ppl": { - "join": { - "subsearch_maxout": "5000" - } - } - } - }, - "transient": {} -} -``` - - -## Usage - -Basic join syntax: - -``` -source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | inner join left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | cross join left = l right = r on 1=1 table2 -source = table1 | left semi join left = l right = r on l.a = r.a table2 -source = table1 | left anti join left = l right = r on l.a = r.a table2 -source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] -source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c -source = table1 | inner join on a = c table2 | fields a, b, c, d -source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a -source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a -source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a -``` - -Extended syntax with options: - -``` -source = table1 | join type=outer left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | join type=left left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | join type=inner max=1 left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c -source = table1 | join a table2 | fields a, b, c -source = table1 | join a, b table2 | fields a, b, c -source = table1 | join type=outer a b table2 | fields a, b, c -source = table1 | join type=inner max=1 a, b table2 | fields a, b, c -source = table1 | join type=left overwrite=false max=0 a, b [source=table2 | rename d as b] | fields a, b, c -``` - - -## Example 1: Two indexes join - -The following example PPL query shows how to use `join` to join two indexes using the basic join syntax. - -```sql -source = state_country -| inner join left=a right=b ON a.name = b.name occupation -| stats avg(salary) by span(age, 10) as age_span, b.country -``` -{% include copy.html %} - -Expected output: - -| avg(salary) | age_span | b.country | -| --- | --- | --- | -| 120000.0 | 40 | USA | -| 105000.0 | 20 | Canada | -| 0.0 | 40 | Canada | -| 70000.0 | 30 | USA | -| 100000.0 | 70 | England | - - -## Example 2: Join with subsearch - -The following example PPL query shows how to use `join` to join with a subsearch using the basic join syntax. - -```sql -source = state_country as a -| where country = 'USA' OR country = 'England' -| left join ON a.name = b.name [ source = occupation -| where salary > 0 -| fields name, country, salary -| sort salary -| head 3 ] as b -| stats avg(salary) by span(age, 10) as age_span, b.country -``` -{% include copy.html %} - -Expected output: - -| avg(salary) | age_span | b.country | -| --- | --- | --- | -| null | 40 | null | -| 70000.0 | 30 | USA | -| 100000.0 | 70 | England | - - -## Example 3: Join with field list - -The following example PPL query shows how to use `join` with the extended syntax and field list. - -```sql -source = state_country -| where country = 'USA' OR country = 'England' -| join type=left overwrite=true name [ source = occupation -| where salary > 0 -| fields name, country, salary -| sort salary -| head 3 ] -| stats avg(salary) by span(age, 10) as age_span, country -``` -{% include copy.html %} - -Expected output: - -| avg(salary) | age_span | country | -| --- | --- | --- | -| null | 40 | null | -| 70000.0 | 30 | USA | -| 100000.0 | 70 | England | - - -## Example 4: Join with options - -The following example PPL query shows how to use `join` with the extended syntax and additional options. - -```sql -source = state_country -| join type=inner overwrite=false max=1 name occupation -| stats avg(salary) by span(age, 10) as age_span, country -``` -{% include copy.html %} - -Expected output: - -| avg(salary) | age_span | country | -| --- | --- | --- | -| 120000.0 | 40 | USA | -| 100000.0 | 70 | USA | -| 105000.0 | 20 | Canada | -| 70000.0 | 30 | USA | - - -## Limitations - -For basic syntax, if fields in the left outputs and right outputs have the same name. Typically, in the join criteria -`ON t1.id = t2.id`, the names `id` in output are ambiguous. To avoid ambiguous, the ambiguous -fields in output rename to `.id`, or else `.id` if no alias existing. - -Assume table1 and table2 only contain field `id`, following PPL queries and their outputs are: - -| Query | Output | -| --- | --- | -| source=table1 \| join left=t1 right=t2 on t1.id=t2.id table2 \| eval a = 1 | t1.id, t2.id, a | -| source=table1 \| join on table1.id=table2.id table2 \| eval a = 1 | table1.id, table2.id, a | -| source=table1 \| join on table1.id=t2.id table2 as t2 \| eval a = 1 | table1.id, t2.id, a | -| source=table1 \| join right=tt on table1.id=t2.id [ source=table2 as t2 \| eval b = id ] \| eval a = 1 | table1.id, tt.id, tt.b, a | - -For extended syntax (join with field list), when duplicate-named fields in output results are deduplicated, the fields in output determined by the value of 'overwrite' option. -Join types `inner`, `left`, `outer` (alias of `left`), `semi` and `anti` are supported by default. `right`, `full`, `cross` are performance-sensitive join types which are disabled by default. Set config `plugins.calcite.all_join_types.allowed = true` to enable. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/kmeans.md b/_sql-and-ppl/ppl/cmd/kmeans.md deleted file mode 100644 index 2ac063e2188..00000000000 --- a/_sql-and-ppl/ppl/cmd/kmeans.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: default -title: "kmeans" -parent: "Commands" -grand_parent: "PPL" -nav_order: 22 ---- -# kmeans (deprecated by ml command) - - -The `kmeans` command applies the kmeans algorithm in the ml-commons plugin on the search results returned by a PPL command. - -## Syntax - -Use the following syntax: - -`kmeans ` -* `centroids`: optional. The number of clusters you want to group your data points into. **Default:** 2. -* `iterations`: optional. Number of iterations. **Default:** 10. -* `distance_type`: optional. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. - - -## Example: Clustering of iris dataset - -The following example PPL query shows how to use `kmeans` to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. - -```sql -source=iris_data -| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm -| kmeans centroids=3 -``` -{% include copy.html %} - -Expected output: - -| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | -| --- | --- | --- | --- | --- | -| 5.1 | 3.5 | 1.4 | 0.2 | 1 | -| 5.6 | 3.0 | 4.1 | 1.3 | 0 | -| 6.7 | 2.5 | 5.8 | 1.8 | 2 | - - -## Limitations - -The `kmeans` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/lookup.md b/_sql-and-ppl/ppl/cmd/lookup.md deleted file mode 100644 index 460b8c37fc4..00000000000 --- a/_sql-and-ppl/ppl/cmd/lookup.md +++ /dev/null @@ -1,357 +0,0 @@ ---- -layout: default -title: "lookup" -parent: "Commands" -grand_parent: "PPL" -nav_order: 23 ---- -# lookup - - -The `lookup` command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend fields of an index with values from a dimension table, append or replace values when lookup condition is matched. As an alternative of join command, lookup command is more suitable for enriching the source data with a static dataset. - -## Syntax - -Use the following syntax: - -`lookup ( [as ])... [(replace | append) ( [as ])...]` -* `lookupIndex`: mandatory. The name of lookup index (dimension table). -* `lookupMappingField`: mandatory. A mapping key in `lookupIndex`, analogy to a join key from right table. You can specify multiple `lookupMappingField` with comma-delimited. -* `sourceMappingField`: optional. A mapping key from source (left side), analogy to a join key from left side. If not specified, defaults to `lookupMappingField`. -* `inputField`: optional. A field in `lookupIndex` where matched values are applied to result output. You can specify multiple `inputField` with comma-delimited. If not specified, all fields except `lookupMappingField` from `lookupIndex` are applied to result output. -* `outputField`: optional. A field of output. You can specify zero or multiple `outputField`. If `outputField` has an existing field name in source query, its values will be replaced or appended by matched values from `inputField`. If the field specified in `outputField` is a new field, in replace strategy, an extended new field will be applied to the results, but fail in append strategy. -* replace \| append: optional. The output strategies. If replace, matched values in `lookupIndex` field overwrite the values in result. If append, matched values in `lookupIndex` field only append to the missing values in result. **Default:** replace. - - -## Usage - -Lookup - -``` -source = table1 | lookup table2 id -source = table1 | lookup table2 id, name -source = table1 | lookup table2 id as cid, name -source = table1 | lookup table2 id as cid, name replace dept as department -source = table1 | lookup table2 id as cid, name replace dept as department, city as location -source = table1 | lookup table2 id as cid, name append dept as department -source = table1 | lookup table2 id as cid, name append dept as department, city as location -``` - - -## Example 1: Replace strategy - -The following example PPL query shows how to use `lookup` with the REPLACE strategy to overwrite existing values. - -```bash -curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information uid AS id REPLACE department - | fields id, name, occupation, country, salary, department - """ -}' -``` -{% include copy.html %} - -Result set - -```json -{ - "schema": [ - { - "name": "id", - "type": "integer" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "occupation", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "salary", - "type": "integer" - }, - { - "name": "department", - "type": "string" - } - ], - "datarows": [ - [ - 1000, - "Jake", - "Engineer", - "England", - 100000, - "IT" - ], - [ - 1001, - "Hello", - "Artist", - "USA", - 70000, - null - ], - [ - 1002, - "John", - "Doctor", - "Canada", - 120000, - "DATA" - ], - [ - 1003, - "David", - "Doctor", - null, - 120000, - "HR" - ], - [ - 1004, - "David", - null, - "Canada", - 0, - null - ], - [ - 1005, - "Jane", - "Scientist", - "Canada", - 90000, - "DATA" - ] - ], - "total": 6, - "size": 6 -} -``` - - -## Example 2: Append strategy - -The following example PPL query shows how to use `lookup` with the APPEND strategy to fill missing values only. - -```bash -curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information uid AS id APPEND department - | fields id, name, occupation, country, salary, department - """ -}' -``` -{% include copy.html %} - - -## Example 3: No inputField specified - -The following example PPL query shows how to use `lookup` without specifying inputField, which applies all fields from the lookup index. - -```bash -curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information uid AS id, name - | fields id, name, occupation, country, salary, department - """ -}' -``` -{% include copy.html %} - -Result set - -```json -{ - "schema": [ - { - "name": "id", - "type": "integer" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "salary", - "type": "integer" - }, - { - "name": "department", - "type": "string" - }, - { - "name": "occupation", - "type": "string" - } - ], - "datarows": [ - [ - 1000, - "Jake", - "England", - 100000, - "IT", - "Engineer" - ], - [ - 1001, - "Hello", - "USA", - 70000, - null, - null - ], - [ - 1002, - "John", - "Canada", - 120000, - "DATA", - "Scientist" - ], - [ - 1003, - "David", - null, - 120000, - "HR", - "Doctor" - ], - [ - 1004, - "David", - "Canada", - 0, - null, - null - ], - [ - 1005, - "Jane", - "Canada", - 90000, - "DATA", - "Engineer" - ] - ], - "total": 6, - "size": 6 -} -``` - - -## Example 4: OutputField as a new field - -The following example PPL query shows how to use `lookup` with outputField as a new field name. - -```bash -curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information name REPLACE occupation AS new_col - | fields id, name, occupation, country, salary, new_col - """ -}' -``` -{% include copy.html %} - -Result set - -```json -{ - "schema": [ - { - "name": "id", - "type": "integer" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "occupation", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "salary", - "type": "integer" - }, - { - "name": "new_col", - "type": "string" - } - ], - "datarows": [ - [ - 1003, - "David", - "Doctor", - null, - 120000, - "Doctor" - ], - [ - 1004, - "David", - null, - "Canada", - 0, - "Doctor" - ], - [ - 1001, - "Hello", - "Artist", - "USA", - 70000, - null - ], - [ - 1000, - "Jake", - "Engineer", - "England", - 100000, - "Engineer" - ], - [ - 1005, - "Jane", - "Scientist", - "Canada", - 90000, - "Engineer" - ], - [ - 1002, - "John", - "Doctor", - "Canada", - 120000, - "Scientist" - ] - ], - "total": 6, - "size": 6 -} -``` \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/ml.md b/_sql-and-ppl/ppl/cmd/ml.md deleted file mode 100644 index 57478caac80..00000000000 --- a/_sql-and-ppl/ppl/cmd/ml.md +++ /dev/null @@ -1,151 +0,0 @@ ---- -layout: default -title: "ml" -parent: "Commands" -grand_parent: "PPL" -nav_order: 24 ---- -# ml - - -The `ml` command trains, predicts, or trains and predicts on any algorithm in the ml-commons plugin on the search results returned by a PPL command. - -## Syntax - -The `ml` command supports different syntax options depending on the algorithm. - -## AD - Fixed in time RCF for time-series data - -`ml action='train' algorithm='rcf' ` -* `number_of_trees`: optional integer. Number of trees in the forest. **Default:** 30. -* `shingle_size`: optional integer. A shingle is a consecutive sequence of the most recent records. **Default:** 8. -* `sample_size`: optional integer. The sample size used by stream samplers in this forest. **Default:** 256. -* `output_after`: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. -* `time_decay`: optional double. The decay factor used by stream samplers in this forest. **Default:** 0.0001. -* `anomaly_rate`: optional double. The anomaly rate. **Default:** 0.005. -* `time_field`: mandatory string. It specifies the time field for RCF to use as time-series data. -* `date_format`: optional string. It's used for formatting time_field field. **Default:** "yyyy-MM-dd HH:mm:ss". -* `time_zone`: optional string. It's used for setting time zone for time_field field. **Default:** UTC. -* `category_field`: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. - - -## AD - Batch RCF for non-time-series data: - -`ml action='train' algorithm='rcf' ` -* `number_of_trees`: optional integer. Number of trees in the forest. **Default:** 30. -* `sample_size`: optional integer. Number of random samples given to each tree from the training dataset. **Default:** 256. -* `output_after`: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. -* `training_data_size`: optional integer. **Default:** size of your training dataset. -* `anomaly_score_threshold`: optional double. The threshold of anomaly score. **Default:** 1.0. -* `category_field`: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. - - -## KMEANS: - -`ml action='train' algorithm='kmeans' ` -* `centroids`: optional integer. The number of clusters you want to group your data points into. **Default:** 2. -* `iterations`: optional integer. Number of iterations. **Default:** 10. -* `distance_type`: optional string. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. - - -## Example 1: Detecting events in New York City from taxi ridership data with time-series data - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. - -```sql -source=nyc_taxi -| fields value, timestamp -| ml action='train' algorithm='rcf' time_field='timestamp' -| where value=10844.0 -``` -{% include copy.html %} - -Expected output: - -| value | timestamp | score | anomaly_grade | -| --- | --- | --- | --- | -| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | - - -## Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. - -```sql -source=nyc_taxi -| fields category, value, timestamp -| ml action='train' algorithm='rcf' time_field='timestamp' category_field='category' -| where value=10844.0 or value=6526.0 -``` -{% include copy.html %} - -Expected output: - -| category | value | timestamp | score | anomaly_grade | -| --- | --- | --- | --- | --- | -| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | -| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | - - -## Example 3: Detecting events in New York City from taxi ridership data with non-time-series data - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. - -```sql -source=nyc_taxi -| fields value -| ml action='train' algorithm='rcf' -| where value=10844.0 -``` -{% include copy.html %} - -Expected output: - -| value | score | anomalous | -| --- | --- | --- | -| 10844.0 | 0.0 | False | - - -## Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. - -```sql -source=nyc_taxi -| fields category, value -| ml action='train' algorithm='rcf' category_field='category' -| where value=10844.0 or value=6526.0 -``` -{% include copy.html %} - -Expected output: - -| category | value | score | anomalous | -| --- | --- | --- | --- | -| night | 10844.0 | 0.0 | False | -| day | 6526.0 | 0.0 | False | - - -## Example 5: KMEANS - Clustering of iris dataset - -The following example PPL query shows how to use `ml` with KMEANS to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. - -```sql -source=iris_data -| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm -| ml action='train' algorithm='kmeans' centroids=3 -``` -{% include copy.html %} - -Expected output: - -| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | -| --- | --- | --- | --- | --- | -| 5.1 | 3.5 | 1.4 | 0.2 | 1 | -| 5.6 | 3.0 | 4.1 | 1.3 | 0 | -| 6.7 | 2.5 | 5.8 | 1.8 | 2 | - - -## Limitations - -The `ml` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/multisearch.md b/_sql-and-ppl/ppl/cmd/multisearch.md deleted file mode 100644 index de1d0a52251..00000000000 --- a/_sql-and-ppl/ppl/cmd/multisearch.md +++ /dev/null @@ -1,151 +0,0 @@ ---- -layout: default -title: "multisearch" -parent: "Commands" -grand_parent: "PPL" -nav_order: 25 ---- -# multisearch - - -The `multisearch` command runs multiple search subsearches and merges their results together. The command allows you to combine data from different queries on the same or different sources, and optionally apply subsequent processing to the combined search results. -Key aspects of `multisearch`: -1. Combines results from multiple search operations into a single result set. -2. Each subsearch can have different filtering criteria, data transformations, and field selections. -3. Results are merged and can be further processed with aggregations, sorting, and other PPL commands. -4. Particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. -5. Supports timestamp-based result interleaving when working with time-series data. - -Use Cases: -* **Comparative Analysis**: Compare metrics across different segments, regions, or time periods -* **Success Rate Monitoring**: Calculate success rates by comparing successful compared to total operations -* **Multi-source Data Combination**: Merge data from different indexes or apply different filters to the same source -* **A/B Testing Analysis**: Combine results from different test groups for comparison -* **Time-series Data Merging**: Interleave events from multiple sources based on timestamps - - -## Syntax - -Use the following syntax: - -`multisearch ...` -* subsearch1, subsearch2, ...: mandatory. At least two subsearches required. Each subsearch must be enclosed in square brackets and start with the `search` keyword. Format: `[search source=index | commands...]`. All PPL commands are supported within subsearches. -* `result-processing`: optional. Commands applied to the merged results after the multisearch operation, such as `stats`, `sort`, `head`, etc. - - -## Usage - -Basic multisearch - -``` -| multisearch [search source=table | where condition1] [search source=table | where condition2] -| multisearch [search source=index1 | fields field1, field2] [search source=index2 | fields field1, field2] -| multisearch [search source=table | where status="success"] [search source=table | where status="error"] -``` - - -## Example 1: Basic age group analysis - -This example combines young and adult customers into a single result set for further analysis. - -```sql -| multisearch [search source=accounts -| where age < 30 -| eval age_group = "young" -| fields firstname, age, age_group] [search source=accounts -| where age >= 30 -| eval age_group = "adult" -| fields firstname, age, age_group] -| sort age -``` -{% include copy.html %} - -Expected output: - -| firstname | age | age_group | -| --- | --- | --- | -| Nanette | 28 | young | -| Amber | 32 | adult | -| Dale | 33 | adult | -| Hattie | 36 | adult | - - -## Example 2: Success rate Pattern - -This example combines high-balance and all valid accounts for comparison analysis. - -```sql -| multisearch [search source=accounts -| where balance > 20000 -| eval query_type = "high_balance" -| fields firstname, balance, query_type] [search source=accounts -| where balance > 0 AND balance <= 20000 -| eval query_type = "regular" -| fields firstname, balance, query_type] -| sort balance desc -``` -{% include copy.html %} - -Expected output: - -| firstname | balance | query_type | -| --- | --- | --- | -| Amber | 39225 | high_balance | -| Nanette | 32838 | high_balance | -| Hattie | 5686 | regular | -| Dale | 4180 | regular | - - -## Example 3: Timestamp interleaving - -This example combines time-series data from multiple sources with automatic timestamp-based ordering. - -```sql -| multisearch [search source=time_data -| where category IN ("A", "B")] [search source=time_data2 -| where category IN ("E", "F")] -| fields @timestamp, category, value, timestamp -| head 5 -``` -{% include copy.html %} - -Expected output: - -| @timestamp | category | value | timestamp | -| --- | --- | --- | --- | -| 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 | -| 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 | -| 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 | -| 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 | -| 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 | - - -## Example 4: Type compatibility - missing fields - -The following example PPL query demonstrates how missing fields are handled with NULL insertion. - -```sql -| multisearch [search source=accounts -| where age < 30 -| eval young_flag = "yes" -| fields firstname, age, young_flag] [search source=accounts -| where age >= 30 -| fields firstname, age] -| sort age -``` -{% include copy.html %} - -Expected output: - -| firstname | age | young_flag | -| --- | --- | --- | -| Nanette | 28 | yes | -| Amber | 32 | null | -| Dale | 33 | null | -| Hattie | 36 | null | - - -## Limitations - -* **Minimum Subsearches**: At least two subsearches must be specified -* **Schema Compatibility**: When fields with the same name exist across subsearches but have incompatible types, the system automatically resolves conflicts by renaming the conflicting fields. The first occurrence retains the original name, while subsequent conflicting fields are renamed with a numeric suffix (e.g., `age` becomes `age0`, `age1`, etc.). This ensures all data is preserved while maintaining schema consistency. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/parse.md b/_sql-and-ppl/ppl/cmd/parse.md deleted file mode 100644 index 692c96abbfd..00000000000 --- a/_sql-and-ppl/ppl/cmd/parse.md +++ /dev/null @@ -1,134 +0,0 @@ ---- -layout: default -title: "parse" -parent: "Commands" -grand_parent: "PPL" -nav_order: 26 ---- -# parse - - -The `parse` command extracts information from a text field using a regular expression and adds it to the search result. - -## Syntax - -Use the following syntax: - -`parse ` -* `field`: mandatory. The field must be a text field. -* `pattern`: mandatory. The regular expression pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. - - -## Regular expression -The regular expression pattern is used to match the whole text field of each document with Java regex engine. Each named capture group in the expression will become a new `STRING` field. - -## Example 1: Create a new field - -The following example PPL query shows how to create new field `host` for each document. `host` becomes the hostname after the @ symbol in the `email` field. Parsing a null field returns an empty string. - -```sql -source=accounts -| parse email '.+@(?.+)' -| fields email, host -``` -{% include copy.html %} - -Expected output: - -| email | host | -| --- | --- | -| amberduke@pyrami.com | pyrami.com | -| hattiebond@netagy.com | netagy.com | -| null | | -| daleadams@boink.com | boink.com | - - -## Example 2: Override an existing field - -The following example PPL query shows how to override the existing `address` field while excluding the street number: - -```sql -source=accounts -| parse address '\d+ (?
.+)' -| fields address -``` -{% include copy.html %} - -Expected output: - -| address | -| --- | -| Holmes Lane | -| Bristol Street | -| Madison Street | -| Hutchinson Court | - - -## Example 3: Filter and sort by casted parsed field - -The following example PPL query shows how to sort street numbers that are higher than 500 in the `address` field. - -```sql -source=accounts -| parse address '(?\d+) (?.+)' -| where cast(streetNumber as int) > 500 -| sort num(streetNumber) -| fields streetNumber, street -``` -{% include copy.html %} - -Expected output: - -| streetNumber | street | -| --- | --- | -| 671 | Bristol Street | -| 789 | Madison Street | -| 880 | Holmes Lane | - - -## Limitations - -There are a few limitations with parse command: -- Fields defined by parse cannot be parsed again. - -The following command will not work - -``` -source=accounts | parse address '\d+ (?.+)' | parse street '\w+ (?\w+)' ; -``` - -- Fields defined by parse cannot be overridden with other commands. - -`where` will not match any documents since `street` cannot be overridden - -``` -source=accounts | parse address '\d+ (?.+)' | eval street='1' | where street='1' ; -``` - -- The text field used by parse cannot be overridden. - -`street` will not be successfully parsed since `address` is overridden - -``` -source=accounts | parse address '\d+ (?.+)' | eval address='1' ; -``` - -- Fields defined by parse cannot be filtered/sorted after using them in `stats` command. - -`where` in the following command will not work - -``` -source=accounts | parse email '.+@(?.+)' | stats avg(age) by host | where host=pyrami.com ; -``` - -- Fields defined by parse will not appear in the final result unless the original source field is included in the `fields` command. - -For example, the following query will not display the parsed fields `host` unless the source field `email` is also explicitly included - -``` -source=accounts | parse email '.+@(?.+)' | fields email, host ; -``` - -- Named capture group must start with a letter and contain only letters and digits. - - For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/rare.md b/_sql-and-ppl/ppl/cmd/rare.md deleted file mode 100644 index e7c973194c5..00000000000 --- a/_sql-and-ppl/ppl/cmd/rare.md +++ /dev/null @@ -1,138 +0,0 @@ ---- -layout: default -title: "rare" -parent: "Commands" -grand_parent: "PPL" -nav_order: 28 ---- -# rare - - -The `rare` command finds the least common tuple of values of all fields in the field list. - -**Note**: A maximum of 10 results is returned for each distinct tuple of values of the group-by fields. - -## Syntax - -Use the following syntax: - -`rare [rare-options] [by-clause]` -* `field-list`: mandatory. Comma-delimited list of field names. -* `by-clause`: optional. One or more fields to group the results by. -* `rare-options`: optional. Options for the rare command. Supported syntax is [countfield=\] [showcount=\]. -* showcount=\: optional. Whether to create a field in output that represent a count of the tuple of values. **Default:** `true`. -* countfield=\: optional. The name of the field that contains count. **Default:** `'count'`. -* usenull=\: optional. whether to output the null value. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`: - * When `plugins.ppl.syntax.legacy.preferred=true`, `usenull` defaults to `true` - * When `plugins.ppl.syntax.legacy.preferred=false`, `usenull` defaults to `false` - - -## Example 1: Find the least common values in a field - -The following example PPL query shows how to use `rare` to find the least common gender of all the accounts. - -```sql -source=accounts -| rare showcount=false gender -``` -{% include copy.html %} - -Expected output: - -| gender | -| --- | -| F | -| M | - - -## Example 2: Find the least common values organized by gender - -The following example PPL query shows how to use `rare` to find the least common age of all the accounts grouped by gender. - -```sql -source=accounts -| rare showcount=false age by gender -``` -{% include copy.html %} - -Expected output: - -| gender | age | -| --- | --- | -| F | 28 | -| M | 32 | -| M | 33 | -| M | 36 | - - -## Example 3: Rare command - -The following example PPL query shows how to use `rare` to find the least common gender of all the accounts. - -```sql -source=accounts -| rare gender -``` -{% include copy.html %} - -Expected output: - -| gender | count | -| --- | --- | -| F | 1 | -| M | 3 | - - -## Example 4: Specify the count field option - -The following example PPL query shows how to use `rare` to specify the count field. - -```sql -source=accounts -| rare countfield='cnt' gender -``` -{% include copy.html %} - -Expected output: - -| gender | cnt | -| --- | --- | -| F | 1 | -| M | 3 | - - -## Example 5: Specify the usenull field option - -```sql -source=accounts -| rare usenull=false email -``` -{% include copy.html %} - -Expected output: - -| email | count | -| --- | --- | -| amberduke@pyrami.com | 1 | -| daleadams@boink.com | 1 | -| hattiebond@netagy.com | 1 | - -```sql -source=accounts -| rare usenull=true email -``` -{% include copy.html %} - -Expected output: - -| email | count | -| --- | --- | -| null | 1 | -| amberduke@pyrami.com | 1 | -| daleadams@boink.com | 1 | -| hattiebond@netagy.com | 1 | - - -## Limitations - -The `rare` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/regex.md b/_sql-and-ppl/ppl/cmd/regex.md deleted file mode 100644 index 20340048f24..00000000000 --- a/_sql-and-ppl/ppl/cmd/regex.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -layout: default -title: "regex" -parent: "Commands" -grand_parent: "PPL" -nav_order: 29 ---- -# regex - - -The `regex` command filters search results by matching field values against a regular expression pattern. Only documents where the specified field matches the pattern are included in the results. - -## Syntax - -Use the following syntax: - -`regex = ` -`regex != ` -* `field`: mandatory. The field name to match against. -* `pattern`: mandatory string. The regular expression pattern to match. Supports Java regex syntax including named groups, lookahead/lookbehind, and character classes. -* = : operator for positive matching (include matches) -* != : operator for negative matching (exclude matches) - - -## Regular expression engine - -The regex command uses Java's built-in regular expression engine, which supports: -* **Standard regex features**: Character classes, quantifiers, anchors -* **Named capture groups**: `(?pattern)` syntax -* **Lookahead/lookbehind**: `(?=...)` and `(?<=...)` assertions -* **Inline flags**: Case-insensitive `(?i)`, multiline `(?m)`, dotall `(?s)`, and other modes - -For complete documentation of Java regex patterns and available modes, see the [Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). - -## Example 1: Basic pattern matching - -The following example PPL query shows how to use `regex` to filter documents where the `lastname` field matches names starting with uppercase letters. - -```sql -source=accounts -| regex lastname="^[A-Z][a-z]+$" -| fields account_number, firstname, lastname -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | lastname | -| --- | --- | --- | -| 1 | Amber | Duke | -| 6 | Hattie | Bond | -| 13 | Nanette | Bates | -| 18 | Dale | Adams | - - -## Example 2: Negative matching - -The following example PPL query shows how to use `regex` to exclude documents where the `lastname` field ends with "son". - -```sql -source=accounts -| regex lastname!=".*son$" -| fields account_number, lastname -``` -{% include copy.html %} - -Expected output: - -| account_number | lastname | -| --- | --- | -| 1 | Duke | -| 6 | Bond | -| 13 | Bates | -| 18 | Adams | - - -## Example 3: Email domain matching - -The following example PPL query shows how to use `regex` to filter documents by email domain patterns. - -```sql -source=accounts -| regex email="@pyrami\.com$" -| fields account_number, email -``` -{% include copy.html %} - -Expected output: - -| account_number | email | -| --- | --- | -| 1 | amberduke@pyrami.com | - - -## Example 4: Complex patterns with character classes - -The following example PPL query shows how to use `regex` with complex regex patterns with character classes and quantifiers. - -```sql -source=accounts | regex address="\\d{3,4}\\s+[A-Z][a-z]+\\s+(Street|Lane|Court)" | fields account_number, address -``` -{% include copy.html %} - -Expected output: - -| account_number | address | -| --- | --- | -| 1 | 880 Holmes Lane | -| 6 | 671 Bristol Street | -| 13 | 789 Madison Street | -| 18 | 467 Hutchinson Court | - - -## Example 5: Case-sensitive matching - -The following example PPL query demonstrates that regex matching is case-sensitive by default. - -```sql -source=accounts -| regex state="va" -| fields account_number, state -``` -{% include copy.html %} - -Expected output: - -| account_number | state | -| --- | --- | - -```sql -source=accounts -| regex state="VA" -| fields account_number, state -``` -{% include copy.html %} - -Expected output: - -| account_number | state | -| --- | --- | -| 13 | VA | - - -## Limitations - -* **Field specification required**: A field name must be specified in the regex command. Pattern-only syntax (e.g., `regex "pattern"`) is not currently supported -* **String fields only**: The regex command currently only supports string fields. Using it on numeric or boolean fields will result in an error \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/rename.md b/_sql-and-ppl/ppl/cmd/rename.md deleted file mode 100644 index b3bdb9990ea..00000000000 --- a/_sql-and-ppl/ppl/cmd/rename.md +++ /dev/null @@ -1,138 +0,0 @@ ---- -layout: default -title: "rename" -parent: "Commands" -grand_parent: "PPL" -nav_order: 30 ---- -# rename - - -The `rename` command renames one or more fields in the search results. - -## Syntax - -Use the following syntax: - -`rename AS ["," AS ]...` -* `source-field`: mandatory. The name of the field you want to rename. Supports wildcard patterns using `*`. -* `target-field`: mandatory. The name you want to rename to. Must have same number of wildcards as the source. - - -## Behavior - -The rename command handles non-existent fields as follows: -* **Renaming a non-existent field to a non-existent field**: No change occurs to the search results. -* **Renaming a non-existent field to an existing field**: The existing target field is removed from the search results. -* **Renaming an existing field to an existing field**: The existing target field is removed and the source field is renamed to the target. - - -## Example 1: Rename one field - -The following example PPL query shows how to use `rename` to rename one field. - -```sql -source=accounts -| rename account_number as an -| fields an -``` -{% include copy.html %} - -Expected output: - -| an | -| --- | -| 1 | -| 6 | -| 13 | -| 18 | - - -## Example 2: Rename multiple fields - -The following example PPL query shows how to use `rename` to rename multiple fields. - -```sql -source=accounts -| rename account_number as an, employer as emp -| fields an, emp -``` -{% include copy.html %} - -Expected output: - -| an | emp | -| --- | --- | -| 1 | Pyrami | -| 6 | Netagy | -| 13 | Quility | -| 18 | null | - - -## Example 3: Rename with wildcards - -The following example PPL query shows how to use `rename` to rename multiple fields using wildcard patterns. - -```sql -source=accounts -| rename *name as *_name -| fields first_name, last_name -``` -{% include copy.html %} - -Expected output: - -| first_name | last_name | -| --- | --- | -| Amber | Duke | -| Hattie | Bond | -| Nanette | Bates | -| Dale | Adams | - - -## Example 4: Rename with multiple wildcard patterns - -The following example PPL query shows how to use `rename` to rename multiple fields using multiple wildcard patterns. - -```sql -source=accounts -| rename *name as *_name, *_number as *number -| fields first_name, last_name, accountnumber -``` -{% include copy.html %} - -Expected output: - -| first_name | last_name | accountnumber | -| --- | --- | --- | -| Amber | Duke | 1 | -| Hattie | Bond | 6 | -| Nanette | Bates | 13 | -| Dale | Adams | 18 | - - -## Example 5: Rename existing field to existing field - -The following example PPL query shows how to use `rename` to rename an existing field to an existing field. The target field gets removed and the source field is renamed to the target field. - -```sql -source=accounts -| rename firstname as age -| fields age -``` -{% include copy.html %} - -Expected output: - -| age | -| --- | -| Amber | -| Hattie | -| Nanette | -| Dale | - - -## Limitations - -The `rename` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. -Literal asterisk (*) characters in field names cannot be replaced as asterisk is used for wildcard matching. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/replace.md b/_sql-and-ppl/ppl/cmd/replace.md deleted file mode 100644 index 4ec179d3109..00000000000 --- a/_sql-and-ppl/ppl/cmd/replace.md +++ /dev/null @@ -1,301 +0,0 @@ ---- -layout: default -title: "replace" -parent: "Commands" -grand_parent: "PPL" -nav_order: 31 ---- -# replace - - -The `replace` command replaces text in one or more fields in the search results. Supports literal string replacement and wildcard patterns using `*`. - -## Syntax - -Use the following syntax: - -`replace '' WITH '' [, '' WITH '']... IN [, ]...` -* `pattern`: mandatory. The text pattern you want to replace. -* `replacement`: mandatory. The text you want to replace with. -* `field-name`: mandatory. One or more field names where the replacement should occur. - - -## Example 1: Replace text in one field - -The following example PPL query shows how to use `replace` to replace text in one field. - -```sql -source=accounts -| replace "IL" WITH "Illinois" IN state -| fields state -``` -{% include copy.html %} - -Expected output: - -| state | -| --- | -| Illinois | -| TN | -| VA | -| MD | - - -## Example 2: Replace text in multiple fields - -The following example PPL query shows how to use `replace` to replace text in multiple fields. - -```sql -source=accounts -| replace "IL" WITH "Illinois" IN state, address -| fields state, address -``` -{% include copy.html %} - -Expected output: - -| state | address | -| --- | --- | -| Illinois | 880 Holmes Lane | -| TN | 671 Bristol Street | -| VA | 789 Madison Street | -| MD | 467 Hutchinson Court | - - -## Example 3: Replace with other commands in a pipeline - -The following example PPL query shows how to use `replace` with other commands in a query pipeline. - -```sql -source=accounts -| replace "IL" WITH "Illinois" IN state -| where age > 30 -| fields state, age -``` -{% include copy.html %} - -Expected output: - -| state | age | -| --- | --- | -| Illinois | 32 | -| TN | 36 | -| MD | 33 | - - -## Example 4: Replace with multiple pattern/replacement pairs - -The following example PPL query shows how to use `replace` with multiple pattern/replacement pairs in a single replace command. The replacements are applied sequentially. - -```sql -source=accounts -| replace "IL" WITH "Illinois", "TN" WITH "Tennessee" IN state -| fields state -``` -{% include copy.html %} - -Expected output: - -| state | -| --- | -| Illinois | -| Tennessee | -| VA | -| MD | - - -## Example 5: Pattern matching with LIKE and replace - -Since replace command only supports plain string literals, you can use LIKE command with replace for pattern matching needs. - -```sql -source=accounts -| where LIKE(address, '%Holmes%') -| replace "Holmes" WITH "HOLMES" IN address -| fields address, state, gender, age, city -``` -{% include copy.html %} - -Expected output: - -| address | state | gender | age | city | -| --- | --- | --- | --- | --- | -| 880 HOLMES Lane | IL | M | 32 | Brogan | - - -## Example 6: Wildcard suffix match - -Replace values that end with a specific pattern. The wildcard `*` matches any prefix. - -```sql -source=accounts -| replace "*IL" WITH "Illinois" IN state -| fields state -``` -{% include copy.html %} - -Expected output: - -| state | -| --- | -| Illinois | -| TN | -| VA | -| MD | - - -## Example 7: Wildcard prefix match - -Replace values that start with a specific pattern. The wildcard `*` matches any suffix. - -```sql -source=accounts -| replace "IL*" WITH "Illinois" IN state -| fields state -``` -{% include copy.html %} - -Expected output: - -| state | -| --- | -| Illinois | -| TN | -| VA | -| MD | - - -## Example 8: Wildcard capture and substitution - -Use wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement. - -```sql -source=accounts -| replace "* Lane" WITH "Lane *" IN address -| fields address -``` -{% include copy.html %} - -Expected output: - -| address | -| --- | -| Lane 880 Holmes | -| 671 Bristol Street | -| 789 Madison Street | -| 467 Hutchinson Court | - - -## Example 9: Multiple wildcards for pattern transformation - -Use multiple wildcards to transform patterns. Each wildcard in the replacement substitutes the corresponding captured value. - -```sql -source=accounts -| replace "* *" WITH "*_*" IN address -| fields address -``` -{% include copy.html %} - -Expected output: - -| address | -| --- | -| 880_Holmes Lane | -| 671_Bristol Street | -| 789_Madison Street | -| 467_Hutchinson Court | - - -## Example 10: Wildcard with zero wildcards in replacement - -When replacement has zero wildcards, all matching values are replaced with the literal replacement string. - -```sql -source=accounts -| replace "*IL*" WITH "Illinois" IN state -| fields state -``` -{% include copy.html %} - -Expected output: - -| state | -| --- | -| Illinois | -| TN | -| VA | -| MD | - - -## Example 11: Matching literal asterisks - -Use `\*` to match literal asterisk characters (`\*` = literal asterisk, `\\` = literal backslash). - -```sql -source=accounts -| eval note = 'price: *sale*' -| replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note -| fields note -``` -{% include copy.html %} - -Expected output: - -| note | -| --- | -| DISCOUNTED | -| DISCOUNTED | -| DISCOUNTED | -| DISCOUNTED | - - -## Example 12: Wildcard with no replacement wildcards - -Use wildcards in pattern but none in replacement to create a fixed output. - -```sql -source=accounts -| eval test = 'prefix-value-suffix' -| replace 'prefix-*-suffix' WITH 'MATCHED' IN test -| fields test -``` -{% include copy.html %} - -Expected output: - -| test | -| --- | -| MATCHED | -| MATCHED | -| MATCHED | -| MATCHED | - - -## Example 13: Escaped asterisks with wildcards - -Combine escaped asterisks (literal) with wildcards for complex patterns. - -```sql -source=accounts -| eval label = 'file123.txt' -| replace 'file*.*' WITH '\**.*' IN label -| fields label -``` -{% include copy.html %} - -Expected output: - -| label | -| --- | -| *123.txt | -| *123.txt | -| *123.txt | -| *123.txt | - - -## Limitations - -* `Wildcards`: `*` matches zero or more characters (case-sensitive) -* Replacement wildcards must match pattern wildcard count, or be zero -* Escape sequences: `\*` (literal asterisk), `\\` (literal backslash) \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/reverse.md b/_sql-and-ppl/ppl/cmd/reverse.md deleted file mode 100644 index 064b3f1e687..00000000000 --- a/_sql-and-ppl/ppl/cmd/reverse.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -layout: default -title: "reverse" -parent: "Commands" -grand_parent: "PPL" -nav_order: 32 ---- -# reverse - - -The `reverse` command reverses the display order of search results. The same results are returned, but in reverse order. - -## Syntax - -Use the following syntax: - -`reverse` -* No parameters: The reverse command takes no arguments or options. - - -## Note - -The `reverse` command processes the entire dataset. If applied directly to millions of records, it will consume significant memory resources on the coordinating node. Users should only apply the `reverse` command to smaller datasets, typically after aggregation operations. - -## Example 1: Basic reverse operation - -The following example PPL query shows how to use `reverse` to reverse the order of all documents. - -```sql -source=accounts -| fields account_number, age -| reverse -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | -| 1 | 32 | -| 13 | 28 | - - -## Example 2: Reverse with sort - -The following example PPL query shows how to use `reverse` to reverse results after sorting by age in ascending order, effectively giving descending order. - -```sql -source=accounts -| sort age -| fields account_number, age -| reverse -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | -| 1 | 32 | -| 13 | 28 | - - -## Example 3: Reverse with head - -The following example PPL query shows how to use `reverse` with head to get the last 2 records from the original order. - -```sql -source=accounts -| reverse -| head 2 -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | - - -## Example 4: Double reverse - -The following example PPL query demonstrates that applying reverse twice returns to the original order. - -```sql -source=accounts -| reverse -| reverse -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 13 | 28 | -| 1 | 32 | -| 18 | 33 | -| 6 | 36 | - - -## Example 5: Reverse with complex pipeline - -The following example PPL query shows how to use `reverse` with filtering and field selection. - -```sql -source=accounts -| where age > 30 -| fields account_number, age -| reverse -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | -| 1 | 32 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/rex.md b/_sql-and-ppl/ppl/cmd/rex.md deleted file mode 100644 index 6fc6267abe0..00000000000 --- a/_sql-and-ppl/ppl/cmd/rex.md +++ /dev/null @@ -1,277 +0,0 @@ ---- -layout: default -title: "rex" -parent: "Commands" -grand_parent: "PPL" -nav_order: 33 ---- -# rex - - -The `rex` command extracts fields from a raw text field using regular expression named capture groups. - -## Syntax - -Use the following syntax: - -`rex [mode=] field= [max_match=] [offset_field=]` -* `field`: mandatory. The field must be a string field to extract data from. -* `pattern`: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using `(?pattern)` syntax. -* `mode`: optional. Either `extract` or `sed`. **Default:** extract - * **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior. - * **sed mode**: Performs text substitution on the field using sed-style patterns - * `s/pattern/replacement/` - Replace first occurrence - * `s/pattern/replacement/g` - Replace all occurrences (global) - * `s/pattern/replacement/n` - Replace only the nth occurrence (where n is a number) - * `y/from_chars/to_chars/` - Character-by-character transliteration - * Backreferences: `\1`, `\2`, etc. reference captured groups in replacement -* `max_match`: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable through `plugins.ppl.rex.max_match.limit`). -* `offset_field`: optional string. Field name to store the character offset positions of matches. Only available in extract mode. - - -## Example 1: Basic field Extraction - -The following example PPL query shows how to use `rex` to extract username and domain from email addresses using named capture groups. Both extracted fields are returned as string type. - -```sql -source=accounts -| rex field=email "(?[^@]+)@(?[^.]+)" -| fields email, username, domain -| head 2 -``` -{% include copy.html %} - -Expected output: - -| email | username | domain | -| --- | --- | --- | -| amberduke@pyrami.com | amberduke | pyrami | -| hattiebond@netagy.com | hattiebond | netagy | - - -## Example 2: Handling non-matching Patterns - -The following example PPL query shows that the rex command returns all events, setting extracted fields to null for non-matching patterns. Extracted fields would be string type when matches are found. - -```sql -source=accounts -| rex field=email "(?[^@]+)@(?gmail\\.com)" -| fields email, user, domain -| head 2 -``` -{% include copy.html %} - -Expected output: - -| email | user | domain | -| --- | --- | --- | -| amberduke@pyrami.com | null | null | -| hattiebond@netagy.com | null | null | - - -## Example 3: Multiple matches with max_match - -The following example PPL query shows how to use `rex` to extract multiple words from address field using max_match parameter. The extracted field is returned as an array type containing string elements. - -```sql -source=accounts -| rex field=address "(?[A-Za-z]+)" max_match=2 -| fields address, words -| head 3 -``` -{% include copy.html %} - -Expected output: - -| address | words | -| --- | --- | -| 880 Holmes Lane | [Holmes,Lane] | -| 671 Bristol Street | [Bristol,Street] | -| 789 Madison Street | [Madison,Street] | - - -## Example 4: Text replacement with mode=sed - -The following example PPL query shows how to use `rex` to replace email domains using sed mode for text substitution. The extracted field is returned as string type. - -```sql -source=accounts -| rex field=email mode=sed "s/@.*/@company.com/" -| fields email -| head 2 -``` -{% include copy.html %} - -Expected output: - -| email | -| --- | -| amberduke@company.com | -| hattiebond@company.com | - - -## Example 5: Using offset_field - -The following example PPL query shows how to use `rex` to track the character positions where matches occur. Extracted fields are string type, and the offset_field is also string type. - -```sql -source=accounts -| rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos -| fields email, username, domain, matchpos -| head 2 -``` -{% include copy.html %} - -Expected output: - -| email | username | domain | matchpos | -| --- | --- | --- | --- | -| amberduke@pyrami.com | amberduke | pyrami | domain=10-15&username=0-8 | -| hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | - - -## Example 6: Complex email Pattern - -The following example PPL query shows how to use `rex` to extract comprehensive email components including top-level domain. All extracted fields are returned as string type. - -```sql -source=accounts -| rex field=email "(?[a-zA-Z0-9._%+-]+)@(?[a-zA-Z0-9.-]+)\\.(?[a-zA-Z]{2,})" -| fields email, user, domain, tld -| head 2 -``` -{% include copy.html %} - -Expected output: - -| email | user | domain | tld | -| --- | --- | --- | --- | -| amberduke@pyrami.com | amberduke | pyrami | com | -| hattiebond@netagy.com | hattiebond | netagy | com | - - -## Example 7: Chaining multiple rex Commands - -The following example PPL query shows how to use `rex` to extract initial letters from both first and last names. All extracted fields are returned as string type. - -```sql -source=accounts -| rex field=firstname "(?^.)" -| rex field=lastname "(?^.)" -| fields firstname, lastname, firstinitial, lastinitial -| head 3 -``` -{% include copy.html %} - -Expected output: - -| firstname | lastname | firstinitial | lastinitial | -| --- | --- | --- | --- | -| Amber | Duke | A | D | -| Hattie | Bond | H | B | -| Nanette | Bates | N | B | - - -## Example 8: Named capture group limitations - -The following example PPL query demonstrates naming restrictions for capture groups. Group names cannot contain underscores due to Java regex limitations. -Invalid PPL query with underscores - -```sql -source=accounts -| rex field=email "(?[^@]+)@(?[^.]+)" -| fields email, user_name, email_domain -``` -{% include copy.html %} - -Expected output: - -```text -{'reason': 'Invalid Query', 'details': "Invalid capture group name 'user_name'. Java regex group names must start with a letter and contain only letters and digits.", 'type': 'IllegalArgumentException'} -Error: Query returned no data -``` - -Correct PPL query without underscores - -```sql -source=accounts -| rex field=email "(?[^@]+)@(?[^.]+)" -| fields email, username, emaildomain -| head 2 -``` -{% include copy.html %} - -Expected output: - -| email | username | emaildomain | -| --- | --- | --- | -| amberduke@pyrami.com | amberduke | pyrami | -| hattiebond@netagy.com | hattiebond | netagy | - - -## Example 9: Max match limit protection - -The following example PPL query demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion. -PPL query with max_match=0 automatically capped to default limit of 10 - -```sql -source=accounts -| rex field=address "(?\\d*)" max_match=0 -| eval digit_count=array_length(digit) -| fields address, digit_count -| head 1 -``` -{% include copy.html %} - -Expected output: - -| address | digit_count | -| --- | --- | -| 880 Holmes Lane | 10 | - -PPL query exceeding the configured limit results in an error - -```sql -source=accounts -| rex field=address "(?\\d*)" max_match=100 -| fields address, digit -| head 1 -``` -{% include copy.html %} - -Expected output: - -```text -{'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'} -Error: Query returned no data -``` - - -## Comparison with related commands - -| Feature | rex | parse | -| --- | --- | --- | -| Pattern Type | Java Regex | Java Regex | -| Named Groups Required | Yes | Yes | -| Multiple Named Groups | Yes | No | -| Multiple Matches | Yes | No | -| Text Substitution | Yes | No | -| Offset Tracking | Yes | No | -| Special Characters in Group Names | No | No | - - -## Limitations - -**Named Capture Group Naming:** -* Group names must start with a letter and contain only letters and digits -* For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) - -**Pattern Requirements:** -* Pattern must contain at least one named capture group -* Regular capture groups `(...)` without names are not allowed - -**Max Match Limit:** -* The `max_match` parameter is subject to a configurable system limit to prevent memory exhaustion -* When `max_match=0` (unlimited) is specified, it is automatically capped at the configured limit (default: 10) -* User-specified values exceeding the configured limit will result in an error -* Users can adjust the limit through the `plugins.ppl.rex.max_match.limit` cluster setting. Setting this limit to a large value is not recommended as it can lead to excessive memory consumption, especially with patterns that match empty strings (e.g., `\d*`, `\w*`) \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/search.md b/_sql-and-ppl/ppl/cmd/search.md deleted file mode 100644 index 133db3d6cf4..00000000000 --- a/_sql-and-ppl/ppl/cmd/search.md +++ /dev/null @@ -1,653 +0,0 @@ ---- -layout: default -title: "search" -parent: "Commands" -grand_parent: "PPL" -nav_order: 34 ---- -# search - - -The `search` command retrieves documents from the index. The `search` command can only be used as the first command in the PPL query. - -## Syntax - -Use the following syntax: - -`search source=[:] [search-expression]` -* `search`: search keyword, which could be ignored. -* `index`: mandatory. search command must specify which index to query from. The index name can be prefixed by "\:" for cross-cluster search. -* `search-expression`: optional. Search expression that gets converted to OpenSearch [query_string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) function which uses [Lucene Query Syntax](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html). - - -## Search expression - -The search expression syntax supports: -* **Full text search**: `error` or `"error message"` - Searches the default field configured by the `index.query.default_field` setting (defaults to `*` which searches all fields) -* **Field-value comparisons**: `field=value`, `field!=value`, `field>value`, `field>=value`, `field[+<...>]@` - Time offset from current time - -**Relative Time Components**: -* **Time offset**: `+` (future) or `-` (past) -* **Time amount**: Numeric value + time unit (`second`, `minute`, `hour`, `day`, `week`, `month`, `year`, and their variants) -* **Snap to unit**: Optional `@` to round to nearest unit (hour, day, month, etc.) - -**Examples of Time Modifier Values**: -* `earliest=now` - From current time -* `latest='2024-12-31 23:59:59'` - Until a specific date -* `earliest=-7d` - From 7 days ago -* `latest='+1d@d'` - Until tomorrow at start of day -* `earliest='-1month@month'` - From start of previous month -* `latest=1754020061` - Until a unix timestamp (August 1, 2025 03:47:41 at UTC) - -Read more details on time modifiers in the [PPL relative_timestamp documentation](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/ppl-lang/functions/ppl-datetime.md#relative_timestamp). -**Notes:** -* **Column name conflicts**: If your data contains columns named "earliest" or "latest", use backticks to access them as regular fields (e.g., `` `earliest`="value"``) to avoid conflicts with time modifier syntax. -* **Time snap syntax**: Time modifiers with chained time offsets must be wrapped in quotes (e.g., `latest='+1d@month-10h'`) for proper query parsing. - - -## Default field configuration - -When you search without specifying a field, it searches the default field configured by the `index.query.default_field` index setting (defaults to `*` which searches all fields). -You can check or modify the default field setting - GET /accounts/_settings/index.query.default_field - PUT /accounts/_settings - { - "index.query.default_field": "firstname,lastname,email" - } - -## Field types and search behavior - -**Text Fields**: Full-text search, phrase search -* `search message="error occurred" source=logs` -* `Limitations`: Wildcards apply to terms after analysis, not entire field value. - -**Keyword Fields**: Exact matching, wildcard patterns -* `search status="ACTIVE" source=logs` -* `Limitations`: No text analysis, case-sensitive matching - -**Numeric Fields**: Range queries, exact matching, IN operator -* `search age>=18 AND balance<50000 source=accounts` -* `Limitations`: No wildcard or text search support - -**Date Fields**: Range queries, exact matching, IN operator -* `search timestamp>="2024-01-01" source=logs` -* `Limitations`: Must use index mapping date format, no wildcards - -**Boolean Fields**: true/false values only, exact matching, IN operator -* `search active=true source=users` -* `Limitations`: No wildcards or range queries - -**IP Fields**: Exact matching, CIDR notation -* `search client_ip="192.168.1.0/24" source=logs` -* `Limitations`: No wildcards for partial IP matching. For wildcard search use multi field with keyword: `search ip_address.keyword='1*' source=logs` or WHERE clause: `source=logs | where cast(ip_address as string) like '1%'` - -**Field Type Performance Tips**: - * Each field type has specific search capabilities and limitations. Using the wrong field type during ingestion impacts performance and accuracy - * For wildcard searches on non-keyword fields: Add a keyword field copy for better performance. Example: If you need wildcards on a text field, create `message.keyword` alongside `message` - - -## Cross-cluster search - -Cross-cluster search lets any node in a cluster execute search requests against other clusters. Refer to [Cross-Cluster Search]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/admin/cross_cluster_search/) for configuration. - -## Example 1: Text search - -**Basic Text Search** (unquoted single term) - -```sql -search ERROR source=otellogs -| sort @timestamp -| fields severityText, body -| head 1 -``` -{% include copy.html %} - -Expected output: - -| severityText | body | -| --- | --- | -| ERROR | Payment failed: Insufficient funds for user@example.com | - -**Phrase Search** (requires quotes for multi-word exact match) - -```sql -search "Payment failed" source=otellogs -| fields body -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Payment failed: Insufficient funds for user@example.com | - -**Implicit AND with Multiple Terms** (unquoted literals are combined with AND) - -```sql -search user email source=otellogs -| sort @timestamp -| fields body -| head 1 -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Executing SQL: SELECT * FROM users WHERE email LIKE '%@gmail.com' AND status != 'deleted' ORDER BY created_at DESC | - -Note: `search user email` is equivalent to `search user AND email`. Multiple unquoted terms are automatically combined with AND. -**Enclose in double quotes for terms which contain special characters** - -```sql -search "john.doe+newsletter@company.com" source=otellogs -| fields body -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Email notification sent to john.doe+newsletter@company.com with subject: 'Welcome! Your order #12345 is confirmed' | - -### Mixed phrase and boolean - -```sql -search "User authentication" OR OAuth2 source=otellogs -| sort @timestamp -| fields body -| head 1 -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| [2024-01-15 10:30:09] production.INFO: User authentication successful for admin@company.org using OAuth2 | - - -## Example 2: Boolean logic and operator precedence - -The following examples demonstrate boolean operators and precedence. - -### Boolean operators - -```sql -search severityText="ERROR" OR severityText="FATAL" source=otellogs -| sort @timestamp -| fields severityText -| head 3 -``` -{% include copy.html %} - -Expected output: - -| severityText | -| --- | -| ERROR | -| FATAL | -| ERROR | - -```sql -search severityText="INFO" AND `resource.attributes.service.name`="cart-service" source=otellogs -| fields body -| head 1; -``` -{% include copy.html %} - -Expected output - -| body | -| --- | -| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | - -**Operator Precedence** (highest to lowest): Parentheses → NOT → OR → AND - -```sql -search severityText="ERROR" OR severityText="WARN" AND severityNumber>15 source=otellogs -| sort @timestamp -| fields severityText, severityNumber -| head 2 -``` -{% include copy.html %} - -Expected output: - -| severityText | severityNumber | -| --- | --- | -| ERROR | 17 | -| ERROR | 17 | - -The preceding expression evaluates as `(severityText="ERROR" OR severityText="WARN") AND severityNumber>15` - -## Example 3: NOT compared to != Semantics - -**!= operator** (field must exist and not equal the value) - -```sql -search employer!="Quility" source=accounts -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | -| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | - -**NOT operator** (excludes matching conditions, includes null fields) - -```sql -search NOT employer="Quility" source=accounts -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | -| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | -| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | - -**Key difference**: `!=` excludes null values, `NOT` includes them. -Dale Adams (account 18) has `employer=null`. He appears in `NOT employer="Quility"` but not in `employer!="Quility"`. - -## Example 4: Wildcards - -The following examples demonstrate wildcard pattern matching. - -### Wildcard patterns - -```sql -search severityText=ERR* source=otellogs -| sort @timestamp -| fields severityText -| head 3 -``` -{% include copy.html %} - -Expected output: - -| severityText | -| --- | -| ERROR | -| ERROR | -| ERROR2 | - -```sql -search body=user* source=otellogs -| sort @timestamp -| fields body -| head 2; -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | -| Payment failed: Insufficient funds for user@example.com | - -**Wildcard Rules**: -* `*` - Matches zero or more characters -* `?` - Matches exactly one character - -### Single character wildcard (?) - -```sql -search severityText="INFO?" source=otellogs -| sort @timestamp -| fields severityText -| head 3 -``` -{% include copy.html %} - -Expected output: - -| severityText | -| --- | -| INFO2 | -| INFO3 | -| INFO4 | - - -## Example 5: Range queries - -Use comparison operators (>, <, >=, <=) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics. - -```sql -search severityNumber>15 AND severityNumber<=20 source=otellogs -| sort @timestamp -| fields severityNumber -| head 3 -``` -{% include copy.html %} - -Expected output: - -| severityNumber | -| --- | -| 17 | -| 17 | -| 18 | - -```sql -search `attributes.payment.amount`>=1000.0 AND `attributes.payment.amount`<=2000.0 source=otellogs -| fields body; -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Payment failed: Insufficient funds for user@example.com | - - -## Example 6: Field search with Wildcards - -When searching in text or keyword fields, wildcards enable partial matching. This is particularly useful for finding records where you only know part of the value. Note that wildcards work best with keyword fields, while text fields may produce unexpected results due to tokenization. -**Partial Search in Keyword Fields** - -```sql -search employer=Py* source=accounts -| fields firstname, employer -``` -{% include copy.html %} - -Expected output: - -| firstname | employer | -| --- | --- | -| Amber | Pyrami | - -### Combining wildcards with field comparisons - -```sql -search firstname=A* AND age>30 source=accounts -| fields firstname, age, city -``` -{% include copy.html %} - -Expected output: - -| firstname | age | city | -| --- | --- | --- | -| Amber | 32 | Brogan | - -**Important Notes on Wildcard Usage**: -* **Keyword fields**: Best for wildcard searches - exact value matching with pattern support -* **Text fields**: Wildcards apply to individual tokens after analysis, not the entire field value -* **Performance**: Leading wildcards (e.g., `*@example.com`) are slower than trailing wildcards -* **Case sensitivity**: Keyword field wildcards are case-sensitive unless normalized during indexing - - -## Example 7: IN operator and field comparisons - -The IN operator efficiently checks if a field matches any value from a list. This is cleaner and more performant than chaining multiple OR conditions for the same field. -**IN Operator** - -```sql -search severityText IN ("ERROR", "WARN", "FATAL") source=otellogs -| sort @timestamp -| fields severityText -| head 3 -``` -{% include copy.html %} - -Expected output: - -| severityText | -| --- | -| ERROR | -| WARN | -| FATAL | - -### Field comparison examples - -```sql -search severityNumber=17 source=otellogs -| sort @timestamp -| fields body -| head 1 -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Payment failed: Insufficient funds for user@example.com | - -```sql -search `attributes.user.email`="user@example.com" source=otellogs -| fields body; -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Payment failed: Insufficient funds for user@example.com | - - -## Example 8: Complex expressions - -Combine multiple conditions using boolean operators and parentheses to create sophisticated search queries. - -```sql -search (severityText="ERROR" OR severityText="WARN") AND severityNumber>10 source=otellogs -| sort @timestamp -| fields severityText -| head 3 -``` -{% include copy.html %} - -Expected output: - -| severityText | -| --- | -| ERROR | -| WARN | -| ERROR | - -```sql -search `attributes.user.email`="user@example.com" OR (`attributes.error.code`="INSUFFICIENT_FUNDS" AND severityNumber>15) source=otellogs -| fields body; -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Payment failed: Insufficient funds for user@example.com | - - -## Example 9: Time modifiers - -Time modifiers filter search results by time range using the implicit `@timestamp` field. They support various time formats for precise temporal filtering. -**Absolute Time Filtering** - -```sql -search earliest='2024-01-15 10:30:05' latest='2024-01-15 10:30:10' source=otellogs -| fields @timestamp, severityText -``` -{% include copy.html %} - -Expected output: - -| @timestamp | severityText | -| --- | --- | -| 2024-01-15 10:30:05.678901234 | FATAL | -| 2024-01-15 10:30:06.789012345 | TRACE | -| 2024-01-15 10:30:07.890123456 | ERROR | -| 2024-01-15 10:30:08.901234567 | WARN | -| 2024-01-15 10:30:09.012345678 | INFO | -| 2024-01-15 10:30:10.123456789 | TRACE2 | - -**Relative Time Filtering** (before 30 seconds ago) - -```sql -search latest=-30s source=otellogs -| sort @timestamp -| fields @timestamp, severityText -| head 3 -``` -{% include copy.html %} - -Expected output: - -| @timestamp | severityText | -| --- | --- | -| 2024-01-15 10:30:00.123456789 | INFO | -| 2024-01-15 10:30:01.23456789 | ERROR | -| 2024-01-15 10:30:02.345678901 | WARN | - -**Time Snapping** (before start of current minute) - -```sql -search latest='@m' source=otellogs -| fields @timestamp, severityText -| head 2 -``` -{% include copy.html %} - -Expected output: - -| @timestamp | severityText | -| --- | --- | -| 2024-01-15 10:30:00.123456789 | INFO | -| 2024-01-15 10:30:01.23456789 | ERROR | - -### Unix timestamp filtering - -```sql -search earliest=1705314600 latest=1705314605 source=otellogs -| fields @timestamp, severityText -``` -{% include copy.html %} - -Expected output: - -| @timestamp | severityText | -| --- | --- | -| 2024-01-15 10:30:00.123456789 | INFO | -| 2024-01-15 10:30:01.23456789 | ERROR | -| 2024-01-15 10:30:02.345678901 | WARN | -| 2024-01-15 10:30:03.456789012 | DEBUG | -| 2024-01-15 10:30:04.567890123 | INFO | - - -## Example 10: Special characters and Escaping - -Understand when and how to escape special characters in your search queries. There are two categories of characters that need escaping: -**Characters that must be escaped**: -* **Backslashes (\)**: Always escape as `\\` to search for literal backslash -* **Quotes (")**: Escape as `\"` when inside quoted strings - -**Wildcard characters (escape only to search literally)**: -* **Asterisk (*)**: Use as-is for wildcard, escape as `\\*` to search for literal asterisk -* **Question mark (?)**: Use as-is for wildcard, escape as `\\?` to search for literal question mark - - -| Intent | PPL syntax | Result | -|--------|------------|--------| -| Wildcard search | `field=user*` | Matches "user", "user123", "userABC" | -| Literal "user*" | `field="user\\*"` | Matches only "user*" | -| Wildcard search | `field=log?` | Matches "log1", "logA", "logs" | -| Literal "log?" | `field="log\\?"` | Matches only "log?" | - - -**Backslash in file paths** - -```sql -search `attributes.error.type`="C:\\\\Users\\\\admin" source=otellogs -| fields `attributes.error.type` -``` -{% include copy.html %} - -Expected output: - -| attributes.error.type | -| --- | -| C:\Users\admin | - -Note: Each backslash in the search value needs to be escaped with another backslash. When using REST API with JSON, additional JSON escaping is required. -**Quotes within strings** - -```sql -search body="\"exact phrase\"" source=otellogs -| sort @timestamp -| fields body -| head 1 -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | - -**Text with special characters** - -```sql -search "wildcard\\* fuzzy~2" source=otellogs -| fields body -| head 1 -``` -{% include copy.html %} - -Expected output: - -| body | -| --- | -| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | - - -## Example 11: Fetch all Data - -Retrieve all documents from an index by specifying only the source without any search conditions. This is useful for exploring small datasets or verifying data ingestion. - -```sql -source=accounts -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | -| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | -| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | -| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/showdatasources.md b/_sql-and-ppl/ppl/cmd/showdatasources.md deleted file mode 100644 index 57c1b85cf49..00000000000 --- a/_sql-and-ppl/ppl/cmd/showdatasources.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: default -title: "showdatasources" -parent: "Commands" -grand_parent: "PPL" -nav_order: 35 ---- -# show datasources - - -The `show datasources` command queries datasources configured in the PPL engine. The `show datasources` command can only be used as the first command in the PPL query. - -## Syntax - -Use the following syntax: - -`show datasources` - -## Example 1: Fetch all PROMETHEUS datasources - -The following example PPL query shows how to use `showdatasources` to fetch all the datasources of type prometheus. -PPL query for all PROMETHEUS DATASOURCES - -```sql -show datasources -| where CONNECTOR_TYPE='PROMETHEUS' -``` -{% include copy.html %} - -Expected output: - -| DATASOURCE_NAME | CONNECTOR_TYPE | -| --- | --- | -| my_prometheus | PROMETHEUS | - - -## Limitations - -The `show datasources` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/sort.md b/_sql-and-ppl/ppl/cmd/sort.md deleted file mode 100644 index e9d1be9f89f..00000000000 --- a/_sql-and-ppl/ppl/cmd/sort.md +++ /dev/null @@ -1,234 +0,0 @@ ---- -layout: default -title: "sort" -parent: "Commands" -grand_parent: "PPL" -nav_order: 36 ---- -# sort - - -The `sort` command sorts all the search results by the specified fields. - -## Syntax - -Use the following syntax: - -`sort [count] <[+|-] sort-field | sort-field [asc|a|desc|d]>...` -* `count`: optional. The number of results to return. Specifying a count of 0 or less than 0 returns all results. **Default:** 0. -* `[+|-]`: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. -* `[asc|a|desc|d]`: optional. asc/a stands for ascending order and NULL/MISSING first. desc/d stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. -* `sort-field`: mandatory. The field used to sort. Can use `auto(field)`, `str(field)`, `ip(field)`, or `num(field)` to specify how to interpret field values. - -> **Note:** -> You cannot mix +/- and asc/desc in the same sort command. Choose one approach for all fields in a single sort command. -> -> - -## Example 1: Sort by one field - -The following example PPL query shows how to use `sort` to sort all documents by age field in ascending order. - -```sql -source=accounts -| sort age -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 13 | 28 | -| 1 | 32 | -| 18 | 33 | -| 6 | 36 | - - -## Example 2: Sort by one field return all the result - -The following example PPL query shows how to use `sort` to sort all documents by age field in ascending order and return all results. - -```sql -source=accounts -| sort 0 age -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 13 | 28 | -| 1 | 32 | -| 18 | 33 | -| 6 | 36 | - - -## Example 3: Sort by one field in descending order (using -) - -The following example PPL query shows how to use `sort` to sort all documents by age field in descending order. - -```sql -source=accounts -| sort - age -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | -| 1 | 32 | -| 13 | 28 | - - -## Example 4: Sort by one field in descending order (using desc) - -The following example PPL query shows how to use `sort` to sort all documents by the age field in descending order using the desc keyword. - -```sql -source=accounts -| sort age desc -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | -| 1 | 32 | -| 13 | 28 | - - -## Example 5: Sort by multiple fields (using +/-) - -The following example PPL query shows how to use `sort` to sort all documents by gender field in ascending order and age field in descending order using +/- operators. - -```sql -source=accounts -| sort + gender, - age -| fields account_number, gender, age -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | age | -| --- | --- | --- | -| 13 | F | 28 | -| 6 | M | 36 | -| 18 | M | 33 | -| 1 | M | 32 | - - -## Example 6: Sort by multiple fields (using asc/desc) - -The following example PPL query shows how to use `sort` to sort all documents by the gender field in ascending order and age field in descending order using asc/desc keywords. - -```sql -source=accounts -| sort gender asc, age desc -| fields account_number, gender, age -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | age | -| --- | --- | --- | -| 13 | F | 28 | -| 6 | M | 36 | -| 18 | M | 33 | -| 1 | M | 32 | - - -## Example 7: Sort by field include null value - -The following example PPL query shows how to use `sort` to sort employer field by default option (ascending order and null first). The result shows that null value is in the first row. - -```sql -source=accounts -| sort employer -| fields employer -``` -{% include copy.html %} - -Expected output: - -| employer | -| --- | -| null | -| Netagy | -| Pyrami | -| Quility | - - -## Example 8: Specify the number of sorted documents to return - -The following example PPL query shows how to use `sort` to sort all documents and return 2 documents. - -```sql -source=accounts -| sort 2 age -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 13 | 28 | -| 1 | 32 | - - -## Example 9: Sort with desc modifier - -The following example PPL query shows how to use `sort` to sort with the desc modifier to reverse sort order. - -```sql -source=accounts -| sort age desc -| fields account_number, age -``` -{% include copy.html %} - -Expected output: - -| account_number | age | -| --- | --- | -| 6 | 36 | -| 18 | 33 | -| 1 | 32 | -| 13 | 28 | - - -## Example 10: Sort with specifying field type - -The following example PPL query shows how to use `sort` to sort with str() to sort numeric values lexicographically. - -```sql -source=accounts -| sort str(account_number) -| fields account_number -``` -{% include copy.html %} - -Expected output: - -| account_number | -| --- | -| 1 | -| 13 | -| 18 | -| 6 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/spath.md b/_sql-and-ppl/ppl/cmd/spath.md deleted file mode 100644 index 2c37c6a13f8..00000000000 --- a/_sql-and-ppl/ppl/cmd/spath.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -layout: default -title: "spath" -parent: "Commands" -grand_parent: "PPL" -nav_order: 37 ---- -# spath - - -The `spath` command extracts fields from structured text data. It currently allows selecting from JSON data with JSON paths. - -## Syntax - -Use the following syntax: - -`spath input= [output=] [path=]` -* `input`: mandatory. The field to scan for JSON data. -* `output`: optional. The destination field that the data will be loaded to. **Default:** value of `path`. -* `path`: mandatory. The path of the data to load for the object. For more information about path syntax, see [json_extract]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/json#json_extract). - - -## Note - -The `spath` command currently does not support pushdown behavior for extraction. It will be slow on large datasets. It's generally better to index fields needed for filtering directly instead of using `spath` to filter nested fields. - -## Example 1: Simple field Extraction - -The simplest spath is to extract a single field. This example extracts `n` from the `doc` field of type `text`. - -```sql -source=structured -| spath input=doc_n n -| fields doc_n n -``` -{% include copy.html %} - -Expected output: - -| doc_n | n | -| --- | --- | -| {"n": 1} | 1 | -| {"n": 2} | 2 | -| {"n": 3} | 3 | - - -## Example 2: Lists and nesting - -The following example PPL query demonstrates more JSON path uses, like traversing nested fields and extracting list elements. - -```sql -source=structured -| spath input=doc_list output=first_element list{0} -| spath input=doc_list output=all_elements list{} -| spath input=doc_list output=nested nest_out.nest_in -| fields doc_list first_element all_elements nested -``` -{% include copy.html %} - -Expected output: - -| doc_list | first_element | all_elements | nested | -| --- | --- | --- | --- | -| {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | -| {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | -| {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a | - - -## Example 3: Sum of inner elements - -The following example PPL query shows how to use `spath` to extract an inner field and do statistics on it, using the docs from example 1. It also demonstrates that `spath` always returns strings for inner types. - -```sql -source=structured -| spath input=doc_n n -| eval n=cast(n as int) -| stats sum(n) -| fields `sum(n)` -``` -{% include copy.html %} - -Expected output: - -| sum(n) | -| --- | -| 6 | - - -## Example 4: Escaped paths - -`spath` can escape paths with strings to accept any path that `json_extract` does. This includes escaping complex field names as array components. - -```sql -source=structured -| spath output=a input=doc_escape "['a fancy field name']" -| spath output=b input=doc_escape "['a.b.c']" -| fields a b -``` -{% include copy.html %} - -Expected output: - -| a | b | -| --- | --- | -| true | 0 | -| true | 1 | -| false | 2 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/stats.md b/_sql-and-ppl/ppl/cmd/stats.md deleted file mode 100644 index 8228c7acefa..00000000000 --- a/_sql-and-ppl/ppl/cmd/stats.md +++ /dev/null @@ -1,464 +0,0 @@ ---- -layout: default -title: "stats" -parent: "Commands" -grand_parent: "PPL" -nav_order: 38 ---- -# stats - - -The `stats` command calculates the aggregation from the search results. - -## Syntax - -Use the following syntax: - -`stats [bucket_nullable=bool] ... [by-clause]` -* `aggregation`: mandatory. An aggregation function. -* `bucket_nullable`: optional. Controls whether the stats command includes null buckets in group-by aggregations. When set to `false`, the aggregation ignores records where the group-by field is null, resulting in faster performance by excluding null bucket. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. - * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` - * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` -* `by-clause`: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** If no by-clause is specified, the stats command returns only one row, which is the aggregation over the entire search results. -* `span-expression`: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). The unit of the interval expression is the natural unit by default. If the field is a date/time type field, the aggregation results always ignore null bucket. For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. - * Available time units - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - - -## Aggregation functions - -The stats command supports the following aggregation functions: -* COUNT/C: Count of values -* `SUM`: Sum of numeric values -* `AVG`: Average of numeric values -* `MAX`: Maximum value -* `MIN`: Minimum value -* `VAR_SAMP`: Sample variance -* `VAR_POP`: Population variance -* `STDDEV_SAMP`: Sample standard deviation -* `STDDEV_POP`: Population standard deviation -* `DISTINCT_COUNT_APPROX`: Approximate distinct count -* `TAKE`: List of original values -* PERCENTILE/PERCENTILE_APPROX: Percentile calculations -* PERC\/P\: Percentile shortcut functions -* `MEDIAN`: 50th percentile -* `EARLIEST`: Earliest value by timestamp -* `LATEST`: Latest value by timestamp -* `FIRST`: First non-null value -* `LAST`: Last non-null value -* `LIST`: Collect all values into array -* `VALUES`: Collect unique values into sorted array - -For detailed documentation of each function, see [Aggregation Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/aggregations/). - -## Limitations - -The following limitations apply to the `stats` command. - -### Bucket aggregation result may be approximate in large dataset - -In OpenSearch, `doc_count` values for a terms bucket aggregation may be approximate. As a result, any aggregations (such as `sum` and `avg`) on the terms bucket aggregation may also be approximate. -For example, the following PPL query (find the top 10 URLs) may return an approximate result if the cardinality of `URL` is high. - -```sql -source=hits -| stats bucket_nullable=false count() as c by URL -| sort - c -| head 10 -``` -{% include copy.html %} - -This query is pushed down to a terms bucket aggregation DSL query with `"order": { "_count": "desc" }`. In OpenSearch, this terms aggregation may throw away some buckets. - -### Sorting by ascending doc_count may produce inaccurate results - -Similar to the preceding PPL query, the following query (find the rare 10 URLs) often produces inaccurate results. - -```sql -source=hits -| stats bucket_nullable=false count() as c by URL -| sort + c -| head 10 -``` -{% include copy.html %} - -A term that is globally infrequent might not appear as infrequent on every individual shard or might be entirely absent from the least frequent results returned by some shards. Conversely, a term that appears infrequently on one shard might be common on another. In both scenarios, rare terms can be missed during shard-level aggregation, resulting in incorrect overall results. - -## Example 1: Calculate the count of events - -The following example PPL query shows how to use `stats` to calculate the count of events in the accounts. - -```sql -source=accounts -| stats count() -``` -{% include copy.html %} - -Expected output: - -| count() | -| --- | -| 4 | - - -## Example 2: Calculate the average of a field - -The following example PPL query shows how to use `stats` to calculate the average age of all the accounts. - -```sql -source=accounts -| stats avg(age) -``` -{% include copy.html %} - -Expected output: - -| avg(age) | -| --- | -| 32.25 | - - -## Example 3: Calculate the average of a field by group - -The following example PPL query shows how to use `stats` to calculate the average age of all the accounts group by gender. - -```sql -source=accounts -| stats avg(age) by gender -``` -{% include copy.html %} - -Expected output: - -| avg(age) | gender | -| --- | --- | -| 28.0 | F | -| 33.666666666666664 | M | - - -## Example 4: Calculate the average, sum and count of a field by group - -The following example PPL query shows how to use `stats` to calculate the average age, sum age and count of events of all the accounts group by gender. - -```sql -source=accounts -| stats avg(age), sum(age), count() by gender -``` -{% include copy.html %} - -Expected output: - -| avg(age) | sum(age) | count() | gender | -| --- | --- | --- | --- | -| 28.0 | 28 | 1 | F | -| 33.666666666666664 | 101 | 3 | M | - - -## Example 5: Calculate the maximum of a field - -The example calculates the max age of all the accounts. - -```sql -source=accounts -| stats max(age) -``` -{% include copy.html %} - -Expected output: - -| max(age) | -| --- | -| 36 | - - -## Example 6: Calculate the maximum and minimum of a field by group - -The example calculates the max and min age values of all the accounts group by gender. - -```sql -source=accounts -| stats max(age), min(age) by gender -``` -{% include copy.html %} - -Expected output: - -| max(age) | min(age) | gender | -| --- | --- | --- | -| 28 | 28 | F | -| 36 | 32 | M | - - -## Example 7: Calculate the distinct count of a field - -To get the count of distinct values of a field, you can use `DISTINCT_COUNT` (or `DC`) function instead of `COUNT`. The example calculates both the count and the distinct count of gender field of all the accounts. - -```sql -source=accounts -| stats count(gender), distinct_count(gender) -``` -{% include copy.html %} - -Expected output: - -| count(gender) | distinct_count(gender) | -| --- | --- | -| 4 | 2 | - - -## Example 8: Calculate the count by a span - -The example gets the count of age by the interval of 10 years. - -```sql -source=accounts -| stats count(age) by span(age, 10) as age_span -``` -{% include copy.html %} - -Expected output: - -| count(age) | age_span | -| --- | --- | -| 1 | 20 | -| 3 | 30 | - - -## Example 9: Calculate the count by a gender and span - -The example gets the count of age by the interval of 10 years and group by gender. - -```sql -source=accounts -| stats count() as cnt by span(age, 5) as age_span, gender -``` -{% include copy.html %} - -Expected output: - -| cnt | age_span | gender | -| --- | --- | --- | -| 1 | 25 | F | -| 2 | 30 | M | -| 1 | 35 | M | - -Span will always be the first grouping key whatever order you specify. - -```sql -source=accounts -| stats count() as cnt by gender, span(age, 5) as age_span -``` -{% include copy.html %} - -Expected output: - -| cnt | age_span | gender | -| --- | --- | --- | -| 1 | 25 | F | -| 2 | 30 | M | -| 1 | 35 | M | - - -## Example 10: Calculate the count and get email list by a gender and span - -The example gets the count of age by the interval of 10 years and group by gender, additionally for each row get a list of at most 5 emails. - -```sql -source=accounts -| stats count() as cnt, take(email, 5) by span(age, 5) as age_span, gender -``` -{% include copy.html %} - -Expected output: - -| cnt | take(email, 5) | age_span | gender | -| --- | --- | --- | --- | -| 1 | [] | 25 | F | -| 2 | [amberduke@pyrami.com,daleadams@boink.com] | 30 | M | -| 1 | [hattiebond@netagy.com] | 35 | M | - - -## Example 11: Calculate the percentile of a field - -The following example PPL query shows how to use `stats` to calculate the percentile 90th age of all the accounts. - -```sql -source=accounts -| stats percentile(age, 90) -``` -{% include copy.html %} - -Expected output: - -| percentile(age, 90) | -| --- | -| 36 | - - -## Example 12: Calculate the percentile of a field by group - -The following example PPL query shows how to use `stats` to calculate the percentile 90th age of all the accounts group by gender. - -```sql -source=accounts -| stats percentile(age, 90) by gender -``` -{% include copy.html %} - -Expected output: - -| percentile(age, 90) | gender | -| --- | --- | -| 28 | F | -| 36 | M | - - -## Example 13: Calculate the percentile by a gender and span - -The example gets the percentile 90th age by the interval of 10 years and group by gender. - -```sql -source=accounts -| stats percentile(age, 90) as p90 by span(age, 10) as age_span, gender -``` -{% include copy.html %} - -Expected output: - -| p90 | age_span | gender | -| --- | --- | --- | -| 28 | 20 | F | -| 36 | 30 | M | - - -## Example 14: Collect all values in a field using LIST - -The following example PPL query shows how to use `stats` to collect all firstname values, preserving duplicates and order. - -```sql -source=accounts -| stats list(firstname) -``` -{% include copy.html %} - -Expected output: - -| list(firstname) | -| --- | -| [Amber,Hattie,Nanette,Dale] | - - -## Example 15: Ignore null bucket - -```sql -source=accounts -| stats bucket_nullable=false count() as cnt by email -``` -{% include copy.html %} - -Expected output: - -| cnt | email | -| --- | --- | -| 1 | amberduke@pyrami.com | -| 1 | daleadams@boink.com | -| 1 | hattiebond@netagy.com | - - -## Example 16: Collect unique values in a field using VALUES - -The following example PPL query shows how to use `stats` to collect all unique firstname values, sorted lexicographically with duplicates removed. - -```sql -source=accounts -| stats values(firstname) -``` -{% include copy.html %} - -Expected output: - -| values(firstname) | -| --- | -| [Amber,Dale,Hattie,Nanette] | - - -## Example 17: Span on date/time field always ignore null bucket - -Index example data: -+-------+--------+------------+ -Name | DEPTNO | birthday | -+=======+========+============+ -Alice | 1 | 2024-04-21 | -+-------+--------+------------+ -Bob | 2 | 2025-08-21 | -+-------+--------+------------+ -Jeff | null | 2025-04-22 | -+-------+--------+------------+ -Adam | 2 | null | -+-------+--------+------------+ - -```sql -source=example -| stats count() as cnt by span(birthday, 1y) as year -``` -{% include copy.html %} - -Expected output: - -| cnt | year | -| --- | --- | -| 1 | 2024-01-01 | -| 2 | 2025-01-01 | - -```sql -source=example -| stats count() as cnt by span(birthday, 1y) as year, DEPTNO -``` -{% include copy.html %} - -Expected output: - -| cnt | year | DEPTNO | -| --- | --- | --- | -| 1 | 2024-01-01 | 1 | -| 1 | 2025-01-01 | 2 | -| 1 | 2025-01-01 | null | - -```sql -source=example -| stats bucket_nullable=false count() as cnt by span(birthday, 1y) as year, DEPTNO -``` -{% include copy.html %} - -Expected output: - -| cnt | year | DEPTNO | -| --- | --- | --- | -| 1 | 2024-01-01 | 1 | -| 1 | 2025-01-01 | 2 | - - -## Example 18: Calculate the count by the implicit @timestamp field - -The following example PPL query demonstrates that if you omit the field parameter in the span function, it will automatically use the implicit `@timestamp` field. - -```sql -source=big5 -| stats count() by span(1month) -``` -{% include copy.html %} - -Expected output: - -| count() | span(1month) | -| --- | --- | -| 1 | 2023-01-01 00:00:00 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/streamstats.md b/_sql-and-ppl/ppl/cmd/streamstats.md deleted file mode 100644 index 5f896cea7e9..00000000000 --- a/_sql-and-ppl/ppl/cmd/streamstats.md +++ /dev/null @@ -1,266 +0,0 @@ ---- -layout: default -title: "streamstats" -parent: "Commands" -grand_parent: "PPL" -nav_order: 39 ---- -# streamstats - - -The `streamstats` command calculates cumulative or rolling statistics as events are processed in order. Unlike `stats` or `eventstats` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. -Key aspects of `streamstats`: -1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. -2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). -3. Retains all original events and appends new fields containing the calculated statistics. -4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. - -Difference between `stats`, `eventstats` and `streamstats` -All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: -* Transformation Behavior - * `stats`: Transforms all events into an aggregated result table, losing original event structure. - * `eventstats`: Adds aggregation results as new fields to the original events without removing the event structure. - * `streamstats`: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. -* Output Format - * `stats`: Output contains only aggregated values. Original raw events are not preserved. - * `eventstats`: Original events remain, with extra fields containing summary statistics. - * `streamstats`: Original events remain, with extra fields containing running totals or cumulative statistics. -* Aggregation Scope - * `stats`: Based on all events in the search (or groups defined by BY clause). - * `eventstats`: Based on all relevant events, then the result is added back to each event in the group. - * `streamstats`: Calculations occur progressively as each event is processed; can be scoped by window. -* Use Cases - * `stats`: When only aggregated results are needed (e.g., counts, averages, sums). - * `eventstats`: When aggregated statistics are needed alongside original event data. - * `streamstats`: When a running total or cumulative statistic is needed across event streams. - - -## Syntax - -Use the following syntax: - -`streamstats [bucket_nullable=bool] [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause]` -* `function`: mandatory. A aggregation function or window function. -* `bucket_nullable`: optional. Controls whether the streamstats command consider null buckets as a valid group in group-by aggregations. When set to `false`, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. - * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` - * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` -* `current`: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=\. **Default:** true. -* `window`: optional. Specifies the number of events to use when computing the statistics. Syntax: window=\. **Default:** 0, which means that all previous and current events are used. -* `global`: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=\. **Default:** true. -* `reset_before`: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("\")". **Default:** false. -* `reset_after`: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("\")". **Default:** false. -* `by-clause`: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no \ is specified, all events are processed as a single group and running statistics are computed across the entire event stream. -* `span-expression`: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. - * Available time units - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - - -## Aggregation functions - -The streamstats command supports the following aggregation functions: -* `COUNT`: Count of values -* `SUM`: Sum of numeric values -* `AVG`: Average of numeric values -* `MAX`: Maximum value -* `MIN`: Minimum value -* `VAR_SAMP`: Sample variance -* `VAR_POP`: Population variance -* `STDDEV_SAMP`: Sample standard deviation -* `STDDEV_POP`: Population standard deviation -* DISTINCT_COUNT/DC: Distinct count of values -* `EARLIEST`: Earliest value by timestamp -* `LATEST`: Latest value by timestamp - -For detailed documentation of each function, see [Aggregation Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/functions/aggregations/). - -## Usage - -Streamstats - -```sql -source = table | streamstats avg(a) -source = table | streamstats current = false avg(a) -source = table | streamstats window = 5 sum(b) -source = table | streamstats current = false window = 2 max(a) -source = table | where a < 50 | streamstats count(c) -source = table | streamstats min(c), max(c) by b -source = table | streamstats count(c) as count_by by b | where count_by > 1000 -source = table | streamstats dc(field) as distinct_count -source = table | streamstats distinct_count(category) by region -source = table | streamstats current=false window=2 global=false avg(a) by b -source = table | streamstats window=2 reset_before=a>31 avg(b) -source = table | streamstats current=false reset_after=a>31 avg(b) by c -``` -{% include copy.html %} - - -## Example 1: Calculate the running average, sum, and count of a field by group - -This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. - -```sql -source=accounts -| streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | -| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | -| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | -| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | - - -## Example 2: Running maximum age over a 2-row window - -This example calculates the running maximum age over a 2-row window, excluding the current event. - -```sql -source=state_country -| streamstats current=false window=2 max(age) as prev_max_age -``` -{% include copy.html %} - -Expected output: - -| name | country | state | month | year | age | prev_max_age | -| --- | --- | --- | --- | --- | --- | --- | -| Jake | USA | California | 4 | 2023 | 70 | null | -| Hello | USA | New York | 4 | 2023 | 30 | 70 | -| John | Canada | Ontario | 4 | 2023 | 25 | 70 | -| Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | -| Jim | Canada | B.C | 4 | 2023 | 27 | 25 | -| Peter | Canada | B.C | 4 | 2023 | 57 | 27 | -| Rick | Canada | B.C | 4 | 2023 | 70 | 57 | -| David | USA | Washington | 4 | 2023 | 40 | 70 | - - -## Example 3: Use the global argument to calculate running statistics - -The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: -* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. -* global=false: the window itself is created per group, meaning each group gets its own independent window. - -The following example PPL query shows how to use `streamstats` to calculate the running average of age across accounts by country, using global argument. -Original data: -| name | country | state | month | year | age | -| --- | --- | --- | --- | --- | --- | -| Jake | USA | California | 4 | 2023 | 70 | -| Hello | USA | New York | 4 | 2023 | 30 | -| John | Canada | Ontario | 4 | 2023 | 25 | -| Jane | Canada | Quebec | 4 | 2023 | 20 | -| Jim | Canada | B.C | 4 | 2023 | 27 | -| Peter | Canada | B.C | 4 | 2023 | 57 | -| Rick | Canada | B.C | 4 | 2023 | 70 | -| David | USA | Washington | 4 | 2023 | 40 | -* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. -* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. - -```sql -source=state_country -| streamstats window=2 global=true avg(age) as running_avg by country -``` -{% include copy.html %} - -Expected output: - -| name | country | state | month | year | age | running_avg | -| --- | --- | --- | --- | --- | --- | --- | -| Jake | USA | California | 4 | 2023 | 70 | 70.0 | -| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | -| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | -| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | -| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | -| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | -| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | -| David | USA | Washington | 4 | 2023 | 40 | 40.0 | - -```sql -source=state_country -| streamstats window=2 global=false avg(age) as running_avg by country ; -``` -{% include copy.html %} - -Expected output: - -| name | country | state | month | year | age | running_avg | -| --- | --- | --- | --- | --- | --- | --- | -| Jake | USA | California | 4 | 2023 | 70 | 70.0 | -| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | -| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | -| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | -| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | -| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | -| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | -| David | USA | Washington | 4 | 2023 | 40 | 35.0 | - - -## Example 4: Use the reset_before and reset_after arguments to reset statistics - -This example calculates the running average of age across accounts by country, with resets applied. - -```sql -source=state_country -| streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country -``` -{% include copy.html %} - -Expected output: - -| name | country | state | month | year | age | avg_age | -| --- | --- | --- | --- | --- | --- | --- | -| Jake | USA | California | 4 | 2023 | 70 | null | -| Hello | USA | New York | 4 | 2023 | 30 | 70.0 | -| John | Canada | Ontario | 4 | 2023 | 25 | null | -| Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | -| Jim | Canada | B.C | 4 | 2023 | 27 | null | -| Peter | Canada | B.C | 4 | 2023 | 57 | null | -| Rick | Canada | B.C | 4 | 2023 | 70 | null | -| David | USA | Washington | 4 | 2023 | 40 | null | - - -## Example 5: Null buckets handling - -```sql -source=accounts -| streamstats bucket_nullable=false count() as cnt by employer -| fields account_number, firstname, employer, cnt -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | employer | cnt | -| --- | --- | --- | --- | -| 1 | Amber | Pyrami | 1 | -| 6 | Hattie | Netagy | 1 | -| 13 | Nanette | Quility | 1 | -| 18 | Dale | null | null | - -```sql -source=accounts -| streamstats bucket_nullable=true count() as cnt by employer -| fields account_number, firstname, employer, cnt -``` -{% include copy.html %} - -Expected output: - -| account_number | firstname | employer | cnt | -| --- | --- | --- | --- | -| 1 | Amber | Pyrami | 1 | -| 6 | Hattie | Netagy | 1 | -| 13 | Nanette | Quility | 1 | -| 18 | Dale | null | 1 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/syntax.md b/_sql-and-ppl/ppl/cmd/syntax.md deleted file mode 100644 index bf94ecfe793..00000000000 --- a/_sql-and-ppl/ppl/cmd/syntax.md +++ /dev/null @@ -1,83 +0,0 @@ ---- -layout: default -title: "syntax" -parent: "Commands" -grand_parent: "PPL" -nav_order: 41 ---- -# PPL syntax - -Every PPL query starts with the `search` command. It specifies the index to search and retrieve documents from. - -`PPL` supports exactly one `search` command per PPL query, and it is always the first command. The word `search` can be omitted. - -Subsequent commands can follow in any order. - - -## Syntax - -```sql -search source= [boolean-expression] -source= [boolean-expression] -``` -{% include copy.html %} - - -Field | Description | Required -:--- | :--- |:--- -`index` | Specifies the index to query. | No -`bool-expression` | Specifies an expression that evaluates to a Boolean value. | No - - -### Required arguments - -Required arguments are shown in angle brackets `< >`. - -### Optional arguments - -Optional arguments are enclosed in square brackets `[ ]`. - - -## Examples - -**Example 1: Search through accounts index** - -In the following example, the `search` command refers to an `accounts` index as the source and uses `fields` and `where` commands for the conditions: - -```sql -search source=accounts -| where age > 18 -| fields firstname, lastname -``` -{% include copy.html %} - -**Example 2: Get all documents** - -To get all documents from the `accounts` index, specify it as the `source`: - -```sql -search source=accounts; -``` -{% include copy.html %} - - -| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | -:--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- -| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke -| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond -| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates -| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams - -**Example 3: Get documents that match a condition** - -To get all documents from the `accounts` index that either have `account_number` equal to 1 or have `gender` as `F`, use the following query: - -```sql -search source=accounts account_number=1 or gender=\"F\"; -``` -{% include copy.html %} - -| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | -:--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- -| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | -| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | diff --git a/_sql-and-ppl/ppl/cmd/table.md b/_sql-and-ppl/ppl/cmd/table.md deleted file mode 100644 index 9ddaae59f18..00000000000 --- a/_sql-and-ppl/ppl/cmd/table.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: default -title: "table" -parent: "Commands" -grand_parent: "PPL" -nav_order: 42 ---- -# table - - -The `table` command is an alias for the [`fields`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/fields/) command and provides the same field selection capabilities. It allows you to keep or remove fields from the search results using enhanced syntax options. - -## Syntax - -Use the following syntax: - -`table [+|-] ` -* `[+|-]`: optional. If the plus (+) is used, only the fields specified in the field list will be kept. If the minus (-) is used, all the fields specified in the field list will be removed. **Default:** +. -* `field-list`: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. - - -## Example 1: Basic table command usage - -The following example PPL query shows basic field selection using the table command. - -```sql -source=accounts -| table firstname lastname age -``` -{% include copy.html %} - -Expected output: - -| firstname | lastname | age | -| --- | --- | --- | -| Amber | Duke | 32 | -| Hattie | Bond | 36 | -| Nanette | Bates | 28 | -| Dale | Adams | 33 | - - -## See also - -- [fields]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/fields/) - Alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/timechart.md b/_sql-and-ppl/ppl/cmd/timechart.md deleted file mode 100644 index fe77b78ff38..00000000000 --- a/_sql-and-ppl/ppl/cmd/timechart.md +++ /dev/null @@ -1,353 +0,0 @@ ---- -layout: default -title: "timechart" -parent: "Commands" -grand_parent: "PPL" -nav_order: 43 ---- -# timechart - - -The `timechart` command creates a time-based aggregation of data. It groups data by time intervals and optionally by a field, then applies an aggregation function to each group. The results are returned in an unpivoted format with separate rows for each time-field combination. - -## Syntax - -Use the following syntax: - -`timechart [timefield=] [span=] [limit=] [useother=] [by ]` -* `timefield`: optional. Specifies the timestamp field to use for time interval grouping. **Default**: `@timestamp`. -* `span`: optional. Specifies the time interval for grouping data. **Default:** 1m (1 minute). - * Available time units: - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) -* `limit`: optional. Specifies the maximum number of distinct values to display when using the "by" clause. **Default:** 10. - * When there are more distinct values than the limit, the additional values are grouped into an "OTHER" category if useother is not set to false. - * The "most distinct" values are determined by calculating the sum of the aggregation values across all time intervals for each distinct field value. The top N values with the highest sums are displayed individually, while the rest are grouped into the "OTHER" category. - * Set to 0 to show all distinct values without any limit (when limit=0, useother is automatically set to false). - * The parameters can be specified in any order before the aggregation function. - * Only applies when using the "by" clause to group results. -* `useother`: optional. Controls whether to create an "OTHER" category for values beyond the limit. **Default:** true. - * When set to false, only the top N values (based on limit) are shown without an "OTHER" column. - * When set to true, values beyond the limit are grouped into an "OTHER" category. - * Only applies when using the "by" clause and when there are more distinct values than the limit. -* `usenull`: optional. Controls whether NULL values are placed into a separate category in the chart. **Default:** true. - * When set to true, NULL values are grouped into a separate category with the label specified by nullstr. - * When set to false, NULL values are excluded from the results. -* `nullstr`: optional. The display label used for NULL values when usenull is true. **Default:** "NULL". - * Specifies the string representation for the NULL category in the chart output. -* `aggregation_function`: mandatory. The aggregation function to apply to each time bucket. - * Currently, only a single aggregation function is supported. - * Available functions: All aggregation functions supported by the [stats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/stats/) command, as well as the timechart-specific aggregations listed in the following section. -* `by`: optional. Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. - - -## PER_SECOND - -Usage: per_second(field) calculates the per-second rate for a numeric field within each time bucket. -The calculation formula is: `per_second(field) = sum(field) / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. -Return type: DOUBLE - -## PER_MINUTE - -Usage: per_minute(field) calculates the per-minute rate for a numeric field within each time bucket. -The calculation formula is: `per_minute(field) = sum(field) * 60 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. -Return type: DOUBLE - -## PER_HOUR - -Usage: per_hour(field) calculates the per-hour rate for a numeric field within each time bucket. -The calculation formula is: `per_hour(field) = sum(field) * 3600 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. -Return type: DOUBLE - -## PER_DAY - -Usage: per_day(field) calculates the per-day rate for a numeric field within each time bucket. -The calculation formula is: `per_day(field) = sum(field) * 86400 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. -Return type: DOUBLE - -## Notes - -* The `timechart` command requires a timestamp field in the data. By default, it uses the `@timestamp` field, but you can specify a different field using the `timefield` parameter. -* Results are returned in an unpivoted format with separate rows for each time-field combination that has data. -* Only combinations with actual data is included in the results - empty combinations are omitted rather than showing null or zero values. -* The "top N" values for the `limit` parameter are selected based on the sum of values across all time intervals for each distinct field value. -* When using the `limit` parameter, values beyond the limit are grouped into an "OTHER" category (unless `useother=false`). -* Examples 6 and 7 use different datasets: Example 6 uses the `events` dataset with fewer hosts for simplicity, while Example 7 uses the `events_many_hosts` dataset with 11 distinct hosts. -* **Null values**: Documents with null values in the "by" field are treated as a separate category and appear as null in the results. - - -## Example 1: Count events by hour - -This example counts events for each hour and groups them by host. - -```sql -source=events -| timechart span=1h count() by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | count() | -| --- | --- | --- | -| 2023-01-01 10:00:00 | server1 | 4 | -| 2023-01-01 10:00:00 | server2 | 4 | - - -## Example 2: Count events by minute - -This example counts events for each minute and groups them by host. - -```sql -source=events -| timechart span=1m count() by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | count() | -| --- | --- | --- | -| 2023-01-01 10:00:00 | server1 | 1 | -| 2023-01-01 10:05:00 | server2 | 1 | -| 2023-01-01 10:10:00 | server1 | 1 | -| 2023-01-01 10:15:00 | server2 | 1 | -| 2023-01-01 10:20:00 | server1 | 1 | -| 2023-01-01 10:25:00 | server2 | 1 | -| 2023-01-01 10:30:00 | server1 | 1 | -| 2023-01-01 10:35:00 | server2 | 1 | - - -## Example 3: Calculate average number of packets by minute - -This example calculates the average packets for each minute without grouping by any field. - -```sql -source=events -| timechart span=1m avg(packets) -``` -{% include copy.html %} - -Expected output: - -| @timestamp | avg(packets) | -| --- | --- | -| 2023-01-01 10:00:00 | 60.0 | -| 2023-01-01 10:05:00 | 30.0 | -| 2023-01-01 10:10:00 | 60.0 | -| 2023-01-01 10:15:00 | 30.0 | -| 2023-01-01 10:20:00 | 60.0 | -| 2023-01-01 10:25:00 | 30.0 | -| 2023-01-01 10:30:00 | 180.0 | -| 2023-01-01 10:35:00 | 90.0 | - - -## Example 4: Calculate average number of packets by every 20 minutes and status - -This example calculates the average number of packets for every 20 minutes and groups them by status. - -```sql -source=events -| timechart span=20m avg(packets) by status -``` -{% include copy.html %} - -Expected output: - -| @timestamp | status | avg(packets) | -| --- | --- | --- | -| 2023-01-01 10:00:00 | active | 30.0 | -| 2023-01-01 10:00:00 | inactive | 30.0 | -| 2023-01-01 10:00:00 | pending | 60.0 | -| 2023-01-01 10:00:00 | processing | 60.0 | -| 2023-01-01 10:20:00 | cancelled | 180.0 | -| 2023-01-01 10:20:00 | completed | 60.0 | -| 2023-01-01 10:20:00 | inactive | 90.0 | -| 2023-01-01 10:20:00 | pending | 30.0 | - - -## Example 5: Count events by hour and category - -This example counts events for each second and groups them by category - -```sql -source=events -| timechart span=1h count() by category -``` -{% include copy.html %} - -Expected output: - -| @timestamp | category | count() | -| --- | --- | --- | -| 2023-01-01 10:00:00 | orders | 4 | -| 2023-01-01 10:00:00 | users | 4 | - - -## Example 6: Using the limit parameter with count() function - -When there are many distinct values in the "by" field, the timechart command will display the top values based on the limit parameter and group the rest into an "OTHER" category. -This query will display the top 2 hosts with the highest count values, and group the remaining hosts into an "OTHER" category. - -```sql -source=events -| timechart span=1m limit=2 count() by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | count() | -| --- | --- | --- | -| 2023-01-01 10:00:00 | server1 | 1 | -| 2023-01-01 10:05:00 | server2 | 1 | -| 2023-01-01 10:10:00 | server1 | 1 | -| 2023-01-01 10:15:00 | server2 | 1 | -| 2023-01-01 10:20:00 | server1 | 1 | -| 2023-01-01 10:25:00 | server2 | 1 | -| 2023-01-01 10:30:00 | server1 | 1 | -| 2023-01-01 10:35:00 | server2 | 1 | - - -## Example 7: Using limit=0 with count() to show all values - -To display all distinct values without any limit, set limit=0: - -```sql -source=events_many_hosts -| timechart span=1h limit=0 count() by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | count() | -| --- | --- | --- | -| 2024-07-01 00:00:00 | web-01 | 1 | -| 2024-07-01 00:00:00 | web-02 | 1 | -| 2024-07-01 00:00:00 | web-03 | 1 | -| 2024-07-01 00:00:00 | web-04 | 1 | -| 2024-07-01 00:00:00 | web-05 | 1 | -| 2024-07-01 00:00:00 | web-06 | 1 | -| 2024-07-01 00:00:00 | web-07 | 1 | -| 2024-07-01 00:00:00 | web-08 | 1 | -| 2024-07-01 00:00:00 | web-09 | 1 | -| 2024-07-01 00:00:00 | web-10 | 1 | -| 2024-07-01 00:00:00 | web-11 | 1 | - -This shows all 11 hosts as separate rows without an "OTHER" category. - -## Example 8: Using useother=false with count() function - -Limit to top 10 hosts without OTHER category (useother=false): - -```sql -source=events_many_hosts -| timechart span=1h useother=false count() by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | count() | -| --- | --- | --- | -| 2024-07-01 00:00:00 | web-01 | 1 | -| 2024-07-01 00:00:00 | web-02 | 1 | -| 2024-07-01 00:00:00 | web-03 | 1 | -| 2024-07-01 00:00:00 | web-04 | 1 | -| 2024-07-01 00:00:00 | web-05 | 1 | -| 2024-07-01 00:00:00 | web-06 | 1 | -| 2024-07-01 00:00:00 | web-07 | 1 | -| 2024-07-01 00:00:00 | web-08 | 1 | -| 2024-07-01 00:00:00 | web-09 | 1 | -| 2024-07-01 00:00:00 | web-10 | 1 | - - -## Example 9: Using limit with useother parameter and avg() function - -Limit to top 3 hosts with OTHER category (default useother=true): - -```sql -source=events_many_hosts -| timechart span=1h limit=3 avg(cpu_usage) by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | avg(cpu_usage) | -| --- | --- | --- | -| 2024-07-01 00:00:00 | OTHER | 41.3 | -| 2024-07-01 00:00:00 | web-03 | 55.3 | -| 2024-07-01 00:00:00 | web-07 | 48.6 | -| 2024-07-01 00:00:00 | web-09 | 67.8 | - -Limit to top 3 hosts without OTHER category (useother=false): - -```sql -source=events_many_hosts -| timechart span=1h limit=3 useother=false avg(cpu_usage) by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | avg(cpu_usage) | -| --- | --- | --- | -| 2024-07-01 00:00:00 | web-03 | 55.3 | -| 2024-07-01 00:00:00 | web-07 | 48.6 | -| 2024-07-01 00:00:00 | web-09 | 67.8 | - - -## Example 10: Handling null values in the "by" field - -The following example PPL query shows how null values in the "by" field are treated as a separate category. The dataset events_null has 1 entry that does not have a host field. -It is put into a separate "NULL" category because the defaults for `usenull` and `nullstr` are `true` and `"NULL"` respectively. - -```sql -source=events_null -| timechart span=1h count() by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | count() | -| --- | --- | --- | -| 2024-07-01 00:00:00 | NULL | 1 | -| 2024-07-01 00:00:00 | db-01 | 1 | -| 2024-07-01 00:00:00 | web-01 | 2 | -| 2024-07-01 00:00:00 | web-02 | 2 | - - -## Example 11: Calculate packets per second rate - -This example calculates the per-second packet rate for network traffic data using the per_second() function. - -```sql -source=events -| timechart span=30m per_second(packets) by host -``` -{% include copy.html %} - -Expected output: - -| @timestamp | host | per_second(packets) | -| --- | --- | --- | -| 2023-01-01 10:00:00 | server1 | 0.1 | -| 2023-01-01 10:00:00 | server2 | 0.05 | -| 2023-01-01 10:30:00 | server1 | 0.1 | -| 2023-01-01 10:30:00 | server2 | 0.05 | - - -## Limitations - -* Only a single aggregation function is supported per timechart command. -* The `bins` parameter and other bin options are not supported since the `bin` command is not implemented yet. Use the `span` parameter to control time intervals. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/top.md b/_sql-and-ppl/ppl/cmd/top.md deleted file mode 100644 index 5634a3baffe..00000000000 --- a/_sql-and-ppl/ppl/cmd/top.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -layout: default -title: "top" -parent: "Commands" -grand_parent: "PPL" -nav_order: 44 ---- -# top - - -The `top` command finds the most common tuple of values of all fields in the field list. - -## Syntax - -Use the following syntax: - -`top [N] [top-options] [by-clause]` -* `N`: optional. number of results to return. **Default**: 10 -* `top-options`: optional. options for the top command. Supported syntax is [countfield=\] [showcount=\]. - * showcount=\: optional. whether to create a field in output that represent a count of the tuple of values. **Default:** true. - * countfield=\: optional. the name of the field that contains count. **Default:** 'count'. - * usenull=\: optional (since 3.4.0). whether to output the null value. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. - * When `plugins.ppl.syntax.legacy.preferred=true`, `usenull` defaults to `true` - * When `plugins.ppl.syntax.legacy.preferred=false`, `usenull` defaults to `false` -* `field-list`: mandatory. comma-delimited list of field names. -* `by-clause`: optional. one or more fields to group the results by. - - -## Example 1: Find the most common values in a field - -This example finds the most common gender of all the accounts. - -```sql -source=accounts -| top showcount=false gender -``` -{% include copy.html %} - -Expected output: - -| gender | -| --- | -| M | -| F | - - -## Example 2: Limit results to top N values - -This example finds the most common gender and limits results to 1 value. - -```sql -source=accounts -| top 1 showcount=false gender -``` -{% include copy.html %} - -Expected output: - -| gender | -| --- | -| M | - - -## Example 3: Find the most common values grouped by field - -This example finds the most common age of all the accounts grouped by gender. - -```sql -source=accounts -| top 1 showcount=false age by gender -``` -{% include copy.html %} - -Expected output: - -| gender | age | -| --- | --- | -| F | 28 | -| M | 32 | - - -## Example 4: Top command with count field - -This example finds the most common gender of all the accounts and includes the count. - -```sql -source=accounts -| top gender -``` -{% include copy.html %} - -Expected output: - -| gender | count | -| --- | --- | -| M | 3 | -| F | 1 | - - -## Example 5: Specify the count field option - -This example specifies a custom name for the count field. - -```sql -source=accounts -| top countfield='cnt' gender -``` -{% include copy.html %} - -Expected output: - -| gender | cnt | -| --- | --- | -| M | 3 | -| F | 1 | - - -## Example 5: Specify the usenull field option - -```sql -source=accounts -| top usenull=false email -``` -{% include copy.html %} - -Expected output: - -| email | count | -| --- | --- | -| amberduke@pyrami.com | 1 | -| daleadams@boink.com | 1 | -| hattiebond@netagy.com | 1 | - -```sql -source=accounts -| top usenull=true email -``` -{% include copy.html %} - -Expected output: - -| email | count | -| --- | --- | -| null | 1 | -| amberduke@pyrami.com | 1 | -| daleadams@boink.com | 1 | -| hattiebond@netagy.com | 1 | - - -## Limitations - -The `top` command is not rewritten to [query domain-specific language (DSL)](https://opensearch.org/docs/latest/query-dsl/index/). It is only run on the coordinating node. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/trendline.md b/_sql-and-ppl/ppl/cmd/trendline.md deleted file mode 100644 index 5cea6544561..00000000000 --- a/_sql-and-ppl/ppl/cmd/trendline.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -layout: default -title: "trendline" -parent: "Commands" -grand_parent: "PPL" -nav_order: 45 ---- -# trendline - - -The `trendline` command calculates moving averages of fields. - -## Syntax - -Use the following syntax: - -`trendline [sort <[+|-] sort-field>] [sma|wma]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/number-of-datapoints-field/) [as ] [[sma|wma]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/number-of-datapoints-field/) [as ]]...` -* `[+|-]`: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. -* `sort-field`: mandatory when sorting is used. The field used to sort. -* `sma|wma`: mandatory. Simple Moving Average (sma) applies equal weighting to all values, Weighted Moving Average (wma) applies greater weight to more recent values. -* `number-of-datapoints`: mandatory. The number of datapoints to calculate the moving average (must be greater than zero). -* `field`: mandatory. The name of the field the moving average should be calculated for. -* `alias`: optional. The name of the resulting column containing the moving average. **Default:** field name with "_trendline". - - -## Example 1: Calculate the simple moving average on one field - -The following example PPL query shows how to use `trendline` to calculate the simple moving average on one field. - -```sql -source=accounts -| trendline sma(2, account_number) as an -| fields an -``` -{% include copy.html %} - -Expected output: - -| an | -| --- | -| null | -| 3.5 | -| 9.5 | -| 15.5 | - - -## Example 2: Calculate the simple moving average on multiple fields - -The following example PPL query shows how to use `trendline` to calculate the simple moving average on multiple fields. - -```sql -source=accounts -| trendline sma(2, account_number) as an sma(2, age) as age_trend -| fields an, age_trend -``` -{% include copy.html %} - -Expected output: - -| an | age_trend | -| --- | --- | -| null | null | -| 3.5 | 34.0 | -| 9.5 | 32.0 | -| 15.5 | 30.5 | - - -## Example 3: Calculate the simple moving average on one field without specifying an alias - -The following example PPL query shows how to use `trendline` to calculate the simple moving average on one field. - -```sql -source=accounts -| trendline sma(2, account_number) -| fields account_number_trendline -``` -{% include copy.html %} - -Expected output: - -| account_number_trendline | -| --- | -| null | -| 3.5 | -| 9.5 | -| 15.5 | - - -## Example 4: Calculate the weighted moving average on one field - -The following example PPL query shows how to use `trendline` to calculate the weighted moving average on one field. - -```sql -source=accounts -| trendline wma(2, account_number) -| fields account_number_trendline -``` -{% include copy.html %} - -Expected output: - -| account_number_trendline | -| --- | -| null | -| 4.333333333333333 | -| 10.666666666666666 | -| 16.333333333333332 | - - -## Limitations - -The `trendline` command requires all values in the specified `field` to be non-null. Any rows with null values present in the calculation field will be automatically excluded from the command's output. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/where.md b/_sql-and-ppl/ppl/cmd/where.md deleted file mode 100644 index 70b9e098d6d..00000000000 --- a/_sql-and-ppl/ppl/cmd/where.md +++ /dev/null @@ -1,188 +0,0 @@ ---- -layout: default -title: "where" -parent: "Commands" -grand_parent: "PPL" -nav_order: 46 ---- -# where - - -The `where` command filters the search results. The `where` command only returns the result when the bool-expression evaluates to true. - -## Syntax - -Use the following syntax: - -`where ` -* `bool-expression`: optional. Any expression which could be evaluated to boolean value. - - -## Example 1: Filter search results with condition - -The following example PPL query shows how to use `where` to fetch all the documents from the accounts index where account_number is 1 or gender is "F". - -```sql -source=accounts -| where account_number=1 or gender="F" -| fields account_number, gender -``` -{% include copy.html %} - -Expected output: - -| account_number | gender | -| --- | --- | -| 1 | M | -| 13 | F | - - -## Example 2: Basic field Comparison - -The following example PPL query shows how to use `where` to filter accounts with balance greater than 30000. - -```sql -source=accounts -| where balance > 30000 -| fields account_number, balance -``` -{% include copy.html %} - -Expected output: - -| account_number | balance | -| --- | --- | -| 1 | 39225 | -| 13 | 32838 | - - -## Example 3: Pattern matching with LIKE - -Pattern Matching with Underscore (\_) -The following example PPL query demonstrates using LIKE with underscore (\_) to match a single character. - -```sql -source=accounts -| where LIKE(state, 'M_') -| fields account_number, state -``` -{% include copy.html %} - -Expected output: - -| account_number | state | -| --- | --- | -| 18 | MD | - -Pattern Matching with Percent (%) -The following example PPL query demonstrates using LIKE with percent (%) to match multiple characters. - -```sql -source=accounts -| where LIKE(state, 'V%') -| fields account_number, state -``` -{% include copy.html %} - -Expected output: - -| account_number | state | -| --- | --- | -| 13 | VA | - - -## Example 4: Multiple conditions - -The following example PPL query shows how to combine multiple conditions using AND operator. - -```sql -source=accounts -| where age > 30 AND gender = 'M' -| fields account_number, age, gender -``` -{% include copy.html %} - -Expected output: - -| account_number | age | gender | -| --- | --- | --- | -| 1 | 32 | M | -| 6 | 36 | M | -| 18 | 33 | M | - - -## Example 5: Using IN operator - -The following example PPL query demonstrates using IN operator to match multiple values. - -```sql -source=accounts -| where state IN ('IL', 'VA') -| fields account_number, state -``` -{% include copy.html %} - -Expected output: - -| account_number | state | -| --- | --- | -| 1 | IL | -| 13 | VA | - - -## Example 6: NULL Checks - -The following example PPL query shows how to filter records with NULL values. - -```sql -source=accounts -| where ISNULL(employer) -| fields account_number, employer -``` -{% include copy.html %} - -Expected output: - -| account_number | employer | -| --- | --- | -| 18 | null | - - -## Example 7: Complex conditions - -The following example PPL query demonstrates combining multiple conditions with parentheses and logical operators. - -```sql -source=accounts -| where (balance > 40000 OR age > 35) AND gender = 'M' -| fields account_number, balance, age, gender -``` -{% include copy.html %} - -Expected output: - -| account_number | balance | age | gender | -| --- | --- | --- | --- | -| 6 | 5686 | 36 | M | - - -## Example 8: NOT conditions - -The following example PPL query shows how to use NOT operator to exclude matching records. - -```sql -source=accounts -| where NOT state = 'CA' -| fields account_number, state -``` -{% include copy.html %} - -Expected output: - -| account_number | state | -| --- | --- | -| 1 | IL | -| 6 | TN | -| 13 | VA | -| 18 | MD | - diff --git a/_sql-and-ppl/ppl/commands/ad.md b/_sql-and-ppl/ppl/commands/ad.md new file mode 100644 index 00000000000..95452cdd6f5 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/ad.md @@ -0,0 +1,157 @@ +--- +layout: default +title: ad +parent: Commands +grand_parent: PPL +nav_order: 1 +--- + +# ad (Deprecated) + +The `ad` command is deprecated in favor of the [`ml` command]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/ml/). +{: .warning} + +The `ad` command applies Random Cut Forest (RCF) algorithm in the ML Commons plugin to the search results returned by a PPL command. The command provides two anomaly detection approaches: + +- [Anomaly detection for time-series data](#anomaly-detection-for-time-series-data) using the fixed in time RCF algorithm +- [Anomaly detection for non-time-series data](#anomaly-detection-for-non-time-series-data) using the batch RCF algorithm. + +To use the `ad` command, `plugins.calcite.enabled` must be set to `false`. +{: .note} + +## Syntax + +The `ad` command has two different syntax variants depending on the algorithm type. + +### Anomaly detection for time-series data + +Use this syntax to detect anomalies in time-series data. This method uses the fixed in time RCF algorithm, which is optimized for sequential data patterns. + +The fixed in time RCF `ad` command has the following syntax: + +```sql +ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [anomaly_rate] [date_format] [time_zone] [category_field] +``` + +### Parameters + +The fixed in time RCF algorithm supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `time_field` | Required | The time field for RCF to use as time-series data. | +| `number_of_trees` | Optional | The number of trees in the forest. Default is `30`. | +| `shingle_size` | Optional | The number of records in a shingle. A shingle is a consecutive sequence of the most recent records. Default is `8`. | +| `sample_size` | Optional | The sample size used by the stream samplers in this forest. Default is `256`. | +| `output_after` | Optional | The number of points required by the stream samplers before results are returned. Default is `32`. | +| `time_decay` | Optional | The decay factor used by the stream samplers in this forest. Default is `0.0001`. | +| `anomaly_rate` | Optional | The anomaly rate. Default is `0.005`. | +| `date_format` | Optional | The format used for the `time_field` field. Default is `yyyy-MM-dd HH:mm:ss`. | +| `time_zone` | Optional | The time zone for the `time_field` field. Default is `UTC`. | +| `category_field` | Optional | The category field used to group input values. The predict operation is applied to each category independently. | + + +### Anomaly detection for non-time-series data + +Use this syntax to detect anomalies in data where the order doesn't matter. This method uses the batch RCF algorithm, which is optimized for independent data points. + +The batch RCF `ad` command has the following syntax: + +```sql +ad [number_of_trees] [sample_size] [output_after] [training_data_size] [anomaly_score_threshold] [category_field] +``` + +### Parameters + +The batch RCF algorithm supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `number_of_trees` | Optional | The number of trees in the forest. Default is `30`. | +| `sample_size` | Optional | The number of random samples provided to each tree from the training dataset. Default is `256`. | +| `output_after` | Optional | The number of points required by the stream samplers before results are returned. Default is `32`. | +| `training_data_size` | Optional | The size of the training dataset. Default is the full dataset size. | +| `anomaly_score_threshold` | Optional | The anomaly score threshold. Default is `1.0`. | +| `category_field` | Optional | The category field used to group input values. The predict operation is applied to each category independently. | + + +## Example 1: Detecting events in New York City taxi ridership time-series data + +The following examples use the `nyc_taxi` dataset, which contains New York City taxi ridership data with fields including `value` (number of rides), `timestamp` (time of measurement), and `category` (time period classifications such as 'day' and 'night'). + +This example trains an RCF model and uses it to detect anomalies in time-series ridership data: + +```sql +source=nyc_taxi +| fields value, timestamp +| AD time_field='timestamp' +| where value=10844.0 +``` +{% include copy.html %} + +The query returns the following results: + +| value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | +| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | + + +## Example 2: Detecting events in New York City taxi ridership time-series data by category + +This example trains an RCF model and uses it to detect anomalies in time-series ridership data across multiple category values: + +```sql +source=nyc_taxi +| fields category, value, timestamp +| AD time_field='timestamp' category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +The query returns the following results: + +| category | value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | --- | +| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | +| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | + + +## Example 3: Detecting events in New York City taxi ridership non-time-series data + +This example trains an RCF model and uses it to detect anomalies in non-time-series ridership data: + +```sql +source=nyc_taxi +| fields value +| AD +| where value=10844.0 +``` +{% include copy.html %} + +The query returns the following results: + +| value | score | anomalous | +| --- | --- | --- | +| 10844.0 | 0.0 | False | + + +## Example 4: Detecting events in New York City taxi ridership non-time-series data by category + +This example trains an RCF model and uses it to detect anomalies in non-time-series ridership data across multiple category values: + +```sql +source=nyc_taxi +| fields category, value +| AD category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +The query returns the following results: + +| category | value | score | anomalous | +| --- | --- | --- | --- | +| night | 10844.0 | 0.0 | False | +| day | 6526.0 | 0.0 | False | + + diff --git a/_sql-and-ppl/ppl/commands/addcoltotals.md b/_sql-and-ppl/ppl/commands/addcoltotals.md new file mode 100644 index 00000000000..57fed4cb2b7 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/addcoltotals.md @@ -0,0 +1,94 @@ +--- +layout: default +title: addcoltotals +parent: Commands +grand_parent: PPL +nav_order: 2 +--- + +# addcoltotals + +The `addcoltotals` command computes the sum of each column and adds a summary row showing the total for each column. This command is equivalent to using `addtotals` with `row=false` and `col=true`, making it useful for creating summary reports with column totals. + +The command only processes numeric fields (integers, floats, doubles). Non-numeric fields are ignored regardless of whether they are explicitly specified in the field list. + + +## Syntax + +The `addcoltotals` command has the following syntax: + +```sql +addcoltotals [field-list] [label=] [labelfield=] +``` + +## Parameters + +The `addcoltotals` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | A comma-separated list of numeric fields to add. By default, all numeric fields are added. | +| `labelfield` | Optional | The field in which the label is placed. If the field does not exist, it is created, and the label is shown in the summary row (last row) of the new field. | +| `label` | Optional | The text that appears in the summary row (last row) to identify the computed totals. When used with `labelfield`, this text is placed in the specified field in the summary row. Default is `Total`. | + +### Example 1: Basic example + +The following query places the label in an existing field: + +```sql +source=accounts +| fields firstname, balance +| head 3 +| addcoltotals labelfield='firstname' +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | balance | +| --- | --- | +| Amber | 39225 | +| Hattie | 5686 | +| Nanette | 32838 | +| Total | 77749 | + +## Example 2: Adding column totals with a custom summary label + +The following query adds totals after a `stats` command where the final summary event label is `Sum`. It also creates a new field specified by `labelfield` because this field does not exist in the data: + +```sql +source=accounts +| stats count() by gender +| addcoltotals `count()` label='Sum' labelfield='Total' +``` +{% include copy.html %} + +The query returns the following results: + +| count() | gender | Total | +| --- | --- | --- | +| 1 | F | null | +| 3 | M | null | +| 4 | null | Sum | + +## Example 3: Using all options + +The following query uses the `addcoltotals` command with all options set: + +```sql +source=accounts +| where age > 30 +| stats avg(balance) as avg_balance, count() as count by state +| head 3 +| addcoltotals avg_balance, count label='Sum' labelfield='Column Total' +``` +{% include copy.html %} + +The query returns the following results: + +| avg_balance | count | state | Column Total | +| --- | --- | --- | --- | +| 39225.0 | 1 | IL | null | +| 4180.0 | 1 | MD | null | +| 5686.0 | 1 | TN | null | +| 49091.0 | 3 | null | Sum | diff --git a/_sql-and-ppl/ppl/commands/addtotals.md b/_sql-and-ppl/ppl/commands/addtotals.md new file mode 100644 index 00000000000..e841ca4d8c8 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/addtotals.md @@ -0,0 +1,120 @@ +--- +layout: default +title: addtotals +parent: Commands +grand_parent: PPL +nav_order: 3 +--- + +# addtotals + +The `addtotals` command computes the sum of numeric fields and can create both column totals (summary row) and row totals (new field). This command is useful for creating summary reports with subtotals or grand totals. + +The command only processes numeric fields (integers, floats, doubles). Non-numeric fields are ignored regardless of whether they are explicitly specified in the field list. + + +## Syntax + +The `addtotals` command has the following syntax: + +```sql +addtotals [field-list] [label=] [labelfield=] [row=] [col=] [fieldname=] +``` + +## Parameters + +The `addtotals` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | A comma-separated list of numeric fields to add. By default, all numeric fields are added. | +| `row` | Optional | Calculates the total of each row and adds a new field to store the row total. Default is `true`. | +| `col` | Optional | Calculates the total of each column and adds a summary event at the end with the column totals. Default is `false`. | +| `labelfield` | Optional | The field in which the label is placed. If the field does not exist, it is created, and the label is shown in the summary row (last row) of the new field. Applicable when `col=true`. | +| `label` | Optional | The text that appears in the summary row (last row) to identify the computed totals. When used with `labelfield`, this text is placed in the specified field in the summary row. Default is `Total`. Applicable when `col=true`. This parameter has no effect when the `labelfield` and `fieldname` parameters specify the same field name. | +| `fieldname` | Optional | The field used to store row totals. Applicable when `row=true`. | + +## Example 1: Basic example + +The following query places the label in an existing field: + +```sql +source=accounts +| head 3 +| fields firstname, balance +| addtotals col=true labelfield='firstname' label='Total' +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | balance | Total | +| --- | --- | --- | +| Amber | 39225 | 39225 | +| Hattie | 5686 | 5686 | +| Nanette | 32838 | 32838 | +| Total | 77749 | null | + +## Example 2: Adding column totals with a custom summary label + +The following query adds totals after a `stats` command, with the final summary event labeled `Sum`. It also creates a new field specified by `labelfield` because the field does not exist in the data: + + +```sql +source=accounts +| fields account_number, firstname , balance , age +| addtotals col=true row=false label='Sum' labelfield='Total' +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | balance | age | Total | +| --- | --- | --- | --- | --- | +| 1 | Amber | 39225 | 32 | null | +| 6 | Hattie | 5686 | 36 | null | +| 13 | Nanette | 32838 | 28 | null | +| 18 | Dale | 4180 | 33 | null | +| 38 | null | 81929 | 129 | Sum | + +If you set `row=true` in the preceding example, both row totals and column totals try to use the same field name (`Total`), creating a conflict. When this happens, the summary row label displays as `null` instead of `Sum` because the field becomes numeric (for row totals) and cannot display string values: + + +```sql +source=accounts +| fields account_number, firstname , balance , age +| addtotals col=true row=true label='Sum' labelfield='Total' +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | balance | age | Total | +| --- | --- | --- | --- | --- | +| 1 | Amber | 39225 | 32 | 39258 | +| 6 | Hattie | 5686 | 36 | 5728 | +| 13 | Nanette | 32838 | 28 | 32879 | +| 18 | Dale | 4180 | 33 | 4231 | +| 38 | null | 81929 | 129 | null | + +## Example 3: Using all options + +The following query uses the `addtotals` command with all options set: + +```sql +source=accounts +| where age > 30 +| stats avg(balance) as avg_balance, count() as count by state +| head 3 +| addtotals avg_balance, count row=true col=true fieldname='Row Total' label='Sum' labelfield='Column Total' +``` +{% include copy.html %} + +The query returns the following results: + +| avg_balance | count | state | Row Total | Column Total | +| --- | --- | --- | --- | --- | +| 39225.0 | 1 | IL | 39226.0 | null | +| 4180.0 | 1 | MD | 4181.0 | null | +| 5686.0 | 1 | TN | 5687.0 | null | +| 49091.0 | 3 | null | null | Sum | \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/append.md b/_sql-and-ppl/ppl/commands/append.md new file mode 100644 index 00000000000..20c39f63a62 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/append.md @@ -0,0 +1,76 @@ +--- +layout: default +title: append +parent: Commands +grand_parent: PPL +nav_order: 4 +--- + +# append + +The `append` command appends the results of a subsearch as additional rows to the end of the input search results (the main search). + +The command aligns columns that have the same field names and types. For columns that exist in only the main search or only the subsearch, `NULL` values are inserted in the missing fields for the respective rows. + +## Syntax + +The `append` command has the following syntax: + +```sql +append +``` + +## Parameters + +The `append` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | Executes PPL commands as a secondary search. | + +## Example 1: Append rows from a count aggregation to existing search results + +The following query appends rows from `count by gender` to `sum by gender, state`: + +```sql +source=accounts | stats sum(age) by gender, state | sort -`sum(age)` | head 5 | append [ source=accounts | stats count(age) by gender ] +``` +{% include copy.html %} + +The query returns the following results: + +| sum(age) | gender | state | count(age) | +| --- | --- | --- | --- | +| 36 | M | TN | null | +| 33 | M | MD | null | +| 32 | M | IL | null | +| 28 | F | VA | null | +| null | F | null | 1 | +| null | M | null | 3 | + + +## Example 2: Append rows with merged column names + +The following query appends rows from `sum by gender` to `sum by gender, state`, merging columns that have the same field name and type: + +```sql +source=accounts | stats sum(age) as sum by gender, state | sort -sum | head 5 | append [ source=accounts | stats sum(age) as sum by gender ] +``` +{% include copy.html %} + +The query returns the following results: + +| sum | gender | state | +| --- | --- | --- | +| 36 | M | TN | +| 33 | M | MD | +| 32 | M | IL | +| 28 | F | VA | +| 28 | F | null | +| 101 | M | null | + +## Limitations + +The `append` command has the following limitations: + +* **Schema compatibility**: When fields with the same name exist in both the main search and the subsearch but have incompatible types, the query fails with an error. To avoid type conflicts, ensure that fields with the same name share the same data type. Alternatively, use different field names. You can rename the conflicting fields using `eval`, or select non-conflicting columns using `fields`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/appendcol.md b/_sql-and-ppl/ppl/commands/appendcol.md similarity index 57% rename from _sql-and-ppl/ppl/cmd/appendcol.md rename to _sql-and-ppl/ppl/commands/appendcol.md index e8f8e208001..1a9e32b196f 100644 --- a/_sql-and-ppl/ppl/cmd/appendcol.md +++ b/_sql-and-ppl/ppl/commands/appendcol.md @@ -1,27 +1,37 @@ --- layout: default -title: "appendcol" -parent: "Commands" -grand_parent: "PPL" +title: appendcol +parent: Commands +grand_parent: PPL nav_order: 5 --- -# appendcol +# appendcol -The `appendcol` command appends the result of a sub-search and attaches it alongside the input search results (the main search). +The `appendcol` command appends the result of a subsearch as additional columns to the input search results (the main search). ## Syntax -Use the following syntax: +The `appendcol` command has the following syntax: + +```sql +appendcol [override=] +``` + +## Parameters -`appendcol [override=] ` -* override=: optional. Boolean field to specify should result from main-result be overwritten in the case of column name conflict. **Default:** false. -* `sub-search`: mandatory. Executes PPL commands as a secondary search. The sub-search uses the same data specified in the source clause of the main search results as its input. +The `appendcol` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | Executes PPL commands as a secondary search. The `subsearch` uses the data specified in the `source` clause of the main search results as its input. | +| `override` | Optional | Specifies whether the results of the main search should be overwritten when column names conflict. Default is `false`. | + ## Example 1: Append a count aggregation to existing search results -This example appends "count by gender" to "sum by gender, state". +This example appends `count by gender` to `sum by gender, state`: ```sql source=accounts @@ -31,7 +41,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | gender | state | sum(age) | count(age) | | --- | --- | --- | --- | @@ -47,9 +57,9 @@ Expected output: | F | FL | 310 | NULL | -## Example 2: Append a count aggregation to existing search results with override option +## Example 2: Append a count aggregation to existing search results, overriding the main search results -This example appends "count by gender" to "sum by gender, state" with override option. +This example appends `count by gender` to `sum by gender, state` and overrides the main search results: ```sql source=accounts @@ -59,7 +69,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | gender | state | sum(age) | count(age) | | --- | --- | --- | --- | @@ -75,9 +85,9 @@ Expected output: | F | FL | 310 | NULL | -## Example 3: Append multiple sub-search results +## Example 3: Append multiple subsearch results -The following example PPL query shows how to use `appendcol` to chain multiple appendcol commands to add columns from different sub-searches. +The following query chains multiple `appendcol` commands to add columns from different subsearches: ```sql source=employees @@ -87,7 +97,7 @@ source=employees ``` {% include copy.html %} -Expected output: +The query returns the following results: | name | dept | age | avg_age | max_age | | --- | --- | --- | --- | --- | @@ -102,9 +112,9 @@ Expected output: | Jeff | Marketing | 38 | NULL | NULL | -## Example 4: Override case of column name conflict +## Example 4: Resolve column name conflicts using the override parameter -The following example PPL query demonstrates how to use `appendcol` with the override option when column names conflict between main search and sub-search. +The following query shows how to use `appendcol` with the `override` option when column names in the main search and subsearch conflict: ```sql source=employees @@ -113,7 +123,7 @@ source=employees ``` {% include copy.html %} -Expected output: +The query returns the following results: | agg | dept | | --- | --- | diff --git a/_sql-and-ppl/ppl/commands/appendpipe.md b/_sql-and-ppl/ppl/commands/appendpipe.md new file mode 100644 index 00000000000..77b1259847c --- /dev/null +++ b/_sql-and-ppl/ppl/commands/appendpipe.md @@ -0,0 +1,86 @@ +--- +layout: default +title: appendpipe +parent: Commands +grand_parent: PPL +nav_order: 6 +--- + +# appendpipe + +The `appendpipe` command appends the results of a subpipeline to the search results. Unlike a subsearch, the subpipeline is not executed first; it runs only when the search reaches the `appendpipe` command. + +The command aligns columns that have the same field names and types. For columns that exist in only the main search or only the subpipeline, `NULL` values are inserted in the missing fields for the respective rows. + +## Syntax + +The `appendpipe` command has the following syntax: + +```sql +appendpipe [] +``` + +## Parameters + +The `appendpipe` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A list of commands applied to the search results produced by the commands that precede the `appendpipe` command. | + + +## Example 1: Append rows from a total count to existing search results + +This example appends rows from `total by gender` to `sum by gender, state`, merging columns that have the same field name and type: + +```sql +source=accounts +| stats sum(age) as part by gender, state +| sort -part +| head 5 +| appendpipe [ stats sum(part) as total by gender ] +``` +{% include copy.html %} + +The query returns the following results: + +| part | gender | state | total | +| --- | --- | --- | --- | +| 36 | M | TN | null | +| 33 | M | MD | null | +| 32 | M | IL | null | +| 28 | F | VA | null | +| null | F | null | 28 | +| null | M | null | 101 | + + +## Example 2: Append rows with merged column names + +This example appends rows from `count by gender` to `sum by gender, state`: + +```sql +source=accounts +| stats sum(age) as total by gender, state +| sort -total +| head 5 +| appendpipe [ stats sum(total) as total by gender ] +``` +{% include copy.html %} + +The query returns the following results: + +| total | gender | state | +| --- | --- | --- | +| 36 | M | TN | +| 33 | M | MD | +| 32 | M | IL | +| 28 | F | VA | +| 28 | F | null | +| 101 | M | null | + + +## Limitations + +The `appendpipe` command has the following limitations: + +* **Schema compatibility**: When fields with the same name exist in both the main search and the subpipeline but have incompatible types, the query fails with an error. To avoid type conflicts, ensure that fields with the same name share the same data type. Alternatively, use different field names. You can rename the conflicting fields using `eval`, or select non-conflicting columns using `fields`. diff --git a/_sql-and-ppl/ppl/cmd/bin.md b/_sql-and-ppl/ppl/commands/bin.md similarity index 51% rename from _sql-and-ppl/ppl/cmd/bin.md rename to _sql-and-ppl/ppl/commands/bin.md index 02dd7356560..7737b4c5f52 100644 --- a/_sql-and-ppl/ppl/cmd/bin.md +++ b/_sql-and-ppl/ppl/commands/bin.md @@ -1,58 +1,82 @@ --- layout: default -title: "bin" -parent: "Commands" -grand_parent: "PPL" +title: bin +parent: Commands +grand_parent: PPL nav_order: 7 --- -# bin +# bin -The `bin` command groups numeric values into buckets of equal intervals, making it useful for creating histograms and analyzing data distribution. It takes a numeric or time-based field and generates a new field with values that represent the lower bound of each bucket. +The `bin` command groups numeric values into buckets of equal intervals, which is useful for creating histograms and analyzing data distribution. It accepts a numeric or time-based field and generates a new field containing values that represent the lower bound of each bucket. ## Syntax -Use the following syntax: - -`bin [span=] [minspan=] [bins=] [aligntime=(earliest | latest | )] [start=] [end=]` -* `field`: mandatory. The field to bin. Accepts numeric or time-based fields. -* `span`: optional. The interval size for each bin. Cannot be used with bins or minspan parameters. - * Supports numeric (e.g., `1000`), logarithmic (e.g., `log10`, `2log10`), and time intervals - * Available time units: - * microsecond (us) - * millisecond (ms) - * centisecond (cs) - * decisecond (ds) - * second (s, sec, secs, second, seconds) - * minute (m, min, mins, minute, minutes) - * hour (h, hr, hrs, hour, hours) - * day (d, day, days) - * month (M, mon, month, months) -* `minspan`: optional. The minimum interval size for automatic span calculation. Cannot be used with span or bins parameters. -* `bins`: optional. The maximum number of equal-width bins to create. Cannot be used with span or minspan parameters. The bins parameter must be between 2 and 50000 (inclusive). - - **Limitation**: The bins parameter on timestamp fields has the following requirements: - - 1. **Pushdown must be enabled**: Controlled by ``plugins.calcite.pushdown.enabled`` (enabled by default). When pushdown is disabled, use the ``span`` parameter instead (e.g., ``bin @timestamp span=5m``). - 2. **Timestamp field must be used as an aggregation bucket**: The binned timestamp field must be used in a ``stats`` aggregation (e.g., ``source=events | bin @timestamp bins=3 | stats count() by @timestamp``). Using bins on timestamp fields outside of aggregation buckets is not supported. -* `aligntime`: optional. Align the bin times for time-based fields. Valid only for time-based discretization. Options: - * earliest: Align bins to the earliest timestamp in the data - * latest: Align bins to the latest timestamp in the data - * \: Align bins to a specific epoch time value or time modifier expression -* `start`: optional. The starting value for binning range. **Default:** minimum field value. -* `end`: optional. The ending value for binning range. **Default:** maximum field value. - -**Parameter Behavior** -When multiple parameters are specified, priority order is: span > minspan > bins > start/end > default. -**Special Behaviors:** -* Logarithmic span (`log10`, `2log10`, etc.) creates logarithmic bin boundaries instead of linear -* Daily/monthly spans automatically align to calendar boundaries and return date strings (YYYY-MM-DD) instead of timestamps -* aligntime parameter only applies to time spans excluding days/months -* start/end parameters expand the range (never shrink) and affect bin width calculation +The `bin` command has the following syntax: + +```sql +bin [span=] [minspan=] [bins=] [aligntime=(earliest | latest | )] [start=] [end=] +``` + +## Parameters + +The `bin` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The field to group into buckets. Accepts numeric or time-based fields. | +| `span` | Optional | The interval size for each bin. Cannot be used with `bins` or `minspan` parameters. Supports numeric, logarithmic (`log10`, `2log10`), and time intervals. See [Time units](#time-units).| +| `minspan` | Optional | The minimum interval size for automatic span calculation. Cannot be used with `span` or `bins` parameters. | +| `bins` | Optional | The maximum number of equal-width bins to create. Must be between `2` and `50000` (inclusive). Cannot be used with `span` or `minspan` parameters. See [The bins parameter for timestamp fields](#the-bins-parameter-for-timestamp-fields).| +| `aligntime` | Optional | Align the bin times for time-based fields. Valid only for time-based discretization. Valid values are `earliest`, `latest`, or a specific time. See [Align options](#align-time-options).| +| `start` | Optional | The starting value of the interval range. Default is the minimum value of the field. | +| `end` | Optional | The ending value of the interval range. Default is the maximum value of the field. | + +### The bins parameter for timestamp fields + +The `bins` parameter for timestamp fields has the following requirements: + +- **Pushdown must be enabled**: Enable pushdown by setting `plugins.calcite.pushdown.enabled` to `true` (enabled by default). If pushdown is disabled, use the `span` parameter instead (for example, `bin @timestamp span=5m`). +- **Timestamp field must be used as an aggregation bucket**: The binned timestamp field must be included in a `stats` aggregation (for example, `source=events | bin @timestamp bins=3 | stats count() by @timestamp`). Using `bins` on timestamp fields outside of aggregation buckets is not supported. + + +### Time units + +The following time units are available for the `span` parameter: + +* Microseconds (`us`) +* Milliseconds (`ms`) +* Centiseconds (`cs`) +* Deciseconds (`ds`) +* Seconds (`s`, `sec`, `secs`, `second`, or `seconds`) +* Minutes (`m`, `min`, `mins`, `minute`, or `minutes`) +* Hours (`h`, `hr`, `hrs`, `hour`, or `hours`) +* Days (`d`, `day`, or `days`) +* Months (`M`, `mon`, `month`, or `months`) + +### Align time options + +The following options are available for the `aligntime` parameter: + +* `earliest` -- Align bins to the earliest timestamp in the data. +* `latest` -- Align bins to the latest timestamp in the data. +* `` -- Align bins to a specific epoch time value or time modifier expression. +### Parameter behavior + +When multiple parameters are specified, priority order is: `span` > `minspan` > `bins` > `start`/`end` > default. + +### Special parameter types + +The `bin` command has the following special handling for certain parameter types: + +* Logarithmic spans (for example, `log10` or `2log10`) create logarithmic bin boundaries instead of linear ones. +* Daily or monthly spans automatically align to calendar boundaries and return date strings (`YYYY-MM-DD`) instead of timestamps. +* The `aligntime` parameter applies only to time spans shorter than a day (excluding daily or monthly spans). +* The `start` and `end` parameters expand the range (they never reduce it) and affect bin width calculations. ## Example 1: Basic numeric span - + ```sql source=accounts | bin age span=10 @@ -61,7 +85,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | account_number | | --- | --- | @@ -71,7 +95,7 @@ Expected output: ## Example 2: Large numeric span - + ```sql source=accounts | bin balance span=25000 @@ -80,7 +104,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | balance | | --- | @@ -89,7 +113,7 @@ Expected output: ## Example 3: Logarithmic span (log10) - + ```sql source=accounts | bin balance span=log10 @@ -98,7 +122,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | balance | | --- | @@ -107,7 +131,7 @@ Expected output: ## Example 4: Logarithmic span with coefficient - + ```sql source=accounts | bin balance span=2log10 @@ -116,7 +140,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | balance | | --- | @@ -126,7 +150,7 @@ Expected output: ## Example 5: Basic bins parameter - + ```sql source=time_test | bin value bins=5 @@ -135,7 +159,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | value | | --- | @@ -145,7 +169,7 @@ Expected output: ## Example 6: Low bin count - + ```sql source=accounts | bin age bins=2 @@ -154,7 +178,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | | --- | @@ -162,7 +186,7 @@ Expected output: ## Example 7: High bin count - + ```sql source=accounts | bin age bins=21 @@ -171,7 +195,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | account_number | | --- | --- | @@ -181,7 +205,7 @@ Expected output: ## Example 8: Basic minspan - + ```sql source=accounts | bin age minspan=5 @@ -190,7 +214,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | account_number | | --- | --- | @@ -200,7 +224,7 @@ Expected output: ## Example 9: Large minspan - + ```sql source=accounts | bin age minspan=101 @@ -209,7 +233,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | | --- | @@ -217,7 +241,7 @@ Expected output: ## Example 10: Start and end range - + ```sql source=accounts | bin age start=0 end=101 @@ -226,7 +250,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | | --- | @@ -234,7 +258,7 @@ Expected output: ## Example 11: Large end range - + ```sql source=accounts | bin balance start=0 end=100001 @@ -243,7 +267,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | balance | | --- | @@ -251,7 +275,7 @@ Expected output: ## Example 12: Span with start/end - + ```sql source=accounts | bin age span=1 start=25 end=35 @@ -260,7 +284,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | | --- | @@ -271,7 +295,7 @@ Expected output: ## Example 13: Hour span - + ```sql source=time_test | bin @timestamp span=1h @@ -280,7 +304,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | @timestamp | value | | --- | --- | @@ -290,7 +314,7 @@ Expected output: ## Example 14: Minute span - + ```sql source=time_test | bin @timestamp span=45minute @@ -299,7 +323,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | @timestamp | value | | --- | --- | @@ -309,7 +333,7 @@ Expected output: ## Example 15: Second span - + ```sql source=time_test | bin @timestamp span=30seconds @@ -318,7 +342,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | @timestamp | value | | --- | --- | @@ -328,7 +352,7 @@ Expected output: ## Example 16: Daily span - + ```sql source=time_test | bin @timestamp span=7day @@ -337,7 +361,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | @timestamp | value | | --- | --- | @@ -347,7 +371,7 @@ Expected output: ## Example 17: Aligntime with time modifier - + ```sql source=time_test | bin @timestamp span=2h aligntime='@d+3h' @@ -356,7 +380,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | @timestamp | value | | --- | --- | @@ -366,7 +390,7 @@ Expected output: ## Example 18: Aligntime with epoch timestamp - + ```sql source=time_test | bin @timestamp span=2h aligntime=1500000000 @@ -375,7 +399,7 @@ source=time_test ``` {% include copy.html %} -Expected output: +The query returns the following results: | @timestamp | value | | --- | --- | @@ -385,7 +409,7 @@ Expected output: ## Example 19: Default behavior (no parameters) - + ```sql source=accounts | bin age @@ -394,7 +418,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | age | account_number | | --- | --- | @@ -404,7 +428,7 @@ Expected output: ## Example 20: Binning with string fields - + ```sql source=accounts | eval age_str = CAST(age AS STRING) @@ -414,10 +438,9 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | count() | age_str | | --- | --- | | 1 | 20-30 | | 3 | 30-40 | - \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/chart.md b/_sql-and-ppl/ppl/commands/chart.md new file mode 100644 index 00000000000..df5c16bb04a --- /dev/null +++ b/_sql-and-ppl/ppl/commands/chart.md @@ -0,0 +1,181 @@ +--- +layout: default +title: chart +parent: Commands +grand_parent: PPL +nav_order: 8 +--- + +# chart + +The `chart` command transforms search results by applying a statistical aggregation function and optionally grouping the data by one or two fields. When grouped by two fields, the results are suitable for two-dimensional chart visualizations, with unique values in the second group key pivoted into column names. + +## Syntax + +The `chart` command has the following syntax: + +```sql +chart [limit=(top|bottom) ] [useother=] [usenull=] [nullstr=] [otherstr=] [ by ] | [over ] [ by ] +``` + +## Parameters + +The `chart` command supports the following parameters. + +| Parameter | Required/Optional | Description | Default | +| --- | --- | --- | --- | +| `` | Required | The aggregation function to apply to the data. Only a single aggregation function is supported. Available functions are the aggregation functions supported by the [`stats`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/stats/) command. | N/A | +| `` | Optional | Groups the results by either one field (row split) or two fields (row split and column split). The parameters `limit`, `useother`, and `usenull` apply to the column split. Results are returned as individual rows for each combination. | Aggregate across all documents | +| `over [] by []` | Optional | Alternative syntax for grouping by multiple fields. `over by ` groups the results by both fields. Using `over` alone on one field is equivalent to `by `. | N/A | +| `limit` | Optional | The number of categories to display when using column split. `limit=N` or `limit=topN` returns the top N categories. `limit=bottomN` returns the bottom N categories. When the limit is exceeded, remaining categories are grouped into an `OTHER` category (unless `useother=false`). Set to `0` to show all categories without limit. The ranking is based on the sum of aggregated values for each column category. For example, `limit=top3` keeps the three categories with the highest total values. Only applies when grouping by two fields. | `top10` | +| `useother` | Optional | Controls whether to create an `OTHER` category for categories beyond the `limit`. When set to `false`, only the top or bottom N categories (based on `limit`) are shown without an `OTHER` category. When set to `true`, categories beyond the `limit` are grouped into an `OTHER` category. This parameter only applies when using column split and when there are more categories than the `limit`. | `true` | +| `usenull` | Optional | Controls whether to group documents that have null values in the column split field into a separate `NULL` category. This parameter only applies to column split. Documents with null values in the row split field are ignored; only documents with non-null values in the row split field are included in the results. When `usenull=false`, documents with null values in the column split field are excluded from the results. When `usenull=true`, documents with null values in the column split field are grouped into a separate `NULL` category. | `true` | +| `nullstr` | Optional | Specifies the category name for documents that have null values in the column split field. This parameter only applies when `usenull` is `true`. | `"NULL"` | +| `otherstr` | Optional | Specifies the category name for the `OTHER` category. This parameter only applies when `useother` is `true` and there are values beyond the `limit`. | `OTHER` | + + +## Notes + +The following considerations apply when using the `chart` command: + +* Fields generated by column splitting are converted to strings. This ensures compatibility with `nullstr` and `otherstr` and allows the fields to be used as column names after pivoting. +* Documents with null values in fields used by the aggregation function are excluded from aggregation. For example, in `chart avg(balance) over deptno, group`, documents where `balance` is null are excluded from the average calculation. +* The aggregation metric appears as the last column in the results. Result columns are ordered as follows: `[row split] [column split] [aggregation metrics]`. + +## Example 1: Basic aggregation without grouping + +This example calculates the average balance across all accounts: + +```sql +source=accounts +| chart avg(balance) +``` +{% include copy.html %} + +The query returns the following results: + +| avg(balance) | +| --- | +| 20482.25 | + + +## Example 2: Group by single field + +This example calculates the count of accounts grouped by gender: + +```sql +source=accounts +| chart count() by gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | count() | +| --- | --- | +| F | 1 | +| M | 3 | + + +## Example 3: Using over [] by [] to group by multiple fields + +The following query calculates average balance grouped by both `gender` and `age` fields: + +```sql +source=accounts +| chart avg(balance) over gender by age +``` +{% include copy.html %} + +The query returns the following results. The `age` column in the result is converted to the string type: + +| gender | age | avg(balance) | +| --- | --- | --- | +| F | 28 | 32838.0 | +| M | 32 | 39225.0 | +| M | 33 | 4180.0 | +| M | 36 | 5686.0 | + + +## Example 4: Using basic limit functionality + +This example limits the results to show only the single top age group: + +```sql +source=accounts +| chart limit=1 count() over gender by age +``` +{% include copy.html %} + +The query returns the following results. The `age` column in the result is converted to the string type: + +| gender | age | count() | +| --- | --- | --- | +| F | OTHER | 1 | +| M | 33 | 1 | +| M | OTHER | 2 | + + +## Example 5: Using limit with other parameters + +The following query uses the `chart` command with `limit`, `useother`, and custom `otherstr` parameters: + +```sql +source=accounts +| chart limit=top1 useother=true otherstr='minor_gender' count() over state by gender +``` +{% include copy.html %} + +The query returns the following results: + +| state | gender | count() | +| --- | --- | --- | +| IL | M | 1 | +| MD | M | 1 | +| TN | M | 1 | +| VA | minor_gender | 1 | + + +## Example 6: Using null parameters + +The following query uses the `chart` command with `limit`, `usenull`, and custom `nullstr` parameters: + +```sql +source=accounts +| chart usenull=true nullstr='employer not specified' count() over firstname by employer +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | employer | count() | +| --- | --- | --- | +| Amber | Pyrami | 1 | +| Dale | employer not specified | 1 | +| Hattie | Netagy | 1 | +| Nanette | Quility | 1 | + + +## Example 7: Using span + +The following query uses the `chart` command with `span` for grouping age ranges: + +```sql +source=accounts +| chart max(balance) by age span=10, gender +``` +{% include copy.html %} + +The query returns the following results: + +| age | gender | max(balance) | +| --- | --- | --- | +| 20 | F | 32838 | +| 30 | M | 39225 | + + +## Limitations + +The `chart` command has the following limitations: + +* Only a single aggregation function is supported per `chart` command. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/dedup.md b/_sql-and-ppl/ppl/commands/dedup.md new file mode 100644 index 00000000000..43e3a7efea5 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/dedup.md @@ -0,0 +1,134 @@ +--- +layout: default +title: dedup +parent: Commands +grand_parent: PPL +nav_order: 9 +--- + +# dedup + +The `dedup` command removes duplicate documents defined by specified fields from the search result. + +## Syntax + +The `dedup` command has the following syntax: + +```sql +dedup [int] [keepempty=] [consecutive=] +``` + +## Parameters + +The `dedup` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A comma-delimited list of fields to use for deduplication. At least one field is required. | +| `` | Optional | The number of duplicate documents to retain for each combination. Must be greater than `0`. Default is `1`. | +| `keepempty` | Optional | When set to `true`, keeps documents in which any field in the field list has a `NULL` value or is missing. Default is `false`. | +| `consecutive` | Optional | When set to `true`, removes only consecutive duplicate documents. Default is `false`. Requires the legacy SQL engine (`plugins.calcite.enabled=false`). | + + +## Example 1: Remove duplicates based on a single field + +The following query deduplicates documents based on the `gender` field: + +```sql +source=accounts +| dedup gender +| fields account_number, gender +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 13 | F | + + +## Example 2: Retain multiple duplicate documents + +The following query removes duplicate documents based on the `gender` field while keeping two duplicate documents: + +```sql +source=accounts +| dedup 2 gender +| fields account_number, gender +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 6 | M | +| 13 | F | + + +## Example 3: Handle documents with empty field values + +The following query removes duplicate documents while keeping documents with `null` values in the specified field: + +```sql +source=accounts +| dedup email keepempty=true +| fields account_number, email +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | email | +| --- | --- | +| 1 | amberduke@pyrami.com | +| 6 | hattiebond@netagy.com | +| 13 | null | +| 18 | daleadams@boink.com | + +The following query removes duplicate documents while ignoring documents with empty values in the specified field: + +```sql +source=accounts +| dedup email +| fields account_number, email +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | email | +| --- | --- | +| 1 | amberduke@pyrami.com | +| 6 | hattiebond@netagy.com | +| 18 | daleadams@boink.com | + + +## Example 4: Deduplicate consecutive documents + +The following query removes duplicate consecutive documents: + +```sql +source=accounts +| dedup gender consecutive=true +| fields account_number, gender +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 13 | F | +| 18 | M | + + diff --git a/_sql-and-ppl/ppl/cmd/describe.md b/_sql-and-ppl/ppl/commands/describe.md similarity index 64% rename from _sql-and-ppl/ppl/cmd/describe.md rename to _sql-and-ppl/ppl/commands/describe.md index d3bf02a5112..ceeb8734952 100644 --- a/_sql-and-ppl/ppl/cmd/describe.md +++ b/_sql-and-ppl/ppl/commands/describe.md @@ -1,35 +1,45 @@ --- layout: default -title: "describe" -parent: "Commands" -grand_parent: "PPL" +title: describe +parent: Commands +grand_parent: PPL nav_order: 10 --- -# describe +# describe -The `describe` command queries metadata of the index. The `describe` command can only be used as the first command in the PPL query. +The `describe` command queries index metadata. The `describe` command can only be used as the first command in the PPL query. ## Syntax -Use the following syntax: +The `describe` command has the following syntax. The argument to the command is a dot-separated path to the table, consisting of an optional data source, optional schema, and required table name: -`describe [dataSource.][schema.]` -* `dataSource`: optional. If dataSource is not provided, it resolves to OpenSearch dataSource. -* `schema`: optional. If schema is not provided, it resolves to default schema. -* `tablename`: mandatory. describe command must specify which tablename to query from. - +```sql +describe [.][.] +``` -## Example 1: Fetch all the metadata +## Parameters -This example describes the accounts index. +The `describe` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The table to query. | +| `` | Optional | The data source to use. Default is the OpenSearch `datasource`. | +| `` | Optional | The schema to use. Default is the default schema. | + + + +## Example 1: Fetch all metadata + +This example describes the `accounts` index: ```sql describe accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | TABLE_CAT | TABLE_SCHEM | TABLE_NAME | COLUMN_NAME | DATA_TYPE | TYPE_NAME | COLUMN_SIZE | BUFFER_LENGTH | DECIMAL_DIGITS | NUM_PREC_RADIX | NULLABLE | REMARKS | COLUMN_DEF | SQL_DATA_TYPE | SQL_DATETIME_SUB | CHAR_OCTET_LENGTH | ORDINAL_POSITION | IS_NULLABLE | SCOPE_CATALOG | SCOPE_SCHEMA | SCOPE_TABLE | SOURCE_DATA_TYPE | IS_AUTOINCREMENT | IS_GENERATEDCOLUMN | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | @@ -46,9 +56,9 @@ Expected output: | docTestCluster | null | accounts | lastname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 10 | | null | null | null | null | NO | | -## Example 2: Fetch metadata with condition and filter +## Example 2: Fetch metadata with a condition and filter -This example retrieves columns with type bigint in the accounts index. +This example retrieves columns of the type `bigint` from the `accounts` index: ```sql describe accounts @@ -57,7 +67,7 @@ describe accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | COLUMN_NAME | | --- | @@ -65,7 +75,10 @@ Expected output: | balance | | age | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/eval.md b/_sql-and-ppl/ppl/commands/eval.md new file mode 100644 index 00000000000..c3fd7ea0877 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/eval.md @@ -0,0 +1,136 @@ +--- +layout: default +title: eval +parent: Commands +grand_parent: PPL +nav_order: 11 +--- + +# eval + +The `eval` command evaluates the specified expression and appends the result of the evaluation to the search results. + +The `eval` command is not rewritten to [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/). It is only executed on the coordinating node. +{: .note} + +## Syntax + +The `eval` command has the following syntax: + +```sql +eval = ["," = ]... +``` + +## Parameters + +The `eval` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The name of the field to create or update. If the field does not exist, a new field is added. If it already exists, its value is overwritten. | +| `` | Required | The expression to evaluate. | + + +## Example 1: Create a new field + +The following query creates a new `doubleAge` field for each document: + +```sql +source=accounts +| eval doubleAge = age * 2 +| fields age, doubleAge +``` +{% include copy.html %} + +The query returns the following results: + +| age | doubleAge | +| --- | --- | +| 32 | 64 | +| 36 | 72 | +| 28 | 56 | +| 33 | 66 | + + +## Example 2: Override an existing field + +The following query overrides the `age` field by adding `1` to its value: + +```sql +source=accounts +| eval age = age + 1 +| fields age +``` +{% include copy.html %} + +The query returns the following results: + +| age | +| --- | +| 33 | +| 37 | +| 29 | +| 34 | + + +## Example 3: Create a new field using a field defined in eval + +The following query creates a new field based on another field defined in the same `eval` expression. In this example, the new `ddAge` field is calculated by multiplying the `doubleAge` field by `2`. The `doubleAge` field itself is defined earlier in the `eval` command: + +```sql +source=accounts +| eval doubleAge = age * 2, ddAge = doubleAge * 2 +| fields age, doubleAge, ddAge +``` +{% include copy.html %} + +The query returns the following results: + +| age | doubleAge | ddAge | +| --- | --- | --- | +| 32 | 64 | 128 | +| 36 | 72 | 144 | +| 28 | 56 | 112 | +| 33 | 66 | 132 | + + +## Example 4: String concatenation + +The following query uses the `+` operator for string concatenation. You can concatenate string literals and field values as follows: + +```sql +source=accounts +| eval greeting = 'Hello ' + firstname +| fields firstname, greeting +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | greeting | +| --- | --- | +| Amber | Hello Amber | +| Hattie | Hello Hattie | +| Nanette | Hello Nanette | +| Dale | Hello Dale | + + +## Example 5: Multiple string concatenation with type casting + +The following query performs multiple concatenation operations, including type casting from numeric values to strings: + +```sql +source=accounts | eval full_info = 'Name: ' + firstname + ', Age: ' + CAST(age AS STRING) | fields firstname, age, full_info +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | full_info | +| --- | --- | --- | +| Amber | 32 | Name: Amber, Age: 32 | +| Hattie | 36 | Name: Hattie, Age: 36 | +| Nanette | 28 | Name: Nanette, Age: 28 | +| Dale | 33 | Name: Dale, Age: 33 | + + diff --git a/_sql-and-ppl/ppl/commands/eventstats.md b/_sql-and-ppl/ppl/commands/eventstats.md new file mode 100644 index 00000000000..33e81f6c6fb --- /dev/null +++ b/_sql-and-ppl/ppl/commands/eventstats.md @@ -0,0 +1,169 @@ +--- +layout: default +title: eventstats +parent: Commands +grand_parent: PPL +nav_order: 12 +--- + +# eventstats + +The `eventstats` command enriches your event data with calculated summary statistics. It analyzes the specified fields within your events, computes various statistical measures, and then appends these results as new fields to each original event. + +The `eventstats` command operates in the following way: + +1. It performs calculations across the entire search results or within defined groups. +2. The original events remain intact, with new fields added to contain the statistical results. +3. The command is particularly useful for comparative analysis, identifying outliers, and providing additional context to individual events. + +## Comparing `stats` and `eventstats` + +For a comprehensive comparison of `stats`, `eventstats`, and `streamstats` commands, including their differences in transformation behavior, output format, aggregation scope, and use cases, see [Comparing stats, eventstats, and streamstats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/streamstats/#comparing-stats-eventstats-and-streamstats). + +## Syntax + +The `eventstats` command has the following syntax: + +```sql +eventstats [bucket_nullable=bool] ... [by-clause] +``` + +The following are examples of the `eventstats` command syntax: + +```sql +source = table | eventstats avg(a) +source = table | where a < 50 | eventstats count(c) +source = table | eventstats min(c), max(c) by b +source = table | eventstats count(c) as count_by by b | where count_by > 1000 +source = table | eventstats dc(field) as distinct_count +source = table | eventstats distinct_count(category) by region +``` + +## Parameters + +The `eventstats` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | An aggregation function or window function. | +| `bucket_nullable` | Optional | Controls whether the `eventstats` command considers `null` buckets as a valid group in group-by aggregations. When set to `false`, it does not treat `null` group by values as a distinct group during aggregation. Default is determined by `plugins.ppl.syntax.legacy.preferred`. | +| `` | Optional | Groups results by specified fields or expressions. Syntax: `by [span-expression,] [field,]...` Default is aggregating over the entire search results. | +| `` | Optional | Splits field into buckets by intervals (at most one). Syntax: `span(field_expr, interval_expr)`. For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. | + +### Time units + +The following time units are available for span expressions: + +* Milliseconds (`ms`) +* Seconds (`s`) +* Minutes (`m`, case sensitive) +* Hours (`h`) +* Days (`d`) +* Weeks (`w`) +* Months (`M`, case sensitive) +* Quarters (`q`) +* Years (`y`) + +## Aggregation functions + +The `eventstats` command supports the following aggregation functions: + +* `COUNT` -- Count of values +* `SUM` -- Sum of numeric values +* `AVG` -- Average of numeric values +* `MAX` -- Maximum value +* `MIN` -- Minimum value +* `VAR_SAMP` -- Sample variance +* `VAR_POP` -- Population variance +* `STDDEV_SAMP` -- Sample standard deviation +* `STDDEV_POP` -- Population standard deviation +* `DISTINCT_COUNT`/`DC` -- Distinct count of values +* `EARLIEST` -- Earliest value by timestamp +* `LATEST` -- Latest value by timestamp + +For detailed documentation of each function, see [Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/functions/#aggregate). + +## Example 1: Calculate the average, sum and count of a field by group + +The following query calculates the average age, sum of age, and count of events for all accounts grouped by gender: + +```sql +source=accounts +| fields account_number, gender, age +| eventstats avg(age), sum(age), count() by gender +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | age | avg(age) | sum(age) | count() | +| --- | --- | --- | --- | --- | --- | +| 1 | M | 32 | 33.666666666666664 | 101 | 3 | +| 6 | M | 36 | 33.666666666666664 | 101 | 3 | +| 13 | F | 28 | 28.0 | 28 | 1 | +| 18 | M | 33 | 33.666666666666664 | 101 | 3 | + + +## Example 2: Calculate the count by a gender and span + +The following query counts events by age intervals of 5 years, grouped by gender: + +```sql +source=accounts +| fields account_number, gender, age +| eventstats count() as cnt by span(age, 5) as age_span, gender +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | age | cnt | +| --- | --- | --- | --- | +| 1 | M | 32 | 2 | +| 6 | M | 36 | 1 | +| 13 | F | 28 | 1 | +| 18 | M | 33 | 2 | + + +## Example 3: Null bucket handling + +The following query uses the `eventstats` command with `bucket_nullable=false` to exclude null values from the group-by aggregation: + +```sql +source=accounts +| eventstats bucket_nullable=false count() as cnt by employer +| fields account_number, firstname, employer, cnt +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | null | + +The following query uses the `eventstats` command with `bucket_nullable=true` to include null values in the group-by aggregation: + +```sql +source=accounts +| eventstats bucket_nullable=true count() as cnt by employer +| fields account_number, firstname, employer, cnt +| sort account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | 1 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/expand.md b/_sql-and-ppl/ppl/commands/expand.md new file mode 100644 index 00000000000..d149f2cf91e --- /dev/null +++ b/_sql-and-ppl/ppl/commands/expand.md @@ -0,0 +1,68 @@ +--- +layout: default +title: expand +parent: Commands +grand_parent: PPL +nav_order: 13 +--- + +# expand + +The `expand` command transforms a single document with a nested array field into multiple documents, each containing one element of the array. All other fields in the original document are duplicated across the resulting documents. + +The `expand` command operates in the following way: + +* It generates one row per element in the specified array field. +* The specified array field is converted into individual rows. +* If an alias is provided, the expanded values appear under the alias instead of the original field name. +* If the specified field is an empty array, the row is retained with the expanded field set to `null`. + +## Syntax + +The `expand` command has the following syntax: + +```sql +expand [as alias] +``` + +## Parameters + +The `expand` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The field to be expanded. Only nested arrays are supported. | +| `` | Optional | The name to use in place of the original field name. | + + +## Example: Expand an address field using an alias + +Given a `migration` dataset with the following data: + +```json +{"name":"abbas","age":24,"address":[{"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}}]} +{"name":"chen","age":32,"address":[{"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}},{"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}}]} +``` + +The following query expands the `address` field and renames it to `addr`: + +```sql +source=migration +| expand address as addr +``` +{% include copy.html %} + +The query returns the following results: + +| name | age | addr | +| --- | --- | --- | +| abbas | 24 | {"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}} | +| chen | 32 | {"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}} | +| chen | 32 | {"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}} | + + +## Limitations + +The `expand` command has the following limitations: + +* The `expand` command only supports nested arrays. Primitive fields storing arrays are not supported. For example, a string field storing an array of strings cannot be expanded. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/explain.md b/_sql-and-ppl/ppl/commands/explain.md similarity index 59% rename from _sql-and-ppl/ppl/cmd/explain.md rename to _sql-and-ppl/ppl/commands/explain.md index 06b28b15ea6..b069d060d2a 100644 --- a/_sql-and-ppl/ppl/cmd/explain.md +++ b/_sql-and-ppl/ppl/commands/explain.md @@ -1,31 +1,35 @@ --- layout: default -title: "explain" -parent: "Commands" -grand_parent: "PPL" +title: explain +parent: Commands +grand_parent: PPL nav_order: 14 --- -# explain +# explain The `explain` command displays the execution plan of a query, which is often used for query translation and troubleshooting. The `explain` command can only be used as the first command in the PPL query. ## Syntax -Use the following syntax: +The `explain` command has the following syntax: -`explain queryStatement` -* `mode`: optional. There are 4 explain modes: "simple", "standard", "cost", "extended". **Default:** standard. - * standard: The default mode. Display logical and physical plan with pushdown information (DSL). - * simple: Display the logical plan tree without attributes. - * cost: Display the standard information plus plan cost attributes. - * extended: Display the standard information plus generated code. -* `queryStatement`: mandatory. A PPL query to explain. - +```sql +explain queryStatement +``` -## Example 1: Explain a PPL query in v2 engine +## Parameters -When Calcite is disabled (plugins.calcite.enabled=false), explaining a PPL query will get its physical plan of v2 engine and pushdown information. +The `explain` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A PPL query to explain. | +| `` | Optional | The explain mode. Valid values are:
- `standard`: Displays the logical and physical plan along with pushdown information (DSL). Available in both v2 and v3 engines.
- `simple`: Displays the logical plan tree without attributes. Requires v3 engine (`plugins.calcite.enabled` = `true`).
- `cost`: Displays the standard information plus plan cost attributes. Requires v3 engine (`plugins.calcite.enabled` = `true`).
- `extended`: Displays the standard information plus the generated code. Requires v3 engine (`plugins.calcite.enabled` = `true`).

Default is `standard`. | + +## Example 1: Explain a PPL query in the v2 engine + +When Apache Calcite is disabled (`plugins.calcite.enabled` is set to `false`), `explain` obtains its physical plan and pushdown information from the v2 engine: ```sql explain source=state_country @@ -34,7 +38,7 @@ explain source=state_country ``` {% include copy.html %} -Explain: +The query returns the following results: ```json { @@ -57,9 +61,9 @@ Explain: ``` -## Example 2: Explain a PPL query in v3 engine +## Example 2: Explain a PPL query in the v3 engine -When Calcite is enabled (`plugins.calcite.enabled=true`), explaining a PPL query will get its logical and physical plan of v3 engine and pushdown information. +When Apache Calcite is enabled (`plugins.calcite.enabled` is set to `true`), `explain` obtains its logical and physical plan and pushdown information from the v3 engine: ```sql explain source=state_country @@ -68,8 +72,8 @@ explain source=state_country ``` {% include copy.html %} -Explain - +The query returns the following results: + ```json { "calcite": { @@ -79,16 +83,16 @@ Explain CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) """, "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#53:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/input=RelSubset#43,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#53:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/input=RelSubset#43,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) """ } } ``` -## Example 3: Explain a PPL query with simple mode +## Example 3: Explain a PPL query in the simple mode -When Calcite is enabled (`plugins.calcite.enabled=true`), you can explain a PPL query with the "simple" mode. +The following query uses the `explain` command in the `simple` mode to show a simplified logical plan tree: ```sql explain simple source=state_country @@ -97,7 +101,7 @@ explain simple source=state_country ``` {% include copy.html %} -Explain +The query returns the following results: ```json { @@ -112,9 +116,9 @@ Explain ``` -## Example 4: Explain a PPL query with cost mode +## Example 4: Explain a PPL query in the cost mode -When Calcite is enabled (`plugins.calcite.enabled=true`), you can explain a PPL query with the "cost" mode. +The following query uses the `explain` command in the `cost` mode to show plan cost attributes: ```sql explain cost source=state_country @@ -123,8 +127,8 @@ explain cost source=state_country ``` {% include copy.html %} -Explain - +The query returns the following results: + ```json { "calcite": { @@ -134,15 +138,17 @@ Explain CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 72 """, "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]): rowcount = 100.0, cumulative cost = {200.0 rows, 501.0 cpu, 0.0 io}, id = 138 - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#125:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/input=RelSubset#115,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 133 + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#125:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/input=RelSubset#115,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 133 """ } } ``` -## Example 5: Explain a PPL query with extended mode - +## Example 5: Explain a PPL query in the extended mode + +The following query uses the `explain` command in the `extended` mode to show the generated code: + ```sql explain extended source=state_country | where country = 'USA' OR country = 'England' @@ -150,8 +156,8 @@ explain extended source=state_country ``` {% include copy.html %} -Explain - +The query returns the following results: + ```json { "calcite": { @@ -161,7 +167,7 @@ Explain CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) """, "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#193:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/input=RelSubset#183,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#193:LogicalAggregate.NONE.[]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/input=RelSubset#183,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) """, "extended": """public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) { final org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan v1stashed = (org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan) root.get("v1stashed"); diff --git a/_sql-and-ppl/ppl/cmd/fields.md b/_sql-and-ppl/ppl/commands/fields.md similarity index 57% rename from _sql-and-ppl/ppl/cmd/fields.md rename to _sql-and-ppl/ppl/commands/fields.md index 55c426aafea..55e05b052b8 100644 --- a/_sql-and-ppl/ppl/cmd/fields.md +++ b/_sql-and-ppl/ppl/commands/fields.md @@ -1,27 +1,36 @@ --- layout: default -title: "fields" -parent: "Commands" -grand_parent: "PPL" +title: fields +parent: Commands +grand_parent: PPL nav_order: 15 --- -# fields +# fields The `fields` command specifies the fields that should be included in or excluded from the search results. ## Syntax -Use the following syntax: +The `fields` command has the following syntax: + +```sql +fields [+|-] +``` + +## Parameters + +The `fields` command supports the following parameters. -`fields [+|-] ` -* `+|-`: optional. If the plus (+) is used, only the fields specified in the field list will be included. If the minus (-) is used, all the fields specified in the field list will be excluded. **Default:** `+`. -* `field-list`: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. | +| `[+|-]` | Optional | If the plus sign (`+`) is used, only the fields specified in the `field-list` are included. If the minus sign (`-`) is used, all fields specified in the `field-list` are excluded. Default is `+`. | ## Example 1: Select specified fields from the search result -The following example PPL query shows how to retrieve the `account_number`, `firstname`, and `lastname` fields from the search results: +The following query shows how to retrieve the `account_number`, `firstname`, and `lastname` fields from the search results: ```sql source=accounts @@ -29,7 +38,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | account_number | firstname | lastname | | --- | --- | --- | @@ -41,7 +50,7 @@ Expected output: ## Example 2: Remove specified fields from the search results -The following example PPL query shows how to remove the `account_number` field from the search results: +The following query shows how to remove the `account_number` field from the search results: ```sql source=accounts @@ -50,7 +59,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | firstname | lastname | | --- | --- | @@ -62,8 +71,7 @@ Expected output: ## Example 3: Space-delimited field selection -Fields can be specified using spaces instead of commas, providing a more concise syntax. -**Syntax**: `fields field1 field2 field3` +Fields can be specified using spaces instead of commas, providing a more concise syntax: ```sql source=accounts @@ -71,7 +79,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | firstname | lastname | age | | --- | --- | --- | @@ -83,7 +91,7 @@ Expected output: ## Example 4: Prefix wildcard pattern -Select fields starting with a pattern using prefix wildcards. +The following query selects fields starting with a pattern using prefix wildcards: ```sql source=accounts @@ -91,7 +99,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | account_number | | --- | @@ -103,7 +111,7 @@ Expected output: ## Example 5: Suffix wildcard pattern -Select fields ending with a pattern using suffix wildcards. +The following query selects fields ending with a pattern using suffix wildcards: ```sql source=accounts @@ -111,7 +119,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | firstname | lastname | | --- | --- | @@ -123,7 +131,7 @@ Expected output: ## Example 6: Contains wildcard pattern -Select fields containing a pattern using contains wildcards. +The following query selects fields containing a pattern using contains wildcards: ```sql source=accounts @@ -132,7 +140,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | account_number | firstname | address | balance | state | age | email | lastname | | --- | --- | --- | --- | --- | --- | --- | --- | @@ -141,7 +149,7 @@ Expected output: ## Example 7: Mixed delimiter syntax -Combine spaces and commas for flexible field specification. +The following query combines spaces and commas for flexible field specification: ```sql source=accounts @@ -149,7 +157,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | firstname | account_number | lastname | | --- | --- | --- | @@ -161,7 +169,7 @@ Expected output: ## Example 8: Field deduplication -Automatically prevents duplicate columns when wildcards expand to already specified fields. +The following query automatically prevents duplicate columns when wildcards expand to already specified fields: ```sql source=accounts @@ -169,7 +177,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results. Even though `firstname` is explicitly specified and also matches `*name`, it appears only once because of automatic deduplication: | firstname | lastname | | --- | --- | @@ -177,12 +185,10 @@ Expected output: | Hattie | Bond | | Nanette | Bates | | Dale | Adams | - -Note: Even though `firstname` is explicitly specified and would also match `*name`, it appears only once due to automatic deduplication. ## Example 9: Full wildcard selection -Select all available fields using `*` or `` `*` ``. This selects all fields defined in the index schema, including fields that may contain null values. +The following query selects all available fields using `*` or `` `*` ``. This expression selects all fields defined in the index schema, including fields that may contain null values. The `*` wildcard selects fields based on the index schema, not on the data content, so fields with null values are included in the result set. Use backticks (`` `*` ``) if the plain `*` does not return all expected fields: ```sql source=accounts @@ -191,17 +197,15 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | - -Note: The `*` wildcard selects fields based on the index schema, not on data content. Fields with null values are included in the result set. Use backticks `` `*` ` if the plain `*`` doesn't return all expected fields. ## Example 10: Wildcard exclusion -Remove fields using wildcard patterns with the minus (-) operator. +The following query removes fields using wildcard patterns containing the minus (`-`) operator: ```sql source=accounts @@ -209,7 +213,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | account_number | address | balance | gender | city | employer | state | age | email | | --- | --- | --- | --- | --- | --- | --- | --- | --- | @@ -219,6 +223,6 @@ Expected output: | 18 | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | -## See also +## Related documentation -- [table]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/table/) - Alias command with identical functionality \ No newline at end of file +- [`table`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/table/) - An alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/fillnull.md b/_sql-and-ppl/ppl/commands/fillnull.md new file mode 100644 index 00000000000..9fc98a533bf --- /dev/null +++ b/_sql-and-ppl/ppl/commands/fillnull.md @@ -0,0 +1,180 @@ +--- +layout: default +title: fillnull +parent: Commands +grand_parent: PPL +nav_order: 16 +--- + +# fillnull + +The `fillnull` command replaces `null` values in one or more fields of the search results with a specified value. + +The `fillnull` command is not rewritten to [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/). It is only executed on the coordinating node. +{: .note} + +## Syntax + +The `fillnull` command has the following syntax: + +```sql +fillnull with [in ] +fillnull using = [, = ] +fillnull value= [] +``` + +The following syntax variations are available: + +* `with in ` -- Apply same value to specified fields +* `using =, ...` -- Apply different values to different fields +* `value= []` -- Alternative syntax with optional space-delimited field list + +## Parameters + +The `fillnull` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The value that replaces null values. | +| `` | Required (with `using` syntax) | The name of the field to which a specific replacement value is applied. | +| `` | Optional | A list of fields in which null values are replaced. You can specify the list as comma-delimited (using `with` or `using` syntax) or space-delimited (using `value=` syntax). By default, all fields are processed. | + +## Example 1: Replace null values in a single field with a specified value + +The following query replaces null values in the `email` field with `\`: + +```sql +source=accounts +| fields email, employer +| fillnull with '' in email +``` +{% include copy.html %} + +The query returns the following results: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| \ | Quility | +| daleadams@boink.com | null | + + +## Example 2: Replace null values in multiple fields with a specified value + +The following query replaces null values in both email and employer fields with `\`: + +```sql +source=accounts +| fields email, employer +| fillnull with '' in email, employer +``` +{% include copy.html %} + +The query returns the following results: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| \ | Quility | +| daleadams@boink.com | \ | + + +## Example 3: Replace null values in all fields with a specified value + +The following query replaces null values in all fields when no `field-list` is specified: + +```sql +source=accounts +| fields email, employer +| fillnull with '' +``` +{% include copy.html %} + +The query returns the following results: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| \ | Quility | +| daleadams@boink.com | \ | + + +## Example 4: Replace null values in multiple fields with different specified values + +The following query shows how to use the `fillnull` command with different replacement values for multiple fields using the `using` syntax: + +```sql +source=accounts +| fields email, employer +| fillnull using email = '', employer = '' +``` +{% include copy.html %} + +The query returns the following results: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| \ | Quility | +| daleadams@boink.com | \ | + + +## Example 5: Replace null values in specific fields using the `value=` syntax + +The following query shows how to use the `fillnull` command with the `value=` syntax to replace null values in specific fields: + +```sql +source=accounts +| fields email, employer +| fillnull value="" email employer +``` +{% include copy.html %} + +The query returns the following results: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| \ | Quility | +| daleadams@boink.com | \ | + + +## Example 6: Replace null values in all fields using the `value=` syntax + +When no `field-list` is specified, the replacement applies to all fields in the result: + +```sql +source=accounts +| fields email, employer +| fillnull value='' +``` +{% include copy.html %} + +The query returns the following results: + +| email | employer | +| --- | --- | +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| \ | Quility | +| daleadams@boink.com | \ | + + +## Limitations + +The `fillnull` command has the following limitations: + +* When applying the same value to all fields without specifying field names, all fields must be the same type. For mixed types, use separate `fillnull` commands or explicitly specify fields. +* The replacement value type must match all field types in the field list. When applying the same value to multiple fields, all fields must be the same type (all strings or all numeric). The following query shows the error that occurs when this rule is violated: + + ```sql + # This FAILS - same value for mixed-type fields + source=accounts | fillnull value=0 firstname, age + # ERROR: fillnull failed: replacement value type INTEGER is not compatible with field 'firstname' (type: VARCHAR). The replacement value type must match the field type. + ``` + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/flatten.md b/_sql-and-ppl/ppl/commands/flatten.md new file mode 100644 index 00000000000..142d1fde558 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/flatten.md @@ -0,0 +1,101 @@ +--- +layout: default +title: flatten +parent: Commands +grand_parent: PPL +nav_order: 17 +--- + +# flatten + +The `flatten` command converts a struct or object field into individual fields within a document. + +The resulting flattened fields are ordered lexicographically by their original key names. For example, if a struct contains the keys `b`, `c`, and `Z`, the flattened fields are ordered as `Z`, `b`, `c`. + +`flatten` should not be applied to arrays. To expand an array field into multiple rows, use the `expand` command. Note that arrays can be stored in non-array fields in OpenSearch; when flattening a field that contains a nested array, only the first element of the array is flattened. +{: .important} + +## Syntax + +The `flatten` command has the following syntax: + +```sql +flatten [as ()] +``` + +## Parameters + +The `flatten` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The field to be flattened. Only object and nested fields are supported. | +| `` | Optional | A list of names to use instead of the original key names, separated by commas. If specifying more than one alias, enclose the list in parentheses. The number of aliases must match the number of keys in the struct, and the aliases must follow the lexicographical order of the corresponding original keys. | + + +## Example: Flatten an object field using aliases + +Given the following index `my-index`: + +```json + {"message":{"info":"a","author":"e","dayOfWeek":1},"myNum":1} + {"message":{"info":"b","author":"f","dayOfWeek":2},"myNum":2} +``` + +with the following mapping: + +```json + { + "mappings": { + "properties": { + "message": { + "type": "object", + "properties": { + "info": { + "type": "keyword", + "index": "true" + }, + "author": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + }, + "index": "true" + }, + "dayOfWeek": { + "type": "long" + } + } + }, + "myNum": { + "type": "long" + } + } + } + } +``` + +The following query flattens a `message` object field and uses aliases to rename the flattened fields to `creator, dow, info`: + +```sql +source=my-index +| flatten message as (creator, dow, info) +``` +{% include copy.html %} + +The query returns the following results: + +| message | myNum | creator | dow | info | +| --- | --- | --- | --- | --- | +| {"info":"a","author":"e","dayOfWeek":1} | 1 | e | 1 | a | +| {"info":"b","author":"f","dayOfWeek":2} | 2 | f | 2 | b | + + +## Limitations + +The `flatten` command has the following limitations: + +* The `flatten` command may not function as expected if the fields to be flattened are not visible. For example, in the query `source=my-index | fields message | flatten message`, the `flatten message` command fails to execute as expected because some flattened fields, such as `message.info` and `message.author`, are hidden after the `fields message` command. As an alternative, use `source=my-index | flatten message`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/grok.md b/_sql-and-ppl/ppl/commands/grok.md similarity index 50% rename from _sql-and-ppl/ppl/cmd/grok.md rename to _sql-and-ppl/ppl/commands/grok.md index a1f5872ee57..da2dcaf2320 100644 --- a/_sql-and-ppl/ppl/cmd/grok.md +++ b/_sql-and-ppl/ppl/commands/grok.md @@ -1,27 +1,36 @@ --- layout: default -title: "grok" -parent: "Commands" -grand_parent: "PPL" +title: grok +parent: Commands +grand_parent: PPL nav_order: 18 --- -# grok +# grok -The `grok` command parses a text field with a grok pattern and appends the results to the search results. +The `grok` command parses a text field using a Grok pattern and appends the extracted results to the search results. ## Syntax -Use the following syntax: +The `grok` command has the following syntax: + +```sql +grok +``` + +## Parameters -`grok ` -* `field`: mandatory. The field must be a text field. -* `pattern`: mandatory. The grok pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. +The `grok` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The text field to parse. | +| `` | Required | The Grok pattern used to extract new fields from the specified text field. If a new field name already exists, it overwrites the original field. | -## Example 1: Create the new field +## Example 1: Create a new field -The following example PPL query shows how to use `grok` to create new field `host` for each document. `host` will be the hostname after `@` in `email` field. Parsing a null field will return an empty string. +The following query shows how to use the `grok` command to create a new field, `host`, for each document. The `host` field captures the hostname following `@` in the `email` field. Parsing a null field returns an empty string: ```sql source=accounts @@ -30,7 +39,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | email | host | | --- | --- | @@ -40,9 +49,9 @@ Expected output: | daleadams@boink.com | boink.com | -## Example 2: Override the existing field +## Example 2: Override an existing field -The following example PPL query shows how to use `grok` to override the existing `address` field with street number removed. +The following query shows how to use the `grok` command to override the existing `address` field, removing the street number: ```sql source=accounts @@ -51,7 +60,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | address | | --- | @@ -63,7 +72,7 @@ Expected output: ## Example 3: Using grok to parse logs -The following example PPL query shows how to use `grok` to parse raw logs. +The following query parses raw logs: ```sql source=apache @@ -72,7 +81,7 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | COMMONAPACHELOG | timestamp | response | bytes | | --- | --- | --- | --- | @@ -82,6 +91,8 @@ Expected output: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | 28/Sep/2022:10:15:57 -0700 | 301 | 9481 | -## Limitations +## Limitations + +The `grok` command has the following limitations: -The grok command has the same limitations as the parse command, see [parse limitations]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/cmd/parse#limitations) for details. \ No newline at end of file +* The `grok` command has the same [limitations]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/parse#limitations) as the `parse` command. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/head.md b/_sql-and-ppl/ppl/commands/head.md new file mode 100644 index 00000000000..26b9e4f010f --- /dev/null +++ b/_sql-and-ppl/ppl/commands/head.md @@ -0,0 +1,94 @@ +--- +layout: default +title: head +parent: Commands +grand_parent: PPL +nav_order: 19 +--- + +# head + +The `head` command returns the first N lines from a search result. + +The `head` command is not rewritten to [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/index/). It is only executed on the coordinating node. +{: .note} + +## Syntax + +The `head` command has the following syntax: + +```sql +head [] [from ] +``` + +## Parameters + +The `head` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | The number of results to return. Must be an integer. Default is `10`. | +| `` | Optional | The number of results to skip (used with the `from` keyword). Must be an integer. Default is `0`. | + + +## Example 1: Retrieve the first set of results using the default size + +The following query returns the default number of search results (10): + +```sql +source=accounts +| fields firstname, age +| head +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | +| --- | --- | +| Amber | 32 | +| Hattie | 36 | +| Nanette | 28 | +| Dale | 33 | + + +## Example 2: Retrieve a specified number of results + +The following query returns the first 3 search results: + +```sql +source=accounts +| fields firstname, age +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | +| --- | --- | +| Amber | 32 | +| Hattie | 36 | +| Nanette | 28 | + + +## Example 3: Retrieve the first N results after an offset M + +The following query demonstrates how to retrieve the first 3 results starting with the second result from the `accounts` index: + +```sql +source=accounts +| fields firstname, age +| head 3 from 1 +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | +| --- | --- | +| Hattie | 36 | +| Nanette | 28 | +| Dale | 33 | + + diff --git a/_sql-and-ppl/ppl/commands/index.md b/_sql-and-ppl/ppl/commands/index.md new file mode 100644 index 00000000000..1941752b36d --- /dev/null +++ b/_sql-and-ppl/ppl/commands/index.md @@ -0,0 +1,17 @@ +--- +layout: default +title: Commands +parent: PPL +nav_order: 20 +has_children: true +redirect_from: + - /search-plugins/sql/ppl/functions/ + - /observability-plugin/ppl/commands/ + - /search-plugins/ppl/commands/ + - /search-plugins/ppl/functions/ + - /sql-and-ppl/ppl/functions/ +--- + +# Commands + +PPL supports most common [SQL functions](https://docs.opensearch.org/latest/search-plugins/sql/functions/), including [relevance search](https://docs.opensearch.org/latest/search-plugins/sql/full-text/), but also introduces several more functions (called _commands_), which are available in PPL only. diff --git a/_sql-and-ppl/ppl/commands/join.md b/_sql-and-ppl/ppl/commands/join.md new file mode 100644 index 00000000000..dd757e75d19 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/join.md @@ -0,0 +1,218 @@ +--- +layout: default +title: join +parent: Commands +grand_parent: PPL +nav_order: 21 +--- + +# join + +The `join` command combines two datasets. The left side can be an index or the results of piped commands, while the right side can be either an index or a subsearch. + +## Syntax + +The `join` command supports basic and extended syntax options. + +### Basic syntax + +```sql +[joinType] join [left = ] [right = ] (on | where) +``` + +When using aliases, `left` must appear before `right`. +{: .note} + +The following are examples of the basic `join` command syntax: + +```sql +source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | inner join left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | cross join left = l right = r on 1=1 table2 +source = table1 | left semi join left = l right = r on l.a = r.a table2 +source = table1 | left anti join left = l right = r on l.a = r.a table2 +source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] +source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c +source = table1 | inner join on a = c table2 | fields a, b, c, d +source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a +source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a +source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a +``` + +#### Basic syntax parameters + +The basic `join` syntax supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. | +| `` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. | +| `joinType` | Optional | The type of join to perform. Valid values are `left`, `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. | +| `left` | Optional | An alias for the left dataset (typically a subsearch), used to avoid ambiguous field names. Specify as `left = `. | +| `right` | Optional | An alias for the right dataset (typically, a subsearch), used to avoid ambiguous field names. Specify as `right = `. | + +### Extended syntax + +```sql +join [type=] [overwrite=] [max=n] ( | [left = ] [right = ] (on | where) ) +``` + +The following are examples of the extended `join` command syntax: + +```sql +source = table1 | join type=outer left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join type=left left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join type=inner max=1 left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join a table2 | fields a, b, c +source = table1 | join a, b table2 | fields a, b, c +source = table1 | join type=outer a b table2 | fields a, b, c +source = table1 | join type=inner max=1 a, b table2 | fields a, b, c +source = table1 | join type=left overwrite=false max=0 a, b [source=table2 | rename d as b] | fields a, b, c +``` + +#### Extended syntax parameters + +The extended `join` syntax supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. | +| `` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. | +| `type` | Optional | The join type when using extended syntax. Valid values are `left`, `outer` (same as `left`), `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. | +| `` | Optional | A list of fields used to build the join criteria. These fields must exist in both datasets. If not specified, all fields common to both datasets are used as join keys. | +| `overwrite` | Optional | Applicable only when `join-field-list` is specified. Specifies whether fields from the right dataset with duplicate names should replace corresponding fields in the main search results. Default is `true`. | +| `max` | Optional | The maximum number of subsearch results to join with each row in the main search. Default is `0` (unlimited). | +| `left` | Optional | An alias for the left dataset (typically a subsearch), used to avoid ambiguous field names. Specify as `left = `. | +| `right` | Optional | An alias for the right dataset (typically, a subsearch), used to avoid ambiguous field names. Specify as `right = `. | + + +## Configuration + +The `join` command behavior is configured using the `plugins.ppl.join.subsearch_maxout` setting, which specifies the maximum number of rows from the subsearch to join against. Default is `50000`. A value of `0` indicates that the restriction is unlimited. + +To update the setting, send the following request: + +```json +PUT /_plugins/_query/settings +{ + "persistent": { + "plugins.ppl.join.subsearch_maxout": "5000" + } +} +``` +{% include copy-curl.html %} + +## Example 1: Join two indexes + +The following query uses the basic `join` syntax to join two indexes: + +```sql +source = state_country +| inner join left=a right=b ON a.name = b.name occupation +| stats avg(salary) by span(age, 10) as age_span, b.country +``` +{% include copy.html %} + +The query returns the following results: + +| avg(salary) | age_span | b.country | +| --- | --- | --- | +| 120000.0 | 40 | USA | +| 105000.0 | 20 | Canada | +| 0.0 | 40 | Canada | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | + + +## Example 2: Join with subsearch + +The following query combines a dataset with a subsearch using the basic `join` syntax: + +```sql +source = state_country as a +| where country = 'USA' OR country = 'England' +| left join ON a.name = b.name [ source = occupation +| where salary > 0 +| fields name, country, salary +| sort salary +| head 3 ] as b +| stats avg(salary) by span(age, 10) as age_span, b.country +``` +{% include copy.html %} + +The query returns the following results: + +| avg(salary) | age_span | b.country | +| --- | --- | --- | +| null | 40 | null | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | + + +## Example 3: Join using a field list + +The following query uses the extended syntax and specifies a list of fields for the join criteria: + +```sql +source = state_country +| where country = 'USA' OR country = 'England' +| join type=left overwrite=true name [ source = occupation +| where salary > 0 +| fields name, country, salary +| sort salary +| head 3 ] +| stats avg(salary) by span(age, 10) as age_span, country +``` +{% include copy.html %} + +The query returns the following results: + +| avg(salary) | age_span | country | +| --- | --- | --- | +| null | 40 | null | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | + + +## Example 4: Join with additional options + +The following query uses the extended syntax and optional parameters for more control over the join operation: + +```sql +source = state_country +| join type=inner overwrite=false max=1 name occupation +| stats avg(salary) by span(age, 10) as age_span, country +``` +{% include copy.html %} + +The query returns the following results: + +| avg(salary) | age_span | country | +| --- | --- | --- | +| 120000.0 | 40 | USA | +| 100000.0 | 70 | USA | +| 105000.0 | 20 | Canada | +| 70000.0 | 30 | USA | + + +## Limitations + +The `join` command has the following limitations: + +* **Field name ambiguity in basic syntax** – When fields from the left and right datasets share the same name, the field names in the output are ambiguous. To resolve this, conflicting fields are renamed to `.id`, or `.id` if no alias is specified. + + The following table demonstrates how field name conflicts are resolved when both `table1` and `table2` contain a field named `id`. + + | Query | Output | + | --- | --- | + | `source=table1 | join left=t1 right=t2 on t1.id=t2.id table2 | eval a = 1` | `t1.id, t2.id, a` | + | `source=table1 | join on table1.id=table2.id table2 | eval a = 1` | `table1.id, table2.id, a` | + | `source=table1 | join on table1.id=t2.id table2 as t2 | eval a = 1` | `table1.id, t2.id, a` | + | `source=table1 | join right=tt on table1.id=t2.id [ source=table2 as t2 | eval b = id ] | eval a = 1` | `table1.id, tt.id, tt.b, a` | + +* **Field deduplication in extended syntax** – When using the extended syntax with a field list, duplicate field names in the output are deduplicated according to the `overwrite` option. + +* **Join type availability** – The join types `inner`, `left`, `outer` (alias of `left`), `semi`, and `anti` are enabled by default. The performance-sensitive join types `right`, `full`, and `cross` are disabled by default. To enable these types, set `plugins.calcite.all_join_types.allowed` to `true`. diff --git a/_sql-and-ppl/ppl/commands/kmeans.md b/_sql-and-ppl/ppl/commands/kmeans.md new file mode 100644 index 00000000000..b1167c8ff85 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/kmeans.md @@ -0,0 +1,57 @@ +--- +layout: default +title: kmeans +parent: Commands +grand_parent: PPL +nav_order: 22 +--- + +# kmeans (Deprecated) + +The `kmeans` command is deprecated in favor of the [`ml` command]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/ml/). +{: .warning} + +The `kmeans` command applies the k-means algorithm in the ml-commons plugin on the search results returned by a PPL command. + +To use the `kmeans` command, `plugins.calcite.enabled` must be set to `false`. +{: .note} + +## Syntax + +The `kmeans` command has the following syntax: + +```sql +kmeans +``` + +## Parameters + +The `kmeans` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | The number of clusters you want to group your data points into. Default is `2`. | +| `` | Optional | The number of iterations. Default is `10`. | +| `` | Optional | The distance type. Valid values are `COSINE`, `L1`, and `EUCLIDEAN`. Default is `EUCLIDEAN`. | + + +## Example: Clustering of the Iris dataset + +The following query classifies three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample (the lengths and widths of sepals and petals): + +```sql +source=iris_data +| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm +| kmeans centroids=3 +``` +{% include copy.html %} + +The query returns the following results: + +| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | +| --- | --- | --- | --- | --- | +| 5.1 | 3.5 | 1.4 | 0.2 | 1 | +| 5.6 | 3.0 | 4.1 | 1.3 | 0 | +| 6.7 | 2.5 | 5.8 | 1.8 | 2 | + + diff --git a/_sql-and-ppl/ppl/commands/lookup.md b/_sql-and-ppl/ppl/commands/lookup.md new file mode 100644 index 00000000000..d22cd582c36 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/lookup.md @@ -0,0 +1,123 @@ +--- +layout: default +title: lookup +parent: Commands +grand_parent: PPL +nav_order: 23 +--- + +# lookup + +The `lookup` command enriches search data by adding or replacing values from a lookup index (dimension table). It allows you to extend fields in your index with values from a dimension table, appending or replacing values when the lookup condition matches. Compared with the `join` command, `lookup` is better suited for enriching source data with a static dataset. + +## Syntax + +The `lookup` command has the following syntax: + +```sql +lookup ( [as ])... [(replace | append) ( [as ])...] +``` + +The following are examples of the `lookup` command syntax: + +```sql +source = table1 | lookup table2 id +source = table1 | lookup table2 id, name +source = table1 | lookup table2 id as cid, name +source = table1 | lookup table2 id as cid, name replace dept as department +source = table1 | lookup table2 id as cid, name replace dept as department, city as location +source = table1 | lookup table2 id as cid, name append dept as department +source = table1 | lookup table2 id as cid, name append dept as department, city as location +``` + +## Parameters + +The `lookup` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The name of the lookup index (dimension table). | +| `` | Required | A key in the lookup index used for matching, similar to a join key in the right table. Specify multiple fields as a comma-separated list. | +| `` | Optional | A key from the source data (left side) used for matching, similar to a join key in the left table. Default is `lookupMappingField`. | +| `` | Optional | A field in the lookup index whose matched values are applied to the results (output). Specify multiple fields as a comma-separated list. If not specified, all fields except `lookupMappingField` from the lookup index are applied to the results. | +| `` | Optional | The name of the field in the results (output) in which matched values are placed. Specify multiple fields as a comma-separated list. If the `outputField` specifies an existing field in the source query, its values are replaced or appended with matched values from the `inputField`. If the field specified in the `outputField` is not an existing field, a new field is added to the results when using `replace` or the operation fails when using `append`. | +| `(replace | append)` | Optional | Specifies how matched values are applied to the output. `replace` overwrites existing values with matched values from the lookup index. `append` fills only missing values in the results with matched values from the lookup index. Default is `replace`. | + +## Example 1: Replace existing values + +The following query uses the `lookup` command with the `replace` strategy to overwrite existing values: + +```sql +source = worker + | LOOKUP work_information uid AS id REPLACE department + | fields id, name, occupation, country, salary, department +``` +{% include copy.html %} + +The query returns the following results: + +| id | name | occupation | country | salary | department | +| --- | --- | --- | --- | --- | --- | +| 1000 | Jake | Engineer | England | 100000 | IT | +| 1001 | Hello | Artist | USA | 70000 | null | +| 1002 | John | Doctor | Canada | 120000 | DATA | +| 1003 | David | Doctor | null | 120000 | HR | +| 1004 | David | null | Canada | 0 | null | +| 1005 | Jane | Scientist | Canada | 90000 | DATA | + + +## Example 2: Append missing values + +The following query uses the `lookup` command with the `append` strategy to append missing values only: + +```sql +source = worker + | LOOKUP work_information uid AS id APPEND department + | fields id, name, occupation, country, salary, department +``` +{% include copy.html %} + + +## Example 3: No input field specified + +The following query uses the `lookup` command without specifying an `inputField`, which adds all fields from the lookup index to the results: + +```sql + source = worker + | LOOKUP work_information uid AS id, name + | fields id, name, occupation, country, salary, department +``` +{% include copy.html %} + +The query returns the following results: + +| id | name | country | salary | department | occupation | +| --- | --- | --- | --- | --- | --- | +| 1000 | Jake | England | 100000 | IT | Engineer | +| 1001 | Hello | USA | 70000 | null | null | +| 1002 | John | Canada | 120000 | DATA | Scientist | +| 1003 | David | null | 120000 | HR | Doctor | +| 1004 | David | Canada | 0 | null | null | +| 1005 | Jane | Canada | 90000 | DATA | Engineer | + +## Example 4: Add matched values to a new field + +The following query places matched values into a new field specified by `outputField`: + +```sql + source = worker + | LOOKUP work_information name REPLACE occupation AS new_col + | fields id, name, occupation, country, salary, new_col +``` +{% include copy.html %} + +The query returns the following results: + +| id | name | occupation | country | salary | new_col | +| --- | --- | --- | --- | --- | --- | +| 1003 | David | Doctor | null | 120000 | Doctor | +| 1004 | David | null | Canada | 0 | Doctor | +| 1001 | Hello | Artist | USA | 70000 | null | +| 1000 | Jake | Engineer | England | 100000 | Engineer | +| 1005 | Jane | Scientist | Canada | 90000 | Engineer | +| 1002 | John | Doctor | Canada | 120000 | Scientist | \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/ml.md b/_sql-and-ppl/ppl/commands/ml.md new file mode 100644 index 00000000000..e0bc25ff52d --- /dev/null +++ b/_sql-and-ppl/ppl/commands/ml.md @@ -0,0 +1,189 @@ +--- +layout: default +title: ml +parent: Commands +grand_parent: PPL +nav_order: 24 +--- + +# ml + +The `ml` command applies machine learning algorithms from the ML Commons plugin to the search results returned by a PPL command. It supports various machine learning operations, including anomaly detection and clustering. The command can perform train, predict, or combined train-and-predict operations, depending on the algorithm and specified action. + +To use the `ml` command, `plugins.calcite.enabled` must be set to `false`. +{: .note} + +The `ml` command supports the following algorithms: + +- **Random Cut Forest (RCF)** for anomaly detection, with support for both time-series and non-time-series data + +- **K-means** for clustering data points into groups + +## Syntax + +The `ml` command supports different syntax options, depending on the algorithm. + +### Anomaly detection for time-series data + +Use this syntax to detect anomalies in time-series data. This method uses the Random Cut Forest (RCF) algorithm optimized for sequential data patterns: + +```sql +ml action='train' algorithm='rcf' +``` + +### Parameters + +The fixed in time RCF algorithm supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `number_of_trees` | Optional | The number of trees in the forest. Default is `30`. | +| `shingle_size` | Optional | The number of records in a shingle. A shingle is a consecutive sequence of the most recent records. Default is `8`. | +| `sample_size` | Optional | The sample size used by the stream samplers in this forest. Default is `256`. | +| `output_after` | Optional | The number of points required by the stream samplers before results are returned. Default is `32`. | +| `time_decay` | Optional | The decay factor used by the stream samplers in this forest. Default is `0.0001`. | +| `anomaly_rate` | Optional | The anomaly rate. Default is `0.005`. | +| `time_field` | Required | The time field for RCF to use as time-series data. | +| `date_format` | Optional | The format for the `time_field`. Default is `yyyy-MM-dd HH:mm:ss`. | +| `time_zone` | Optional | The time zone for the `time_field`. Default is `UTC`. | +| `category_field` | Optional | The category field used to group input values. The predict operation is applied to each category independently. | + +### Anomaly detection for non-time-series data + +Use this syntax to detect anomalies in data where the order doesn't matter. This method uses the Random Cut Forest (RCF) algorithm optimized for independent data points: + +```sql +ml action='train' algorithm='rcf' +``` + +### Parameters + +The batch RCF algorithm supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `number_of_trees` | Optional | The number of trees in the forest. Default is `30`. | +| `sample_size` | Optional | The number of random samples provided to each tree from the training dataset. Default is `256`. | +| `output_after` | Optional | The number of points required by the stream samplers before results are returned. Default is `32`. | +| `training_data_size` | Optional | The size of the training dataset. Default is the full dataset size. | +| `anomaly_score_threshold` | Optional | The anomaly score threshold. Default is `1.0`. | +| `category_field` | Optional | The category field used to group input values. The predict operation is applied to each category independently. | + + +### K-means clustering + +Use this syntax to group data points into clusters based on similarity: + +```sql +ml action='train' algorithm='kmeans' +``` + +### Parameters + +The k-means clustering algorithm supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `centroids` | Optional | The number of clusters you want to group your data points into. Default is `2`. | +| `iterations` | Optional | The number of iterations. Default is `10`. | +| `distance_type` | Optional | The distance type. Valid values are `COSINE`, `L1`, and `EUCLIDEAN`. Default is `EUCLIDEAN`. | + + +## Example 1: Time-series anomaly detection + +This example trains an RCF model and uses it to detect anomalies in time-series ridership data: + +```sql +source=nyc_taxi +| fields value, timestamp +| ml action='train' algorithm='rcf' time_field='timestamp' +| where value=10844.0 +``` +{% include copy.html %} + +The query returns the following results: + +| value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | +| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | + + +## Example 2: Time-series anomaly detection by category + +This example trains an RCF model and uses it to detect anomalies in time-series ridership data across multiple category values: + +```sql +source=nyc_taxi +| fields category, value, timestamp +| ml action='train' algorithm='rcf' time_field='timestamp' category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +The query returns the following results: + +| category | value | timestamp | score | anomaly_grade | +| --- | --- | --- | --- | --- | +| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | +| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | + + +## Example 3: Non-time-series anomaly detection + +This example trains an RCF model and uses it to detect anomalies in non-time-series ridership data: + +```sql +source=nyc_taxi +| fields value +| ml action='train' algorithm='rcf' +| where value=10844.0 +``` +{% include copy.html %} + +The query returns the following results: + +| value | score | anomalous | +| --- | --- | --- | +| 10844.0 | 0.0 | False | + + +## Example 4: Non-time-series anomaly detection by category + +This example trains an RCF model and uses it to detect anomalies in non-time-series ridership data across multiple category values: + +```sql +source=nyc_taxi +| fields category, value +| ml action='train' algorithm='rcf' category_field='category' +| where value=10844.0 or value=6526.0 +``` +{% include copy.html %} + +The query returns the following results: + +| category | value | score | anomalous | +| --- | --- | --- | --- | +| night | 10844.0 | 0.0 | False | +| day | 6526.0 | 0.0 | False | + + +## Example 5: K-means clustering of the Iris dataset + +This example uses k-means clustering to classify three Iris species (Iris setosa, Iris virginica, and Iris versicolor) based on the combination of four features measured from each sample (the lengths and widths of sepals and petals): + +```sql +source=iris_data +| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm +| ml action='train' algorithm='kmeans' centroids=3 +``` +{% include copy.html %} + +The query returns the following results: + +| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | +| --- | --- | --- | --- | --- | +| 5.1 | 3.5 | 1.4 | 0.2 | 1 | +| 5.6 | 3.0 | 4.1 | 1.3 | 0 | +| 6.7 | 2.5 | 5.8 | 1.8 | 2 | + + diff --git a/_sql-and-ppl/ppl/commands/multisearch.md b/_sql-and-ppl/ppl/commands/multisearch.md new file mode 100644 index 00000000000..2988f922e01 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/multisearch.md @@ -0,0 +1,157 @@ +--- +layout: default +title: multisearch +parent: Commands +grand_parent: PPL +nav_order: 25 +--- + +# multisearch + + +The `multisearch` command runs multiple subsearches and merges their results. It allows you to combine data from different queries on the same or different sources. You can optionally apply subsequent processing, such as aggregation or sorting, to the combined results. Each subsearch can have different filtering criteria, data transformations, and field selections. + +Multisearch is particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. The command supports timestamp-based result interleaving when working with time-series data. + +Use multisearch for: + +* **Comparative analysis**: Compare metrics across different segments, regions, or time periods. +* **Success rate monitoring**: Calculate success rates by comparing successful to total operations. +* **Multi-source data combination**: Merge data from different indexes or apply different filters to the same source. +* **A/B testing analysis**: Combine results from different test groups for comparison. +* **Time-series data merging**: Interleave events from multiple sources based on timestamps. + + + +## Syntax + +The `multisearch` command has the following syntax: + +```sql +multisearch [ ...] +``` + +The following are examples of the `multisearch` command syntax: + +```sql +| multisearch [search source=table | where condition1] [search source=table | where condition2] +| multisearch [search source=index1 | fields field1, field2] [search source=index2 | fields field1, field2] +| multisearch [search source=table | where status="success"] [search source=table | where status="error"] +``` + +## Parameters + +The `multisearch` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | At least two subsearches are required. Each subsearch must be enclosed in square brackets and start with the `search` keyword (`[search source=index | ]`). All PPL commands are supported within subsearches. | +| `` | Optional | Commands applied to the merged results after the multisearch operation (for example, `stats`, `sort`, or `head`). | + +## Example 1: Combining age groups for demographic analysis + +This example demonstrates how to merge customers from different age segments into a unified dataset. It combines `young` and `adult` customers into a single result set and adds categorization labels for further analysis: + +```sql +| multisearch [search source=accounts +| where age < 30 +| eval age_group = "young" +| fields firstname, age, age_group] [search source=accounts +| where age >= 30 +| eval age_group = "adult" +| fields firstname, age, age_group] +| sort age +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | age_group | +| --- | --- | --- | +| Nanette | 28 | young | +| Amber | 32 | adult | +| Dale | 33 | adult | +| Hattie | 36 | adult | + + +## Example 2: Segmenting accounts by balance tier + +This example demonstrates how to create account segments based on balance thresholds for comparative analysis. It separates `high_balance` accounts from `regular` accounts and labels them for easy comparison: + +```sql +| multisearch [search source=accounts +| where balance > 20000 +| eval query_type = "high_balance" +| fields firstname, balance, query_type] [search source=accounts +| where balance > 0 AND balance <= 20000 +| eval query_type = "regular" +| fields firstname, balance, query_type] +| sort balance desc +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | balance | query_type | +| --- | --- | --- | +| Amber | 39225 | high_balance | +| Nanette | 32838 | high_balance | +| Hattie | 5686 | regular | +| Dale | 4180 | regular | + + +## Example 3: Merging time-series data from multiple sources + +This example demonstrates how to combine time-series data from different sources while maintaining chronological order. The results are automatically sorted by timestamp to create a unified timeline: + +```sql +| multisearch [search source=time_data +| where category IN ("A", "B")] [search source=time_data2 +| where category IN ("E", "F")] +| fields @timestamp, category, value, timestamp +| head 5 +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | category | value | timestamp | +| --- | --- | --- | --- | +| 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 | +| 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 | +| 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 | +| 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 | +| 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 | + + +## Example 4: Handling missing fields across subsearches + +This example demonstrates how multisearch handles schema differences when subsearches return different fields. When one subsearch includes a field that others don't have, missing values are automatically filled with null values: + +```sql +| multisearch [search source=accounts +| where age < 30 +| eval young_flag = "yes" +| fields firstname, age, young_flag] [search source=accounts +| where age >= 30 +| fields firstname, age] +| sort age +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | young_flag | +| --- | --- | --- | +| Nanette | 28 | yes | +| Amber | 32 | null | +| Dale | 33 | null | +| Hattie | 36 | null | + + +## Limitations + +The `multisearch` command has the following limitations: + +* At least two subsearches must be specified. +* When fields with the same name exist across subsearches but have incompatible types, the system automatically resolves conflicts by renaming the conflicting fields. The first occurrence retains the original name, while subsequent conflicting fields are renamed using a numeric suffix (for example, `age` becomes `age0`, `age1`, and so on). This ensures that all data is preserved while maintaining schema consistency. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/parse.md b/_sql-and-ppl/ppl/commands/parse.md new file mode 100644 index 00000000000..6d9d83d99ec --- /dev/null +++ b/_sql-and-ppl/ppl/commands/parse.md @@ -0,0 +1,136 @@ +--- +layout: default +title: parse +parent: Commands +grand_parent: PPL +nav_order: 26 +--- + +# parse + +The `parse` command extracts information from a text field using a regular expression and adds the extracted information to the search results. It uses Java regex patterns. For more information, see the [Java regular expression documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). + +## rex and parse commands compared + +The `rex` and `parse` commands both extract information from text fields using Java regular expressions with named capture groups. To compare the capabilities of the `rex` and `parse` commands, see the [`rex` command documentation]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/rex/). + +## Syntax + +The `parse` command has the following syntax: + +```sql +parse +``` + +## Parameters + +The `parse` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The text field to parse. | +| `` | Required | The regular expression pattern used to extract new fields from the specified text field. If a field with the same name already exists, its values are replaced. | + +## Regular expression + +The regular expression pattern is used to match the whole text field of each document based on the [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). Each named capture group in the expression becomes a new `STRING` field. + +## Example 1: Create a new field + +The following query extracts the hostname from email addresses. The regex pattern `.+@(?.+)` captures all characters after the `@` symbol and creates a new `host` field. When parsing a null field, the result is an empty string: + +```sql +source=accounts +| parse email '.+@(?.+)' +| fields email, host +``` +{% include copy.html %} + +The query returns the following results: + +| email | host | +| --- | --- | +| amberduke@pyrami.com | pyrami.com | +| hattiebond@netagy.com | netagy.com | +| null | | +| daleadams@boink.com | boink.com | + + +## Example 2: Override an existing field + +The following query replaces the `address` field with only the street name, removing the street number. The regex pattern `\d+ (?
.+)` matches digits followed by a space, then captures the remaining text as the new `address` value: + +```sql +source=accounts +| parse address '\d+ (?
.+)' +| fields address +``` +{% include copy.html %} + +The query returns the following results: + +| address | +| --- | +| Holmes Lane | +| Bristol Street | +| Madison Street | +| Hutchinson Court | + + +## Example 3: Parse, filter, and sort address components + +The following query extracts street numbers and names from addresses, then filters for street numbers greater than 500 and sorts them numerically. The regex pattern `(?\d+) (?.+)` captures the numeric part as `streetNumber` and the remaining text as `street`: + +```sql +source=accounts +| parse address '(?\d+) (?.+)' +| where cast(streetNumber as int) > 500 +| sort num(streetNumber) +| fields streetNumber, street +``` +{% include copy.html %} + +The query returns the following results: + +| streetNumber | street | +| --- | --- | +| 671 | Bristol Street | +| 789 | Madison Street | +| 880 | Holmes Lane | + + +## Limitations + +The `parse` command has the following limitations: + +- Fields created by the `parse` command cannot be parsed again. For example, the following command does not function as intended: + + ```sql + source=accounts | parse address '\d+ (?.+)' | parse street '\w+ (?\w+)' + ``` + +- Fields created by the `parse` command cannot be overridden by other commands. For example, in the following query, the `where` clause does not match any documents because `street` cannot be overridden: + + ```sql + source=accounts | parse address '\d+ (?.+)' | eval street='1' | where street='1' + ``` + +- The source text field used by the `parse` command cannot be overridden. For example, in the following query, the `street` field is not parsed correctly because `address` is overridden: + + ```sql + source=accounts | parse address '\d+ (?.+)' | eval address='1' + ``` + +- Fields created by the `parse` command cannot be filtered or sorted after they are used in the `stats` command. For example, in the following query, the `where` clause does not function as intended: + + ```sql + source=accounts | parse email '.+@(?.+)' | stats avg(age) by host | where host=pyrami.com + ``` + +- Fields created by the `parse` command do not appear in the final results unless the original source field is included in the `fields` command. For example, the following query does not return the parsed field `host` unless the source field `email` is explicitly included: + + ```sql + source=accounts | parse email '.+@(?.+)' | fields email, host + ``` + + diff --git a/_sql-and-ppl/ppl/cmd/patterns.md b/_sql-and-ppl/ppl/commands/patterns.md similarity index 51% rename from _sql-and-ppl/ppl/cmd/patterns.md rename to _sql-and-ppl/ppl/commands/patterns.md index e2fddedc05d..6b4c9ad1315 100644 --- a/_sql-and-ppl/ppl/cmd/patterns.md +++ b/_sql-and-ppl/ppl/commands/patterns.md @@ -1,60 +1,104 @@ --- layout: default -title: "patterns" -parent: "Commands" -grand_parent: "PPL" +title: patterns +parent: Commands +grand_parent: PPL nav_order: 27 --- + # patterns +The `patterns` command extracts log patterns from a text field and appends the results to the search results. Grouping logs by pattern simplifies aggregating statistics from large volumes of log data for analysis and troubleshooting. You can choose from the following log-parsing methods to achieve high pattern-grouping accuracy: + +* `simple_pattern`: A parsing method that uses [Java regular expressions](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +* `brain`: An automatic log-grouping method that provides high grouping accuracy while preserving semantic meaning. + +The `patterns` command supports the following modes: + +* `label`: Returns individual pattern labels. +* `aggregation`: Returns aggregated results for the target field. + +The command identifies variable parts of log messages (such as timestamps, numbers, IP addresses, and unique identifiers) and replaces them with `<*>` placeholders to create reusable patterns. For example, email addresses like `amberduke@pyrami.com` and `hattiebond@netagy.com` are replaced with the pattern `<*>@<*>.<*>`. -The `patterns` command extracts log patterns from a text field and appends the results to the search results. Grouping logs by their patterns makes it easier to aggregate stats from large volumes of log data for analysis and troubleshooting. -`patterns` command allows users to select different log parsing algorithms to get high log pattern grouping accuracy. Two pattern methods are supported: `simple_pattern` and `brain`. -`simple_pattern` algorithm is basically a regex parsing method compared to `brain` algorithm is an automatic log grouping algorithm with high grouping accuracy and keeps semantic meaning. -`patterns` command supports two modes: `label` and `aggregation`. `label` mode returns individual pattern labels. `aggregation` mode returns aggregated results on target field. -Calcite engine by default labels the variables with '\<*\>' placeholder. If `show_numbered_token` option is turned on, Calcite engine's `label` mode not only labels pattern of text but also labels variable tokens in map. In `aggregation` mode, it will also output labeled pattern as well as variable tokens per pattern. The variable placeholder is in the format of '' instead of '<\*>'. +The `patterns` command is not executed on OpenSearch data nodes. It only groups log patterns from log messages that have been returned to the coordinator node. +{: .note} ## Syntax -Use the following syntax: +The `patterns` command supports the following syntax options. -`patterns [by byClause...] [method=simple_pattern | brain] [mode=label | aggregation] [max_sample_count=integer] [buffer_limit=integer] [show_numbered_token=boolean] [new_field=] (algorithm parameters...)` -* `field`: mandatory. The text field to analyze for patterns. -* `byClause`: optional. Fields or scalar functions used to group logs for labeling/aggregation. -* `method`: optional. Algorithm choice: `simple_pattern` or `brain`. **Default:** `simple_pattern`. -* `mode`: optional. Output mode: `label` or `aggregation`. **Default:** `label`. -* `max_sample_count`: optional. Max sample logs returned per pattern in aggregation mode. **Default:** 10. -* `buffer_limit`: optional. Safeguard parameter for `brain` algorithm to limit internal temporary buffer size (min: 50,000). **Default:** 100,000. -* `show_numbered_token`: optional. The flag to turn on numbered token output format. **Default:** false. -* `new_field`: optional. Alias of the output pattern field. **Default:** "patterns_field". -* algorithm parameters: optional. Algorithm-specific tuning: - * `simple_pattern`: Define regex through "pattern". - * `brain`: Adjust sensitivity with variable_count_threshold and frequency_threshold_percentage. - * `variable_count_threshold`: optional integer. Words are split by space. Algorithm counts how many distinct words are at specific position in initial log groups. Adjusting this threshold can determine the sensitivity of constant words. **Default:** 5. - * `frequency_threshold_percentage`: optional double. Brain's log pattern is selected based on longest word combination. This sets the lower bound of frequency to ignore low frequency words. **Default:** 0.3. - +### Simple pattern method syntax -## Change the default pattern method +The `patterns` command with a `simple_pattern` method has the following syntax: -To override default pattern parameters, users can run following command - +```sql +patterns [by ] [method=simple_pattern] [mode=label | aggregation] [max_sample_count=integer] [show_numbered_token=boolean] [new_field=] [pattern=] ``` - PUT _cluster/settings - { - "persistent": { - "plugins.ppl.pattern.method": "brain", - "plugins.ppl.pattern.mode": "aggregation", - "plugins.ppl.pattern.max.sample.count": 5, - "plugins.ppl.pattern.buffer.limit": 50000, - "plugins.ppl.pattern.show.numbered.token": true - } + +### Brain method syntax + +The `patterns` command with a `brain` method has the following syntax: + +```sql +patterns [by ] [method=brain] [mode=label | aggregation] [max_sample_count=integer] [buffer_limit=integer] [show_numbered_token=boolean] [new_field=] [variable_count_threshold=integer] [frequency_threshold_percentage=decimal] +``` + +## Parameters + +The `patterns` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The text field that is analyzed to extract log patterns. | +| `` | Optional | The fields or scalar functions used to group logs before labeling or aggregation. | +| `method` | Optional | The pattern extraction method to use. Valid values are `simple_pattern` and `brain`. Default is `simple_pattern`. | +| `mode` | Optional | The output mode of the command. Valid values are `label` and `aggregation`. Default is `label`. | +| `max_sample_count` | Optional | The maximum number of sample log entries returned per pattern in `aggregation` mode. Default is `10`. | +| `buffer_limit` | Optional | A safeguard setting for the `brain` method that limits the size of its internal temporary buffer. Minimum is `50000`. Default is `100000`. | +| `show_numbered_token` | Optional | Enables numbered token placeholders in the output instead of the default wildcard token. See [Placeholder behavior](#placeholder-behavior). Default is `false`. | +| `` | Optional | An alias for the output field that contains the extracted pattern. Default is `patterns_field`. | + +The `simple_pattern` method accepts the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | A custom [Java regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) pattern that identifies characters or sequences to replace with `<*>` placeholders. When not specified, the method uses a default pattern that automatically removes alphanumeric characters and replaces variable parts with `<*>` placeholders while preserving structural elements. | + +The `brain` method accepts the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `variable_count_threshold` | Optional | Controls the algorithm sensitivity to detecting constant words by counting distinct words at specific positions in the initial log groups. Default is `5`. | +| `frequency_threshold_percentage` | Optional | Sets the minimum word frequency percentage threshold. Words with frequencies below this value are ignored. The `brain` algorithm selects log patterns based on the longest word combination. Default is `0.3`. | + +## Placeholder behavior + +By default, the Apache Calcite engine labels variables using the `<*>` placeholder. If the `show_numbered_token` option is enabled, the Calcite engine's `label` mode not only labels the text pattern but also assigns numbered placeholders to variable tokens. In `aggregation` mode, it outputs both the labeled pattern and the variable tokens for each pattern. In this case, variable placeholders use the format `` instead of `<*>`. + +## Changing the default pattern method + +To override default pattern parameters, run the following command: +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ppl.pattern.method": "brain", + "plugins.ppl.pattern.mode": "aggregation", + "plugins.ppl.pattern.max.sample.count": 5, + "plugins.ppl.pattern.buffer.limit": 50000, + "plugins.ppl.pattern.show.numbered.token": true } +} ``` +{% include copy-curl.html %} +## Simple pattern examples -## Simple pattern example 1: Create the new field +The following are examples of using the `simple_pattern` method. -The following example PPL query shows how to use `patterns` to extract patterns in `email` for each document. Parsing a null field will return an empty string. +### Example 1: Create a new field + +The following query extracts patterns from the `email` field for each document. If the `email` field is `null`, the command returns an empty string: ```sql source=accounts @@ -63,7 +107,7 @@ source=accounts ``` {% include copy.html %} -Expected output: +The query returns the following results: | email | patterns_field | | --- | --- | @@ -73,9 +117,9 @@ Expected output: | daleadams@boink.com | <*>@<*>.<*> | -## Simple pattern example 2: Extract log patterns +### Example 2: Extract log patterns -The following example PPL query shows how to use `patterns` to extract patterns from a raw log field using the default patterns. +The following query extracts default patterns from a raw log field: ```sql source=apache @@ -84,7 +128,7 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | message | patterns_field | | --- | --- | @@ -94,9 +138,9 @@ Expected output: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | -## Simple pattern example 3: Extract log patterns with custom regex pattern +### Example 3: Extract log patterns using a custom regex pattern -The following example PPL query shows how to use `patterns` to extract patterns from a raw log field using user defined patterns. +The following query extracts patterns from a raw log field using a custom pattern: ```sql source=apache @@ -105,7 +149,7 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | message | no_numbers | | --- | --- | @@ -115,9 +159,9 @@ Expected output: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*><*><*>.<*><*><*>.<*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "POST /users HTTP/<*>.<*>" <*><*><*> <*><*><*><*> | -## Simple pattern example 4: Return log patterns aggregation result +### Example 4: Return log pattern aggregation result -The following example PPL query shows how to use `patterns` to get aggregated results from a raw log field. +The following query aggregates patterns extracted from a raw log field: ```sql source=apache @@ -126,7 +170,7 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | patterns_field | pattern_count | sample_logs | | --- | --- | --- | @@ -136,13 +180,10 @@ Expected output: | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722] | -## Simple pattern example 5: Return log patterns aggregation result with detected variable tokens - -The following example PPL query shows how to use `patterns` to get aggregated results with detected variable tokens. +### Example 5: Return aggregated log patterns with detected variable tokens -## Configuration +The following query returns aggregated results with detected variable tokens. When the `show_numbered_token` option is enabled, the pattern output uses numbered placeholders (for example, ``, ``) and returns a mapping of each placeholder to the values that it represents: -With option `show_numbered_token` enabled, the output can detect numbered variable tokens from the pattern field. ```sql source=apache @@ -152,16 +193,20 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | patterns_field | pattern_count | tokens | | --- | --- | --- | | ... - - [//::: -] " / /." | 1 | {'': ['HTTP'], '': ['users'], '': ['1'], '': ['1'], '': ['9481'], '': ['301'], '': ['28'], '': ['104'], '': ['2022'], '': ['Sep'], '': ['15'], '': ['10'], '': ['57'], '': ['210'], '': ['POST'], '': ['15'], '': ['0700'], '': ['204']} | - -## Brain Example 1: Extract log patterns -The following example PPL query shows how to use `patterns` to extract semantic meaningful log patterns from a raw log field using the brain algorithm. The default variable count threshold is 5. +## Brain pattern examples + +The following are examples of using the `brain` method. + +### Example 1: Extract log patterns + +The following query extracts semantically meaningful log patterns from a raw log field using the `brain` algorithm. This query uses the default `variable_count_threshold` value of `5`: ```sql source=apache @@ -170,7 +215,7 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | message | patterns_field | | --- | --- | @@ -180,9 +225,9 @@ Expected output: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*>" 301 <*> | -## Brain Example 2: Extract log patterns with custom parameters +### Example 2: Extract log patterns using custom parameters -The following example PPL query shows how to use `patterns` to extract semantic meaningful log patterns from a raw log field using custom parameters of the brain algorithm. +The following query extracts semantically meaningful log patterns from a raw log field using custom parameters of the `brain` algorithm: ```sql source=apache @@ -191,7 +236,7 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | message | patterns_field | | --- | --- | @@ -201,9 +246,9 @@ Expected output: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | -## Brain Example 3: Return log patterns aggregation result +### Example 3: Return log pattern aggregation result -The following example PPL query shows how to use `patterns` to get aggregated results from a raw log field using the brain algorithm. +The following query aggregates patterns extracted from a raw log field using the `brain` algorithm: ```sql source=apache @@ -212,18 +257,16 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | patterns_field | pattern_count | sample_logs | | --- | --- | --- | | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | 4 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927,127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722,118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439,210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | -## Brain Example 4: Return log patterns aggregation result with detected variable tokens +### Example 4: Return aggregated log patterns with detected variable tokens -The following example PPL query shows how to use `patterns` to get aggregated results with detected variable tokens using the brain algorithm. - -With option `show_numbered_token` enabled, the output can detect numbered variable tokens from the pattern field. +The following query returns aggregated results with detected variable tokens using the `brain` method. When the `show_numbered_token` option is enabled, the pattern output uses numbered placeholders (for example, ``, ``) and returns a mapping of each placeholder to the values that it represents: ```sql source=apache @@ -232,13 +275,11 @@ source=apache ``` {% include copy.html %} -Expected output: +The query returns the following results: | patterns_field | pattern_count | tokens | | --- | --- | --- | | - [/Sep/::: ] HTTP/" | 4 | {'': ['19927', '28722', '27439', '9481'], '': ['10', '10', '10', '10'], '': ['2022', '2022', '2022', '2022'], '': ['57', '57', '57', '57'], '': ['15', '15', '15', '15'], '': ['"HEAD', '"GET', '"PATCH', '"POST'], '': ['-0700', '-0700', '-0700', '-0700'], '': ['/e-business/mindshare', '/architectures/convergence/niches/mindshare', '/strategize/out-of-the-box', '/users'], '': ['177.95.8.74', '127.45.152.6', '118.223.210.10... | -## Limitations - -- Patterns command is not pushed down to OpenSearch data node for now. It will only group log patterns on log messages returned to coordinator node. \ No newline at end of file + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/rare.md b/_sql-and-ppl/ppl/commands/rare.md new file mode 100644 index 00000000000..900160c0e6f --- /dev/null +++ b/_sql-and-ppl/ppl/commands/rare.md @@ -0,0 +1,147 @@ +--- +layout: default +title: rare +parent: Commands +grand_parent: PPL +nav_order: 28 +--- + +# rare + +The `rare` command identifies the least common combination of values across all fields specified in the field list. + +The command returns up to 10 results for each distinct combination of values in the group-by fields. +{: .note} + +The `rare` command is not rewritten to [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/). It is only executed on the coordinating node. +{: .note} + +## Syntax + +The `rare` command has the following syntax: + +```sql +rare [rare-options] [by-clause] +``` + +## Parameters + +The `rare` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A comma-delimited list of field names. | +| `` | Optional | One or more fields to group the results by. | +| `rare-options` | Optional | Additional options for controlling output:
- `showcount`: Whether to create a field in the output containing the frequency count for each combination of values. Default is `true`.
- `countfield`: The name of the field that contains the count. Default is `count`.
- `usenull`: Whether to output null values. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | + + +## Example 1: Find the least common values without showing counts + +The following query uses the `rare` command with `showcount=false` to find the least common gender without displaying frequency counts: + +```sql +source=accounts +| rare showcount=false gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | +| --- | +| F | +| M | + + +## Example 2: Find the least common values grouped by field + +The following query uses the `rare` command with a `by` clause to find the least common age values grouped by gender: + +```sql +source=accounts +| rare showcount=false age by gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | age | +| --- | --- | +| F | 28 | +| M | 32 | +| M | 33 | +| M | 36 | + + +## Example 3: Find the least common values with frequency counts + +The following query uses the `rare` command with default settings to find the least common gender values and display their frequency counts: + +```sql +source=accounts +| rare gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | count | +| --- | --- | +| F | 1 | +| M | 3 | + + +## Example 4: Customize the count field name + +The following query uses the `rare` command with the `countfield` parameter to specify a custom name for the frequency count field: + +```sql +source=accounts +| rare countfield='cnt' gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | cnt | +| --- | --- | +| F | 1 | +| M | 3 | + + +## Example 5: Specify null value handling + +The following query uses the `rare` command with `usenull=false` to exclude null values from the results: + +```sql +source=accounts +| rare usenull=false email +``` +{% include copy.html %} + +The query returns the following results: + +| email | count | +| --- | --- | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + +The following query uses `usenull=true` to include null values in the results: + +```sql +source=accounts +| rare usenull=true email +``` +{% include copy.html %} + +The query returns the following results: + +| email | count | +| --- | --- | +| null | 1 | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + + diff --git a/_sql-and-ppl/ppl/commands/regex.md b/_sql-and-ppl/ppl/commands/regex.md new file mode 100644 index 00000000000..6cdeed53e48 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/regex.md @@ -0,0 +1,155 @@ +--- +layout: default +title: regex +parent: Commands +grand_parent: PPL +nav_order: 29 +--- + +# regex + +The `regex` command filters search results by matching field values against a regular expression pattern. Only documents in which the specified field matches the pattern are included in the results. + +## Syntax + +The `regex` command has the following syntax: + +```sql +regex = +regex != +``` + +The following operators are supported: + +* `=` - Positive matching (include matches). +* `!=` - Negative matching (exclude matches). + +The `regex` command uses Java's built-in regular expression engine, which supports: + +* **Standard regex features**: Character classes, quantifiers, anchors. +* **Named capture groups**: `(?pattern)` syntax. +* **Lookahead/lookbehind**: `(?=...)` and `(?<=...)` assertions. +* **Inline flags**: Case-insensitive `(?i)`, multiline `(?m)`, dotall `(?s)`, and other modes. + +## Parameters + +The `regex` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The field name to match against. | +| `` | Required | The regular expression pattern to match. Supports [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). | + +## Example 1: Basic pattern matching + +The following query uses the `regex` command to return any document in which the `lastname` field starts with an uppercase letter: + +```sql +source=accounts +| regex lastname="^[A-Z][a-z]+$" +| fields account_number, firstname, lastname +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | lastname | +| --- | --- | --- | +| 1 | Amber | Duke | +| 6 | Hattie | Bond | +| 13 | Nanette | Bates | +| 18 | Dale | Adams | + + +## Example 2: Negative matching + +The following query excludes documents in which the `lastname` field ends with `ms`: + +```sql +source=accounts +| regex lastname!=".*ms$" +| fields account_number, lastname +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | lastname | +| --- | --- | +| 1 | Duke | +| 6 | Bond | +| 13 | Bates | + + +## Example 3: Email domain matching + +The following query filters documents by email domain patterns: + +```sql +source=accounts +| regex email="@pyrami\.com$" +| fields account_number, email +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | email | +| --- | --- | +| 1 | amberduke@pyrami.com | + + +## Example 4: Complex patterns with character classes + +The following query uses complex regex patterns with character classes and quantifiers: + +```sql +source=accounts | regex address="\\d{3,4}\\s+[A-Z][a-z]+\\s+(Street|Lane|Court)" | fields account_number, address +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | address | +| --- | --- | +| 1 | 880 Holmes Lane | +| 6 | 671 Bristol Street | +| 13 | 789 Madison Street | +| 18 | 467 Hutchinson Court | + + +## Example 5: Case-sensitive matching + +By default, regex matching is case sensitive. The following query searches for lowercase state name `va`: + +```sql +source=accounts +| regex state="va" +| fields account_number, state +``` +{% include copy.html %} + +The query returns no results because the regex pattern `va` (lowercase) does not match any state values in the data. + +The following query searches for uppercase state name `VA`: + +```sql +source=accounts +| regex state="VA" +| fields account_number, state +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | state | +| --- | --- | +| 13 | VA | + + +## Limitations + +The `regex` command has the following limitations: + +* A field name must be specified in the `regex` command. Pattern-only syntax (for example, `regex "pattern"`) is not supported. +* The `regex` command only supports string fields. Using it on numeric or Boolean fields results in an error. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/rename.md b/_sql-and-ppl/ppl/commands/rename.md new file mode 100644 index 00000000000..5cb74b629c9 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/rename.md @@ -0,0 +1,148 @@ +--- +layout: default +title: rename +parent: Commands +grand_parent: PPL +nav_order: 30 +--- + +# rename + +The `rename` command renames one or more fields in the search results. + +The `rename` command handles non-existent fields as follows: + +* **Renaming a non-existent field to a non-existent field**: No change occurs to the search results. +* **Renaming a non-existent field to an existing field**: The existing target field is removed from the search results. +* **Renaming an existing field to an existing field**: The existing target field is removed and the source field is renamed to the target. + +The `rename` command is not rewritten to [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/). It is only executed on the coordinating node. +{: .note} + +## Syntax + +The `rename` command has the following syntax: + +```sql +rename AS ["," AS ]... +``` + +## Parameters + +The `rename` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The name of the field you want to rename. Supports wildcard patterns using `*`. | +| `` | Required | The name you want to rename to. Must contain the same number of wildcards as the source. | + +## Example 1: Rename a field + +The following query renames one field: + +```sql +source=accounts +| rename account_number as an +| fields an +``` +{% include copy.html %} + +The query returns the following results: + +| an | +| --- | +| 1 | +| 6 | +| 13 | +| 18 | + + +## Example 2: Rename multiple fields + +The following query renames multiple fields: + +```sql +source=accounts +| rename account_number as an, employer as emp +| fields an, emp +``` +{% include copy.html %} + +The query returns the following results: + +| an | emp | +| --- | --- | +| 1 | Pyrami | +| 6 | Netagy | +| 13 | Quility | +| 18 | null | + + +## Example 3: Rename fields using wildcards + +The following query renames multiple fields using wildcard patterns: + +```sql +source=accounts +| rename *name as *_name +| fields first_name, last_name +``` +{% include copy.html %} + +The query returns the following results: + +| first_name | last_name | +| --- | --- | +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | + + +## Example 4: Rename fields using multiple wildcard patterns + +The following query renames multiple fields using multiple wildcard patterns: + +```sql +source=accounts +| rename *name as *_name, *_number as *number +| fields first_name, last_name, accountnumber +``` +{% include copy.html %} + +The query returns the following results: + +| first_name | last_name | accountnumber | +| --- | --- | --- | +| Amber | Duke | 1 | +| Hattie | Bond | 6 | +| Nanette | Bates | 13 | +| Dale | Adams | 18 | + + +## Example 5: Rename an existing field to another existing field + +The following query renames an existing field to another existing field. The target field is removed and the source field is renamed to the target field: + +```sql +source=accounts +| rename firstname as age +| fields age +``` +{% include copy.html %} + +The query returns the following results: + +| age | +| --- | +| Amber | +| Hattie | +| Nanette | +| Dale | + + +## Limitations + +The `rename` command has the following limitations: + +* Literal asterisk (`*`) characters in field names cannot be replaced because the asterisk is used for wildcard matching. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/replace.md b/_sql-and-ppl/ppl/commands/replace.md new file mode 100644 index 00000000000..187b363ac50 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/replace.md @@ -0,0 +1,288 @@ +--- +layout: default +title: replace +parent: Commands +grand_parent: PPL +nav_order: 31 +--- + +# replace + +The `replace` command replaces text in one or more fields in the search results. It supports literal string replacement and wildcard patterns using `*`. + +## Syntax + +The `replace` command has the following syntax: + +```sql +replace '' WITH '' [, '' WITH '']... IN [, ]... +``` + +## Parameters + +The `replace` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The text pattern to be replaced. | +| `` | Required | The text to use as the replacement. | +| `` | Required | One or more fields to which the replacement should be applied. | + +## Example 1: Replace text in one field + +The following query replaces text in one field: + +```sql +source=accounts +| replace "IL" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +The query returns the following results: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 2: Replace text in multiple fields + +The following query replaces text in multiple fields: + +```sql +source=accounts +| replace "IL" WITH "Illinois" IN state, address +| fields state, address +``` +{% include copy.html %} + +The query returns the following results: + +| state | address | +| --- | --- | +| Illinois | 880 Holmes Lane | +| TN | 671 Bristol Street | +| VA | 789 Madison Street | +| MD | 467 Hutchinson Court | + + +## Example 3: Use the replace command in a pipeline + +The following query uses the `replace` command with other commands in a query pipeline: + +```sql +source=accounts +| replace "IL" WITH "Illinois" IN state +| where age > 30 +| fields state, age +``` +{% include copy.html %} + +The query returns the following results: + +| state | age | +| --- | --- | +| Illinois | 32 | +| TN | 36 | +| MD | 33 | + + +## Example 4: Replace text using multiple pattern-replacement pairs + +The following query uses the `replace` command with multiple pattern and replacement pairs in a single replace command. The replacements are applied sequentially. + +```sql +source=accounts +| replace "IL" WITH "Illinois", "TN" WITH "Tennessee" IN state +| fields state +``` +{% include copy.html %} + +The query returns the following results: + +| state | +| --- | +| Illinois | +| Tennessee | +| VA | +| MD | + + +## Example 5: Pattern matching using LIKE + +The following query uses the `LIKE` command with the `replace` command for pattern matching, since the `replace` command only supports plain string literals: + +```sql +source=accounts +| where LIKE(address, '%Holmes%') +| replace "Holmes" WITH "HOLMES" IN address +| fields address, state, gender, age, city +``` +{% include copy.html %} + +The query returns the following results: + +| address | state | gender | age | city | +| --- | --- | --- | --- | --- | +| 880 HOLMES Lane | IL | M | 32 | Brogan | + + +## Example 6: Wildcard suffix matching + +The following query shows wildcard suffix matching, in which `*` matches any characters before a specific ending pattern: + +```sql +source=accounts +| replace "*IL" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +The query returns the following results: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 7: Wildcard prefix matching + +The following query shows wildcard prefix matching, in which `*` matches any characters after a specific starting pattern: + +```sql +source=accounts +| replace "IL*" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +The query returns the following results: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 8: Wildcard capture and substitution + +The following query uses wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement: + +```sql +source=accounts +| replace "* Lane" WITH "Lane *" IN address +| fields address +``` +{% include copy.html %} + +The query returns the following results: + +| address | +| --- | +| Lane 880 Holmes | +| 671 Bristol Street | +| 789 Madison Street | +| 467 Hutchinson Court | + + +## Example 9: Multiple wildcards for pattern transformation + +The following query uses multiple wildcards to transform patterns. Each wildcard in the replacement is substituted with the corresponding captured value: + +```sql +source=accounts +| replace "* *" WITH "*_*" IN address +| fields address +``` +{% include copy.html %} + +The query returns the following results: + +| address | +| --- | +| 880_Holmes Lane | +| 671_Bristol Street | +| 789_Madison Street | +| 467_Hutchinson Court | + + +## Example 10: Replace any match with a fixed value + +The following query shows that when the replacement contains zero wildcards, all matching values are replaced with the literal replacement string: + +```sql +source=accounts +| replace "*IL*" WITH "Illinois" IN state +| fields state +``` +{% include copy.html %} + +The query returns the following results: + +| state | +| --- | +| Illinois | +| TN | +| VA | +| MD | + + +## Example 11: Matching literal asterisks + +Use `\*` to match literal asterisk characters and `\\` to match literal backslash characters. The following query uses `\*`: + +```sql +source=accounts +| eval note = 'price: *sale*' +| replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note +| fields note +``` +{% include copy.html %} + +The query returns the following results: + +| note | +| --- | +| DISCOUNTED | +| DISCOUNTED | +| DISCOUNTED | +| DISCOUNTED | + +## Example 12: Replace text with literal asterisk symbols + +The following query shows how to insert literal asterisk symbols into text while using wildcards to preserve other parts of the pattern: + +```sql +source=accounts +| eval label = 'file123.txt' +| replace 'file*.*' WITH '\**.*' IN label +| fields label +``` +{% include copy.html %} + +The query returns the following results: + +| label | +| --- | +| *123.txt | +| *123.txt | +| *123.txt | +| *123.txt | + + +## Limitations + +The `replace` command has the following limitations: + +* **Wildcards**: The `*` wildcard matches zero or more characters and is case-sensitive. +* **Wildcard matching**: Replacement wildcards must match the pattern wildcard count, or be zero. +* **Escape sequences**: Use `\*` for literal asterisk and `\\` for literal backslash characters. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/reverse.md b/_sql-and-ppl/ppl/commands/reverse.md new file mode 100644 index 00000000000..14feb43bc78 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/reverse.md @@ -0,0 +1,130 @@ +--- +layout: default +title: reverse +parent: Commands +grand_parent: PPL +nav_order: 32 +--- + +# reverse + +The `reverse` command reverses the display order of the search results. It returns the same results, but in the opposite order. + +The `reverse` command processes the entire dataset. If applied directly to millions of records, it consumes significant memory resources on the coordinating node. Only apply the `reverse` command to smaller datasets, typically after aggregation operations. +{: .note} + +## Syntax + +The `reverse` command has the following syntax: + +```sql +reverse +``` + +The `reverse` command takes no parameters. + +## Example 1: Basic reverse operation + +The following query reverses the order of all documents in the results: + +```sql +source=accounts +| fields account_number, age +| reverse +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 2: Use the reverse and sort commands + +The following query reverses results after sorting documents by age in ascending order, effectively implementing descending order: + +```sql +source=accounts +| sort age +| fields account_number, age +| reverse +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 3: Use the reverse and head commands + +The following query uses the `reverse` command together with the `head` command to retrieve the last two records from the original result order: + +```sql +source=accounts +| reverse +| head 2 +| fields account_number, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | + + +## Example 4: Double reverse + +The following query shows that applying reverse twice returns documents in the original order: + +```sql +source=accounts +| reverse +| reverse +| fields account_number, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | + + +## Example 5: Use the reverse command with a complex pipeline + +The following query uses the `reverse` command with filtering and field selection: + +```sql +source=accounts +| where age > 30 +| fields account_number, age +| reverse +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | + diff --git a/_sql-and-ppl/ppl/commands/rex.md b/_sql-and-ppl/ppl/commands/rex.md new file mode 100644 index 00000000000..701e8c4390a --- /dev/null +++ b/_sql-and-ppl/ppl/commands/rex.md @@ -0,0 +1,271 @@ +--- +layout: default +title: rex +parent: Commands +grand_parent: PPL +nav_order: 33 +--- + +# rex + +The `rex` command extracts fields from a raw text field using regular expression named capture groups. It uses Java regex patterns. For more information, see the [Java regular expression documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). + +## rex and parse commands compared + +The `rex` and [`parse`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/parse/) commands both extract information from text fields using Java regular expressions with named capture groups. The following table compares the capabilities of the `rex` and `parse` commands. + +| Feature | `rex` | `parse` | +| --- | --- | --- | +| Pattern type | Java regex | Java regex | +| Named groups required | Yes | Yes | +| Multiple named groups | Yes | No | +| Multiple matches | Yes | No | +| Text substitution | Yes | No | +| Offset tracking | Yes | No | +| Special characters in group names | No | No | + +## Syntax + +The `rex` command has the following syntax: + +```sql +rex [mode=] field= [max_match=] [offset_field=] +``` + +## Parameters + +The `rex` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `field` | Required | The field to extract data from. The field must be a string. | +| `` | Required | The regular expression pattern with named capture groups used to extract new fields. The pattern must contain at least one named capture group using the `(?pattern)` syntax. Group names must start with a letter and contain only letters and digits. | +| `mode` | Optional | The pattern-matching mode. Valid values are `extract` and `sed`. The `extract` mode creates new fields from regular expression named capture groups. The `sed` mode performs text substitution using sed-style patterns (supports `s/pattern/replacement/` with flags, `y/from_chars/to_chars/` transliteration, and backreferences). | +| `max_match` | Optional | The maximum number of matches to extract. If the value is greater than `1`, the extracted fields are returned as arrays. A value of `0` indicates unlimited matches; however, the effective number of matches is automatically limited by the configured maximum. The default maximum is `10` and can be configured using `plugins.ppl.rex.max_match.limit` (see the [note]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/rex/#note)). Default is `1`. | +| `offset_field` | Optional | Valid in `extract` mode only. The name of the field in which to store the character offset positions of the matches. | + +

+ +You can set the `max_match` limit in the `plugins.ppl.rex.max_match.limit` cluster setting. For more information, see [SQL settings]({{site.url}}{{site.baseurl}}/sql-and-ppl/sql/settings/). Setting this limit to a large value is not recommended because it can lead to excessive memory consumption, especially with patterns that match empty strings (for example, `\d*` or `\w*`). +{: .note} + +## Example 1: Basic text extraction + +The following query extracts username and domain from email addresses using named capture groups. Both extracted fields are returned as strings: + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, username, domain +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| email | username | domain | +| --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | +| hattiebond@netagy.com | hattiebond | netagy | + + +## Example 2: Handle non-matching patterns + +The following query shows that the rex command returns all events, setting extracted fields to null for non-matching patterns. When matches are found, the extracted fields are returned as strings: + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?gmail\\.com)" +| fields email, user, domain +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| email | user | domain | +| --- | --- | --- | +| amberduke@pyrami.com | null | null | +| hattiebond@netagy.com | null | null | + + +## Example 3: Extract multiple words using max_match + +The following query uses the `rex` command with the `max_match` parameter to extract multiple words from the `address` field. The extracted field is returned as an array of strings: + +```sql +source=accounts +| rex field=address "(?[A-Za-z]+)" max_match=2 +| fields address, words +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| address | words | +| --- | --- | +| 880 Holmes Lane | [Holmes,Lane] | +| 671 Bristol Street | [Bristol,Street] | +| 789 Madison Street | [Madison,Street] | + + +## Example 4: Replace text using sed mode + +The following query uses the `rex` command in `sed` mode to replace email domains through text substitution. The extracted field is returned as a string: + +```sql +source=accounts +| rex field=email mode=sed "s/@.*/@company.com/" +| fields email +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| email | +| --- | +| amberduke@company.com | +| hattiebond@company.com | + + +## Example 5: Track match positions using offset_field + +The following query tracks the character positions where matches occur. The extracted fields are returned as strings, and the `offset_field` is also returned as a string: + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos +| fields email, username, domain, matchpos +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| email | username | domain | matchpos | +| --- | --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | domain=10-15&username=0-8 | +| hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | + + +## Example 6: Extract complex email pattern + +The following query extracts complete email components, including the top-level domain. All extracted fields are returned as strings: + +```sql +source=accounts +| rex field=email "(?[a-zA-Z0-9._%+-]+)@(?[a-zA-Z0-9.-]+)\\.(?[a-zA-Z]{2,})" +| fields email, user, domain, tld +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| email | user | domain | tld | +| --- | --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | com | +| hattiebond@netagy.com | hattiebond | netagy | com | + + +## Example 7: Chain multiple rex commands + +The following query extracts initial letters from both first and last names. All extracted fields are returned as strings: + +```sql +source=accounts +| rex field=firstname "(?^.)" +| rex field=lastname "(?^.)" +| fields firstname, lastname, firstinitial, lastinitial +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | lastname | firstinitial | lastinitial | +| --- | --- | --- | --- | +| Amber | Duke | A | D | +| Hattie | Bond | H | B | +| Nanette | Bates | N | B | + + +## Example 8: Capture group naming restrictions + +The following query shows naming restrictions for capture groups. Group names cannot contain underscores because of [Java regex](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) limitations. + +**Invalid PPL query with underscores**: + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, user_name, email_domain +``` +{% include copy.html %} + +The query returns the following results: + +```text +{'reason': 'Invalid Query', 'details': "Invalid capture group name 'user_name'. Java regex group names must start with a letter and contain only letters and digits.", 'type': 'IllegalArgumentException'} +Error: Query returned no data +``` + +**Correct PPL query without underscores**: + +```sql +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, username, emaildomain +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| email | username | emaildomain | +| --- | --- | --- | +| amberduke@pyrami.com | amberduke | pyrami | +| hattiebond@netagy.com | hattiebond | netagy | + + +## Example 9: max_match limit enforcement + +The following query shows the `max_match` limit protection mechanism. When `max_match` is set to `0` (unlimited), the system automatically enforces a maximum limit on the number of matches to prevent memory exhaustion. + +**PPL query with `max_match=0` automatically limited to the default of 10**: + +```sql +source=accounts +| rex field=address "(?\\d*)" max_match=0 +| eval digit_count=array_length(digit) +| fields address, digit_count +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +| address | digit_count | +| --- | --- | +| 880 Holmes Lane | 10 | + +**PPL query exceeding the configured limit results in an error**: + +```sql +source=accounts +| rex field=address "(?\\d*)" max_match=100 +| fields address, digit +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +```text +{'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'} +Error: Query returned no data +``` + +For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). diff --git a/_sql-and-ppl/ppl/commands/search.md b/_sql-and-ppl/ppl/commands/search.md new file mode 100644 index 00000000000..b4b925e0253 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/search.md @@ -0,0 +1,721 @@ +--- +layout: default +title: search +parent: Commands +grand_parent: PPL +nav_order: 34 +--- + +# search + +The `search` command retrieves documents from the index. The `search` command can only be used as the first command in the PPL query. + +## Syntax + +The `search` command has the following syntax: + +```sql +search source=[:] [] +``` + +## Parameters + +The `search` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The index to query. The index name can be prefixed with `:` (the remote cluster name) for cross-cluster search. | +| `` | Optional | A search expression that is converted to OpenSearch [query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) query. | + + +## Search expression + +The search expression syntax supports: +* **Full-text search**: `error` or `"error message"` -- Searches the default field configured in the `index.query.default_field` setting (default is `*`, which specifies all fields). For more information, see [Default field configuration](#default-field-configuration). +* **Field-value comparisons**: `field=value`, `field!=value`, `field>value`, `field>=value`, `field][@]` | A time offset relative to the current time. See [Relative time components](#relative-time-components). | `earliest=-7d`, `latest='+1d@d'` | + +#### Relative time components + +Relative time modifiers use multiple components that can be combined. The following table describes each component. + +| Component | Syntax | Description | Examples | +| --- | --- | --- | --- | +| Time offset | `+` or `-` | Direction: `+` (future) or `-` (past) | `+7d`, `-1h` | +| Amount of time | `` | Numeric value + time unit | `7d`, `1h`, `30m` | +| Round to unit | `@` | Round to nearest unit | `@d` (day), `@h` (hour), `@m` (minute) | + +The following are examples of common time modifier patterns: + +* `earliest=now` -- Start from the current time. +* `latest='2024-12-31 23:59:59'` -- End at a specific date and time. +* `earliest=-7d` -- Start from 7 days ago. +* `latest='+1d@d'` -- End at the start of tomorrow. +* `earliest='-1month@month'` -- Start from the beginning of the previous month. +* `latest=1754020061` -- End at a Unix timestamp `1754020061` (August 1, 2025, 03:47:41 UTC). + +The following considerations apply when using time modifiers in the `search` command: + +* **Column name conflicts**: If your data contains columns named `earliest` or `latest`, use backticks to access them as regular fields (for example, `` `earliest`="value"``) to avoid conflicts with time modifier syntax. +* **Time round syntax**: Time modifiers with chained time offsets must be wrapped in quotation marks (for example, `latest='+1d@month-10h'`) for proper query parsing. + +## Default field configuration + +When a search is performed without specifying a field, it uses the default field configured by the `index.query.default_field` index setting. By default, this is set to `*`, which searches all fields. + +To retrieve the default field setting, use the following request: + +```json +GET /accounts/_settings/index.query.default_field +``` +{% include copy-curl.html %} + +To modify the default field setting, use the following request: + +```json +PUT /accounts/_settings +{ + "index.query.default_field": "firstname,lastname,email" +} +``` +{% include copy-curl.html %} + +## Search behavior by field type + +Different field types have specific search capabilities and limitations. The following table summarizes how search expressions work with each field type. + +| Field type | Supported operations | Example | Limitations | +| --- | --- | --- | --- | +| Text | Full-text search, phrase search | `search message="error occurred" source=logs` | Wildcards apply to terms after analysis, not the entire field value | +| Keyword | Exact matching, wildcard patterns | `search status="ACTIVE" source=logs` | No text analysis; matching is case sensitive | +| Numeric | Range queries, exact matching, `IN` operator | `search age>=18 AND balance<50000 source=accounts` | No wildcard or text search support | +| Date | Range queries, exact matching, `IN` operator | `search timestamp>="2024-01-01" source=logs` | Must follow index mapping date format; wildcards not supported | +| Boolean | Exact matching, `true` and `false` values, `IN` operator | `search active=true source=users` | No wildcards or range queries | +| IP | Exact matching, CIDR notation | `search client_ip="192.168.1.0/24" source=logs` | Partial IP wildcard matching not supported. For wildcard search, use multi-field with keyword: `search ip_address.keyword='1*' source=logs` or WHERE clause: `source=logs | where cast(ip_address as string) like '1%'` | + +Consider the following performance optimizations when working with different field types: + +* Each field type has specific search capabilities and limitations. Choosing an inappropriate field type during ingestion can negatively affect performance and query accuracy. +* For wildcard searches on non-keyword fields, create a `keyword` subfield to improve performance. For example, for wildcard searches on a `message` field of type `text`, add a `message.keyword` field. + + + +## Example 1: Fetching all data + +Retrieve all documents from an index by specifying only the source without any search conditions. This is useful for exploring small datasets or verifying data ingestion: + +```sql +source=accounts +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + + +## Example 2: Text search + +For basic text search, use an unquoted single term: + +```sql +search ERROR source=otellogs +| sort @timestamp +| fields severityText, body +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +| severityText | body | +| --- | --- | +| ERROR | Payment failed: Insufficient funds for user@example.com | + +Phrase search requires quotation marks for multi-word exact matching: + +```sql +search "Payment failed" source=otellogs +| fields body +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + +Multiple search terms (unquoted string literals) are automatically combined using the `AND` operator: + +```sql +search user email source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Executing SQL: SELECT * FROM users WHERE email LIKE '%@gmail.com' AND status != 'deleted' ORDER BY created_at DESC | + +`search user email` is equivalent to `search user AND email`. +{: .note} +Enclose terms containing special characters in double quotation marks: + +```sql +search "john.doe+newsletter@company.com" source=otellogs +| fields body +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Email notification sent to john.doe+newsletter@company.com with subject: 'Welcome! Your order #12345 is confirmed' | + +### Combined phrase and Boolean search + +Combine quoted phrases with Boolean operators for more precise searches: + +```sql +search "User authentication" OR OAuth2 source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| [2024-01-15 10:30:09] production.INFO: User authentication successful for admin@company.org using OAuth2 | + + +## Example 3: Boolean logic and operator precedence + +The following queries demonstrate Boolean operators and precedence. + +### Boolean operators + +Use `OR` to match documents containing any of the specified conditions: + +```sql +search severityText="ERROR" OR severityText="FATAL" source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| severityText | +| --- | +| ERROR | +| FATAL | +| ERROR | + +Combine conditions with `AND` to require all criteria to match: + +```sql +search severityText="INFO" AND `resource.attributes.service.name`="cart-service" source=otellogs +| fields body +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | + +### Operator precedence + +The operators are evaluated using the following precedence: + +``` +Parentheses > NOT > OR > AND +``` + +The following query demonstrates operator precedence: + +```sql +search severityText="ERROR" OR severityText="WARN" AND severityNumber>15 source=otellogs +| sort @timestamp +| fields severityText, severityNumber +| head 2 +``` +{% include copy.html %} + +The preceding expression is evaluated as `(severityText="ERROR" OR severityText="WARN") AND severityNumber>15`. The query returns the following results: + +| severityText | severityNumber | +| --- | --- | +| ERROR | 17 | +| ERROR | 17 | + +## Example 4: NOT compared to != semantics + +Both `!=` and `NOT` operators find documents in which the field value is not equal to the specified value. However, the `!=` operator excludes documents containing null or missing fields, while the `NOT` operator includes them. The following query shows this difference. + +**!= operator** + +Find all accounts for which the `employer` field exists and is not `Quility`: + +```sql +search employer!="Quility" source=accounts +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | + +**`NOT` operator** + +Find all accounts that do not specify `Quility` as employer (including those with null employer values): + +```sql +search NOT employer="Quility" source=accounts +``` +{% include copy.html %} + +The query returns the following results. Dale Adams appears in the search results because his `employer` field is `null`: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + +## Example 5: Range queries + +Use comparison operators (`>,` `<,` `>=`, and `<=`) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics: + +```sql +search severityNumber>15 AND severityNumber<=20 source=otellogs +| sort @timestamp +| fields severityNumber +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| severityNumber | +| --- | +| 17 | +| 17 | +| 18 | + +The following query filters by decimal values within a specific range: + +```sql +search `attributes.payment.amount`>=1000.0 AND `attributes.payment.amount`<=2000.0 source=otellogs +| fields body +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + + +## Example 6: Wildcards + +The following queries demonstrate wildcard pattern matching. In wildcard patterns, `*` matches zero or more characters, while `?` matches exactly one character. + +Use `*` to match any number of characters at the end of a term: + +```sql +search severityText=ERR* source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| severityText | +| --- | +| ERROR | +| ERROR | +| ERROR2 | + +Wildcard searches also work within text fields to find partial matches: + +```sql +search body=user* source=otellogs +| sort @timestamp +| fields body +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | +| Payment failed: Insufficient funds for user@example.com | + +Use `?` to match exactly one character in specific positions: + +```sql +search severityText="INFO?" source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| severityText | +| --- | +| INFO2 | +| INFO3 | +| INFO4 | + + +## Example 7: Wildcard patterns in field searches + +When searching in text or keyword fields, wildcards enable partial matching, which is useful when you only know part of a value. Wildcards work best on keyword fields, for which they match the exact value using patterns. Using wildcards on text fields may produce unexpected results because they apply to individual tokens after analysis, not the entire field value. Wildcards in keyword fields are case-sensitive unless normalized at indexing. + +Leading wildcards (for example, `*@example.com`) can decrease query speed compared to trailing wildcards. +{: .note} + +Find records for which you only know the beginning of a field value: + +```sql +search employer=Py* source=accounts +| fields firstname, employer +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | employer | +| --- | --- | +| Amber | Pyrami | + +Combine wildcard patterns with other conditions for more precise filtering: + +```sql +search firstname=A* AND age>30 source=accounts +| fields firstname, age, city +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | age | city | +| --- | --- | --- | +| Amber | 32 | Brogan | + +## Example 8: Field value matching + +The `IN` operator efficiently checks whether a field matches any value in a list, providing a more concise and more performant alternative to chaining multiple `OR` conditions on the same field. + +Check whether a field matches any value from a predefined list: + +```sql +search severityText IN ("ERROR", "WARN", "FATAL") source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| severityText | +| --- | +| ERROR | +| WARN | +| FATAL | + + +Filter logs by `severityNumber` to find errors with a specific numeric severity level: + +```sql +search severityNumber=17 source=otellogs +| sort @timestamp +| fields body +| head 1 +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + +Search for logs containing a specific user email address in the attributes: + +```sql +search `attributes.user.email`="user@example.com" source=otellogs +| fields body +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + + +## Example 9: Complex expressions + +To create sophisticated search queries, combine multiple conditions using Boolean operators and parentheses. + +```sql +search (severityText="ERROR" OR severityText="WARN") AND severityNumber>10 source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| severityText | +| --- | +| ERROR | +| WARN | +| ERROR | + +Combine multiple conditions with OR and AND operators to search for logs matching either a specific user or high-severity fund errors: + +```sql +search `attributes.user.email`="user@example.com" OR (`attributes.error.code`="INSUFFICIENT_FUNDS" AND severityNumber>15) source=otellogs +| fields body +``` +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Payment failed: Insufficient funds for user@example.com | + + +## Example 10: Time modifiers + +Time modifiers filter search results by time range using the implicit `@timestamp` field. They support various time formats for precise temporal filtering. + +### Absolute time filtering + +Filter logs within a specific time window using absolute timestamps: + +```sql +search earliest='2024-01-15 10:30:05' latest='2024-01-15 10:30:10' source=otellogs +| fields @timestamp, severityText +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:05.678901234 | FATAL | +| 2024-01-15 10:30:06.789012345 | TRACE | +| 2024-01-15 10:30:07.890123456 | ERROR | +| 2024-01-15 10:30:08.901234567 | WARN | +| 2024-01-15 10:30:09.012345678 | INFO | +| 2024-01-15 10:30:10.123456789 | TRACE2 | + +### Relative time filtering + +Filter logs using relative time expressions, such as events that occurred before 30 seconds ago: + +```sql +search latest=-30s source=otellogs +| sort @timestamp +| fields @timestamp, severityText +| head 3 +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | +| 2024-01-15 10:30:02.345678901 | WARN | + +### Time rounding + +Use time rounding expressions to filter events relative to time boundaries, such as before the start of the current minute: + +```sql +search latest='@m' source=otellogs +| fields @timestamp, severityText +| head 2 +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | + +### Unix timestamp filtering + +Filter logs using Unix epoch timestamps for precise time ranges: + +```sql +search earliest=1705314600 latest=1705314605 source=otellogs +| fields @timestamp, severityText +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | severityText | +| --- | --- | +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | +| 2024-01-15 10:30:02.345678901 | WARN | +| 2024-01-15 10:30:03.456789012 | DEBUG | +| 2024-01-15 10:30:04.567890123 | INFO | + + +## Example 11: Escaping special characters + +Special characters fall into two categories, depending on whether they must always be escaped or only when you want to search for their literal value: + +- The following characters must always be escaped to be interpreted literally: + * **Backslash (`\`)**: Escape as `\\`. + * **Quotation mark (`"`)**: Escape as `\"` when used inside a quoted string. + +- These characters act as wildcards by default and should be escaped only when you want to match them literally: + * **Asterisk (`*`)**: Use as `*` for wildcard matching; escape as `\\*` for a literal asterisk. + * **Question mark (`?`)**: Use as `?` for wildcard matching; escape as `\\?` for a literal question mark. + +The following table compares wildcard and literal character matching. + +| Intent | PPL syntax | Result | +| ---| --- | --- | +| Wildcard search | `field=user*` | Matches `user`, `user123`, `userABC` | +| Literal `user*` | `field="user\\*"` | Matches only `user*` | +| Wildcard search | `field=log?` | Matches `log1`, `logA`, `logs` | +| Literal `log?` | `field="log\\?"` | Matches only `log?`| + +### Escaping backslash characters + +Each backslash in the search value must be escaped with another backslash. For example, the following query searches for Windows file paths by properly escaping backslashes: + +```sql +search `attributes.error.type`="C:\\\\Users\\\\admin" source=otellogs +| fields `attributes.error.type` +``` + +{% include copy.html %} + +The query returns the following results: + +| attributes.error.type | +| --- | +| C:\Users\admin | + +When using the REST API with JSON, additional JSON escaping is required. +{: .note} + +### Quotation marks within strings + +Search for text containing quotation marks by escaping them with backslashes: + +```sql +search body="\"exact phrase\"" source=otellogs +| sort @timestamp +| fields body +| head 1 +``` + +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | + +### Text containing special characters + +Search for literal text containing wildcard characters by escaping them: + +```sql +search "wildcard\\* fuzzy~2" source=otellogs +| fields body +| head 1 +``` + +{% include copy.html %} + +The query returns the following results: + +| body | +| --- | +| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/showdatasources.md b/_sql-and-ppl/ppl/commands/showdatasources.md new file mode 100644 index 00000000000..de287f86c5a --- /dev/null +++ b/_sql-and-ppl/ppl/commands/showdatasources.md @@ -0,0 +1,41 @@ +--- +layout: default +title: show datasources +parent: Commands +grand_parent: PPL +nav_order: 35 +--- + +# show datasources + +The `show datasources` command queries data sources configured in the PPL engine. The `show datasources` command can only be used as the first command in the PPL query. + +To use the `show datasources` command, `plugins.calcite.enabled` must be set to `false`. +{: .note} + +## Syntax + +The `show datasources` command has the following syntax: + +```sql +show datasources +``` + +The `show datasources` command takes no parameters. + +## Example 1: Fetch all Prometheus data sources + +The following query fetches all Prometheus data sources: + +```sql +show datasources +| where CONNECTOR_TYPE='PROMETHEUS' +``` +{% include copy.html %} + +The query returns the following results: + +| DATASOURCE_NAME | CONNECTOR_TYPE | +| --- | --- | +| my_prometheus | PROMETHEUS | + diff --git a/_sql-and-ppl/ppl/commands/sort.md b/_sql-and-ppl/ppl/commands/sort.md new file mode 100644 index 00000000000..ee28f389b30 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/sort.md @@ -0,0 +1,196 @@ +--- +layout: default +title: sort +parent: Commands +grand_parent: PPL +nav_order: 36 +--- + +# sort + +The `sort` command sorts the search results by the specified fields. + +## Syntax + +The `sort` command supports two syntax notations. You must use one notation consistently within a single sort command. + +### Prefix notation + +The `sort` command has the following syntax in prefix notation: + +```sql +sort [] [+|-] [, [+|-] ]... +``` + +### Suffix notation + +The `sort` command has the following syntax in suffix notation: + +```sql +sort [] [asc|desc|a|d] [, [asc|desc|a|d]]... +``` + +## Parameters + +The `sort` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The field used to sort. Use `auto(field)`, `str(field)`, `ip(field)`, or `num(field)` to specify how to interpret field values. Multiple fields can be specified as a comma-separated list. | +| `` | Optional | The number of results to return. A value of `0` or less returns all results. Default is `0`. | +| `[+|-]` | Optional | **Prefix notation only.** The plus sign (`+`) specifies ascending order, and the minus sign (`-`) specifies descending order. Default is ascending order. | +| `[asc|desc|a|d]` | Optional | **Suffix notation only.** Specifies the sort order: `asc`/`a` for ascending, `desc`/`d` for descending. Default is ascending order. | + +## Example 1: Sort by one field + +The following query sorts all documents by the `age` field in ascending order. By default, the sort command returns all results, which is equivalent to specifying `sort 0 age`: + +```sql +source=accounts +| sort age +| fields account_number, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | + + +## Example 2: Sort by one field in descending order + +The following query sorts all documents by the `age` field in descending order. You can use either prefix notation (`- age`) or suffix notation (`age desc`): + +```sql +source=accounts +| sort - age +| fields account_number, age +``` +{% include copy.html %} + +This query is equivalent to the following query: + +```sql +source=accounts +| sort age desc +| fields account_number, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | + + +## Example 3: Sort by multiple fields in prefix notation + +The following query uses prefix notation to sort all documents by the `gender` field in ascending order and the `age` field in descending order: + +```sql +source=accounts +| sort + gender, - age +| fields account_number, gender, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | age | +| --- | --- | --- | +| 13 | F | 28 | +| 6 | M | 36 | +| 18 | M | 33 | +| 1 | M | 32 | + + +## Example 4: Sort by multiple fields in suffix notation + +The following query uses suffix notation to sort all documents by the `gender` field in ascending order and the `age` field in descending order: + +```sql +source=accounts +| sort gender asc, age desc +| fields account_number, gender, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | age | +| --- | --- | --- | +| 13 | F | 28 | +| 6 | M | 36 | +| 18 | M | 33 | +| 1 | M | 32 | + + +## Example 5: Sort fields with null values + +The default ascending order lists null values first. The following query sorts the `employer` field in the default order: + +```sql +source=accounts +| sort employer +| fields employer +``` +{% include copy.html %} + +The query returns the following results: + +| employer | +| --- | +| null | +| Netagy | +| Pyrami | +| Quility | + + +## Example 6: Specify the number of sorted documents to return + +The following query sorts all documents and returns two documents: + +```sql +source=accounts +| sort 2 age +| fields account_number, age +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | +| --- | --- | +| 13 | 28 | +| 1 | 32 | + + +## Example 7: Sort by specifying field type + +The following query uses the `sort` command with `str()` to sort numeric values lexicographically: + +```sql +source=accounts +| sort str(account_number) +| fields account_number +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | +| --- | +| 1 | +| 13 | +| 18 | +| 6 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/spath.md b/_sql-and-ppl/ppl/commands/spath.md new file mode 100644 index 00000000000..471babf7802 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/spath.md @@ -0,0 +1,117 @@ +--- +layout: default +title: spath +parent: Commands +grand_parent: PPL +nav_order: 37 +--- + +# spath + +The `spath` command extracts fields from structured text data by allowing you to select JSON values using JSON paths. + +The `spath` command is not executed on OpenSearch data nodes. It extracts fields from data after it has been returned to the coordinator node, which is slow on large datasets. We recommend indexing fields needed for filtering directly instead of using `spath` to filter nested fields. +{: .note} + +## Syntax + +The `spath` command has the following syntax: + +```sql +spath input= [output=] [path=] +``` + +## Parameters + +The `spath` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `input` | Required | The field containing JSON data to parse. | +| `output` | Optional | The destination field in which the extracted data is stored. Default is the value of `path`. | +| `` | Required | The JSON path that identifies the data to extract. | + + + +## Example 1: Basic field extraction + +The basic use of `spath` extracts a single field from JSON data. The following query extracts the `n` field from JSON objects in the `doc_n` field: + +```sql +source=structured +| spath input=doc_n n +| fields doc_n n +``` +{% include copy.html %} + +The query returns the following results: + +| doc_n | n | +| --- | --- | +| {"n": 1} | 1 | +| {"n": 2} | 2 | +| {"n": 3} | 3 | + + +## Example 2: Lists and nesting + +The following query shows how to traverse nested fields and extract list elements: + +```sql +source=structured +| spath input=doc_list output=first_element list{0} +| spath input=doc_list output=all_elements list{} +| spath input=doc_list output=nested nest_out.nest_in +| fields doc_list first_element all_elements nested +``` +{% include copy.html %} + +The query returns the following results: + +| doc_list | first_element | all_elements | nested | +| --- | --- | --- | --- | +| {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | +| {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | +| {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a | + + +## Example 3: Sum of inner elements + +The following query shows how to use `spath` to extract the `n` field from JSON data and calculate the sum of all extracted values: + +```sql +source=structured +| spath input=doc_n n +| eval n=cast(n as int) +| stats sum(n) +| fields `sum(n)` +``` +{% include copy.html %} + +The query returns the following results. The `spath` command always returns inner values as strings: + +| sum(n) | +| --- | +| 6 | + + +## Example 4: Escaped paths + +Use quoted string syntax to access JSON field names that contain spaces, dots, or other special characters: + +```sql +source=structured +| spath output=a input=doc_escape "['a fancy field name']" +| spath output=b input=doc_escape "['a.b.c']" +| fields a b +``` +{% include copy.html %} + +The query returns the following results: + +| a | b | +| --- | --- | +| true | 0 | +| true | 1 | +| false | 2 | + diff --git a/_sql-and-ppl/ppl/commands/stats.md b/_sql-and-ppl/ppl/commands/stats.md new file mode 100644 index 00000000000..765a311c5ee --- /dev/null +++ b/_sql-and-ppl/ppl/commands/stats.md @@ -0,0 +1,477 @@ +--- +layout: default +title: stats +parent: Commands +grand_parent: PPL +nav_order: 38 +--- + +# stats + +The `stats` command calculates aggregations on the search results. + +## Comparing `stats`, `eventstats`, and `streamstats` + +For a comprehensive comparison of `stats`, `eventstats`, and `streamstats` commands, including their differences in transformation behavior, output format, aggregation scope, and use cases, see [Comparing stats, eventstats, and streamstats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/streamstats/#comparing-stats-eventstats-and-streamstats). + +## Syntax + +The `stats` command has the following syntax: + +```sql +stats [bucket_nullable=bool] ... [by-clause] +``` + +## Parameters + +The `stats` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | An aggregation function. | +| `` | Optional | Groups results by specified fields or expressions. Syntax: `by [span-expression,] [field,]...` If no `by-clause` is specified, the stats command returns only one row, which is the aggregation over the entire search results. | +| `bucket_nullable` | Optional | Controls whether to include `null` buckets in group-by aggregations. When `false`, ignores records in which the group-by field is null, resulting in faster performance. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | +| `` | Optional | Splits a field into buckets by intervals (maximum of one). Syntax: `span(field_expr, interval_expr)`. By default, the interval uses the field's default unit. For date/time fields, aggregation results ignore null values. Examples: `span(age, 10)` creates 10-year age buckets, and `span(timestamp, 1h)` creates hourly buckets. Valid time units are millisecond (`ms`), second (`s`), minute (`m`), hour (`h`), day (`d`), week (`w`), month (`M`), quarter (`q`), year (`y`). | + +## Aggregation functions + +The stats command supports the following aggregation functions: + +* `COUNT`/`C` -- Count of values +* `SUM` -- Sum of numeric values +* `AVG` -- Average of numeric values +* `MAX` -- Maximum value +* `MIN` -- Minimum value +* `VAR_SAMP` -- Sample variance +* `VAR_POP` -- Population variance +* `STDDEV_SAMP` -- Sample standard deviation +* `STDDEV_POP` -- Population standard deviation +* `DISTINCT_COUNT_APPROX` -- Approximate distinct count +* `TAKE` -- List of original values +* `PERCENTILE`/`PERCENTILE_APPROX` -- Percentile calculations +* `PERC`/`P` -- Percentile shortcut functions +* `MEDIAN` -- 50th percentile +* `EARLIEST` -- Earliest value by timestamp +* `LATEST` -- Latest value by timestamp +* `FIRST` -- First non-null value +* `LAST` -- Last non-null value +* `LIST` -- Collect all values into array +* `VALUES` -- Collect unique values into sorted array + + + +## Example 1: Calculate the count of events + +The following query calculates the count of events in the `accounts` index: + +```sql +source=accounts +| stats count() +``` +{% include copy.html %} + +The query returns the following results: + +| count() | +| --- | +| 4 | + + +## Example 2: Calculate the average of a field + +The following query calculates the average age for all accounts: + +```sql +source=accounts +| stats avg(age) +``` +{% include copy.html %} + +The query returns the following results: + +| avg(age) | +| --- | +| 32.25 | + + +## Example 3: Calculate the average of a field by group + +The following query calculates the average age for all accounts, grouped by gender: + +```sql +source=accounts +| stats avg(age) by gender +``` +{% include copy.html %} + +The query returns the following results: + +| avg(age) | gender | +| --- | --- | +| 28.0 | F | +| 33.666666666666664 | M | + + +## Example 4: Calculate the average, sum, and count of a field by group + +The following query calculates the average age, sum of ages, and count of events for all accounts, grouped by gender: + +```sql +source=accounts +| stats avg(age), sum(age), count() by gender +``` +{% include copy.html %} + +The query returns the following results: + +| avg(age) | sum(age) | count() | gender | +| --- | --- | --- | --- | +| 28.0 | 28 | 1 | F | +| 33.666666666666664 | 101 | 3 | M | + + +## Example 5: Calculate the maximum of a field + +The following query calculates the maximum age for all accounts: + +```sql +source=accounts +| stats max(age) +``` +{% include copy.html %} + +The query returns the following results: + +| max(age) | +| --- | +| 36 | + + +## Example 6: Calculate the maximum and minimum of a field by group + +The following query calculates the maximum and minimum ages for all accounts, grouped by gender: + +```sql +source=accounts +| stats max(age), min(age) by gender +``` +{% include copy.html %} + +The query returns the following results: + +| max(age) | min(age) | gender | +| --- | --- | --- | +| 28 | 28 | F | +| 36 | 32 | M | + + +## Example 7: Calculate the distinct count of a field + +To retrieve the count of distinct values of a field, you can use `DISTINCT_COUNT` (or `DC`) function instead of `COUNT`. The following query calculates both the count and the distinct count of the `gender` field for all accounts: + +```sql +source=accounts +| stats count(gender), distinct_count(gender) +``` +{% include copy.html %} + +The query returns the following results: + +| count(gender) | distinct_count(gender) | +| --- | --- | +| 4 | 2 | + + +## Example 8: Calculate the count by a span + +The following query retrieves the count of `age` values grouped into 10-year intervals: + +```sql +source=accounts +| stats count(age) by span(age, 10) as age_span +``` +{% include copy.html %} + +The query returns the following results: + +| count(age) | age_span | +| --- | --- | +| 1 | 20 | +| 3 | 30 | + + +## Example 9: Calculate the count by a gender and span + +The following query retrieves the count of `age` grouped into 5-year intervals and broken down by `gender`: + +```sql +source=accounts +| stats count() as cnt by span(age, 5) as age_span, gender +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | age_span | gender | +| --- | --- | --- | +| 1 | 25 | F | +| 2 | 30 | M | +| 1 | 35 | M | + +The `span` expression is always treated as the first grouping key, regardless of its position in the `by` clause: + +```sql +source=accounts +| stats count() as cnt by gender, span(age, 5) as age_span +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | age_span | gender | +| --- | --- | --- | +| 1 | 25 | F | +| 2 | 30 | M | +| 1 | 35 | M | + + +## Example 10: Count and retrieve email list by gender and age span + +The following query calculates the count of `age` values grouped into 5-year intervals and by `gender`, and also returns a list of up to 5 emails for each group: + +```sql +source=accounts +| stats count() as cnt, take(email, 5) by span(age, 5) as age_span, gender +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | take(email, 5) | age_span | gender | +| --- | --- | --- | --- | +| 1 | [] | 25 | F | +| 2 | [amberduke@pyrami.com,daleadams@boink.com] | 30 | M | +| 1 | [hattiebond@netagy.com] | 35 | M | + + +## Example 11: Calculate the percentile of a field + +The following query calculates the 90th percentile of `age` for all accounts: + +```sql +source=accounts +| stats percentile(age, 90) +``` +{% include copy.html %} + +The query returns the following results: + +| percentile(age, 90) | +| --- | +| 36 | + + +## Example 12: Calculate the percentile of a field by group + +The following query calculates the 90th percentile of `age` for all accounts, grouped by `gender`: + +```sql +source=accounts +| stats percentile(age, 90) by gender +``` +{% include copy.html %} + +The query returns the following results: + +| percentile(age, 90) | gender | +| --- | --- | +| 28 | F | +| 36 | M | + + +## Example 13: Calculate the percentile by a gender and span + +The following query calculates the 90th percentile of `age`, grouped into 10-year intervals and by `gender`: + +```sql +source=accounts +| stats percentile(age, 90) as p90 by span(age, 10) as age_span, gender +``` +{% include copy.html %} + +The query returns the following results: + +| p90 | age_span | gender | +| --- | --- | --- | +| 28 | 20 | F | +| 36 | 30 | M | + + +## Example 14: Collect all values in a field using LIST + +The following query collects all `firstname` values, preserving duplicates and order: + +```sql +source=accounts +| stats list(firstname) +``` +{% include copy.html %} + +The query returns the following results: + +| list(firstname) | +| --- | +| [Amber,Hattie,Nanette,Dale] | + + +## Example 15: Ignore null bucket + +The following query excludes null values from grouping by setting `bucket_nullable=false`: + +```sql +source=accounts +| stats bucket_nullable=false count() as cnt by email +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | email | +| --- | --- | +| 1 | amberduke@pyrami.com | +| 1 | daleadams@boink.com | +| 1 | hattiebond@netagy.com | + + +## Example 16: Collect unique values in a field using VALUES + +The following query collects all unique `firstname` values, sorted lexicographically with duplicates removed: + +```sql +source=accounts +| stats values(firstname) +``` +{% include copy.html %} + +The query returns the following results: + +| values(firstname) | +| --- | +| [Amber,Dale,Hattie,Nanette] | + + +## Example 17: Date span grouping with null handling + +The following example uses this sample index data. + +| Name | DEPTNO | birthday | +| --- | --- | --- | +| Alice | 1 | 2024-04-21 | +| Bob | 2 | 2025-08-21 | +| Jeff | null | 2025-04-22 | +| Adam | 2 | null | + +The following query groups data by yearly spans of the `birthday` field, automatically excluding null values: + +```sql +source=example +| stats count() as cnt by span(birthday, 1y) as year +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | year | +| --- | --- | +| 1 | 2024-01-01 | +| 2 | 2025-01-01 | + +Group by both yearly spans and department number (by default, null `DEPTNO` values are included in the results): + +```sql +source=example +| stats count() as cnt by span(birthday, 1y) as year, DEPTNO +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | year | DEPTNO | +| --- | --- | --- | +| 1 | 2024-01-01 | 1 | +| 1 | 2025-01-01 | 2 | +| 1 | 2025-01-01 | null | + +Use `bucket_nullable=false` to exclude null `DEPTNO` values from the grouping: + +```sql +source=example +| stats bucket_nullable=false count() as cnt by span(birthday, 1y) as year, DEPTNO +``` +{% include copy.html %} + +The query returns the following results: + +| cnt | year | DEPTNO | +| --- | --- | --- | +| 1 | 2024-01-01 | 1 | +| 1 | 2025-01-01 | 2 | + + +## Example 18: Calculate the count by the implicit @timestamp field + +If you omit the `field` parameter in the `span` function, it automatically uses the implicit `@timestamp` field: + +```sql +source=big5 +| stats count() by span(1month) +``` +{% include copy.html %} + +The query returns the following results: + +| count() | span(1month) | +| --- | --- | +| 1 | 2023-01-01 00:00:00 | + +## Limitations + +The following limitations apply to the `stats` command. + +### Bucket aggregation results may be approximate for high-cardinality fields + +In OpenSearch, `doc_count` values for a `terms` bucket aggregation can be approximate. Thus, any aggregations (such as `sum` or `avg`) performed on those buckets may also be approximate. + +For example, the following query retrieves the top 10 URLs: + +```sql +source=hits +| stats bucket_nullable=false count() as c by URL +| sort - c +| head 10 +``` + +{% include copy.html %} + +This query is translated into a `terms` aggregation in OpenSearch with `"order": { "_count": "desc" }`. For fields with high cardinality, some buckets may be discarded, so the results may only be approximate. + +### Sorting by `doc_count` in ascending order may produce inaccurate results + +When retrieving the least frequent terms for high-cardinality fields, results may be inaccurate. Shard-level aggregations can miss globally rare terms or misrepresent their frequency, causing errors in the overall results. + +For example, the following query retrieves the 10 least frequent URLs: + +```sql +source=hits +| stats bucket_nullable=false count() as c by URL +| sort + c +| head 10 +``` + +{% include copy.html %} + +A globally rare term might not appear as rare on every shard or could be entirely absent from some shard results. Conversely, a term infrequent on one shard might be common on another. In both cases, shard-level approximations can cause rare terms to be missed, leading to inaccurate overall results. diff --git a/_sql-and-ppl/ppl/commands/streamstats.md b/_sql-and-ppl/ppl/commands/streamstats.md new file mode 100644 index 00000000000..f11b24cb494 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/streamstats.md @@ -0,0 +1,263 @@ +--- +layout: default +title: streamstats +parent: Commands +grand_parent: PPL +nav_order: 39 +--- + +# streamstats + +The `streamstats` command calculates cumulative or rolling statistics as events are processed in order. Unlike `stats` or `eventstats` which operate on the entire dataset at once, `streamstats` processes events incrementally, making it suitable for time-series and sequence-based analysis. + +Key features include support for `window` (sliding window calculations) and `current` (whether to include the current event in calculations) parameters, and specialized use cases such as identifying trends or detecting changes over sequences of events. + +## Comparing `stats`, `eventstats`, and `streamstats` + +The `stats`, `eventstats`, and `streamstats` commands can all generate aggregations such as average, sum, and maximum. However, they differ in how they operate and the results they produce. The following table summarizes these differences. + +| Aspect | `stats` | `eventstats` | `streamstats` | +| --- | --- | --- | --- | +| Transformation behavior | Transforms all events into an aggregated result table, losing original event structure | Adds aggregation results as new fields to the original events without removing the event structure | Adds cumulative (running) aggregation results to each event as they stream through the pipeline | +| Output format | Output contains only aggregated values. Original raw events are not preserved | Original events remain, with extra fields containing summary statistics | Original events remain, with extra fields containing running totals or cumulative statistics | +| Aggregation scope | Based on all events in the search (or groups defined by BY clause) | Based on all relevant events, then the result is added back to each event in the group | Calculations occur progressively as each event is processed; can be scoped by window | +| Use cases | When only aggregated results are needed (for example, counts, averages, sums) | When aggregated statistics are needed alongside original event data | When a running total or cumulative statistic is needed across event streams | + + +## Syntax + +The `streamstats` command has the following syntax: + +```sql +streamstats [bucket_nullable=bool] [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause] +``` + +The following are examples of the `streamstats` command syntax: + +```sql +source = table | streamstats avg(a) +source = table | streamstats current = false avg(a) +source = table | streamstats window = 5 sum(b) +source = table | streamstats current = false window = 2 max(a) +source = table | where a < 50 | streamstats count(c) +source = table | streamstats min(c), max(c) by b +source = table | streamstats count(c) as count_by by b | where count_by > 1000 +source = table | streamstats dc(field) as distinct_count +source = table | streamstats distinct_count(category) by region +source = table | streamstats current=false window=2 global=false avg(a) by b +source = table | streamstats window=2 reset_before=a>31 avg(b) +source = table | streamstats current=false reset_after=a>31 avg(b) by c +``` +{% include copy.html %} + +## Parameters + +The `streamstats` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | An aggregation function or window function. | +| `bucket_nullable` | Optional | Controls whether to consider null buckets as a valid group in group-by aggregations. When `false`, does not treat null group by values as a distinct group during aggregation. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | +| `current` | Optional | Whether to include the current event in summary calculations. When `true`, includes the current event; when `false`, uses the field value from the previous event. Default is `true`. | +| `window` | Optional | The number of events to use when computing statistics. Default is `0` (all previous and current events are used). | +| `global` | Optional | Used only when `window` is specified. Determines whether to use a single window (`true`) or separate windows for each group defined by the `by` clause (`false`). When `false` and `window` is non-zero, a separate window is used for each group of values of the field specified in the `by` clause. Default is `true`. | +| `reset_before` | Optional | Resets all accumulated statistics before `streamstats` computes the running metrics for an event when the `eval-expression` evaluates to `true`. If used with `window`, the window is also reset. Syntax: `reset_before="()"`. Default is `false`. | +| `reset_after` | Optional | Resets all accumulated statistics after `streamstats` computes the running metrics for an event when the `eval-expression` evaluates to `true`. The expression can reference fields returned by `streamstats`. If used with `window`, the window is also reset. Syntax: `reset_after="()"`. Default is `false`. | +| `` | Optional | Fields and expressions for grouping, including scalar functions and aggregation functions. The `span` clause can be used to split specific fields into buckets by intervals. Syntax: `by [span-expression,] [field,]...` If not specified, all events are processed as a single group and running statistics are computed across the entire event stream. | +| `` | Optional | Splits a field into buckets by intervals (maximum of one). Syntax: `span(field_expr, interval_expr)`. By default, the interval uses the field's default unit. For date/time fields, aggregation results ignore null values. Examples: `span(age, 10)` creates 10-year age buckets, and `span(timestamp, 1h)` creates hourly buckets. Valid time units are millisecond (`ms`), second (`s`), minute (`m`), hour (`h`), day (`d`), week (`w`), month (`M`), quarter (`q`), year (`y`). | + + +## Aggregation functions + +The streamstats command supports the following aggregation functions: + +* `COUNT` -- Count of values +* `SUM` -- Sum of numeric values +* `AVG` -- Average of numeric values +* `MAX` -- Maximum value +* `MIN` -- Minimum value +* `VAR_SAMP` -- Sample variance +* `VAR_POP` -- Population variance +* `STDDEV_SAMP` -- Sample standard deviation +* `STDDEV_POP` -- Population standard deviation +* `DISTINCT_COUNT`/`DC` -- Distinct count of values +* `EARLIEST` -- Earliest value by timestamp +* `LATEST` -- Latest value by timestamp + + + +## Example 1: Calculate the running average, sum, and count of a field by group + +The following query calculates the running average age, running sum of age, and running count of events for all accounts, grouped by `gender`: + +```sql +source=accounts +| streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | + + +## Example 2: Calculate the running maximum over a 2-row window + +The following query calculates the running maximum age over a 2-row window, excluding the current event: + +```sql +source=state_country +| streamstats current=false window=2 max(age) as prev_max_age +``` +{% include copy.html %} + +The query returns the following results: + +| name | country | state | month | year | age | prev_max_age | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | null | +| Hello | USA | New York | 4 | 2023 | 30 | 70 | +| John | Canada | Ontario | 4 | 2023 | 25 | 70 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 25 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 27 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 57 | +| David | USA | Washington | 4 | 2023 | 40 | 70 | + + +## Example 3: Global vs group-specific windows + +The `global` parameter takes the following values: + +* `true`: A global window is applied across all rows, but the calculations inside the window still respect the by groups. +* `false`: The window itself is created per group, meaning each group gets its own independent window. + +The following example uses a sample index containing the following data. + +| name | country | state | month | year | age | +| --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | +| Hello | USA | New York | 4 | 2023 | 30 | +| John | Canada | Ontario | 4 | 2023 | 25 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | +| Jim | Canada | B.C | 4 | 2023 | 27 | +| Peter | Canada | B.C | 4 | 2023 | 57 | +| Rick | Canada | B.C | 4 | 2023 | 70 | +| David | USA | Washington | 4 | 2023 | 40 | + +The following examples calculate the running average of `age` across accounts by country, using a different `global` parameter. + +When `global=true`, the window slides across all rows in input order, but aggregation is still computed by `country`. The sliding window size is `2`: + +```sql +source=state_country +| streamstats window=2 global=true avg(age) as running_avg by country +``` +{% include copy.html %} + +As a result, `David` and `Rick` are included in the same sliding window when computing `running_avg` across all rows globally: + +| name | country | state | month | year | age | running_avg | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | 70.0 | +| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | +| David | USA | Washington | 4 | 2023 | 40 | 40.0 | + +In contrast, when `global=false`, each `by` group forms its own independent stream and window: + +```sql +source=state_country +| streamstats window=2 global=false avg(age) as running_avg by country +``` +{% include copy.html %} + +`David` and `Hello` form a window for the `USA` group. As a result, for `David`, the `running_avg` is `35.0` instead of `40.0` in the previous case: + +| name | country | state | month | year | age | running_avg | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | 70.0 | +| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | +| David | USA | Washington | 4 | 2023 | 40 | 35.0 | + + +## Example 4: Conditional statistics reset + +The following query calculates the running average of `age` across accounts by `country`, with resets applied: + +```sql +source=state_country +| streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country +``` +{% include copy.html %} + +The query returns the following results: + +| name | country | state | month | year | age | avg_age | +| --- | --- | --- | --- | --- | --- | --- | +| Jake | USA | California | 4 | 2023 | 70 | null | +| Hello | USA | New York | 4 | 2023 | 30 | 70.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | null | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | +| Jim | Canada | B.C | 4 | 2023 | 27 | null | +| Peter | Canada | B.C | 4 | 2023 | 57 | null | +| Rick | Canada | B.C | 4 | 2023 | 70 | null | +| David | USA | Washington | 4 | 2023 | 40 | null | + + +## Example 5: Null bucket behavior + +When `bucket_nullable=false`, null values are excluded from group-by aggregations: + +```sql +source=accounts +| streamstats bucket_nullable=false count() as cnt by employer +| fields account_number, firstname, employer, cnt +``` +{% include copy.html %} + +Rows in which the `by` field is `null` are excluded from aggregation, so the `cnt` for `Dale` is `null`: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | null | + +When `bucket_nullable=true`, null values are treated as a valid group: + +```sql +source=accounts +| streamstats bucket_nullable=true count() as cnt by employer +| fields account_number, firstname, employer, cnt +``` +{% include copy.html %} + +As a result, the `cnt` for `Dale` is included and calculated normally: + +| account_number | firstname | employer | cnt | +| --- | --- | --- | --- | +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | 1 | + \ No newline at end of file diff --git a/_sql-and-ppl/ppl/cmd/subquery.md b/_sql-and-ppl/ppl/commands/subquery.md similarity index 54% rename from _sql-and-ppl/ppl/cmd/subquery.md rename to _sql-and-ppl/ppl/commands/subquery.md index 94ca89fe869..04af756143d 100644 --- a/_sql-and-ppl/ppl/cmd/subquery.md +++ b/_sql-and-ppl/ppl/commands/subquery.md @@ -1,101 +1,46 @@ --- layout: default -title: "subquery" -parent: "Commands" -grand_parent: "PPL" +title: subquery +parent: Commands +grand_parent: PPL nav_order: 40 --- + # subquery +The `subquery` command allows you to embed one PPL query within another, enabling advanced filtering and data retrieval. A subquery is executed first, and its results are used by the outer query for filtering, comparison, or joining. -The `subquery` command embeds one PPL query inside another, enabling complex filtering and data retrieval operations. A subquery is a nested query that executes first and returns results that are used by the outer query for filtering, comparison, or joining operations. -Subqueries are useful for: -1. Filtering data based on results from another query -2. Checking for the existence of related data -3. Performing calculations that depend on aggregated values from other tables -4. Creating complex joins with dynamic conditions - +Common use cases for subqueries include: + +* Filtering data based on the results of another query. +* Checking for the existence of related data. +* Performing calculations that rely on aggregated values from other tables. +* Creating complex joins with dynamic conditions. ## Syntax -Use the following syntax: +The `subquery` command has the following syntax: `subquery: [ source=... | ... | ... ]` -Subqueries use the same syntax as regular PPL queries but must be enclosed in square brackets. There are four main types of subqueries: +Subqueries use the same syntax as regular PPL queries but must be enclosed in square brackets. There are four main subquery types: + +- [`IN`](#in-subquery) +- [`EXISTS`](#exists-subquery) +- [Scalar](#scalar-subquery) +- [Relation](#relation-subquery) + +### IN subquery -**IN Subquery** Tests whether a field value exists in the results of a subquery: ```sql where [not] in [ source=... | ... | ... ] ``` {% include copy.html %} - -**EXISTS Subquery** -Tests whether a subquery returns any results: - -```sql -where [not] exists [ source=... | ... | ... ] -``` - -**Scalar Subquery** -Returns a single value that can be used in comparisons or calculations - -```sql -where = [ source=... | ... | ... ] -``` -{% include copy.html %} - -**Relation Subquery** -Used in join operations to provide dynamic right-side data - -```sql -| join ON condition [ source=... | ... | ... ] -``` -{% include copy.html %} - - -## Configuration - -The following settings configure the `subquery` command behavior. - -### plugins.ppl.subsearch.maxout - -The size configures the maximum of rows to return from subsearch. The default value is: `10000`. A value of `0` indicates that the restriction is unlimited. - -Change the subsearch.maxout to unlimited: - -```bash -curl -sS -H 'Content-Type: application/json' \ --X PUT localhost:9200/_plugins/_query/settings \ --d '{"persistent" : {"plugins.ppl.subsearch.maxout" : "0"}}' -``` -{% include copy.html %} - -Expected output: -```json -{ - "acknowledged": true, - "persistent": { - "plugins": { - "ppl": { - "subsearch": { - "maxout": "-1" - } - } - } - }, - "transient": {} -} -``` - +The following are examples of the `IN` subquery syntax: -## Usage - -InSubquery: - ```sql source = outer | where a in [ source = inner | fields b ] source = outer | where (a) in [ source = inner | fields b ] @@ -104,14 +49,23 @@ source = outer | where a not in [ source = inner | fields b ] source = outer | where (a) not in [ source = inner | fields b ] source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ] source = outer a in [ source = inner | fields b ] // search filtering with subquery -source = outer a not in [ source = inner | fields b ] // search filtering with subquery) +source = outer a not in [ source = inner | fields b ] // search filtering with subquery source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ] // nested source = table1 | inner join left = l right = r on l.a = r.a AND r.a in [ source = inner | fields d ] | fields l.a, r.a, b, c //as join filter ``` {% include copy.html %} -ExistsSubquery: +### EXISTS subquery + +Tests whether a subquery returns any results: +```sql +where [not] exists [ source=... | ... | ... ] +``` +{% include copy.html %} + +The following are examples of the `EXISTS` subquery syntax: + ```sql // Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table nested source = outer | where exists [ source = inner | where a = c ] @@ -119,7 +73,7 @@ source = outer | where not exists [ source = inner | where a = c ] source = outer | where exists [ source = inner | where a = c and b = d ] source = outer | where not exists [ source = inner | where a = c and b = d ] source = outer exists [ source = inner | where a = c ] // search filtering with subquery -source = outer not exists [ source = inner | where a = c ] //search filtering with subquery +source = outer not exists [ source = inner | where a = c ] // search filtering with subquery source = table as t1 exists [ source = table as t2 | where t1.a = t2.a ] //table alias is useful in exists subquery source = outer | where exists [ source = inner1 | where a = c and exists [ source = nested | where c = e ] ] //nested source = outer | where exists [ source = inner1 | where a = c | where exists [ source = nested | where c = e ] ] //nested @@ -129,8 +83,17 @@ source = outer | where exists [ source = inner ] | eval l = "nonEmpty" | fields ``` {% include copy.html %} -ScalarSubquery: +### Scalar subquery + +Returns a single value that can be used in comparisons or calculations: +```sql +where = [ source=... | ... | ... ] +``` +{% include copy.html %} + +The following are examples of the scalar subquery syntax: + ```sql //Uncorrelated scalar subquery in Select source = outer | eval m = [ source = inner | stats max(c) ] | fields m, a @@ -153,74 +116,95 @@ source = outer [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats //Nested scalar subquery source = outer | where a = [ source = inner | stats max(c) | sort c ] OR b = [ source = inner | where c = 1 | stats min(d) | sort d ] source = outer | where a = [ source = inner | where c = [ source = nested | stats max(e) by f | sort f ] | stats max(d) by c | sort c | head 1 ] -RelationSubquery +``` +{% include copy.html %} + +### Relation Subquery + +Used in `join` operations to provide dynamic right-side data: + +```sql +| join ON condition [ source=... | ... | ... ] +``` +{% include copy.html %} + +The following are examples of the relation subquery syntax: + +```sql source = table1 | join left = l right = r on condition [ source = table2 | where d > 10 | head 5 ] //subquery in join right side source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1 ``` {% include copy.html %} - -## Example 1: TPC-H q20 +## Configuration -The following example PPL query shows a complex TPC-H query 20 implementation using nested subqueries. - -```bash -curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = supplier - | join ON s_nationkey = n_nationkey nation - | where n_name = 'CANADA' - and s_suppkey in [ - source = partsupp - | where ps_partkey in [ - source = part - | where like(p_name, 'forest%') - | fields p_partkey - ] - and ps_availqty > [ - source = lineitem - | where l_partkey = ps_partkey - and l_suppkey = ps_suppkey - and l_shipdate >= date('1994-01-01') - and l_shipdate < date_add(date('1994-01-01'), interval 1 year) - | stats sum(l_quantity) as sum_l_quantity - | eval half_sum_l_quantity = 0.5 * sum_l_quantity // Stats and Eval commands can combine when issues/819 resolved - | fields half_sum_l_quantity - ] - | fields ps_suppkey - ] - """ -}' +The `subquery` command behavior is configured using the `plugins.ppl.subsearch.maxout` setting, which specifies the maximum number of rows to return from the subsearch. Default is `10000`. A value of `0` indicates that the restriction is unlimited. + +To update the setting, send the following request: + +```json +PUT /_plugins/_query/settings +{ + "persistent": { + "plugins.ppl.subsearch.maxout": "0" + } +} +``` +{% include copy-curl.html %} + +## Example 1: TPC-H q20 + +The following query demonstrates a complex TPC-H query 20 implementation using nested subqueries: + +```sql +source = supplier +| join ON s_nationkey = n_nationkey nation +| where n_name = 'CANADA' + and s_suppkey in [ + source = partsupp + | where ps_partkey in [ + source = part + | where like(p_name, 'forest%') + | fields p_partkey + ] + and ps_availqty > [ + source = lineitem + | where l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date('1994-01-01') + and l_shipdate < date_add(date('1994-01-01'), interval 1 year) + | stats sum(l_quantity) as sum_l_quantity + | eval half_sum_l_quantity = 0.5 * sum_l_quantity // Stats and Eval commands can combine when issues/819 resolved + | fields half_sum_l_quantity + ] + | fields ps_suppkey + ] ``` {% include copy.html %} -## Example 2: TPC-H q22 +## Example 2: TPC-H q22 -The following example PPL query shows a TPC-H query 22 implementation using EXISTS and scalar subqueries. - -```bash -curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = [ +The following query demonstrates a TPC-H query 22 implementation using `EXISTS` and scalar subqueries: + +```sql +source = [ + source = customer + | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > [ source = customer - | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') - and c_acctbal > [ - source = customer - | where c_acctbal > 0.00 - and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') - | stats avg(c_acctbal) - ] - and not exists [ - source = orders - | where o_custkey = c_custkey - ] - | eval cntrycode = substring(c_phone, 1, 2) - | fields cntrycode, c_acctbal - ] as custsale - | stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode - | sort cntrycode - """ -}' + | where c_acctbal > 0.00 + and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + | stats avg(c_acctbal) + ] + and not exists [ + source = orders + | where o_custkey = c_custkey + ] + | eval cntrycode = substring(c_phone, 1, 2) + | fields cntrycode, c_acctbal + ] as custsale +| stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode +| sort cntrycode ``` {% include copy.html %} diff --git a/_sql-and-ppl/ppl/syntax.md b/_sql-and-ppl/ppl/commands/syntax.md similarity index 52% rename from _sql-and-ppl/ppl/syntax.md rename to _sql-and-ppl/ppl/commands/syntax.md index 69ef9e9b30e..2e33df6519d 100644 --- a/_sql-and-ppl/ppl/syntax.md +++ b/_sql-and-ppl/ppl/commands/syntax.md @@ -1,10 +1,9 @@ --- layout: default -title: Syntax -parent: PPL +title: PPL syntax +parent: Commands +grand_parent: PPL nav_order: 1 -redirect_from: - - /search-plugins/sql/ppl/syntax/ --- # PPL syntax @@ -23,18 +22,62 @@ search source= [boolean-expression] source= [boolean-expression] ``` {% include copy.html %} + +## Parameters -Field | Description | Required -:--- | :--- |:--- -`index` | Specifies the index to query. | No -`bool-expression` | Specifies an expression that evaluates to a Boolean value. | No +The `search` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | Specifies the index to query. | +| `` | Optional | Specifies an expression that evaluates to a Boolean value. | + + +## Syntax notation conventions + +PPL command syntax uses the following notation conventions. + +### Placeholders + +Placeholders are shown in angle brackets (`< >`). These must be replaced with actual values. + +**Example**: `` means you must specify an actual field name like `age` or `firstname`. + +### Optional elements + +Optional elements are enclosed in square brackets (`[ ]`). These can be omitted from the command. + +**Examples**: +- `[+|-]` means the plus or minus signs are optional +- `[]` means the alias placeholder is optional + +### Required choices + +Required choices between alternatives are shown in parentheses and are delimited with pipe separators (`(option1 | option2)`). You must choose exactly one of the specified options. + +**Example**: `(on | where)` means you must use either `on` or `where`, but not both. + +### Optional choices + +Optional choices between alternatives are shown in square brackets with pipe separators (`[option1 | option2]`). You can choose one of the options or omit them entirely. + +**Example**: `[asc | desc]` means you can specify `asc`, `desc`, or neither. + +### Repetition + +Ellipsis (`...`) indicates that the preceding element can be repeated multiple times. + +**Examples**: +- `...` means one or more fields without commas: `field1 field2 field3` +- `, ...` means comma-separated repetition: `field1, field2, field3` + ## Examples **Example 1: Search through accounts index** -In the following example, the `search` command refers to an `accounts` index as the source and uses `fields` and `where` commands for the conditions: +In the following query, the `search` command refers to an `accounts` index as the source and uses `fields` and `where` commands for the conditions: ```sql search source=accounts @@ -43,11 +86,6 @@ search source=accounts ``` {% include copy.html %} - -In the following examples, angle brackets `< >` enclose required arguments and square brackets `[ ]` enclose optional arguments. -{: .note } - - **Example 2: Get all documents** To get all documents from the `accounts` index, specify it as the `source`: @@ -74,7 +112,6 @@ search source=accounts account_number=1 or gender=\"F\"; ``` {% include copy.html %} - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | diff --git a/_sql-and-ppl/ppl/commands/table.md b/_sql-and-ppl/ppl/commands/table.md new file mode 100644 index 00000000000..cfefc8334de --- /dev/null +++ b/_sql-and-ppl/ppl/commands/table.md @@ -0,0 +1,52 @@ +--- +layout: default +title: table +parent: Commands +grand_parent: PPL +nav_order: 42 +--- + +# table + +The `table` command is an alias for the [`fields`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/fields/) command and provides the same field selection capabilities. It allows you to keep or remove fields from the search results using enhanced syntax options. + +## Syntax + +The `table` command has the following syntax: + +```sql +table [+|-] +``` + +## Parameters + +The `table` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | A comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. | +| `[+|-]` | Optional | Specifies the fields to keep or remove. If the plus (+) is used, only the fields specified in the field list are kept. If the minus (-) is used, all the fields specified in the field list are removed. Default is `+`. | + +## Example: Basic table command usage + +The following query shows basic field selection using the `table` command: + +```sql +source=accounts +| table firstname lastname age +``` +{% include copy.html %} + +The query returns the following results: + +| firstname | lastname | age | +| --- | --- | --- | +| Amber | Duke | 32 | +| Hattie | Bond | 36 | +| Nanette | Bates | 28 | +| Dale | Adams | 33 | + + +## Related documentation + +- [`fields`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/fields/) - An alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/timechart.md b/_sql-and-ppl/ppl/commands/timechart.md new file mode 100644 index 00000000000..1676b1ff157 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/timechart.md @@ -0,0 +1,368 @@ +--- +layout: default +title: timechart +parent: Commands +grand_parent: PPL +nav_order: 43 +--- + +# timechart + +The `timechart` command creates a time-based aggregation of data. It groups data by time intervals and, optionally, by a field, and then applies an aggregation function to each group. The results are returned in an unpivoted format, with separate rows for each time-field combination. + +## Syntax + +The `timechart` command has the following syntax: + +```sql +timechart [timefield=] [span=] [limit=] [useother=] [usenull=] [nullstr=] [by ] +``` + +## Parameters + +The `timechart` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `timefield` | Optional | The field to use for time-based grouping. Must be a timestamp field. Default is `@timestamp`. | +| `span` | Optional | Specifies the time interval for grouping data. Default is `1m` (1 minute). For a complete list of supported time units, see [Time units](#time-units). | +| `limit` | Optional | Specifies the maximum number of distinct values to display when using the "by" clause. Default is `10`. When there are more distinct values than the limit, additional values are grouped into an "OTHER" category if `useother` is not set to `false`. The "most distinct" values are determined by calculating the sum of aggregation values across all time intervals. Set to `0` to show all distinct values without any limit (when `limit=0`, `useother` is automatically set to `false`). Only applies when using the "by" clause. | +| `useother` | Optional | Controls whether to create an `OTHER` category for values beyond the `limit`. When set to `false`, only the top N values (based on `limit`) are shown without an `OTHER` category. When set to `true`, values beyond the `limit` are grouped into an `OTHER` category. This parameter only applies when using the `by` clause and when there are more values than the `limit`. Default is `true`. | +| `usenull` | Optional | Controls whether to group documents that have null values in the `by` field into a separate `NULL` category. When `usenull=false`, documents with null values in the `by` field are excluded from the results. When `usenull=true`, documents with null values in the `by` field are grouped into a separate `NULL` category. Default is `true`. | +| `nullstr` | Optional | Specifies the category name for documents that have null values in the `by` field. This parameter only applies when `usenull` is `true`. Default is `"NULL"`. | +| `` | Required | The aggregation function to apply to each time bucket. Only a single aggregation function is supported. Available functions: All aggregation functions supported by the [stats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/stats/) command, as well as the timechart-specific aggregations. | +| `by` | Optional | Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. | + +## Notes + +The following considerations apply when using the `timechart` command: + +* The `timechart` command requires a timestamp field in the data. By default, it uses the `@timestamp` field, but you can specify a different field using the `timefield` parameter. +* Results are returned in an unpivoted format with separate rows for each time-field combination that has data. +* Only combinations with actual data are included in the results---empty combinations are omitted rather than showing null or zero values. +* The top N values for the `limit` parameter are selected based on the sum of values across all time intervals for each distinct field value. +* When using the `limit` parameter, values beyond the limit are grouped into an `OTHER` category (unless `useother=false`). +* Documents with null values in the `by` field are treated as a separate category and appear as null in the results. + +### Time units + +The following time units are available for the `span` parameter: + +* Milliseconds (`ms`) +* Seconds (`s`) +* Minutes (`m`, case sensitive) +* Hours (`h`) +* Days (`d`) +* Weeks (`w`) +* Months (`M`, case sensitive) +* Quarters (`q`) +* Years (`y`) + +## Timechart-specific aggregation functions + +The `timechart` command provides specialized rate-based aggregation functions that calculate values per unit of time. + +### per_second + +**Usage**: `per_second(field)` calculates the per-second rate for a numeric field within each time bucket. + +**Calculation formula**: `per_second(field) = sum(field) / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. + +**Return type**: DOUBLE + +### per_minute + +**Usage**: `per_minute(field)` calculates the per-minute rate for a numeric field within each time bucket. + +**Calculation formula**: `per_minute(field) = sum(field) * 60 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. + +**Return type**: DOUBLE + +### per_hour + +**Usage**: `per_hour(field)` calculates the per-hour rate for a numeric field within each time bucket. + +**Calculation formula**: `per_hour(field) = sum(field) * 3600 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. + +**Return type**: DOUBLE + +### per_day + +**Usage**: `per_day(field)` calculates the per-day rate for a numeric field within each time bucket. + +**Calculation formula**: `per_day(field) = sum(field) * 86400 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. + +**Return type**: DOUBLE + +## Example 1: Count events by hour + +The following query counts events in each hourly interval and groups the results by `host`: + +```sql +source=events +| timechart span=1h count() by host +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 4 | +| 2023-01-01 10:00:00 | server2 | 4 | + + +## Example 2: Count events by minute + +The following query counts events in each one-minute interval and groups the results by `host`: + +```sql +source=events +| timechart span=1m count() by host +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 1 | +| 2023-01-01 10:05:00 | server2 | 1 | +| 2023-01-01 10:10:00 | server1 | 1 | +| 2023-01-01 10:15:00 | server2 | 1 | +| 2023-01-01 10:20:00 | server1 | 1 | +| 2023-01-01 10:25:00 | server2 | 1 | +| 2023-01-01 10:30:00 | server1 | 1 | +| 2023-01-01 10:35:00 | server2 | 1 | + + +## Example 3: Calculate the average number of packets by minute + +The following query calculates the average number of packets per minute without grouping by any additional field: + +```sql +source=events +| timechart span=1m avg(packets) +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | avg(packets) | +| --- | --- | +| 2023-01-01 10:00:00 | 60.0 | +| 2023-01-01 10:05:00 | 30.0 | +| 2023-01-01 10:10:00 | 60.0 | +| 2023-01-01 10:15:00 | 30.0 | +| 2023-01-01 10:20:00 | 60.0 | +| 2023-01-01 10:25:00 | 30.0 | +| 2023-01-01 10:30:00 | 180.0 | +| 2023-01-01 10:35:00 | 90.0 | + + +## Example 4: Calculate the average number of packets by every 20 minutes and status + +The following query calculates the average number of packets in each 20-minute interval and groups the results by `status`: + +```sql +source=events +| timechart span=20m avg(packets) by status +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | status | avg(packets) | +| --- | --- | --- | +| 2023-01-01 10:00:00 | active | 30.0 | +| 2023-01-01 10:00:00 | inactive | 30.0 | +| 2023-01-01 10:00:00 | pending | 60.0 | +| 2023-01-01 10:00:00 | processing | 60.0 | +| 2023-01-01 10:20:00 | cancelled | 180.0 | +| 2023-01-01 10:20:00 | completed | 60.0 | +| 2023-01-01 10:20:00 | inactive | 90.0 | +| 2023-01-01 10:20:00 | pending | 30.0 | + + +## Example 5: Count events by hour and category + +The following query counts events in each one-second interval and groups the results by `category`: + +```sql +source=events +| timechart span=1h count() by category +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | category | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | orders | 4 | +| 2023-01-01 10:00:00 | users | 4 | + +## Example 6: Using the limit parameter with count() + +This example uses the `events` dataset with fewer hosts for simplicity. + +When there are many distinct values in the `by` field, the `timechart` command displays only the top values according to the `limit` parameter and groups the remaining values into an `OTHER` category. + +The following query displays the top `2` hosts with the highest event counts and groups all remaining hosts into an `OTHER` category: + +```sql +source=events +| timechart span=1m limit=2 count() by host +``` + +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 1 | +| 2023-01-01 10:05:00 | server2 | 1 | +| 2023-01-01 10:10:00 | server1 | 1 | +| 2023-01-01 10:15:00 | server2 | 1 | +| 2023-01-01 10:20:00 | server1 | 1 | +| 2023-01-01 10:25:00 | server2 | 1 | +| 2023-01-01 10:30:00 | server1 | 1 | +| 2023-01-01 10:35:00 | server2 | 1 | + + +## Example 7: Use limit=0 with count() to show all values + +This example uses the `events_many_hosts` dataset, which contains 11 distinct hosts. + +To display all distinct values without applying any limit, set `limit=0`: + +```sql +source=events_many_hosts +| timechart span=1h limit=0 count() by host +``` +{% include copy.html %} + +All 11 hosts are returned as separate rows without an `OTHER` category: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2024-07-01 00:00:00 | web-01 | 1 | +| 2024-07-01 00:00:00 | web-02 | 1 | +| 2024-07-01 00:00:00 | web-03 | 1 | +| 2024-07-01 00:00:00 | web-04 | 1 | +| 2024-07-01 00:00:00 | web-05 | 1 | +| 2024-07-01 00:00:00 | web-06 | 1 | +| 2024-07-01 00:00:00 | web-07 | 1 | +| 2024-07-01 00:00:00 | web-08 | 1 | +| 2024-07-01 00:00:00 | web-09 | 1 | +| 2024-07-01 00:00:00 | web-10 | 1 | +| 2024-07-01 00:00:00 | web-11 | 1 | + +## Example 8: Use useother=false with count() function + +The following query limits the results to the top 10 hosts without creating an `OTHER` category by setting `useother=false`: + +```sql +source=events_many_hosts +| timechart span=1h useother=false count() by host +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2024-07-01 00:00:00 | web-01 | 1 | +| 2024-07-01 00:00:00 | web-02 | 1 | +| 2024-07-01 00:00:00 | web-03 | 1 | +| 2024-07-01 00:00:00 | web-04 | 1 | +| 2024-07-01 00:00:00 | web-05 | 1 | +| 2024-07-01 00:00:00 | web-06 | 1 | +| 2024-07-01 00:00:00 | web-07 | 1 | +| 2024-07-01 00:00:00 | web-08 | 1 | +| 2024-07-01 00:00:00 | web-09 | 1 | +| 2024-07-01 00:00:00 | web-10 | 1 | + + +## Example 9: Use limit with useother parameter and avg() function + +The following query displays the top 3 hosts based on average `cpu_usage` per hour. All remaining hosts are grouped into an `OTHER` category (by default, `useother=true`): + +```sql +source=events_many_hosts +| timechart span=1h limit=3 avg(cpu_usage) by host +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | avg(cpu_usage) | +| --- | --- | --- | +| 2024-07-01 00:00:00 | OTHER | 41.3 | +| 2024-07-01 00:00:00 | web-03 | 55.3 | +| 2024-07-01 00:00:00 | web-07 | 48.6 | +| 2024-07-01 00:00:00 | web-09 | 67.8 | + +The following query displays the top 3 hosts based on average `cpu_usage` per hour, without creating an `OTHER` category by setting `useother=false`: + +```sql +source=events_many_hosts +| timechart span=1h limit=3 useother=false avg(cpu_usage) by host +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | avg(cpu_usage) | +| --- | --- | --- | +| 2024-07-01 00:00:00 | web-03 | 55.3 | +| 2024-07-01 00:00:00 | web-07 | 48.6 | +| 2024-07-01 00:00:00 | web-09 | 67.8 | + + +## Example 10: Handling null values in the `by` field + +The following query demonstrates how null values in the `by` field are treated as a separate category: + +```sql +source=events_null +| timechart span=1h count() by host +``` +{% include copy.html %} + +The `events_null` dataset contains one entry without a `host` value. Because the default settings are `usenull=true` and `nullstr="NULL"`, this entry is grouped into a separate `NULL` category: + +| @timestamp | host | count() | +| --- | --- | --- | +| 2024-07-01 00:00:00 | NULL | 1 | +| 2024-07-01 00:00:00 | db-01 | 1 | +| 2024-07-01 00:00:00 | web-01 | 2 | +| 2024-07-01 00:00:00 | web-02 | 2 | + + +## Example 11: Calculate packets per second rate + +The following query calculates the per-second packet rate for network traffic data using the `per_second()` function: + +```sql +source=events +| timechart span=30m per_second(packets) by host +``` +{% include copy.html %} + +The query returns the following results: + +| @timestamp | host | per_second(packets) | +| --- | --- | --- | +| 2023-01-01 10:00:00 | server1 | 0.1 | +| 2023-01-01 10:00:00 | server2 | 0.05 | +| 2023-01-01 10:30:00 | server1 | 0.1 | +| 2023-01-01 10:30:00 | server2 | 0.05 | + + +## Limitations + +The `timechart` command has the following limitations: + +* Only a single aggregation function is supported per `timechart` command. +* The `bins` parameter and other `bin` options are not supported. To control the time intervals, use the `span` parameter. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/top.md b/_sql-and-ppl/ppl/commands/top.md new file mode 100644 index 00000000000..8d1110088ad --- /dev/null +++ b/_sql-and-ppl/ppl/commands/top.md @@ -0,0 +1,156 @@ +--- +layout: default +title: top +parent: Commands +grand_parent: PPL +nav_order: 44 +--- + +# top {#top-command} + +The `top` command finds the most common combination of values across all fields specified in the field list. + +The `top` command is not rewritten to [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/). It is only executed on the coordinating node. +{: .note} + +## Syntax + +The `top` command has the following syntax: + +```sql +top [N] [top-options] [by-clause] +``` + +## Parameters + +The `top` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Optional | The number of results to return. Default is `10`. | +| `top-options` | Optional | `showcount`: Whether to create a field in output that represents a count of the tuple of values. Default is `true`.
`countfield`: The name of the field that contains the count. Default is `count`.
`usenull`: Whether to output `null` values. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | +| `` | Required | A comma-delimited list of field names. | +| `` | Optional | One or more fields to group the results by. | + +## Example 1: Display counts in the default count column + +The following query finds the most common gender values: + +```sql +source=accounts +| top gender +``` +{% include copy.html %} + +By default, the `top` command automatically includes a `count` column showing the frequency of each value: + +| gender | count | +| --- | --- | +| M | 3 | +| F | 1 | + + +## Example 2: Find the most common values without count display + +The following query uses `showcount=false` to hide the `count` column in the results: + +```sql +source=accounts +| top showcount=false gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | +| --- | +| M | +| F | + +## Example 3: Rename the count column + +The following query uses the `countfield` parameter to specify a custom name (`cnt`) for the count column instead of the default `count`: + +```sql +source=accounts +| top countfield='cnt' gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | cnt | +| --- | --- | +| M | 3 | +| F | 1 | + +## Example 4: Limit the number of returned results + +The following query returns the top 1 most common gender value: + +```sql +source=accounts +| top 1 showcount=false gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | +| --- | +| M | + + +## Example 5: Group the results + +The following query uses the `by` clause to find the most common age within each gender group and show it separately for each gender: + +```sql +source=accounts +| top 1 showcount=false age by gender +``` +{% include copy.html %} + +The query returns the following results: + +| gender | age | +| --- | --- | +| F | 28 | +| M | 32 | + +## Example 6: Specify null value handling + +The following query specifies `usenull=false` to exclude null values: + +```sql +source=accounts +| top usenull=false email +``` +{% include copy.html %} + +The query returns the following results: + +| email | count | +| --- | --- | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + +The following query specifies `usenull=true` to include null values in the results: + +```sql +source=accounts +| top usenull=true email +``` +{% include copy.html %} + +The query returns the following results: + +| email | count | +| --- | --- | +| null | 1 | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | + + diff --git a/_sql-and-ppl/ppl/commands/trendline.md b/_sql-and-ppl/ppl/commands/trendline.md new file mode 100644 index 00000000000..bcc832f0943 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/trendline.md @@ -0,0 +1,122 @@ +--- +layout: default +title: trendline +parent: Commands +grand_parent: PPL +nav_order: 45 +--- + +# trendline + +The `trendline` command calculates moving averages of fields. + +## Syntax + +The `trendline` command has the following syntax: + +```sql +trendline [sort [+|-] ] (sma | wma)(, ) [as ] [(sma | wma)(, ) [as ]]... +``` + +## Parameters + +The `trendline` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `[+|-]` | Optional | The sort order for the data. `+` specifies ascending order with `NULL`/`MISSING` first, `-` specifies descending order with `NULL`/`MISSING` last. Default is `+`. | +| `` | Required | The field used to sort the data. | +| `(sma | wma)` | Required | The type of moving average to calculate. `sma` calculates simple moving average with equal weighting for all values, `wma` calculates weighted moving average with more weight to recent values. | +| `number-of-datapoints` | Required | The number of datapoints used to calculate the moving average. Must be greater than zero. | +| `` | Required | The field for which the moving average is calculated. | +| `` | Optional | The name of the resulting column containing the moving average. Default is the `` name with `_trendline` appended. | + +## Example 1: Calculate the simple moving average for one field + +The following query calculates the simple moving average for one field: + +```sql +source=accounts +| trendline sma(2, account_number) as an +| fields an +``` +{% include copy.html %} + +The query returns the following results: + +| an | +| --- | +| null | +| 3.5 | +| 9.5 | +| 15.5 | + + +## Example 2: Calculate the simple moving average for multiple fields + +The following query calculates the simple moving average for multiple fields: + +```sql +source=accounts +| trendline sma(2, account_number) as an sma(2, age) as age_trend +| fields an, age_trend +``` +{% include copy.html %} + +The query returns the following results: + +| an | age_trend | +| --- | --- | +| null | null | +| 3.5 | 34.0 | +| 9.5 | 32.0 | +| 15.5 | 30.5 | + + +## Example 3: Calculate the simple moving average for one field without specifying an alias + +The following query calculates the simple moving average for one field: + +```sql +source=accounts +| trendline sma(2, account_number) +| fields account_number_trendline +``` +{% include copy.html %} + +The query returns the following results: + +| account_number_trendline | +| --- | +| null | +| 3.5 | +| 9.5 | +| 15.5 | + + +## Example 4: Calculate the weighted moving average for one field + +The following query calculates the weighted moving average for one field: + +```sql +source=accounts +| trendline wma(2, account_number) +| fields account_number_trendline +``` +{% include copy.html %} + +The query returns the following results: + +| account_number_trendline | +| --- | +| null | +| 4.333333333333333 | +| 10.666666666666666 | +| 16.333333333333332 | + + +## Limitations + +The `trendline` command has the following limitations: + +* The `trendline` command requires all values in the specified `` parameter to be non-null. Any rows with `null` values in this field are automatically excluded from the command's output. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/where.md b/_sql-and-ppl/ppl/commands/where.md new file mode 100644 index 00000000000..9bfb94f1d29 --- /dev/null +++ b/_sql-and-ppl/ppl/commands/where.md @@ -0,0 +1,199 @@ +--- +layout: default +title: where +parent: Commands +grand_parent: PPL +nav_order: 46 +--- + +# where + +The `where` command filters the search results. It only returns results that match the specified conditions. + +## Syntax + +The `where` command has the following syntax: + +```sql +where +``` + +## Parameters + +The `where` command supports the following parameters. + +| Parameter | Required/Optional | Description | +| --- | --- | --- | +| `` | Required | The condition used to filter the results. Only rows where this condition evaluates to `true` are returned. | + +## Example 1: Filter by numeric values + +The following query returns accounts in which `balance` is greater than `30000`: + +```sql +source=accounts +| where balance > 30000 +| fields account_number, balance +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | balance | +| --- | --- | +| 1 | 39225 | +| 13 | 32838 | + + +## Example 2: Filter using combined criteria + +The following query combines multiple conditions using an `AND` operator: + +```sql +source=accounts +| where age > 30 AND gender = 'M' +| fields account_number, age, gender +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | age | gender | +| --- | --- | --- | +| 1 | 32 | M | +| 6 | 36 | M | +| 18 | 33 | M | + + +## Example 3: Filter with multiple possible values + +The following query fetches all the documents from the `accounts` index in which `account_number` is `1` or `gender` is `F`: + +```sql +source=accounts +| where account_number=1 or gender="F" +| fields account_number, gender +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | gender | +| --- | --- | +| 1 | M | +| 13 | F | + + +## Example 4: Filter by text patterns + +The `LIKE` operator enables pattern matching on string fields using wildcards. + +### Matching a single character + +The following query uses an underscore (`_`) to match a single character: + +```sql +source=accounts +| where LIKE(state, 'M_') +| fields account_number, state +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | state | +| --- | --- | +| 18 | MD | + +### Matching multiple characters + +The following query uses a percent sign (`%`) to match multiple characters: + +```sql +source=accounts +| where LIKE(state, 'V%') +| fields account_number, state +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | state | +| --- | --- | +| 13 | VA | + +## Example 5: Filter by excluding specific values + +The following query uses a `NOT` operator to exclude matching records: + +```sql +source=accounts +| where NOT state = 'CA' +| fields account_number, state +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | state | +| --- | --- | +| 1 | IL | +| 6 | TN | +| 13 | VA | +| 18 | MD | + + +## Example 6: Filter using value lists + +The following query uses an `IN` operator to match multiple values: + +```sql +source=accounts +| where state IN ('IL', 'VA') +| fields account_number, state +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | state | +| --- | --- | +| 1 | IL | +| 13 | VA | + + +## Example 7: Filter records with missing data + +The following query returns records in which the `employer` field is `null`: + +```sql +source=accounts +| where ISNULL(employer) +| fields account_number, employer +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | employer | +| --- | --- | +| 18 | null | + + +## Example 8: Filter using grouped conditions + +The following query combines multiple conditions using parentheses and logical operators: + +```sql +source=accounts +| where (balance > 40000 OR age > 35) AND gender = 'M' +| fields account_number, balance, age, gender +``` +{% include copy.html %} + +The query returns the following results: + +| account_number | balance | age | gender | +| --- | --- | --- | --- | +| 6 | 5686 | 36 | M | + \ No newline at end of file From 2413fdf366fa870257d38d7392d2f2d2b1952ea6 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Tue, 23 Dec 2025 15:27:37 -0500 Subject: [PATCH 3/9] Fix links Signed-off-by: Fanit Kolchina --- _sql-and-ppl/ppl/commands/rex.md | 2 +- _sql-and-ppl/ppl/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/_sql-and-ppl/ppl/commands/rex.md b/_sql-and-ppl/ppl/commands/rex.md index 701e8c4390a..93661864010 100644 --- a/_sql-and-ppl/ppl/commands/rex.md +++ b/_sql-and-ppl/ppl/commands/rex.md @@ -46,7 +46,7 @@ The `rex` command supports the following parameters.

-You can set the `max_match` limit in the `plugins.ppl.rex.max_match.limit` cluster setting. For more information, see [SQL settings]({{site.url}}{{site.baseurl}}/sql-and-ppl/sql/settings/). Setting this limit to a large value is not recommended because it can lead to excessive memory consumption, especially with patterns that match empty strings (for example, `\d*` or `\w*`). +You can set the `max_match` limit in the `plugins.ppl.rex.max_match.limit` cluster setting. For more information, see [SQL settings]({{site.url}}{{site.baseurl}}/sql-and-ppl/settings/). Setting this limit to a large value is not recommended because it can lead to excessive memory consumption, especially with patterns that match empty strings (for example, `\d*` or `\w*`). {: .note} ## Example 1: Basic text extraction diff --git a/_sql-and-ppl/ppl/index.md b/_sql-and-ppl/ppl/index.md index ad794a3eced..1daf1105da1 100644 --- a/_sql-and-ppl/ppl/index.md +++ b/_sql-and-ppl/ppl/index.md @@ -30,7 +30,7 @@ search source= | | | ... | {% include copy.html %} -See [Syntax]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/syntax/) for specific PPL syntax examples. +See [Syntax]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/syntax/) for specific PPL syntax examples. ## PPL commands From 4ddd4454b89432e992059b771547196bdb6de472 Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Wed, 24 Dec 2025 13:06:08 -0500 Subject: [PATCH 4/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- _dashboards/management/S3-data-source.md | 2 +- _sql-and-ppl/ppl/commands/ad.md | 14 +++++++------- _sql-and-ppl/ppl/commands/addcoltotals.md | 2 +- _sql-and-ppl/ppl/commands/addtotals.md | 2 +- _sql-and-ppl/ppl/commands/append.md | 4 ++-- _sql-and-ppl/ppl/commands/appendpipe.md | 4 ++-- _sql-and-ppl/ppl/commands/bin.md | 10 +++++----- _sql-and-ppl/ppl/commands/chart.md | 10 +++++----- _sql-and-ppl/ppl/commands/describe.md | 2 +- 9 files changed, 25 insertions(+), 25 deletions(-) diff --git a/_dashboards/management/S3-data-source.md b/_dashboards/management/S3-data-source.md index cd2eaced383..97092882854 100644 --- a/_dashboards/management/S3-data-source.md +++ b/_dashboards/management/S3-data-source.md @@ -46,5 +46,5 @@ This feature is currently under development, including the data integration func - Learn about [querying your data in Data Explorer]({{site.url}}{{site.baseurl}}/dashboards/management/query-data-source/) through OpenSearch Dashboards. - Learn about [optimizing the query performance of your external data sources]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/), such as Amazon S3, through Query Workbench. -- Learn about [Amazon S3 and AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.md) and the APIS used with Amazon S3 data sources, including configuration settings and query examples. +- Learn about [Amazon S3 and AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.md) and the APIs used with Amazon S3 data sources, including configuration settings and query examples. - Learn about [managing your indexes]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index/) through OpenSearch Dashboards. diff --git a/_sql-and-ppl/ppl/commands/ad.md b/_sql-and-ppl/ppl/commands/ad.md index 95452cdd6f5..0c618738659 100644 --- a/_sql-and-ppl/ppl/commands/ad.md +++ b/_sql-and-ppl/ppl/commands/ad.md @@ -11,23 +11,23 @@ nav_order: 1 The `ad` command is deprecated in favor of the [`ml` command]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/ml/). {: .warning} -The `ad` command applies Random Cut Forest (RCF) algorithm in the ML Commons plugin to the search results returned by a PPL command. The command provides two anomaly detection approaches: +The `ad` command applies the Random Cut Forest (RCF) algorithm in the ML Commons plugin to the search results returned by a PPL command. The command provides two anomaly detection approaches: -- [Anomaly detection for time-series data](#anomaly-detection-for-time-series-data) using the fixed in time RCF algorithm -- [Anomaly detection for non-time-series data](#anomaly-detection-for-non-time-series-data) using the batch RCF algorithm. +- [Anomaly detection for time-series data](#anomaly-detection-for-time-series-data) using the fixed-in-time RCF algorithm +- [Anomaly detection for non-time-series data](#anomaly-detection-for-non-time-series-data) using the batch RCF algorithm To use the `ad` command, `plugins.calcite.enabled` must be set to `false`. {: .note} ## Syntax -The `ad` command has two different syntax variants depending on the algorithm type. +The `ad` command has two different syntax variants, depending on the algorithm type. ### Anomaly detection for time-series data -Use this syntax to detect anomalies in time-series data. This method uses the fixed in time RCF algorithm, which is optimized for sequential data patterns. +Use this syntax to detect anomalies in time-series data. This method uses the fixed-in-time RCF algorithm, which is optimized for sequential data patterns. -The fixed in time RCF `ad` command has the following syntax: +The fixed-in-time RCF `ad` command has the following syntax: ```sql ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [anomaly_rate] [date_format] [time_zone] [category_field] @@ -35,7 +35,7 @@ ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [a ### Parameters -The fixed in time RCF algorithm supports the following parameters. +The fixed-in-time RCF algorithm supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | diff --git a/_sql-and-ppl/ppl/commands/addcoltotals.md b/_sql-and-ppl/ppl/commands/addcoltotals.md index 57fed4cb2b7..ce07f250ea2 100644 --- a/_sql-and-ppl/ppl/commands/addcoltotals.md +++ b/_sql-and-ppl/ppl/commands/addcoltotals.md @@ -28,7 +28,7 @@ The `addcoltotals` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Optional | A comma-separated list of numeric fields to add. By default, all numeric fields are added. | -| `labelfield` | Optional | The field in which the label is placed. If the field does not exist, it is created, and the label is shown in the summary row (last row) of the new field. | +| `labelfield` | Optional | The field in which the label is placed. If the field does not exist, it is created and the label is shown in the summary row (last row) of the new field. | | `label` | Optional | The text that appears in the summary row (last row) to identify the computed totals. When used with `labelfield`, this text is placed in the specified field in the summary row. Default is `Total`. | ### Example 1: Basic example diff --git a/_sql-and-ppl/ppl/commands/addtotals.md b/_sql-and-ppl/ppl/commands/addtotals.md index e841ca4d8c8..3e0e75393b3 100644 --- a/_sql-and-ppl/ppl/commands/addtotals.md +++ b/_sql-and-ppl/ppl/commands/addtotals.md @@ -30,7 +30,7 @@ The `addtotals` command supports the following parameters. | `` | Optional | A comma-separated list of numeric fields to add. By default, all numeric fields are added. | | `row` | Optional | Calculates the total of each row and adds a new field to store the row total. Default is `true`. | | `col` | Optional | Calculates the total of each column and adds a summary event at the end with the column totals. Default is `false`. | -| `labelfield` | Optional | The field in which the label is placed. If the field does not exist, it is created, and the label is shown in the summary row (last row) of the new field. Applicable when `col=true`. | +| `labelfield` | Optional | The field in which the label is placed. If the field does not exist, it is created and the label is shown in the summary row (last row) of the new field. Applicable when `col=true`. | | `label` | Optional | The text that appears in the summary row (last row) to identify the computed totals. When used with `labelfield`, this text is placed in the specified field in the summary row. Default is `Total`. Applicable when `col=true`. This parameter has no effect when the `labelfield` and `fieldname` parameters specify the same field name. | | `fieldname` | Optional | The field used to store row totals. Applicable when `row=true`. | diff --git a/_sql-and-ppl/ppl/commands/append.md b/_sql-and-ppl/ppl/commands/append.md index 20c39f63a62..4bcedfa7215 100644 --- a/_sql-and-ppl/ppl/commands/append.md +++ b/_sql-and-ppl/ppl/commands/append.md @@ -10,7 +10,7 @@ nav_order: 4 The `append` command appends the results of a subsearch as additional rows to the end of the input search results (the main search). -The command aligns columns that have the same field names and types. For columns that exist in only the main search or only the subsearch, `NULL` values are inserted in the missing fields for the respective rows. +The command aligns columns that have the same field names and types. For columns that exist in only the main search or subsearch, `NULL` values are inserted into the missing fields for the respective rows. ## Syntax @@ -73,4 +73,4 @@ The query returns the following results: The `append` command has the following limitations: -* **Schema compatibility**: When fields with the same name exist in both the main search and the subsearch but have incompatible types, the query fails with an error. To avoid type conflicts, ensure that fields with the same name share the same data type. Alternatively, use different field names. You can rename the conflicting fields using `eval`, or select non-conflicting columns using `fields`. \ No newline at end of file +* **Schema compatibility**: When fields with the same name exist in both the main search and the subsearch but have incompatible types, the query fails with an error. To avoid type conflicts, ensure that fields with the same name share the same data type. Alternatively, use different field names. You can rename the conflicting fields using `eval` or select non-conflicting columns using `fields`. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/appendpipe.md b/_sql-and-ppl/ppl/commands/appendpipe.md index 77b1259847c..444bc828ff2 100644 --- a/_sql-and-ppl/ppl/commands/appendpipe.md +++ b/_sql-and-ppl/ppl/commands/appendpipe.md @@ -10,7 +10,7 @@ nav_order: 6 The `appendpipe` command appends the results of a subpipeline to the search results. Unlike a subsearch, the subpipeline is not executed first; it runs only when the search reaches the `appendpipe` command. -The command aligns columns that have the same field names and types. For columns that exist in only the main search or only the subpipeline, `NULL` values are inserted in the missing fields for the respective rows. +The command aligns columns that have the same field names and types. For columns that exist in only the main search or subpipeline, `NULL` values are inserted into the missing fields for the respective rows. ## Syntax @@ -83,4 +83,4 @@ The query returns the following results: The `appendpipe` command has the following limitations: -* **Schema compatibility**: When fields with the same name exist in both the main search and the subpipeline but have incompatible types, the query fails with an error. To avoid type conflicts, ensure that fields with the same name share the same data type. Alternatively, use different field names. You can rename the conflicting fields using `eval`, or select non-conflicting columns using `fields`. +* **Schema compatibility**: When fields with the same name exist in both the main search and the subpipeline but have incompatible types, the query fails with an error. To avoid type conflicts, ensure that fields with the same name share the same data type. Alternatively, use different field names. You can rename the conflicting fields using `eval` or select non-conflicting columns using `fields`. diff --git a/_sql-and-ppl/ppl/commands/bin.md b/_sql-and-ppl/ppl/commands/bin.md index 7737b4c5f52..d70dab06685 100644 --- a/_sql-and-ppl/ppl/commands/bin.md +++ b/_sql-and-ppl/ppl/commands/bin.md @@ -25,9 +25,9 @@ The `bin` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Required | The field to group into buckets. Accepts numeric or time-based fields. | -| `span` | Optional | The interval size for each bin. Cannot be used with `bins` or `minspan` parameters. Supports numeric, logarithmic (`log10`, `2log10`), and time intervals. See [Time units](#time-units).| -| `minspan` | Optional | The minimum interval size for automatic span calculation. Cannot be used with `span` or `bins` parameters. | -| `bins` | Optional | The maximum number of equal-width bins to create. Must be between `2` and `50000` (inclusive). Cannot be used with `span` or `minspan` parameters. See [The bins parameter for timestamp fields](#the-bins-parameter-for-timestamp-fields).| +| `span` | Optional | The interval size for each bin. Cannot be used with the `bins` or `minspan` parameters. Supports numeric, logarithmic (`log10`, `2log10`), and time intervals. See [Time units](#time-units).| +| `minspan` | Optional | The minimum interval size for automatic span calculation. Cannot be used with the `span` or `bins` parameters. | +| `bins` | Optional | The maximum number of equal-width bins to create. Must be between `2` and `50000` (inclusive). Cannot be used with the `span` or `minspan` parameters. See [The bins parameter for timestamp fields](#the-bins-parameter-for-timestamp-fields).| | `aligntime` | Optional | Align the bin times for time-based fields. Valid only for time-based discretization. Valid values are `earliest`, `latest`, or a specific time. See [Align options](#align-time-options).| | `start` | Optional | The starting value of the interval range. Default is the minimum value of the field. | | `end` | Optional | The ending value of the interval range. Default is the maximum value of the field. | @@ -37,7 +37,7 @@ The `bin` command supports the following parameters. The `bins` parameter for timestamp fields has the following requirements: - **Pushdown must be enabled**: Enable pushdown by setting `plugins.calcite.pushdown.enabled` to `true` (enabled by default). If pushdown is disabled, use the `span` parameter instead (for example, `bin @timestamp span=5m`). -- **Timestamp field must be used as an aggregation bucket**: The binned timestamp field must be included in a `stats` aggregation (for example, `source=events | bin @timestamp bins=3 | stats count() by @timestamp`). Using `bins` on timestamp fields outside of aggregation buckets is not supported. +- **The timestamp field must be used as an aggregation bucket**: The binned timestamp field must be included in a `stats` aggregation (for example, `source=events | bin @timestamp bins=3 | stats count() by @timestamp`). Using `bins` on timestamp fields outside of aggregation buckets is not supported. ### Time units @@ -64,7 +64,7 @@ The following options are available for the `aligntime` parameter: ### Parameter behavior -When multiple parameters are specified, priority order is: `span` > `minspan` > `bins` > `start`/`end` > default. +When multiple parameters are specified, the priority order is: `span` > `minspan` > `bins` > `start`/`end` > default. ### Special parameter types diff --git a/_sql-and-ppl/ppl/commands/chart.md b/_sql-and-ppl/ppl/commands/chart.md index df5c16bb04a..417b8e989c3 100644 --- a/_sql-and-ppl/ppl/commands/chart.md +++ b/_sql-and-ppl/ppl/commands/chart.md @@ -27,7 +27,7 @@ The `chart` command supports the following parameters. | `` | Required | The aggregation function to apply to the data. Only a single aggregation function is supported. Available functions are the aggregation functions supported by the [`stats`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/stats/) command. | N/A | | `` | Optional | Groups the results by either one field (row split) or two fields (row split and column split). The parameters `limit`, `useother`, and `usenull` apply to the column split. Results are returned as individual rows for each combination. | Aggregate across all documents | | `over [] by []` | Optional | Alternative syntax for grouping by multiple fields. `over by ` groups the results by both fields. Using `over` alone on one field is equivalent to `by `. | N/A | -| `limit` | Optional | The number of categories to display when using column split. `limit=N` or `limit=topN` returns the top N categories. `limit=bottomN` returns the bottom N categories. When the limit is exceeded, remaining categories are grouped into an `OTHER` category (unless `useother=false`). Set to `0` to show all categories without limit. The ranking is based on the sum of aggregated values for each column category. For example, `limit=top3` keeps the three categories with the highest total values. Only applies when grouping by two fields. | `top10` | +| `limit` | Optional | The number of categories to display when using column split. `limit=N` or `limit=topN` returns the top N categories. `limit=bottomN` returns the bottom N categories. When the limit is exceeded, remaining categories are grouped into an `OTHER` category (unless `useother=false`). Set to `0` to show all categories without a limit. The ranking is based on the sum of aggregated values for each column category. For example, `limit=top3` keeps the three categories with the highest total values. Only applies when grouping by two fields. | `top10` | | `useother` | Optional | Controls whether to create an `OTHER` category for categories beyond the `limit`. When set to `false`, only the top or bottom N categories (based on `limit`) are shown without an `OTHER` category. When set to `true`, categories beyond the `limit` are grouped into an `OTHER` category. This parameter only applies when using column split and when there are more categories than the `limit`. | `true` | | `usenull` | Optional | Controls whether to group documents that have null values in the column split field into a separate `NULL` category. This parameter only applies to column split. Documents with null values in the row split field are ignored; only documents with non-null values in the row split field are included in the results. When `usenull=false`, documents with null values in the column split field are excluded from the results. When `usenull=true`, documents with null values in the column split field are grouped into a separate `NULL` category. | `true` | | `nullstr` | Optional | Specifies the category name for documents that have null values in the column split field. This parameter only applies when `usenull` is `true`. | `"NULL"` | @@ -59,7 +59,7 @@ The query returns the following results: | 20482.25 | -## Example 2: Group by single field +## Example 2: Group by a single field This example calculates the count of accounts grouped by gender: @@ -79,7 +79,7 @@ The query returns the following results: ## Example 3: Using over [] by [] to group by multiple fields -The following query calculates average balance grouped by both `gender` and `age` fields: +The following query calculates average balance grouped by both the `gender` and `age` fields: ```sql source=accounts @@ -118,7 +118,7 @@ The query returns the following results. The `age` column in the result is conve ## Example 5: Using limit with other parameters -The following query uses the `chart` command with `limit`, `useother`, and custom `otherstr` parameters: +The following query uses the `chart` command with the `limit`, `useother`, and custom `otherstr` parameters: ```sql source=accounts @@ -138,7 +138,7 @@ The query returns the following results: ## Example 6: Using null parameters -The following query uses the `chart` command with `limit`, `usenull`, and custom `nullstr` parameters: +The following query uses the `chart` command with the `limit`, `usenull`, and custom `nullstr` parameters: ```sql source=accounts diff --git a/_sql-and-ppl/ppl/commands/describe.md b/_sql-and-ppl/ppl/commands/describe.md index ceeb8734952..9adcaf42c5b 100644 --- a/_sql-and-ppl/ppl/commands/describe.md +++ b/_sql-and-ppl/ppl/commands/describe.md @@ -12,7 +12,7 @@ The `describe` command queries index metadata. The `describe` command can only b ## Syntax -The `describe` command has the following syntax. The argument to the command is a dot-separated path to the table, consisting of an optional data source, optional schema, and required table name: +The `describe` command has the following syntax. The argument to the command is a dot-separated path to the table consisting of an optional data source, optional schema, and required table name: ```sql describe [.][.] From 45d2107d6306183e85b5743474c3064aff6073cd Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Wed, 24 Dec 2025 13:07:55 -0500 Subject: [PATCH 5/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- _sql-and-ppl/ppl/commands/eventstats.md | 8 ++++---- _sql-and-ppl/ppl/commands/explain.md | 2 +- _sql-and-ppl/ppl/commands/fields.md | 6 +++--- _sql-and-ppl/ppl/commands/fillnull.md | 16 ++++++++-------- _sql-and-ppl/ppl/commands/index.md | 2 +- _sql-and-ppl/ppl/commands/join.md | 12 ++++++------ _sql-and-ppl/ppl/commands/kmeans.md | 6 +++--- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/_sql-and-ppl/ppl/commands/eventstats.md b/_sql-and-ppl/ppl/commands/eventstats.md index 33e81f6c6fb..0120a0b4308 100644 --- a/_sql-and-ppl/ppl/commands/eventstats.md +++ b/_sql-and-ppl/ppl/commands/eventstats.md @@ -16,7 +16,7 @@ The `eventstats` command operates in the following way: 2. The original events remain intact, with new fields added to contain the statistical results. 3. The command is particularly useful for comparative analysis, identifying outliers, and providing additional context to individual events. -## Comparing `stats` and `eventstats` +## Comparing stats and eventstats For a comprehensive comparison of `stats`, `eventstats`, and `streamstats` commands, including their differences in transformation behavior, output format, aggregation scope, and use cases, see [Comparing stats, eventstats, and streamstats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/streamstats/#comparing-stats-eventstats-and-streamstats). @@ -46,9 +46,9 @@ The `eventstats` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Required | An aggregation function or window function. | -| `bucket_nullable` | Optional | Controls whether the `eventstats` command considers `null` buckets as a valid group in group-by aggregations. When set to `false`, it does not treat `null` group by values as a distinct group during aggregation. Default is determined by `plugins.ppl.syntax.legacy.preferred`. | +| `bucket_nullable` | Optional | Controls whether the `eventstats` command considers `null` buckets as a valid group in group-by aggregations. When set to `false`, it does not treat `null` group-by values as a distinct group during aggregation. Default is determined by `plugins.ppl.syntax.legacy.preferred`. | | `` | Optional | Groups results by specified fields or expressions. Syntax: `by [span-expression,] [field,]...` Default is aggregating over the entire search results. | -| `` | Optional | Splits field into buckets by intervals (at most one). Syntax: `span(field_expr, interval_expr)`. For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. | +| `` | Optional | Splits a field into buckets by intervals (at most one). Syntax: `span(field_expr, interval_expr)`. For example, `span(age, 10)` creates 10-year age buckets, while `span(timestamp, 1h)` creates hourly buckets. | ### Time units @@ -83,7 +83,7 @@ The `eventstats` command supports the following aggregation functions: For detailed documentation of each function, see [Functions]({{site.url}}{{site.baseurl}}/sql-and-ppl/functions/#aggregate). -## Example 1: Calculate the average, sum and count of a field by group +## Example 1: Calculate the average, sum, and count of a field by group The following query calculates the average age, sum of age, and count of events for all accounts grouped by gender: diff --git a/_sql-and-ppl/ppl/commands/explain.md b/_sql-and-ppl/ppl/commands/explain.md index b069d060d2a..025121bbb9e 100644 --- a/_sql-and-ppl/ppl/commands/explain.md +++ b/_sql-and-ppl/ppl/commands/explain.md @@ -25,7 +25,7 @@ The `explain` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Required | A PPL query to explain. | -| `` | Optional | The explain mode. Valid values are:
- `standard`: Displays the logical and physical plan along with pushdown information (DSL). Available in both v2 and v3 engines.
- `simple`: Displays the logical plan tree without attributes. Requires v3 engine (`plugins.calcite.enabled` = `true`).
- `cost`: Displays the standard information plus plan cost attributes. Requires v3 engine (`plugins.calcite.enabled` = `true`).
- `extended`: Displays the standard information plus the generated code. Requires v3 engine (`plugins.calcite.enabled` = `true`).

Default is `standard`. | +| `` | Optional | The explain mode. Valid values are:
- `standard`: Displays the logical and physical plan along with pushdown information (query domain-specific language [DSL]). Available in both v2 and v3 engines.
- `simple`: Displays the logical plan tree without attributes. Requires the v3 engine (`plugins.calcite.enabled` = `true`).
- `cost`: Displays the standard information plus plan cost attributes. Requires the v3 engine (`plugins.calcite.enabled` = `true`).
- `extended`: Displays the standard information plus the generated code. Requires the v3 engine (`plugins.calcite.enabled` = `true`).

Default is `standard`. | ## Example 1: Explain a PPL query in the v2 engine diff --git a/_sql-and-ppl/ppl/commands/fields.md b/_sql-and-ppl/ppl/commands/fields.md index 55e05b052b8..4b797bb7027 100644 --- a/_sql-and-ppl/ppl/commands/fields.md +++ b/_sql-and-ppl/ppl/commands/fields.md @@ -129,9 +129,9 @@ The query returns the following results: | Dale | Adams | -## Example 6: Contains wildcard pattern +## Example 6: Wildcard pattern matching -The following query selects fields containing a pattern using contains wildcards: +The following query selects fields containing a pattern using `contains` wildcards: ```sql source=accounts @@ -225,4 +225,4 @@ The query returns the following results: ## Related documentation -- [`table`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/table/) - An alias command with identical functionality \ No newline at end of file +- [`table`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/table/) -- An alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/fillnull.md b/_sql-and-ppl/ppl/commands/fillnull.md index 9fc98a533bf..51564f97de3 100644 --- a/_sql-and-ppl/ppl/commands/fillnull.md +++ b/_sql-and-ppl/ppl/commands/fillnull.md @@ -25,9 +25,9 @@ fillnull value= [] The following syntax variations are available: -* `with in ` -- Apply same value to specified fields -* `using =, ...` -- Apply different values to different fields -* `value= []` -- Alternative syntax with optional space-delimited field list +* `with in ` -- Apply the same value to specified fields. +* `using =, ...` -- Apply different values to different fields. +* `value= []` -- Alternative syntax with an optional space-delimited field list. ## Parameters @@ -62,7 +62,7 @@ The query returns the following results: ## Example 2: Replace null values in multiple fields with a specified value -The following query replaces null values in both email and employer fields with `\`: +The following query replaces null values in both the `email` and `employer` fields with `\`: ```sql source=accounts @@ -123,7 +123,7 @@ The query returns the following results: | daleadams@boink.com | \ | -## Example 5: Replace null values in specific fields using the `value=` syntax +## Example 5: Replace null values in specific fields using the value= syntax The following query shows how to use the `fillnull` command with the `value=` syntax to replace null values in specific fields: @@ -144,7 +144,7 @@ The query returns the following results: | daleadams@boink.com | \ | -## Example 6: Replace null values in all fields using the `value=` syntax +## Example 6: Replace null values in all fields using the value= syntax When no `field-list` is specified, the replacement applies to all fields in the result: @@ -169,8 +169,8 @@ The query returns the following results: The `fillnull` command has the following limitations: -* When applying the same value to all fields without specifying field names, all fields must be the same type. For mixed types, use separate `fillnull` commands or explicitly specify fields. -* The replacement value type must match all field types in the field list. When applying the same value to multiple fields, all fields must be the same type (all strings or all numeric). The following query shows the error that occurs when this rule is violated: +* When applying the same value to all fields without specifying field names, all fields must be of the same type. For mixed types, use separate `fillnull` commands or explicitly specify fields. +* The replacement value type must match all field types in the field list. When applying the same value to multiple fields, all fields must be of the same type (all strings or all numeric). The following query shows the error that occurs when this rule is violated: ```sql # This FAILS - same value for mixed-type fields diff --git a/_sql-and-ppl/ppl/commands/index.md b/_sql-and-ppl/ppl/commands/index.md index 1941752b36d..227990127c9 100644 --- a/_sql-and-ppl/ppl/commands/index.md +++ b/_sql-and-ppl/ppl/commands/index.md @@ -14,4 +14,4 @@ redirect_from: # Commands -PPL supports most common [SQL functions](https://docs.opensearch.org/latest/search-plugins/sql/functions/), including [relevance search](https://docs.opensearch.org/latest/search-plugins/sql/full-text/), but also introduces several more functions (called _commands_), which are available in PPL only. +PPL supports most common [SQL functions](https://docs.opensearch.org/latest/search-plugins/sql/functions/), including [relevance search](https://docs.opensearch.org/latest/search-plugins/sql/full-text/), but also introduces several more functions, called _commands_, which are available in PPL only. diff --git a/_sql-and-ppl/ppl/commands/join.md b/_sql-and-ppl/ppl/commands/join.md index dd757e75d19..49ba3133e79 100644 --- a/_sql-and-ppl/ppl/commands/join.md +++ b/_sql-and-ppl/ppl/commands/join.md @@ -51,8 +51,8 @@ The basic `join` syntax supports the following parameters. | `` | Required | A comparison expression specifying how to join the datasets. Must be placed after the `on` or `where` keyword in the query. | | `` | Required | The right dataset, which can be an index or a subsearch, with or without an alias. | | `joinType` | Optional | The type of join to perform. Valid values are `left`, `semi`, `anti`, and performance-sensitive types (`right`, `full`, and `cross`). Default is `inner`. | -| `left` | Optional | An alias for the left dataset (typically a subsearch), used to avoid ambiguous field names. Specify as `left = `. | -| `right` | Optional | An alias for the right dataset (typically, a subsearch), used to avoid ambiguous field names. Specify as `right = `. | +| `left` | Optional | An alias for the left dataset (typically a subsearch) used to avoid ambiguous field names. Specify as `left = `. | +| `right` | Optional | An alias for the right dataset (typically, a subsearch) used to avoid ambiguous field names. Specify as `right = `. | ### Extended syntax @@ -85,8 +85,8 @@ The extended `join` syntax supports the following parameters. | `` | Optional | A list of fields used to build the join criteria. These fields must exist in both datasets. If not specified, all fields common to both datasets are used as join keys. | | `overwrite` | Optional | Applicable only when `join-field-list` is specified. Specifies whether fields from the right dataset with duplicate names should replace corresponding fields in the main search results. Default is `true`. | | `max` | Optional | The maximum number of subsearch results to join with each row in the main search. Default is `0` (unlimited). | -| `left` | Optional | An alias for the left dataset (typically a subsearch), used to avoid ambiguous field names. Specify as `left = `. | -| `right` | Optional | An alias for the right dataset (typically, a subsearch), used to avoid ambiguous field names. Specify as `right = `. | +| `left` | Optional | An alias for the left dataset (typically a subsearch) used to avoid ambiguous field names. Specify as `left = `. | +| `right` | Optional | An alias for the right dataset (typically, a subsearch) used to avoid ambiguous field names. Specify as `right = `. | ## Configuration @@ -127,7 +127,7 @@ The query returns the following results: | 100000.0 | 70 | England | -## Example 2: Join with subsearch +## Example 2: Join with a subsearch The following query combines a dataset with a subsearch using the basic `join` syntax: @@ -202,7 +202,7 @@ The query returns the following results: The `join` command has the following limitations: -* **Field name ambiguity in basic syntax** – When fields from the left and right datasets share the same name, the field names in the output are ambiguous. To resolve this, conflicting fields are renamed to `.id`, or `.id` if no alias is specified. +* **Field name ambiguity in basic syntax** – When fields from the left and right datasets share the same name, the field names in the output are ambiguous. To resolve this, conflicting fields are renamed to `.id` (or `.id` if no alias is specified). The following table demonstrates how field name conflicts are resolved when both `table1` and `table2` contain a field named `id`. diff --git a/_sql-and-ppl/ppl/commands/kmeans.md b/_sql-and-ppl/ppl/commands/kmeans.md index b1167c8ff85..c677c66f7f0 100644 --- a/_sql-and-ppl/ppl/commands/kmeans.md +++ b/_sql-and-ppl/ppl/commands/kmeans.md @@ -11,7 +11,7 @@ nav_order: 22 The `kmeans` command is deprecated in favor of the [`ml` command]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/ml/). {: .warning} -The `kmeans` command applies the k-means algorithm in the ml-commons plugin on the search results returned by a PPL command. +The `kmeans` command applies the k-means algorithm in the ML Commons plugin on the search results returned by a PPL command. To use the `kmeans` command, `plugins.calcite.enabled` must be set to `false`. {: .note} @@ -30,14 +30,14 @@ The `kmeans` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | -| `` | Optional | The number of clusters you want to group your data points into. Default is `2`. | +| `` | Optional | The number of clusters to group data points into. Default is `2`. | | `` | Optional | The number of iterations. Default is `10`. | | `` | Optional | The distance type. Valid values are `COSINE`, `L1`, and `EUCLIDEAN`. Default is `EUCLIDEAN`. | ## Example: Clustering of the Iris dataset -The following query classifies three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample (the lengths and widths of sepals and petals): +The following query classifies three Iris species (Iris setosa, Iris virginica, and Iris versicolor) based on the combination of four features measured from each sample (the lengths and widths of sepals and petals): ```sql source=iris_data From bd32ba2b72c0ea5bc3e4bb124ea737f39032ef29 Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Wed, 24 Dec 2025 13:09:36 -0500 Subject: [PATCH 6/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- _sql-and-ppl/ppl/commands/lookup.md | 2 +- _sql-and-ppl/ppl/commands/ml.md | 10 +++++----- _sql-and-ppl/ppl/commands/multisearch.md | 4 ++-- _sql-and-ppl/ppl/commands/parse.md | 2 +- _sql-and-ppl/ppl/commands/patterns.md | 7 ++++--- _sql-and-ppl/ppl/commands/regex.md | 8 ++++---- _sql-and-ppl/ppl/commands/replace.md | 8 ++++---- _sql-and-ppl/ppl/commands/reverse.md | 6 +++--- 8 files changed, 24 insertions(+), 23 deletions(-) diff --git a/_sql-and-ppl/ppl/commands/lookup.md b/_sql-and-ppl/ppl/commands/lookup.md index d22cd582c36..f2ed435f332 100644 --- a/_sql-and-ppl/ppl/commands/lookup.md +++ b/_sql-and-ppl/ppl/commands/lookup.md @@ -40,7 +40,7 @@ The `lookup` command supports the following parameters. | `` | Required | A key in the lookup index used for matching, similar to a join key in the right table. Specify multiple fields as a comma-separated list. | | `` | Optional | A key from the source data (left side) used for matching, similar to a join key in the left table. Default is `lookupMappingField`. | | `` | Optional | A field in the lookup index whose matched values are applied to the results (output). Specify multiple fields as a comma-separated list. If not specified, all fields except `lookupMappingField` from the lookup index are applied to the results. | -| `` | Optional | The name of the field in the results (output) in which matched values are placed. Specify multiple fields as a comma-separated list. If the `outputField` specifies an existing field in the source query, its values are replaced or appended with matched values from the `inputField`. If the field specified in the `outputField` is not an existing field, a new field is added to the results when using `replace` or the operation fails when using `append`. | +| `` | Optional | The name of the field in the results (output) in which matched values are placed. Specify multiple fields as a comma-separated list. If the `outputField` specifies an existing field in the source query, its values are replaced or appended with matched values from the `inputField`. If the field specified in the `outputField` is not an existing field, a new field is added to the results when using `replace`, or the operation fails when using `append`. | | `(replace | append)` | Optional | Specifies how matched values are applied to the output. `replace` overwrites existing values with matched values from the lookup index. `append` fills only missing values in the results with matched values from the lookup index. Default is `replace`. | ## Example 1: Replace existing values diff --git a/_sql-and-ppl/ppl/commands/ml.md b/_sql-and-ppl/ppl/commands/ml.md index e0bc25ff52d..de9e45d69fb 100644 --- a/_sql-and-ppl/ppl/commands/ml.md +++ b/_sql-and-ppl/ppl/commands/ml.md @@ -8,7 +8,7 @@ nav_order: 24 # ml -The `ml` command applies machine learning algorithms from the ML Commons plugin to the search results returned by a PPL command. It supports various machine learning operations, including anomaly detection and clustering. The command can perform train, predict, or combined train-and-predict operations, depending on the algorithm and specified action. +The `ml` command applies machine learning (ML) algorithms from the ML Commons plugin to the search results returned by a PPL command. It supports various ML operations, including anomaly detection and clustering. The command can perform train, predict, or combined train-and-predict operations, depending on the algorithm and specified action. To use the `ml` command, `plugins.calcite.enabled` must be set to `false`. {: .note} @@ -25,7 +25,7 @@ The `ml` command supports different syntax options, depending on the algorithm. ### Anomaly detection for time-series data -Use this syntax to detect anomalies in time-series data. This method uses the Random Cut Forest (RCF) algorithm optimized for sequential data patterns: +Use this syntax to detect anomalies in time-series data. This method uses the RCF algorithm optimized for sequential data patterns: ```sql ml action='train' algorithm='rcf' @@ -33,7 +33,7 @@ ml action='train' algorithm='rcf' ### Parameters -The fixed in time RCF algorithm supports the following parameters. +The fixed-in-time RCF algorithm supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | @@ -50,7 +50,7 @@ The fixed in time RCF algorithm supports the following parameters. ### Anomaly detection for non-time-series data -Use this syntax to detect anomalies in data where the order doesn't matter. This method uses the Random Cut Forest (RCF) algorithm optimized for independent data points: +Use this syntax to detect anomalies in data where the order doesn't matter. This method uses the RCF algorithm optimized for independent data points: ```sql ml action='train' algorithm='rcf' @@ -84,7 +84,7 @@ The k-means clustering algorithm supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | -| `centroids` | Optional | The number of clusters you want to group your data points into. Default is `2`. | +| `centroids` | Optional | The number of clusters to group data points into. Default is `2`. | | `iterations` | Optional | The number of iterations. Default is `10`. | | `distance_type` | Optional | The distance type. Valid values are `COSINE`, `L1`, and `EUCLIDEAN`. Default is `EUCLIDEAN`. | diff --git a/_sql-and-ppl/ppl/commands/multisearch.md b/_sql-and-ppl/ppl/commands/multisearch.md index 2988f922e01..9dfa8cabbe7 100644 --- a/_sql-and-ppl/ppl/commands/multisearch.md +++ b/_sql-and-ppl/ppl/commands/multisearch.md @@ -11,7 +11,7 @@ nav_order: 25 The `multisearch` command runs multiple subsearches and merges their results. It allows you to combine data from different queries on the same or different sources. You can optionally apply subsequent processing, such as aggregation or sorting, to the combined results. Each subsearch can have different filtering criteria, data transformations, and field selections. -Multisearch is particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. The command supports timestamp-based result interleaving when working with time-series data. +Multisearch is particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. The command supports timestamp-based result interleaving when working with time-series data. Use multisearch for: @@ -126,7 +126,7 @@ The query returns the following results: ## Example 4: Handling missing fields across subsearches -This example demonstrates how multisearch handles schema differences when subsearches return different fields. When one subsearch includes a field that others don't have, missing values are automatically filled with null values: +This example demonstrates how `multisearch` handles schema differences when subsearches return different fields. When one subsearch includes a field that others don't have, missing values are automatically filled with null values: ```sql | multisearch [search source=accounts diff --git a/_sql-and-ppl/ppl/commands/parse.md b/_sql-and-ppl/ppl/commands/parse.md index 6d9d83d99ec..00c165f1649 100644 --- a/_sql-and-ppl/ppl/commands/parse.md +++ b/_sql-and-ppl/ppl/commands/parse.md @@ -10,7 +10,7 @@ nav_order: 26 The `parse` command extracts information from a text field using a regular expression and adds the extracted information to the search results. It uses Java regex patterns. For more information, see the [Java regular expression documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -## rex and parse commands compared +## The rex and parse commands compared The `rex` and `parse` commands both extract information from text fields using Java regular expressions with named capture groups. To compare the capabilities of the `rex` and `parse` commands, see the [`rex` command documentation]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/rex/). diff --git a/_sql-and-ppl/ppl/commands/patterns.md b/_sql-and-ppl/ppl/commands/patterns.md index 6b4c9ad1315..4a7f8a3b1bc 100644 --- a/_sql-and-ppl/ppl/commands/patterns.md +++ b/_sql-and-ppl/ppl/commands/patterns.md @@ -8,7 +8,7 @@ nav_order: 27 # patterns -The `patterns` command extracts log patterns from a text field and appends the results to the search results. Grouping logs by pattern simplifies aggregating statistics from large volumes of log data for analysis and troubleshooting. You can choose from the following log-parsing methods to achieve high pattern-grouping accuracy: +The `patterns` command extracts log patterns from a text field and appends the results to the search results. Grouping logs by pattern simplifies aggregating statistics from large volumes of log data for analysis and troubleshooting. You can choose from the following log parsing methods to achieve high pattern-grouping accuracy: * `simple_pattern`: A parsing method that uses [Java regular expressions](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). * `brain`: An automatic log-grouping method that provides high grouping accuracy while preserving semantic meaning. @@ -78,6 +78,7 @@ By default, the Apache Calcite engine labels variables using the `<*>` placehold ## Changing the default pattern method To override default pattern parameters, run the following command: + ```json PUT _cluster/settings { @@ -159,7 +160,7 @@ The query returns the following results: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*><*><*>.<*><*><*>.<*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "POST /users HTTP/<*>.<*>" <*><*><*> <*><*><*><*> | -### Example 4: Return log pattern aggregation result +### Example 4: Return a log pattern aggregation result The following query aggregates patterns extracted from a raw log field: @@ -246,7 +247,7 @@ The query returns the following results: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | -### Example 3: Return log pattern aggregation result +### Example 3: Return a log pattern aggregation result The following query aggregates patterns extracted from a raw log field using the `brain` algorithm: diff --git a/_sql-and-ppl/ppl/commands/regex.md b/_sql-and-ppl/ppl/commands/regex.md index 6cdeed53e48..83c294a3e98 100644 --- a/_sql-and-ppl/ppl/commands/regex.md +++ b/_sql-and-ppl/ppl/commands/regex.md @@ -21,8 +21,8 @@ regex != The following operators are supported: -* `=` - Positive matching (include matches). -* `!=` - Negative matching (exclude matches). +* `=` -- Positive matching (include matches) +* `!=` -- Negative matching (exclude matches) The `regex` command uses Java's built-in regular expression engine, which supports: @@ -120,7 +120,7 @@ The query returns the following results: ## Example 5: Case-sensitive matching -By default, regex matching is case sensitive. The following query searches for lowercase state name `va`: +By default, regex matching is case sensitive. The following query searches for the lowercase state name `va`: ```sql source=accounts @@ -131,7 +131,7 @@ source=accounts The query returns no results because the regex pattern `va` (lowercase) does not match any state values in the data. -The following query searches for uppercase state name `VA`: +The following query searches for the uppercase state name `VA`: ```sql source=accounts diff --git a/_sql-and-ppl/ppl/commands/replace.md b/_sql-and-ppl/ppl/commands/replace.md index 187b363ac50..f73ae320390 100644 --- a/_sql-and-ppl/ppl/commands/replace.md +++ b/_sql-and-ppl/ppl/commands/replace.md @@ -93,7 +93,7 @@ The query returns the following results: ## Example 4: Replace text using multiple pattern-replacement pairs -The following query uses the `replace` command with multiple pattern and replacement pairs in a single replace command. The replacements are applied sequentially. +The following query uses the `replace` command with multiple pattern and replacement pairs in a single replace command. The replacements are applied sequentially: ```sql source=accounts @@ -175,7 +175,7 @@ The query returns the following results: ## Example 8: Wildcard capture and substitution -The following query uses wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement: +The following query uses wildcards in both the pattern and replacement to capture and reuse matched portions. The number of wildcards must match in the pattern and replacement: ```sql source=accounts @@ -283,6 +283,6 @@ The query returns the following results: The `replace` command has the following limitations: -* **Wildcards**: The `*` wildcard matches zero or more characters and is case-sensitive. -* **Wildcard matching**: Replacement wildcards must match the pattern wildcard count, or be zero. +* **Wildcards**: The `*` wildcard matches zero or more characters and is case sensitive. +* **Wildcard matching**: Replacement wildcards must match the pattern wildcard count or be zero. * **Escape sequences**: Use `\*` for literal asterisk and `\\` for literal backslash characters. \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/reverse.md b/_sql-and-ppl/ppl/commands/reverse.md index 14feb43bc78..a8b3a9506f9 100644 --- a/_sql-and-ppl/ppl/commands/reverse.md +++ b/_sql-and-ppl/ppl/commands/reverse.md @@ -8,9 +8,9 @@ nav_order: 32 # reverse -The `reverse` command reverses the display order of the search results. It returns the same results, but in the opposite order. +The `reverse` command reverses the display order of the search results. It returns the same results but in the opposite order. -The `reverse` command processes the entire dataset. If applied directly to millions of records, it consumes significant memory resources on the coordinating node. Only apply the `reverse` command to smaller datasets, typically after aggregation operations. +The `reverse` command processes the entire dataset. If applied directly to millions of records, it consumes significant coordinating node memory resources. Only apply the `reverse` command to smaller datasets, typically after aggregation operations. {: .note} ## Syntax @@ -88,7 +88,7 @@ The query returns the following results: ## Example 4: Double reverse -The following query shows that applying reverse twice returns documents in the original order: +The following query shows that applying `reverse` twice returns documents in the original order: ```sql source=accounts From 51faa4fc5f071799b2ab597815d0ec7053b6f59f Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Wed, 24 Dec 2025 13:11:46 -0500 Subject: [PATCH 7/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- _sql-and-ppl/ppl/commands/rex.md | 8 +++---- _sql-and-ppl/ppl/commands/search.md | 21 ++++++++++--------- _sql-and-ppl/ppl/commands/showdatasources.md | 2 +- _sql-and-ppl/ppl/commands/sort.md | 2 +- _sql-and-ppl/ppl/commands/spath.md | 2 +- _sql-and-ppl/ppl/commands/stats.md | 22 ++++++++++---------- 6 files changed, 29 insertions(+), 28 deletions(-) diff --git a/_sql-and-ppl/ppl/commands/rex.md b/_sql-and-ppl/ppl/commands/rex.md index 93661864010..0cc071cc182 100644 --- a/_sql-and-ppl/ppl/commands/rex.md +++ b/_sql-and-ppl/ppl/commands/rex.md @@ -10,7 +10,7 @@ nav_order: 33 The `rex` command extracts fields from a raw text field using regular expression named capture groups. It uses Java regex patterns. For more information, see the [Java regular expression documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -## rex and parse commands compared +## The rex and parse commands compared The `rex` and [`parse`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/parse/) commands both extract information from text fields using Java regular expressions with named capture groups. The following table compares the capabilities of the `rex` and `parse` commands. @@ -51,7 +51,7 @@ You can set the `max_match` limit in the `plugins.ppl.rex.max_match.limit` clust ## Example 1: Basic text extraction -The following query extracts username and domain from email addresses using named capture groups. Both extracted fields are returned as strings: +The following query extracts the username and domain from email addresses using named capture groups. Both extracted fields are returned as strings: ```sql source=accounts @@ -150,7 +150,7 @@ The query returns the following results: | hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | -## Example 6: Extract complex email pattern +## Example 6: Extract a complex email pattern The following query extracts complete email components, including the top-level domain. All extracted fields are returned as strings: @@ -251,7 +251,7 @@ The query returns the following results: | --- | --- | | 880 Holmes Lane | 10 | -**PPL query exceeding the configured limit results in an error**: +**A PPL query exceeding the configured limit results in an error**: ```sql source=accounts diff --git a/_sql-and-ppl/ppl/commands/search.md b/_sql-and-ppl/ppl/commands/search.md index b4b925e0253..46000679ea0 100644 --- a/_sql-and-ppl/ppl/commands/search.md +++ b/_sql-and-ppl/ppl/commands/search.md @@ -8,7 +8,7 @@ nav_order: 34 # search -The `search` command retrieves documents from the index. The `search` command can only be used as the first command in the PPL query. +The `search` command retrieves documents from the index. The `search` command can only be used as the first command in a PPL query. ## Syntax @@ -25,7 +25,7 @@ The `search` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Required | The index to query. The index name can be prefixed with `:` (the remote cluster name) for cross-cluster search. | -| `` | Optional | A search expression that is converted to OpenSearch [query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) query. | +| `` | Optional | A search expression that is converted to an OpenSearch [query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) query. | ## Search expression @@ -85,7 +85,7 @@ The following are examples of common time modifier patterns: * `earliest=-7d` -- Start from 7 days ago. * `latest='+1d@d'` -- End at the start of tomorrow. * `earliest='-1month@month'` -- Start from the beginning of the previous month. -* `latest=1754020061` -- End at a Unix timestamp `1754020061` (August 1, 2025, 03:47:41 UTC). +* `latest=1754020061` -- End at the Unix timestamp `1754020061` (August 1, 2025, 03:47:41 UTC). The following considerations apply when using time modifiers in the `search` command: @@ -207,6 +207,7 @@ The query returns the following results: `search user email` is equivalent to `search user AND email`. {: .note} + Enclose terms containing special characters in double quotation marks: ```sql @@ -326,7 +327,7 @@ The query returns the following results: **`NOT` operator** -Find all accounts that do not specify `Quility` as employer (including those with null employer values): +Find all accounts that do not specify `Quility` as the employer (including those with null employer values): ```sql search NOT employer="Quility" source=accounts @@ -343,7 +344,7 @@ The query returns the following results. Dale Adams appears in the search result ## Example 5: Range queries -Use comparison operators (`>,` `<,` `>=`, and `<=`) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics: +Use comparison operators (`>,` `<,` `>=` and `<=`) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics: ```sql search severityNumber>15 AND severityNumber<=20 source=otellogs @@ -436,7 +437,7 @@ The query returns the following results: ## Example 7: Wildcard patterns in field searches -When searching in text or keyword fields, wildcards enable partial matching, which is useful when you only know part of a value. Wildcards work best on keyword fields, for which they match the exact value using patterns. Using wildcards on text fields may produce unexpected results because they apply to individual tokens after analysis, not the entire field value. Wildcards in keyword fields are case-sensitive unless normalized at indexing. +When searching in text or keyword fields, wildcards enable partial matching, which is useful when you only know part of a value. Wildcards work best on keyword fields, for which they match the exact value using patterns. Using wildcards on text fields may produce unexpected results because they apply to individual tokens after analysis, not the entire field value. Wildcards in keyword fields are case sensitive unless normalized at indexing. Leading wildcards (for example, `*@example.com`) can decrease query speed compared to trailing wildcards. {: .note} @@ -525,7 +526,7 @@ The query returns the following results: ## Example 9: Complex expressions -To create sophisticated search queries, combine multiple conditions using Boolean operators and parentheses. +To create sophisticated search queries, combine multiple conditions using Boolean operators and parentheses: ```sql search (severityText="ERROR" OR severityText="WARN") AND severityNumber>10 source=otellogs @@ -543,7 +544,7 @@ The query returns the following results: | WARN | | ERROR | -Combine multiple conditions with OR and AND operators to search for logs matching either a specific user or high-severity fund errors: +Combine multiple conditions with `OR` and `AND` operators to search for logs matching either a specific user or high-severity fund errors: ```sql search `attributes.user.email`="user@example.com" OR (`attributes.error.code`="INSUFFICIENT_FUNDS" AND severityNumber>15) source=otellogs @@ -585,7 +586,7 @@ The query returns the following results: ### Relative time filtering -Filter logs using relative time expressions, such as events that occurred before 30 seconds ago: +Filter logs using relative time expressions, such as those that occurred before 30 seconds ago: ```sql search latest=-30s source=otellogs @@ -605,7 +606,7 @@ The query returns the following results: ### Time rounding -Use time rounding expressions to filter events relative to time boundaries, such as before the start of the current minute: +Use time rounding expressions to filter events relative to time boundaries, such as those before the start of the current minute: ```sql search latest='@m' source=otellogs diff --git a/_sql-and-ppl/ppl/commands/showdatasources.md b/_sql-and-ppl/ppl/commands/showdatasources.md index de287f86c5a..8de93568cdd 100644 --- a/_sql-and-ppl/ppl/commands/showdatasources.md +++ b/_sql-and-ppl/ppl/commands/showdatasources.md @@ -8,7 +8,7 @@ nav_order: 35 # show datasources -The `show datasources` command queries data sources configured in the PPL engine. The `show datasources` command can only be used as the first command in the PPL query. +The `show datasources` command queries data sources configured in the PPL engine. The `show datasources` command can only be used as the first command in a PPL query. To use the `show datasources` command, `plugins.calcite.enabled` must be set to `false`. {: .note} diff --git a/_sql-and-ppl/ppl/commands/sort.md b/_sql-and-ppl/ppl/commands/sort.md index ee28f389b30..32c8b8b5d81 100644 --- a/_sql-and-ppl/ppl/commands/sort.md +++ b/_sql-and-ppl/ppl/commands/sort.md @@ -12,7 +12,7 @@ The `sort` command sorts the search results by the specified fields. ## Syntax -The `sort` command supports two syntax notations. You must use one notation consistently within a single sort command. +The `sort` command supports two syntax notations. You must use one notation consistently within a single `sort` command. ### Prefix notation diff --git a/_sql-and-ppl/ppl/commands/spath.md b/_sql-and-ppl/ppl/commands/spath.md index 471babf7802..330178e5a62 100644 --- a/_sql-and-ppl/ppl/commands/spath.md +++ b/_sql-and-ppl/ppl/commands/spath.md @@ -28,7 +28,7 @@ The `spath` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `input` | Required | The field containing JSON data to parse. | -| `output` | Optional | The destination field in which the extracted data is stored. Default is the value of `path`. | +| `output` | Optional | The destination field in which the extracted data is stored. Default is the value of ``. | | `` | Required | The JSON path that identifies the data to extract. | diff --git a/_sql-and-ppl/ppl/commands/stats.md b/_sql-and-ppl/ppl/commands/stats.md index 765a311c5ee..bfbc551bb69 100644 --- a/_sql-and-ppl/ppl/commands/stats.md +++ b/_sql-and-ppl/ppl/commands/stats.md @@ -10,7 +10,7 @@ nav_order: 38 The `stats` command calculates aggregations on the search results. -## Comparing `stats`, `eventstats`, and `streamstats` +## Comparing stats, eventstats, and streamstats For a comprehensive comparison of `stats`, `eventstats`, and `streamstats` commands, including their differences in transformation behavior, output format, aggregation scope, and use cases, see [Comparing stats, eventstats, and streamstats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/streamstats/#comparing-stats-eventstats-and-streamstats). @@ -30,12 +30,12 @@ The `stats` command supports the following parameters. | --- | --- | --- | | `` | Required | An aggregation function. | | `` | Optional | Groups results by specified fields or expressions. Syntax: `by [span-expression,] [field,]...` If no `by-clause` is specified, the stats command returns only one row, which is the aggregation over the entire search results. | -| `bucket_nullable` | Optional | Controls whether to include `null` buckets in group-by aggregations. When `false`, ignores records in which the group-by field is null, resulting in faster performance. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | +| `bucket_nullable` | Optional | Controls whether to include `null` buckets in group-by aggregations. When `false`, ignores records in which the `group-by` field is null, resulting in faster performance. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | | `` | Optional | Splits a field into buckets by intervals (maximum of one). Syntax: `span(field_expr, interval_expr)`. By default, the interval uses the field's default unit. For date/time fields, aggregation results ignore null values. Examples: `span(age, 10)` creates 10-year age buckets, and `span(timestamp, 1h)` creates hourly buckets. Valid time units are millisecond (`ms`), second (`s`), minute (`m`), hour (`h`), day (`d`), week (`w`), month (`M`), quarter (`q`), year (`y`). | ## Aggregation functions -The stats command supports the following aggregation functions: +The `stats` command supports the following aggregation functions: * `COUNT`/`C` -- Count of values * `SUM` -- Sum of numeric values @@ -171,7 +171,7 @@ The query returns the following results: ## Example 7: Calculate the distinct count of a field -To retrieve the count of distinct values of a field, you can use `DISTINCT_COUNT` (or `DC`) function instead of `COUNT`. The following query calculates both the count and the distinct count of the `gender` field for all accounts: +To retrieve the count of distinct values of a field, you can use the `DISTINCT_COUNT` (or `DC`) function instead of `COUNT`. The following query calculates both the count and the distinct count of the `gender` field for all accounts: ```sql source=accounts @@ -239,9 +239,9 @@ The query returns the following results: | 1 | 35 | M | -## Example 10: Count and retrieve email list by gender and age span +## Example 10: Count and retrieve an email list by gender and age span -The following query calculates the count of `age` values grouped into 5-year intervals and by `gender`, and also returns a list of up to 5 emails for each group: +The following query calculates the count of `age` values grouped into 5-year intervals as well as by `gender` and also returns a list of up to 5 emails for each group: ```sql source=accounts @@ -295,7 +295,7 @@ The query returns the following results: ## Example 13: Calculate the percentile by a gender and span -The following query calculates the 90th percentile of `age`, grouped into 10-year intervals and by `gender`: +The following query calculates the 90th percentile of `age`, grouped into 10-year intervals as well as by `gender`: ```sql source=accounts @@ -328,7 +328,7 @@ The query returns the following results: | [Amber,Hattie,Nanette,Dale] | -## Example 15: Ignore null bucket +## Example 15: Ignore a null bucket The following query excludes null values from grouping by setting `bucket_nullable=false`: @@ -366,7 +366,7 @@ The query returns the following results: ## Example 17: Date span grouping with null handling -The following example uses this sample index data. +The following example uses this sample index data: | Name | DEPTNO | birthday | | --- | --- | --- | @@ -459,7 +459,7 @@ source=hits This query is translated into a `terms` aggregation in OpenSearch with `"order": { "_count": "desc" }`. For fields with high cardinality, some buckets may be discarded, so the results may only be approximate. -### Sorting by `doc_count` in ascending order may produce inaccurate results +### Sorting by doc_count in ascending order may produce inaccurate results When retrieving the least frequent terms for high-cardinality fields, results may be inaccurate. Shard-level aggregations can miss globally rare terms or misrepresent their frequency, causing errors in the overall results. @@ -474,4 +474,4 @@ source=hits {% include copy.html %} -A globally rare term might not appear as rare on every shard or could be entirely absent from some shard results. Conversely, a term infrequent on one shard might be common on another. In both cases, shard-level approximations can cause rare terms to be missed, leading to inaccurate overall results. +A globally rare term might not appear as rare on every shard or could be entirely absent from some shard results. Conversely, a term that is infrequent on one shard might be common on another. In both cases, shard-level approximations can cause rare terms to be missed, leading to inaccurate overall results. From be40eb3e2e157c683fe4376017b26e422087893c Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Wed, 24 Dec 2025 13:13:26 -0500 Subject: [PATCH 8/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- _sql-and-ppl/ppl/commands/streamstats.md | 28 ++++++++++++------------ _sql-and-ppl/ppl/commands/subquery.md | 2 +- _sql-and-ppl/ppl/commands/syntax.md | 8 +++---- _sql-and-ppl/ppl/commands/table.md | 4 ++-- _sql-and-ppl/ppl/commands/timechart.md | 22 +++++++++---------- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/_sql-and-ppl/ppl/commands/streamstats.md b/_sql-and-ppl/ppl/commands/streamstats.md index f11b24cb494..99386f9166d 100644 --- a/_sql-and-ppl/ppl/commands/streamstats.md +++ b/_sql-and-ppl/ppl/commands/streamstats.md @@ -8,19 +8,19 @@ nav_order: 39 # streamstats -The `streamstats` command calculates cumulative or rolling statistics as events are processed in order. Unlike `stats` or `eventstats` which operate on the entire dataset at once, `streamstats` processes events incrementally, making it suitable for time-series and sequence-based analysis. +The `streamstats` command calculates cumulative or rolling statistics as events that are processed in order. Unlike `stats` or `eventstats`, which operate on the entire dataset at once, `streamstats` processes events incrementally, making it suitable for time-series and sequence-based analysis. -Key features include support for `window` (sliding window calculations) and `current` (whether to include the current event in calculations) parameters, and specialized use cases such as identifying trends or detecting changes over sequences of events. +Key features include support for the `window` (sliding window calculations) and `current` (whether to include the current event in calculations) parameters and specialized use cases such as identifying trends or detecting changes over sequences of events. -## Comparing `stats`, `eventstats`, and `streamstats` +## Comparing stats, eventstats, and streamstats The `stats`, `eventstats`, and `streamstats` commands can all generate aggregations such as average, sum, and maximum. However, they differ in how they operate and the results they produce. The following table summarizes these differences. | Aspect | `stats` | `eventstats` | `streamstats` | | --- | --- | --- | --- | -| Transformation behavior | Transforms all events into an aggregated result table, losing original event structure | Adds aggregation results as new fields to the original events without removing the event structure | Adds cumulative (running) aggregation results to each event as they stream through the pipeline | +| Transformation behavior | Transforms all events into an aggregated result table, losing original event structure | Adds aggregation results as new fields to the original events without removing the event structure | Adds cumulative (running) aggregation results to each event as it streams through the pipeline | | Output format | Output contains only aggregated values. Original raw events are not preserved | Original events remain, with extra fields containing summary statistics | Original events remain, with extra fields containing running totals or cumulative statistics | -| Aggregation scope | Based on all events in the search (or groups defined by BY clause) | Based on all relevant events, then the result is added back to each event in the group | Calculations occur progressively as each event is processed; can be scoped by window | +| Aggregation scope | Based on all events in the search (or groups defined by the `by` clause) | Based on all relevant events, then the result is added back to each event in the group | Calculations occur progressively as each event is processed; can be scoped by window | | Use cases | When only aggregated results are needed (for example, counts, averages, sums) | When aggregated statistics are needed alongside original event data | When a running total or cumulative statistic is needed across event streams | @@ -57,7 +57,7 @@ The `streamstats` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Required | An aggregation function or window function. | -| `bucket_nullable` | Optional | Controls whether to consider null buckets as a valid group in group-by aggregations. When `false`, does not treat null group by values as a distinct group during aggregation. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | +| `bucket_nullable` | Optional | Controls whether to consider null buckets as a valid group in group-by aggregations. When `false`, does not treat null group-by values as a distinct group during aggregation. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | | `current` | Optional | Whether to include the current event in summary calculations. When `true`, includes the current event; when `false`, uses the field value from the previous event. Default is `true`. | | `window` | Optional | The number of events to use when computing statistics. Default is `0` (all previous and current events are used). | | `global` | Optional | Used only when `window` is specified. Determines whether to use a single window (`true`) or separate windows for each group defined by the `by` clause (`false`). When `false` and `window` is non-zero, a separate window is used for each group of values of the field specified in the `by` clause. Default is `true`. | @@ -69,7 +69,7 @@ The `streamstats` command supports the following parameters. ## Aggregation functions -The streamstats command supports the following aggregation functions: +The `streamstats` command supports the following aggregation functions: * `COUNT` -- Count of values * `SUM` -- Sum of numeric values @@ -92,7 +92,7 @@ For detailed documentation of each function, see [Functions]({{site.url}}{{site. ## Example 1: Calculate the running average, sum, and count of a field by group -The following query calculates the running average age, running sum of age, and running count of events for all accounts, grouped by `gender`: +The following query calculates the running average `age`, running sum of `age`, and running count of events for all accounts, grouped by `gender`: ```sql source=accounts @@ -112,7 +112,7 @@ The query returns the following results: ## Example 2: Calculate the running maximum over a 2-row window -The following query calculates the running maximum age over a 2-row window, excluding the current event: +The following query calculates the running maximum `age` over a 2-row window, excluding the current event: ```sql source=state_country @@ -134,14 +134,14 @@ The query returns the following results: | David | USA | Washington | 4 | 2023 | 40 | 70 | -## Example 3: Global vs group-specific windows +## Example 3: Global compared to group-specific windows The `global` parameter takes the following values: -* `true`: A global window is applied across all rows, but the calculations inside the window still respect the by groups. -* `false`: The window itself is created per group, meaning each group gets its own independent window. +* `true`: A global window is applied across all rows, but the calculations inside the window still respect the `by` groups. +* `false`: The window itself is created per group, meaning each group receives an independent window. -The following example uses a sample index containing the following data. +The following example uses a sample index containing the following data: | name | country | state | month | year | age | | --- | --- | --- | --- | --- | --- | @@ -177,7 +177,7 @@ As a result, `David` and `Rick` are included in the same sliding window when com | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | | David | USA | Washington | 4 | 2023 | 40 | 40.0 | -In contrast, when `global=false`, each `by` group forms its own independent stream and window: +In contrast, when `global=false`, each `by` group forms an independent stream and window: ```sql source=state_country diff --git a/_sql-and-ppl/ppl/commands/subquery.md b/_sql-and-ppl/ppl/commands/subquery.md index 04af756143d..89978052828 100644 --- a/_sql-and-ppl/ppl/commands/subquery.md +++ b/_sql-and-ppl/ppl/commands/subquery.md @@ -119,7 +119,7 @@ source = outer | where a = [ source = inner | where c = [ source = nested | sta ``` {% include copy.html %} -### Relation Subquery +### Relation subquery Used in `join` operations to provide dynamic right-side data: diff --git a/_sql-and-ppl/ppl/commands/syntax.md b/_sql-and-ppl/ppl/commands/syntax.md index 2e33df6519d..246ff9dffa0 100644 --- a/_sql-and-ppl/ppl/commands/syntax.md +++ b/_sql-and-ppl/ppl/commands/syntax.md @@ -49,8 +49,8 @@ Placeholders are shown in angle brackets (`< >`). These must be replaced with ac Optional elements are enclosed in square brackets (`[ ]`). These can be omitted from the command. **Examples**: -- `[+|-]` means the plus or minus signs are optional -- `[]` means the alias placeholder is optional +- `[+|-]` means the plus or minus signs are optional. +- `[]` means the alias placeholder is optional. ### Required choices @@ -66,7 +66,7 @@ Optional choices between alternatives are shown in square brackets with pipe sep ### Repetition -Ellipsis (`...`) indicates that the preceding element can be repeated multiple times. +An ellipsis (`...`) indicates that the preceding element can be repeated multiple times. **Examples**: - `...` means one or more fields without commas: `field1 field2 field3` @@ -77,7 +77,7 @@ Ellipsis (`...`) indicates that the preceding element can be repeated multiple t **Example 1: Search through accounts index** -In the following query, the `search` command refers to an `accounts` index as the source and uses `fields` and `where` commands for the conditions: +In the following query, the `search` command refers to an `accounts` index as the source and uses the `fields` and `where` commands for the conditions: ```sql search source=accounts diff --git a/_sql-and-ppl/ppl/commands/table.md b/_sql-and-ppl/ppl/commands/table.md index cfefc8334de..f99464fe499 100644 --- a/_sql-and-ppl/ppl/commands/table.md +++ b/_sql-and-ppl/ppl/commands/table.md @@ -25,7 +25,7 @@ The `table` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Required | A comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. | -| `[+|-]` | Optional | Specifies the fields to keep or remove. If the plus (+) is used, only the fields specified in the field list are kept. If the minus (-) is used, all the fields specified in the field list are removed. Default is `+`. | +| `[+|-]` | Optional | Specifies the fields to keep or remove. If the plus sign (`+`) is used, only the fields specified in the field list are kept. If the minus sign (`-`) is used, all the fields specified in the field list are removed. Default is `+`. | ## Example: Basic table command usage @@ -49,4 +49,4 @@ The query returns the following results: ## Related documentation -- [`fields`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/fields/) - An alias command with identical functionality \ No newline at end of file +- [`fields`]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/fields/) -- An alias command with identical functionality \ No newline at end of file diff --git a/_sql-and-ppl/ppl/commands/timechart.md b/_sql-and-ppl/ppl/commands/timechart.md index 1676b1ff157..488011d83fb 100644 --- a/_sql-and-ppl/ppl/commands/timechart.md +++ b/_sql-and-ppl/ppl/commands/timechart.md @@ -26,11 +26,11 @@ The `timechart` command supports the following parameters. | --- | --- | --- | | `timefield` | Optional | The field to use for time-based grouping. Must be a timestamp field. Default is `@timestamp`. | | `span` | Optional | Specifies the time interval for grouping data. Default is `1m` (1 minute). For a complete list of supported time units, see [Time units](#time-units). | -| `limit` | Optional | Specifies the maximum number of distinct values to display when using the "by" clause. Default is `10`. When there are more distinct values than the limit, additional values are grouped into an "OTHER" category if `useother` is not set to `false`. The "most distinct" values are determined by calculating the sum of aggregation values across all time intervals. Set to `0` to show all distinct values without any limit (when `limit=0`, `useother` is automatically set to `false`). Only applies when using the "by" clause. | +| `limit` | Optional | Specifies the maximum number of distinct values to display when using the `by` clause. Default is `10`. When there are more distinct values than the limit, additional values are grouped into an `OTHER` category if `useother` is not set to `false`. The "most distinct" values are determined by calculating the sum of aggregation values across all time intervals. Set to `0` to show all distinct values without any limit (when `limit=0`, `useother` is automatically set to `false`). Only applies when using the `by` clause. | | `useother` | Optional | Controls whether to create an `OTHER` category for values beyond the `limit`. When set to `false`, only the top N values (based on `limit`) are shown without an `OTHER` category. When set to `true`, values beyond the `limit` are grouped into an `OTHER` category. This parameter only applies when using the `by` clause and when there are more values than the `limit`. Default is `true`. | | `usenull` | Optional | Controls whether to group documents that have null values in the `by` field into a separate `NULL` category. When `usenull=false`, documents with null values in the `by` field are excluded from the results. When `usenull=true`, documents with null values in the `by` field are grouped into a separate `NULL` category. Default is `true`. | | `nullstr` | Optional | Specifies the category name for documents that have null values in the `by` field. This parameter only applies when `usenull` is `true`. Default is `"NULL"`. | -| `` | Required | The aggregation function to apply to each time bucket. Only a single aggregation function is supported. Available functions: All aggregation functions supported by the [stats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/stats/) command, as well as the timechart-specific aggregations. | +| `` | Required | The aggregation function to apply to each time bucket. Only a single aggregation function is supported. Available functions: All aggregation functions supported by the [stats]({{site.url}}{{site.baseurl}}/sql-and-ppl/ppl/commands/stats/) command as well as the timechart-specific aggregations. | | `by` | Optional | Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. | ## Notes @@ -114,7 +114,7 @@ The query returns the following results: ## Example 2: Count events by minute -The following query counts events in each one-minute interval and groups the results by `host`: +The following query counts events in each 1-minute interval and groups the results by `host`: ```sql source=events @@ -136,7 +136,7 @@ The query returns the following results: | 2023-01-01 10:35:00 | server2 | 1 | -## Example 3: Calculate the average number of packets by minute +## Example 3: Calculate the average number of packets per minute The following query calculates the average number of packets per minute without grouping by any additional field: @@ -160,7 +160,7 @@ The query returns the following results: | 2023-01-01 10:35:00 | 90.0 | -## Example 4: Calculate the average number of packets by every 20 minutes and status +## Example 4: Calculate the average number of packets per 20 minutes and status The following query calculates the average number of packets in each 20-minute interval and groups the results by `status`: @@ -186,7 +186,7 @@ The query returns the following results: ## Example 5: Count events by hour and category -The following query counts events in each one-second interval and groups the results by `category`: +The following query counts events in each 1-second interval and groups the results by `category`: ```sql source=events @@ -258,7 +258,7 @@ All 11 hosts are returned as separate rows without an `OTHER` category: | 2024-07-01 00:00:00 | web-10 | 1 | | 2024-07-01 00:00:00 | web-11 | 1 | -## Example 8: Use useother=false with count() function +## Example 8: Use useother=false with the count() function The following query limits the results to the top 10 hosts without creating an `OTHER` category by setting `useother=false`: @@ -284,7 +284,7 @@ The query returns the following results: | 2024-07-01 00:00:00 | web-10 | 1 | -## Example 9: Use limit with useother parameter and avg() function +## Example 9: Use the limit parameter with the useother parameter and the avg() function The following query displays the top 3 hosts based on average `cpu_usage` per hour. All remaining hosts are grouped into an `OTHER` category (by default, `useother=true`): @@ -303,7 +303,7 @@ The query returns the following results: | 2024-07-01 00:00:00 | web-07 | 48.6 | | 2024-07-01 00:00:00 | web-09 | 67.8 | -The following query displays the top 3 hosts based on average `cpu_usage` per hour, without creating an `OTHER` category by setting `useother=false`: +The following query displays the top 3 hosts based on average `cpu_usage` per hour without creating an `OTHER` category by setting `useother=false`: ```sql source=events_many_hosts @@ -320,7 +320,7 @@ The query returns the following results: | 2024-07-01 00:00:00 | web-09 | 67.8 | -## Example 10: Handling null values in the `by` field +## Example 10: Handling null values in the by field The following query demonstrates how null values in the `by` field are treated as a separate category: @@ -340,7 +340,7 @@ The `events_null` dataset contains one entry without a `host` value. Because the | 2024-07-01 00:00:00 | web-02 | 2 | -## Example 11: Calculate packets per second rate +## Example 11: Calculate the per-second packet rate The following query calculates the per-second packet rate for network traffic data using the `per_second()` function: From 27162ba0df5f91b4796c4e94e0e7f5bcb7a29104 Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Wed, 24 Dec 2025 13:14:20 -0500 Subject: [PATCH 9/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- _sql-and-ppl/ppl/commands/top.md | 4 ++-- _sql-and-ppl/ppl/commands/trendline.md | 6 +++--- _sql-and-ppl/ppl/commands/where.md | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/_sql-and-ppl/ppl/commands/top.md b/_sql-and-ppl/ppl/commands/top.md index 8d1110088ad..6e40c01d647 100644 --- a/_sql-and-ppl/ppl/commands/top.md +++ b/_sql-and-ppl/ppl/commands/top.md @@ -28,7 +28,7 @@ The `top` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | | `` | Optional | The number of results to return. Default is `10`. | -| `top-options` | Optional | `showcount`: Whether to create a field in output that represents a count of the tuple of values. Default is `true`.
`countfield`: The name of the field that contains the count. Default is `count`.
`usenull`: Whether to output `null` values. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | +| `top-options` | Optional | `showcount`: Whether to create a field in the output that represents a count of the tuple of values. Default is `true`.
`countfield`: The name of the field that contains the count. Default is `count`.
`usenull`: Whether to output `null` values. Default is the value of `plugins.ppl.syntax.legacy.preferred`. | | `` | Required | A comma-delimited list of field names. | | `` | Optional | One or more fields to group the results by. | @@ -50,7 +50,7 @@ By default, the `top` command automatically includes a `count` column showing th | F | 1 | -## Example 2: Find the most common values without count display +## Example 2: Find the most common values without the count display The following query uses `showcount=false` to hide the `count` column in the results: diff --git a/_sql-and-ppl/ppl/commands/trendline.md b/_sql-and-ppl/ppl/commands/trendline.md index bcc832f0943..1dd94c7e3d0 100644 --- a/_sql-and-ppl/ppl/commands/trendline.md +++ b/_sql-and-ppl/ppl/commands/trendline.md @@ -26,8 +26,8 @@ The `trendline` command supports the following parameters. | --- | --- | --- | | `[+|-]` | Optional | The sort order for the data. `+` specifies ascending order with `NULL`/`MISSING` first, `-` specifies descending order with `NULL`/`MISSING` last. Default is `+`. | | `` | Required | The field used to sort the data. | -| `(sma | wma)` | Required | The type of moving average to calculate. `sma` calculates simple moving average with equal weighting for all values, `wma` calculates weighted moving average with more weight to recent values. | -| `number-of-datapoints` | Required | The number of datapoints used to calculate the moving average. Must be greater than zero. | +| `(sma | wma)` | Required | The type of moving average to calculate. `sma` calculates the simple moving average with equal weighting for all values, `wma` calculates the weighted moving average with more weight given to recent values. | +| `number-of-datapoints` | Required | The number of data points used to calculate the moving average. Must be greater than zero. | | `` | Required | The field for which the moving average is calculated. | | `` | Optional | The name of the resulting column containing the moving average. Default is the `` name with `_trendline` appended. | @@ -75,7 +75,7 @@ The query returns the following results: ## Example 3: Calculate the simple moving average for one field without specifying an alias -The following query calculates the simple moving average for one field: +The following query calculates the simple moving average for one field without specifying an alias: ```sql source=accounts diff --git a/_sql-and-ppl/ppl/commands/where.md b/_sql-and-ppl/ppl/commands/where.md index 9bfb94f1d29..7f9b32d91db 100644 --- a/_sql-and-ppl/ppl/commands/where.md +++ b/_sql-and-ppl/ppl/commands/where.md @@ -24,7 +24,7 @@ The `where` command supports the following parameters. | Parameter | Required/Optional | Description | | --- | --- | --- | -| `` | Required | The condition used to filter the results. Only rows where this condition evaluates to `true` are returned. | +| `` | Required | The condition used to filter the results. Only rows in which this condition evaluates to `true` are returned. | ## Example 1: Filter by numeric values