Update 01-infer-schema.md

BohuTANG · web-flow · commit 0c9391a9b9ef · 2025-09-26T12:20:46.000+08:00
diff --git a/docs/en/sql-reference/20-sql-functions/17-table-functions/01-infer-schema.md b/docs/en/sql-reference/20-sql-functions/17-table-functions/01-infer-schema.md
@@ -4,11 +4,26 @@ title: INFER_SCHEMA
 
 Automatically detects the file metadata schema and retrieves the column definitions.
 
+`infer_schema` currently supports the following file formats:
+- **Parquet** - Native support for schema inference
+- **CSV** - With customizable delimiters and header detection
+- **NDJSON** - Newline-delimited JSON files
 
-:::caution
+**Compression Support**: All formats also support compressed files with extensions `.zip`, `.xz`, `.zst`.
 
-`infer_schema` currently only supports parquet file format.
+:::info File Size Limit
+Each individual file has a maximum size limit of **100MB** for schema inference.
+:::
+
+:::info Schema Merging
+When processing multiple files, `infer_schema` automatically merges different schemas:
+
+- **Compatible types** are promoted (e.g., INT8 + INT16 → INT16)
+- **Incompatible types** fall back to **VARCHAR** (e.g., INT + FLOAT → VARCHAR)
+- **Missing columns** in some files are marked as **nullable**
+- **New columns** from later files are added to the final schema
 
+This ensures all files can be read using the unified schema.
 :::
 
 ## Syntax
@@ -17,81 +32,222 @@ Automatically detects the file metadata schema and retrieves the column definiti
 INFER_SCHEMA(
   LOCATION => '{ internalStage | externalStage }'
   [ PATTERN => '<regex_pattern>']
+  [ FILE_FORMAT => '<format_name>' ]
+  [ MAX_RECORDS_PRE_FILE => <number> ]
+  [ MAX_FILE_COUNT => <number> ]
 )
 ```
 
-Where:
+## Parameters
 
-### internalStage
+| Parameter | Description | Default | Example |
+|-----------|-------------|---------|---------|
+| `LOCATION` | Stage location: `@<stage_name>[/<path>]` | Required | `'@my_stage/data/'` |
+| `PATTERN` | File name pattern to match | All files | `'*.csv'`, `'*.parquet'` |
+| `FILE_FORMAT` | File format name for parsing | Stage's format | `'csv_format'`, `'NDJSON'` |
+| `MAX_RECORDS_PRE_FILE` | Max records to sample per file | All records | `100`, `1000` |
+| `MAX_FILE_COUNT` | Max number of files to process | All files | `5`, `10` |
+
+## Examples
+
+### Parquet Files
 
 ```sql
-internalStage ::= @<internal_stage_name>[/<path>]
+-- Create stage and export data
+CREATE STAGE test_parquet;
+COPY INTO @test_parquet FROM (SELECT number FROM numbers(10)) FILE_FORMAT = (TYPE = 'PARQUET');
+
+-- Infer schema from parquet files using pattern
+SELECT * FROM INFER_SCHEMA(
+    location => '@test_parquet',
+    pattern => '*.parquet'
+);
+```
+
+Result:
+```
++-------------+-----------------+----------+----------+----------+
+| column_name | type            | nullable | filenames| order_id |
++-------------+-----------------+----------+----------+----------+
+| number      | BIGINT UNSIGNED |    false | data_... |        0 |
++-------------+-----------------+----------+----------+----------+
 ```
 
-### externalStage
+### CSV Files
 
 ```sql
-externalStage ::= @<external_stage_name>[/<path>]
+-- Create stage and export CSV data
+CREATE STAGE test_csv;
+COPY INTO @test_csv FROM (SELECT number FROM numbers(10)) FILE_FORMAT = (TYPE = 'CSV');
+
+-- Create a CSV file format
+CREATE FILE FORMAT csv_format TYPE = 'CSV';
+
+-- Infer schema using pattern and file format
+SELECT * FROM INFER_SCHEMA(
+    location => '@test_csv',
+    pattern => '*.csv',
+    file_format => 'csv_format'
+);
 ```
 
-### PATTERN = 'regex_pattern'
+Result:
+```
++-------------+---------+----------+----------+----------+
+| column_name | type    | nullable | filenames| order_id |
++-------------+---------+----------+----------+----------+
+| column_1    | BIGINT  |     true | data_... |        0 |
++-------------+---------+----------+----------+----------+
+```
 
-A [PCRE2](https://www.pcre.org/current/doc/html/)-based regular expression pattern string, enclosed in single quotes, specifying the file names to match. Click [here](#loading-data-with-pattern-matching) to see an example. For PCRE2 syntax, see http://www.pcre.org/current/doc/html/pcre2syntax.html.
+For CSV files with headers:
 
-## Examples
+```sql
+-- Create CSV file format with header support
+CREATE FILE FORMAT csv_headers_format
+TYPE = 'CSV'
+field_delimiter = ','
+skip_header = 1;
+
+-- Export data with headers
+CREATE STAGE test_csv_headers;
+COPY INTO @test_csv_headers FROM (
+  SELECT number as user_id, 'user_' || number::string as user_name
+  FROM numbers(5)
+) FILE_FORMAT = (TYPE = 'CSV', output_header = true);
+
+-- Infer schema with headers
+SELECT * FROM INFER_SCHEMA(
+    location => '@test_csv_headers',
+    file_format => 'csv_headers_format'
+);
+```
 
-Generate a parquet file in a stage:
+Limit records for faster inference:
 
 ```sql
-CREATE STAGE infer_parquet FILE_FORMAT = (TYPE = PARQUET);
-COPY INTO @infer_parquet FROM (SELECT * FROM numbers(10)) FILE_FORMAT = (TYPE = PARQUET);
+-- Sample only first 5 records for schema inference
+SELECT * FROM INFER_SCHEMA(
+    location => '@test_csv',
+    pattern => '*.csv',
+    file_format => 'csv_format',
+    max_records_pre_file => 5
+);
 ```
 
+### NDJSON Files
+
 ```sql
-LIST @infer_parquet;
-+-------------------------------------------------------+------+------------------------------------+-------------------------------+---------+
-| name                                                  | size | md5                                | last_modified                 | creator |
-+-------------------------------------------------------+------+------------------------------------+-------------------------------+---------+
-| data_e0fd9cba-f45c-4c43-aa07-d6d87d134378_0_0.parquet |  258 | "7DCC9FFE04EA1F6882AED2CF9640D3D4" | 2023-02-09 05:21:52.000 +0000 | NULL    |
-+-------------------------------------------------------+------+------------------------------------+-------------------------------+---------+
+-- Create stage and export NDJSON data
+CREATE STAGE test_ndjson;
+COPY INTO @test_ndjson FROM (SELECT number FROM numbers(10)) FILE_FORMAT = (TYPE = 'NDJSON');
+
+-- Infer schema using pattern and NDJSON format
+SELECT * FROM INFER_SCHEMA(
+    location => '@test_ndjson',
+    pattern => '*.ndjson',
+    file_format => 'NDJSON'
+);
 ```
 
-### `infer_schema`
+Result:
+```
++-------------+---------+----------+----------+----------+
+| column_name | type    | nullable | filenames| order_id |
++-------------+---------+----------+----------+----------+
+| number      | BIGINT  |     true | data_... |        0 |
++-------------+---------+----------+----------+----------+
+```
 
+Limit records for faster inference:
 
 ```sql
-SELECT * FROM INFER_SCHEMA(location => '@infer_parquet/data_e0fd9cba-f45c-4c43-aa07-d6d87d134378_0_0.parquet');
-+-------------+-----------------+----------+----------+
-| column_name | type            | nullable | order_id |
-+-------------+-----------------+----------+----------+
-| number      | BIGINT UNSIGNED |        0 |        0 |
-+-------------+-----------------+----------+----------+
+-- Sample only first 5 records for schema inference
+SELECT * FROM INFER_SCHEMA(
+    location => '@test_ndjson',
+    pattern => '*.ndjson',
+    file_format => 'NDJSON',
+    max_records_pre_file => 5
+);
 ```
 
-### `infer_schema` with Pattern Matching
+### Schema Merging with Multiple Files
+
+When files have different schemas, `infer_schema` merges them intelligently:
 
 ```sql
-SELECT * FROM infer_schema(location => '@infer_parquet/', pattern => '.*parquet');
-+-------------+-----------------+----------+----------+
-| column_name | type            | nullable | order_id |
-+-------------+-----------------+----------+----------+
-| number      | BIGINT UNSIGNED |        0 |        0 |
-+-------------+-----------------+----------+----------+
+-- Suppose you have multiple CSV files with different schemas:
+-- file1.csv: id(INT), name(VARCHAR)
+-- file2.csv: id(INT), name(VARCHAR), age(INT)
+-- file3.csv: id(FLOAT), name(VARCHAR), age(INT)
+
+SELECT * FROM INFER_SCHEMA(
+    location => '@my_stage/',
+    pattern => '*.csv',
+    file_format => 'csv_format'
+);
 ```
 
-### Create a Table From Parquet File
+Result shows merged schema:
+```
++-------------+---------+----------+-----------+----------+
+| column_name | type    | nullable | filenames | order_id |
++-------------+---------+----------+-----------+----------+
+| id          | VARCHAR |     true | file1,... |        0 |  -- INT+FLOAT→VARCHAR
+| name        | VARCHAR |     true | file1,... |        1 |
+| age         | BIGINT  |     true | file1,... |        2 |  -- Missing in file1→nullable
++-------------+---------+----------+-----------+----------+
+```
 
-The `infer_schema` can only display the schema of a parquet file and cannot create a table from it. 
+### Pattern Matching and File Limits
 
-To create a table from a parquet file:
+Use pattern matching to infer schema from multiple files:
 
 ```sql
-CREATE TABLE mytable AS SELECT * FROM @infer_parquet/ (pattern=>'.*parquet') LIMIT 0;
-
-DESC mytable;
-+--------+-----------------+------+---------+-------+
-| Field  | Type            | Null | Default | Extra |
-+--------+-----------------+------+---------+-------+
-| number | BIGINT UNSIGNED | NO   | 0       |       |
-+--------+-----------------+------+---------+-------+
+-- Infer schema from all CSV files in the directory
+SELECT * FROM INFER_SCHEMA(
+    location => '@my_stage/',
+    pattern => '*.csv'
+);
+```
+
+Limit the number of files processed to improve performance:
+
+```sql
+-- Process only the first 5 matching files
+SELECT * FROM INFER_SCHEMA(
+    location => '@my_stage/',
+    pattern => '*.csv',
+    max_file_count => 5
+);
+```
+
+### Compressed Files
+
+`infer_schema` automatically handles compressed files:
+
+```sql
+-- Works with compressed CSV files
+SELECT * FROM INFER_SCHEMA(location => '@my_stage/data.csv.zip');
+
+-- Works with compressed NDJSON files
+SELECT * FROM INFER_SCHEMA(
+    location => '@my_stage/data.ndjson.xz',
+    file_format => 'NDJSON',
+    max_records_pre_file => 50
+);
+```
+
+### Create Table from Inferred Schema
+
+The `infer_schema` function displays the schema but doesn't create tables. To create a table from the inferred schema:
+
+```sql
+-- Create table structure from file schema
+CREATE TABLE my_table AS
+SELECT * FROM @my_stage/ (pattern=>'*.parquet')
+LIMIT 0;
+
+-- Verify the table structure
+DESC my_table;
 ```