From 15cea157f11049f1db2965c67945ea9874181311 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 23 Oct 2025 16:58:43 +0800 Subject: [PATCH 1/5] [docs] Support HDFS as Remote Storage and Lake Warehouse for Quickstart --- website/docs/quickstart/flink-iceberg.md | 140 ++++++++++++++++++----- 1 file changed, 110 insertions(+), 30 deletions(-) diff --git a/website/docs/quickstart/flink-iceberg.md b/website/docs/quickstart/flink-iceberg.md index 26d44ada42..3a1a373ce9 100644 --- a/website/docs/quickstart/flink-iceberg.md +++ b/website/docs/quickstart/flink-iceberg.md @@ -61,23 +61,55 @@ services: restart: always image: zookeeper:3.9.2 + namenode: + image: apache/hadoop:3.3.6 + hostname: namenode + user: root + command: [ "hdfs", "namenode" ] + ports: + - 9870:9870 + - 8020:8020 + environment: + ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name" + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + + datanode: + image: apache/hadoop:3.3.6 + user: root + command: [ "hdfs", "datanode" ] + environment: + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + depends_on: + - namenode + coordinator-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ depends_on: - zookeeper + - namenode + - datanode environment: - | FLUSS_PROPERTIES= zookeeper.address: zookeeper:2181 bind.listeners: FLUSS://coordinator-server:9123 - remote.data.dir: /tmp/fluss/remote-data + remote.data.dir: hdfs://namenode:8020/fluss-data datalake.format: iceberg datalake.iceberg.type: hadoop - datalake.iceberg.warehouse: /tmp/iceberg + datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake volumes: - - shared-tmpfs:/tmp/iceberg - ./lib:/tmp/lib - entrypoint: ["sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer"] + entrypoint: [ "sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer" ] tablet-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ @@ -90,13 +122,11 @@ services: zookeeper.address: zookeeper:2181 bind.listeners: FLUSS://tablet-server:9123 data.dir: /tmp/fluss/data - remote.data.dir: /tmp/fluss/remote-data - kv.snapshot.interval: 0s + remote.data.dir: hdfs://namenode:8020/fluss-data + kv.snapshot.interval: 10s datalake.format: iceberg datalake.iceberg.type: hadoop - datalake.iceberg.warehouse: /tmp/iceberg - volumes: - - shared-tmpfs:/tmp/iceberg + datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake jobmanager: image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ @@ -107,8 +137,6 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - volumes: - - shared-tmpfs:/tmp/iceberg taskmanager: image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ @@ -122,15 +150,6 @@ services: taskmanager.numberOfTaskSlots: 10 taskmanager.memory.process.size: 2048m taskmanager.memory.framework.off-heap.size: 256m - volumes: - - shared-tmpfs:/tmp/iceberg - -volumes: - shared-tmpfs: - driver: local - driver_opts: - type: "tmpfs" - device: "tmpfs" ``` The Docker Compose environment consists of the following containers: @@ -367,6 +386,60 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` +### Storage + +Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: +```shell +docker compose exec namenode hdfs dfs -ls -R /fluss-data/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . +``` + +**Sample Output:** +```shell +hdfs://namenode:8020/fluss-data +└── kv + └── fluss + ├── enriched_orders-3 + │ └── 0 + │ ├── shared + │ │ ├── 0836f202-bdcd-498b-a94a-0520beb3d7ea + │ │ ├── afefc29f-d8d3-4cdb-a496-a6c271ddfac0 + │ │ └── b67bd402-2ad4-4305-bd36-4fadf08a5200 + │ └── snap-2 + │ ├── _METADATA + │ ├── 02f02528-af03-4c88-980c-ec9f878d5476 + │ ├── 7b21a889-ab06-4b74-98a5-36b542a67d0d + │ └── d7b699d9-6547-49fc-b579-de84cc37a167 + ├── fluss_customer-1 + │ └── 0 + │ ├── shared + │ │ └── e537da08-ad8c-478d-9b80-505616e481b9 + │ └── snap-0 + │ ├── _METADATA + │ ├── 34a2dea2-8079-483f-b6d8-003a6e01bd3f + │ ├── b196e58d-7df2-4c95-b4fb-b1c3ebb4c622 + │ └── f6ff8a01-d7dc-451d-a0e7-716392808405 + ├── fluss_nation-2 + │ └── 0 + │ ├── shared + │ │ └── fd7c1369-fa5d-4396-9610-216f0affb213 + │ └── snap-0 + │ ├── _METADATA + │ ├── 0808acec-65d4-451c-b8fd-2225f045ad6d + │ ├── 181f67c8-0620-4e9a-9367-2a1c774b4abd + │ └── 5690e21f-3322-486c-93f5-5669d595cf34 + └── fluss_order-0 + └── 0 + ├── shared + │ ├── 29affd1e-ca99-4cc5-a855-61d3c1b2c9a5 + │ ├── 546a6113-0a63-4a17-964c-d8e37c32acc0 + │ └── 6d9b6ddb-a2c1-4746-aedd-3217a9f51686 + └── snap-2 + ├── _METADATA + ├── 3fb7b4e9-9d66-44b6-b846-dba77514a1c7 + ├── 450997ac-8323-4708-a4ac-6bb2e71834a7 + └── f2b4b230-41e9-41dd-9227-337e324460a6 +``` + ## Integrate with Iceberg ### Start the Lakehouse Tiering Service To integrate with [Apache Iceberg](https://iceberg.apache.org/), you need to start the `Lakehouse Tiering Service`. @@ -378,7 +451,7 @@ docker compose exec jobmanager \ --fluss.bootstrap.servers coordinator-server:9123 \ --datalake.format iceberg \ --datalake.iceberg.type hadoop \ - --datalake.iceberg.warehouse /tmp/iceberg + --datalake.iceberg.warehouse hdfs://namenode:8020/fluss-lake ``` You should see a Flink Job to tier data from Fluss to Iceberg running in the [Flink Web UI](http://localhost:8083/). @@ -501,20 +574,27 @@ SELECT sum(total_price) as sum_price FROM datalake_enriched_orders; You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time. -Finally, you can use the following command to view the files stored in Iceberg: +### Storage + +Finally, you can use the following command to view the files stored in Iceberg Hadoop warehouse: ```shell -docker compose exec taskmanager tree /tmp/iceberg/fluss +docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . ``` **Sample Output:** ```shell -/tmp/iceberg/fluss -└── datalake_enriched_orders - ├── data - │ └── 00000-0-abc123.parquet - └── metadata - ├── snap-1234567890123456789-1-abc123.avro - └── v1.metadata.json +hdfs://namenode:8020/fluss-lake +└── fluss + └── datalake_enriched_orders + ├── data + │ └── __bucket=0 + │ └── 00000-0-3ff95845-47af-456f-83e0-8411576cfffe-00001.parquet + └── metadata + ├── 528ae521-d683-4c5e-8dd7-779a83dd9c6f-m0.avro + ├── snap-3496049107217731071-1-528ae521-d683-4c5e-8dd7-779a83dd9c6f.avro + ├── v1.metadata.json + ├── v2.metadata.json + └── version-hint.text ``` The files adhere to Iceberg's standard format, enabling seamless querying with other engines such as [Spark](https://iceberg.apache.org/docs/latest/spark-queries/) and [Trino](https://trino.io/docs/current/connector/iceberg.html). From 68f9fdff958c1d55db30f8a89929fe0c55cef66b Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 23 Oct 2025 17:07:16 +0800 Subject: [PATCH 2/5] Remaining entries omitted for brevity --- website/docs/quickstart/flink-iceberg.md | 29 +----------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/website/docs/quickstart/flink-iceberg.md b/website/docs/quickstart/flink-iceberg.md index 3a1a373ce9..8610cee357 100644 --- a/website/docs/quickstart/flink-iceberg.md +++ b/website/docs/quickstart/flink-iceberg.md @@ -410,34 +410,7 @@ hdfs://namenode:8020/fluss-data │ ├── 7b21a889-ab06-4b74-98a5-36b542a67d0d │ └── d7b699d9-6547-49fc-b579-de84cc37a167 ├── fluss_customer-1 - │ └── 0 - │ ├── shared - │ │ └── e537da08-ad8c-478d-9b80-505616e481b9 - │ └── snap-0 - │ ├── _METADATA - │ ├── 34a2dea2-8079-483f-b6d8-003a6e01bd3f - │ ├── b196e58d-7df2-4c95-b4fb-b1c3ebb4c622 - │ └── f6ff8a01-d7dc-451d-a0e7-716392808405 - ├── fluss_nation-2 - │ └── 0 - │ ├── shared - │ │ └── fd7c1369-fa5d-4396-9610-216f0affb213 - │ └── snap-0 - │ ├── _METADATA - │ ├── 0808acec-65d4-451c-b8fd-2225f045ad6d - │ ├── 181f67c8-0620-4e9a-9367-2a1c774b4abd - │ └── 5690e21f-3322-486c-93f5-5669d595cf34 - └── fluss_order-0 - └── 0 - ├── shared - │ ├── 29affd1e-ca99-4cc5-a855-61d3c1b2c9a5 - │ ├── 546a6113-0a63-4a17-964c-d8e37c32acc0 - │ └── 6d9b6ddb-a2c1-4746-aedd-3217a9f51686 - └── snap-2 - ├── _METADATA - ├── 3fb7b4e9-9d66-44b6-b846-dba77514a1c7 - ├── 450997ac-8323-4708-a4ac-6bb2e71834a7 - └── f2b4b230-41e9-41dd-9227-337e324460a6 + ... # Remaining entries omitted for brevity ``` ## Integrate with Iceberg From e9da6d60f2bf0be82be2de8662f937c3d45d284f Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 23 Oct 2025 17:12:32 +0800 Subject: [PATCH 3/5] rename fluss remote storage --- website/docs/quickstart/flink-iceberg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/quickstart/flink-iceberg.md b/website/docs/quickstart/flink-iceberg.md index 8610cee357..fc13c76a7d 100644 --- a/website/docs/quickstart/flink-iceberg.md +++ b/website/docs/quickstart/flink-iceberg.md @@ -386,7 +386,7 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` -### Storage +## Fluss Remote Storage Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: ```shell From 5c6a0606e8b87cc2a2136d986a921a846b135849 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 23 Oct 2025 18:04:38 +0800 Subject: [PATCH 4/5] add paimon hadoop remote storage --- website/docs/quickstart/flink.md | 138 ++++++++++++++++++++++--------- 1 file changed, 97 insertions(+), 41 deletions(-) diff --git a/website/docs/quickstart/flink.md b/website/docs/quickstart/flink.md index ee8c47a500..16237de02a 100644 --- a/website/docs/quickstart/flink.md +++ b/website/docs/quickstart/flink.md @@ -37,12 +37,55 @@ cd fluss-quickstart-flink ```yaml services: + zookeeper: + restart: always + image: zookeeper:3.9.2 + namenode: + image: apache/hadoop:3.3.6 + hostname: namenode + user: root + command: [ "hdfs", "namenode" ] + ports: + - 9870:9870 + - 8020:8020 + environment: + ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name" + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + healthcheck: + test: ["CMD", "hdfs dfs -ls /"] + interval: 10s + timeout: 10s + retries: 20 + + datanode: + image: apache/hadoop:3.3.6 + user: root + command: [ "hdfs", "datanode" ] + environment: + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + depends_on: + - namenode + #begin Fluss cluster coordinator-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ - command: coordinatorServer depends_on: - - zookeeper + namenode: + condition: service_healthy + zookeeper: + condition: service_started + datanode: + condition: service_started environment: - | FLUSS_PROPERTIES= @@ -51,9 +94,10 @@ services: remote.data.dir: /tmp/fluss/remote-data datalake.format: paimon datalake.paimon.metastore: filesystem - datalake.paimon.warehouse: /tmp/paimon + datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake volumes: - - shared-tmpfs:/tmp/paimon + - ./lib:/tmp/lib + entrypoint: [ "sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer" ] tablet-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ command: tabletServer @@ -66,15 +110,11 @@ services: bind.listeners: FLUSS://tablet-server:9123 data.dir: /tmp/fluss/data remote.data.dir: /tmp/fluss/remote-data - kv.snapshot.interval: 0s + kv.snapshot.interval: 30s datalake.format: paimon datalake.paimon.metastore: filesystem - datalake.paimon.warehouse: /tmp/paimon - volumes: - - shared-tmpfs:/tmp/paimon - zookeeper: - restart: always - image: zookeeper:3.9.2 + datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake + #end #begin Flink cluster jobmanager: @@ -86,8 +126,6 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - volumes: - - shared-tmpfs:/tmp/paimon taskmanager: image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ depends_on: @@ -100,16 +138,7 @@ services: taskmanager.numberOfTaskSlots: 10 taskmanager.memory.process.size: 2048m taskmanager.memory.framework.off-heap.size: 256m - volumes: - - shared-tmpfs:/tmp/paimon #end - -volumes: - shared-tmpfs: - driver: local - driver_opts: - type: "tmpfs" - device: "tmpfs" ``` The Docker Compose environment consists of the following containers: @@ -346,6 +375,33 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` +## Fluss Remote Storage + +Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: +```shell +docker compose exec namenode hdfs dfs -ls -R /fluss-data/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . +``` + +**Sample Output:** +```shell +hdfs://namenode:8020/fluss-data +└── kv + └── fluss + ├── enriched_orders-3 + │ └── 0 + │ ├── shared + │ │ ├── 71fca534-ecca-489b-a19a-bd0538c9f9e9 + │ │ ├── b06ef3a3-2873-470e-961f-da25582136a1 + │ │ └── b93bad5c-00fb-4e62-8217-71b010621479 + │ └── snap-2 + │ ├── _METADATA + │ ├── 08d39726-f847-4401-8f31-4e905f2ba3f6 + │ ├── b6a7bc2c-b5c3-4eeb-a523-b2b6fff159f3 + │ └── e6278555-d71f-431f-954e-71bf066dd29f + ├── fluss_customer-1 + ... # Remaining entries omitted for brevity +``` + ## Integrate with Paimon ### Start the Lakehouse Tiering Service To integrate with [Apache Paimon](https://paimon.apache.org/), you need to start the `Lakehouse Tiering Service`. @@ -473,30 +529,30 @@ The result looks like: ``` You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time. -Finally, you can use the following command to view the files stored in Paimon: +### Storage + +Finally, you can use the following command to view the files stored in Paimon Hadoop warehouse: ```shell -docker compose exec taskmanager tree /tmp/paimon/fluss.db +docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . ``` **Sample Output:** ```shell -/tmp/paimon/fluss.db -└── datalake_enriched_orders - ├── bucket-0 - │ ├── changelog-aef1810f-85b2-4eba-8eb8-9b136dec5bdb-0.orc - │ └── data-aef1810f-85b2-4eba-8eb8-9b136dec5bdb-1.orc - ├── manifest - │ ├── manifest-aaa007e1-81a2-40b3-ba1f-9df4528bc402-0 - │ ├── manifest-aaa007e1-81a2-40b3-ba1f-9df4528bc402-1 - │ ├── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-0 - │ ├── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-1 - │ └── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-2 - ├── schema - │ └── schema-0 - └── snapshot - ├── EARLIEST - ├── LATEST - └── snapshot-1 +hdfs://namenode:8020/fluss-lake +├── default.db +└── fluss.db + └── datalake_enriched_orders + ├── bucket-0 + │ └── data-02acf76d-c4cc-4bc1-9292-e64a77dfcc72-0.parquet + ├── manifest + │ ├── manifest-df5b6833-7e92-4ec9-a196-51d6fd60b1d1-0 + │ ├── manifest-list-b683c5a2-4072-4c7a-8586-2c853de8d964-0 + │ └── manifest-list-b683c5a2-4072-4c7a-8586-2c853de8d964-1 + ├── schema + │ └── schema-0 + └── snapshot + ├── LATEST + └── snapshot-1 ``` The files adhere to Paimon's standard format, enabling seamless querying with other engines such as [StarRocks](https://docs.starrocks.io/docs/data_source/catalog/paimon_catalog/). From 333db0e2ea13e45bfd970c9f3731b0335afaf6b2 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Fri, 24 Oct 2025 10:30:13 +0800 Subject: [PATCH 5/5] fix --- website/docs/quickstart/flink-iceberg.md | 7 +++---- website/docs/quickstart/flink.md | 14 +++++--------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/website/docs/quickstart/flink-iceberg.md b/website/docs/quickstart/flink-iceberg.md index fc13c76a7d..dabb0bdcf8 100644 --- a/website/docs/quickstart/flink-iceberg.md +++ b/website/docs/quickstart/flink-iceberg.md @@ -97,7 +97,6 @@ services: depends_on: - zookeeper - namenode - - datanode environment: - | FLUSS_PROPERTIES= @@ -123,7 +122,7 @@ services: bind.listeners: FLUSS://tablet-server:9123 data.dir: /tmp/fluss/data remote.data.dir: hdfs://namenode:8020/fluss-data - kv.snapshot.interval: 10s + kv.snapshot.interval: 30s datalake.format: iceberg datalake.iceberg.type: hadoop datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake @@ -386,7 +385,7 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` -## Fluss Remote Storage +## Remote Storage Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: ```shell @@ -547,7 +546,7 @@ SELECT sum(total_price) as sum_price FROM datalake_enriched_orders; You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time. -### Storage +### Lake Storage Finally, you can use the following command to view the files stored in Iceberg Hadoop warehouse: ```shell diff --git a/website/docs/quickstart/flink.md b/website/docs/quickstart/flink.md index 16237de02a..18867804a9 100644 --- a/website/docs/quickstart/flink.md +++ b/website/docs/quickstart/flink.md @@ -57,8 +57,8 @@ services: HDFS-SITE.XML_dfs.permissions.enabled: false HDFS-SITE.XML_dfs.datanode.address: datanode:9866 healthcheck: - test: ["CMD", "hdfs dfs -ls /"] - interval: 10s + test: ["CMD", "hdfs dfs -test -d / && exit 0 || exit 1"] + interval: 15s timeout: 10s retries: 20 @@ -84,8 +84,6 @@ services: condition: service_healthy zookeeper: condition: service_started - datanode: - condition: service_started environment: - | FLUSS_PROPERTIES= @@ -95,9 +93,7 @@ services: datalake.format: paimon datalake.paimon.metastore: filesystem datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake - volumes: - - ./lib:/tmp/lib - entrypoint: [ "sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer" ] + command: coordinatorServer tablet-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ command: tabletServer @@ -375,7 +371,7 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` -## Fluss Remote Storage +## Remote Storage Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: ```shell @@ -529,7 +525,7 @@ The result looks like: ``` You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time. -### Storage +### Lake Storage Finally, you can use the following command to view the files stored in Paimon Hadoop warehouse: ```shell