diff --git a/website/docs/quickstart/flink-iceberg.md b/website/docs/quickstart/flink-iceberg.md index 26d44ada42..dabb0bdcf8 100644 --- a/website/docs/quickstart/flink-iceberg.md +++ b/website/docs/quickstart/flink-iceberg.md @@ -61,23 +61,54 @@ services: restart: always image: zookeeper:3.9.2 + namenode: + image: apache/hadoop:3.3.6 + hostname: namenode + user: root + command: [ "hdfs", "namenode" ] + ports: + - 9870:9870 + - 8020:8020 + environment: + ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name" + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + + datanode: + image: apache/hadoop:3.3.6 + user: root + command: [ "hdfs", "datanode" ] + environment: + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + depends_on: + - namenode + coordinator-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ depends_on: - zookeeper + - namenode environment: - | FLUSS_PROPERTIES= zookeeper.address: zookeeper:2181 bind.listeners: FLUSS://coordinator-server:9123 - remote.data.dir: /tmp/fluss/remote-data + remote.data.dir: hdfs://namenode:8020/fluss-data datalake.format: iceberg datalake.iceberg.type: hadoop - datalake.iceberg.warehouse: /tmp/iceberg + datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake volumes: - - shared-tmpfs:/tmp/iceberg - ./lib:/tmp/lib - entrypoint: ["sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer"] + entrypoint: [ "sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer" ] tablet-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ @@ -90,13 +121,11 @@ services: zookeeper.address: zookeeper:2181 bind.listeners: FLUSS://tablet-server:9123 data.dir: /tmp/fluss/data - remote.data.dir: /tmp/fluss/remote-data - kv.snapshot.interval: 0s + remote.data.dir: hdfs://namenode:8020/fluss-data + kv.snapshot.interval: 30s datalake.format: iceberg datalake.iceberg.type: hadoop - datalake.iceberg.warehouse: /tmp/iceberg - volumes: - - shared-tmpfs:/tmp/iceberg + datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake jobmanager: image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ @@ -107,8 +136,6 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - volumes: - - shared-tmpfs:/tmp/iceberg taskmanager: image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ @@ -122,15 +149,6 @@ services: taskmanager.numberOfTaskSlots: 10 taskmanager.memory.process.size: 2048m taskmanager.memory.framework.off-heap.size: 256m - volumes: - - shared-tmpfs:/tmp/iceberg - -volumes: - shared-tmpfs: - driver: local - driver_opts: - type: "tmpfs" - device: "tmpfs" ``` The Docker Compose environment consists of the following containers: @@ -367,6 +385,33 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` +## Remote Storage + +Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: +```shell +docker compose exec namenode hdfs dfs -ls -R /fluss-data/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . +``` + +**Sample Output:** +```shell +hdfs://namenode:8020/fluss-data +└── kv + └── fluss + ├── enriched_orders-3 + │ └── 0 + │ ├── shared + │ │ ├── 0836f202-bdcd-498b-a94a-0520beb3d7ea + │ │ ├── afefc29f-d8d3-4cdb-a496-a6c271ddfac0 + │ │ └── b67bd402-2ad4-4305-bd36-4fadf08a5200 + │ └── snap-2 + │ ├── _METADATA + │ ├── 02f02528-af03-4c88-980c-ec9f878d5476 + │ ├── 7b21a889-ab06-4b74-98a5-36b542a67d0d + │ └── d7b699d9-6547-49fc-b579-de84cc37a167 + ├── fluss_customer-1 + ... # Remaining entries omitted for brevity +``` + ## Integrate with Iceberg ### Start the Lakehouse Tiering Service To integrate with [Apache Iceberg](https://iceberg.apache.org/), you need to start the `Lakehouse Tiering Service`. @@ -378,7 +423,7 @@ docker compose exec jobmanager \ --fluss.bootstrap.servers coordinator-server:9123 \ --datalake.format iceberg \ --datalake.iceberg.type hadoop \ - --datalake.iceberg.warehouse /tmp/iceberg + --datalake.iceberg.warehouse hdfs://namenode:8020/fluss-lake ``` You should see a Flink Job to tier data from Fluss to Iceberg running in the [Flink Web UI](http://localhost:8083/). @@ -501,20 +546,27 @@ SELECT sum(total_price) as sum_price FROM datalake_enriched_orders; You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time. -Finally, you can use the following command to view the files stored in Iceberg: +### Lake Storage + +Finally, you can use the following command to view the files stored in Iceberg Hadoop warehouse: ```shell -docker compose exec taskmanager tree /tmp/iceberg/fluss +docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . ``` **Sample Output:** ```shell -/tmp/iceberg/fluss -└── datalake_enriched_orders - ├── data - │ └── 00000-0-abc123.parquet - └── metadata - ├── snap-1234567890123456789-1-abc123.avro - └── v1.metadata.json +hdfs://namenode:8020/fluss-lake +└── fluss + └── datalake_enriched_orders + ├── data + │ └── __bucket=0 + │ └── 00000-0-3ff95845-47af-456f-83e0-8411576cfffe-00001.parquet + └── metadata + ├── 528ae521-d683-4c5e-8dd7-779a83dd9c6f-m0.avro + ├── snap-3496049107217731071-1-528ae521-d683-4c5e-8dd7-779a83dd9c6f.avro + ├── v1.metadata.json + ├── v2.metadata.json + └── version-hint.text ``` The files adhere to Iceberg's standard format, enabling seamless querying with other engines such as [Spark](https://iceberg.apache.org/docs/latest/spark-queries/) and [Trino](https://trino.io/docs/current/connector/iceberg.html). diff --git a/website/docs/quickstart/flink.md b/website/docs/quickstart/flink.md index ee8c47a500..18867804a9 100644 --- a/website/docs/quickstart/flink.md +++ b/website/docs/quickstart/flink.md @@ -37,12 +37,53 @@ cd fluss-quickstart-flink ```yaml services: + zookeeper: + restart: always + image: zookeeper:3.9.2 + namenode: + image: apache/hadoop:3.3.6 + hostname: namenode + user: root + command: [ "hdfs", "namenode" ] + ports: + - 9870:9870 + - 8020:8020 + environment: + ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name" + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + healthcheck: + test: ["CMD", "hdfs dfs -test -d / && exit 0 || exit 1"] + interval: 15s + timeout: 10s + retries: 20 + + datanode: + image: apache/hadoop:3.3.6 + user: root + command: [ "hdfs", "datanode" ] + environment: + CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020 + CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp + HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020 + HDFS-SITE.XML_dfs.replication: 1 + HDFS-SITE.XML_dfs.permissions.enabled: false + HDFS-SITE.XML_dfs.datanode.address: datanode:9866 + depends_on: + - namenode + #begin Fluss cluster coordinator-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ - command: coordinatorServer depends_on: - - zookeeper + namenode: + condition: service_healthy + zookeeper: + condition: service_started environment: - | FLUSS_PROPERTIES= @@ -51,9 +92,8 @@ services: remote.data.dir: /tmp/fluss/remote-data datalake.format: paimon datalake.paimon.metastore: filesystem - datalake.paimon.warehouse: /tmp/paimon - volumes: - - shared-tmpfs:/tmp/paimon + datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake + command: coordinatorServer tablet-server: image: fluss/fluss:$FLUSS_DOCKER_VERSION$ command: tabletServer @@ -66,15 +106,11 @@ services: bind.listeners: FLUSS://tablet-server:9123 data.dir: /tmp/fluss/data remote.data.dir: /tmp/fluss/remote-data - kv.snapshot.interval: 0s + kv.snapshot.interval: 30s datalake.format: paimon datalake.paimon.metastore: filesystem - datalake.paimon.warehouse: /tmp/paimon - volumes: - - shared-tmpfs:/tmp/paimon - zookeeper: - restart: always - image: zookeeper:3.9.2 + datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake + #end #begin Flink cluster jobmanager: @@ -86,8 +122,6 @@ services: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager - volumes: - - shared-tmpfs:/tmp/paimon taskmanager: image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ depends_on: @@ -100,16 +134,7 @@ services: taskmanager.numberOfTaskSlots: 10 taskmanager.memory.process.size: 2048m taskmanager.memory.framework.off-heap.size: 256m - volumes: - - shared-tmpfs:/tmp/paimon #end - -volumes: - shared-tmpfs: - driver: local - driver_opts: - type: "tmpfs" - device: "tmpfs" ``` The Docker Compose environment consists of the following containers: @@ -346,6 +371,33 @@ The following SQL query should return an empty result. SELECT * FROM fluss_customer WHERE `cust_key` = 1; ``` +## Remote Storage + +Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage: +```shell +docker compose exec namenode hdfs dfs -ls -R /fluss-data/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . +``` + +**Sample Output:** +```shell +hdfs://namenode:8020/fluss-data +└── kv + └── fluss + ├── enriched_orders-3 + │ └── 0 + │ ├── shared + │ │ ├── 71fca534-ecca-489b-a19a-bd0538c9f9e9 + │ │ ├── b06ef3a3-2873-470e-961f-da25582136a1 + │ │ └── b93bad5c-00fb-4e62-8217-71b010621479 + │ └── snap-2 + │ ├── _METADATA + │ ├── 08d39726-f847-4401-8f31-4e905f2ba3f6 + │ ├── b6a7bc2c-b5c3-4eeb-a523-b2b6fff159f3 + │ └── e6278555-d71f-431f-954e-71bf066dd29f + ├── fluss_customer-1 + ... # Remaining entries omitted for brevity +``` + ## Integrate with Paimon ### Start the Lakehouse Tiering Service To integrate with [Apache Paimon](https://paimon.apache.org/), you need to start the `Lakehouse Tiering Service`. @@ -473,30 +525,30 @@ The result looks like: ``` You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time. -Finally, you can use the following command to view the files stored in Paimon: +### Lake Storage + +Finally, you can use the following command to view the files stored in Paimon Hadoop warehouse: ```shell -docker compose exec taskmanager tree /tmp/paimon/fluss.db +docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile . ``` **Sample Output:** ```shell -/tmp/paimon/fluss.db -└── datalake_enriched_orders - ├── bucket-0 - │ ├── changelog-aef1810f-85b2-4eba-8eb8-9b136dec5bdb-0.orc - │ └── data-aef1810f-85b2-4eba-8eb8-9b136dec5bdb-1.orc - ├── manifest - │ ├── manifest-aaa007e1-81a2-40b3-ba1f-9df4528bc402-0 - │ ├── manifest-aaa007e1-81a2-40b3-ba1f-9df4528bc402-1 - │ ├── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-0 - │ ├── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-1 - │ └── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-2 - ├── schema - │ └── schema-0 - └── snapshot - ├── EARLIEST - ├── LATEST - └── snapshot-1 +hdfs://namenode:8020/fluss-lake +├── default.db +└── fluss.db + └── datalake_enriched_orders + ├── bucket-0 + │ └── data-02acf76d-c4cc-4bc1-9292-e64a77dfcc72-0.parquet + ├── manifest + │ ├── manifest-df5b6833-7e92-4ec9-a196-51d6fd60b1d1-0 + │ ├── manifest-list-b683c5a2-4072-4c7a-8586-2c853de8d964-0 + │ └── manifest-list-b683c5a2-4072-4c7a-8586-2c853de8d964-1 + ├── schema + │ └── schema-0 + └── snapshot + ├── LATEST + └── snapshot-1 ``` The files adhere to Paimon's standard format, enabling seamless querying with other engines such as [StarRocks](https://docs.starrocks.io/docs/data_source/catalog/paimon_catalog/).