From cde36287406bb9f72e0d64ff5689a9b314e431f7 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Wed, 24 Jan 2024 16:01:53 +0545 Subject: [PATCH 01/33] Integrate Apache Iceberg jars with Rock Image --- rockcraft.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rockcraft.yaml b/rockcraft.yaml index 579f70d4..5c07a890 100644 --- a/rockcraft.yaml +++ b/rockcraft.yaml @@ -93,6 +93,8 @@ parts: overlay-script: | AWS_JAVA_SDK_BUNDLE_VERSION='1.12.540' HADOOP_AWS_VERSION='3.3.6' + ICEBERG_SPARK_RUNTIME_VERSION='3.4_2.12' + ICEBERG_VERSION='1.4.3' mkdir -p $CRAFT_PART_INSTALL/opt/spark/jars cd $CRAFT_PART_INSTALL/opt/spark/jars wget -q "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_BUNDLE_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar" @@ -119,6 +121,14 @@ parts: echo "DOWNLOAD ERROR: spark-metrics-assembly-3.4-1.0.0.jar could not be downloaded properly! Exiting...." >&2 exit 1 fi + wget -q "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" + wget -q "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar.sha1" + echo "`cat iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar.sha1` iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" | sha1sum --check + if [[ $? -ne 0 ]] + then + echo "DOWNLOAD ERROR: iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar could not be downloaded properly! Exiting...." >&2 + exit 1 + fi stage: - opt/spark/jars From 3136d86789067905d79b63b5051819d56a9c8db4 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Fri, 26 Jan 2024 14:26:43 +0545 Subject: [PATCH 02/33] Hardcode SHA sums for the downloaded jars. --- rockcraft.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/rockcraft.yaml b/rockcraft.yaml index 5c07a890..6e39ea47 100644 --- a/rockcraft.yaml +++ b/rockcraft.yaml @@ -95,19 +95,21 @@ parts: HADOOP_AWS_VERSION='3.3.6' ICEBERG_SPARK_RUNTIME_VERSION='3.4_2.12' ICEBERG_VERSION='1.4.3' + SHA1SUM_AWS_JAVA_SDK_BUNDLE_JAR='a351ebc4f81d20e0349b3c0f85f34f443a37ce9d' + SHA1SUM_HADOOP_AWS_JAR='d5e162564701848b0921b80aedef9e64435333cc' + SHA512SUM_SPARK_METRICS_ASSEMBLY_JAR='fc52ba79af46e008b1463da4c0852564f2bfce21668468b683550df1f1ff3e4f149641bbce1cffb24510569c1a441bb47f73dd2b0cff87073631c391dc248211' + SHA1SUM_ICEBERG_JAR='48d553e4e5496f731b9e0e6adb5bc0fd040cb0df' mkdir -p $CRAFT_PART_INSTALL/opt/spark/jars cd $CRAFT_PART_INSTALL/opt/spark/jars wget -q "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_BUNDLE_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar" - wget -q "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_BUNDLE_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar.sha1" - echo "`cat aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar.sha1` aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar" | sha1sum --check + echo "${SHA1SUM_AWS_JAVA_SDK_BUNDLE_JAR} aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar" | sha1sum --check if [[ $? -ne 0 ]] then echo "DOWNLOAD ERROR: aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE_VERSION}.jar could not be downloaded properly! Exiting...." >&2 exit 1 fi wget -q "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar" - wget -q "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar.sha1" - echo "`cat hadoop-aws-${HADOOP_AWS_VERSION}.jar.sha1` hadoop-aws-${HADOOP_AWS_VERSION}.jar" | sha1sum --check + echo "${SHA1SUM_HADOOP_AWS_JAR} hadoop-aws-${HADOOP_AWS_VERSION}.jar" | sha1sum --check if [[ $? -ne 0 ]] then echo "DOWNLOAD ERROR: hadoop-aws-${HADOOP_AWS_VERSION}.jar could not be downloaded properly! Exiting...." >&2 @@ -115,15 +117,14 @@ parts: fi wget -q "https://github.com/canonical/central-uploader/releases/download/spark-metrics-assembly-3.4-1.0.0/spark-metrics-assembly-3.4-1.0.0.jar" wget -q "https://github.com/canonical/central-uploader/releases/download/spark-metrics-assembly-3.4-1.0.0/spark-metrics-assembly-3.4-1.0.0.jar.sha512" - echo "`cat spark-metrics-assembly-3.4-1.0.0.jar.sha512`" | sha512sum --check + echo "${SHA512SUM_SPARK_METRICS_ASSEMBLY_JAR} spark-metrics-assembly-3.4-1.0.0.jar" | sha512sum --check if [[ $? -ne 0 ]] then echo "DOWNLOAD ERROR: spark-metrics-assembly-3.4-1.0.0.jar could not be downloaded properly! Exiting...." >&2 exit 1 fi wget -q "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" - wget -q "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar.sha1" - echo "`cat iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar.sha1` iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" | sha1sum --check + echo "${SHA1SUM_ICEBERG_JAR} iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" | sha1sum --check if [[ $? -ne 0 ]] then echo "DOWNLOAD ERROR: iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar could not be downloaded properly! Exiting...." >&2 From cc7e95fb51b91fad1ce84088f72c893c2f826886 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 10:53:33 +0545 Subject: [PATCH 03/33] Ignore build artifacts from VCS --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 723ef36f..0b26130a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -.idea \ No newline at end of file +.idea +*.rock +*.tar +.make_cache \ No newline at end of file From cc552b8f6a1caa6afb5c16e3650ef1fbbd21c041 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 10:55:11 +0545 Subject: [PATCH 04/33] Add integration tests for Apache Iceberg integration --- CONTRIBUTING.md | 1 + Makefile | 12 ++- tests/integration/config-microk8s.sh | 1 + tests/integration/integration-tests.sh | 85 ++++++++++++++++++++- tests/integration/resources/test-iceberg.py | 39 ++++++++++ tests/integration/setup-aws-cli.sh | 13 ++++ 6 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 tests/integration/resources/test-iceberg.py create mode 100644 tests/integration/setup-aws-cli.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eba0a602..c27e3b68 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,6 +13,7 @@ cd charmed-spark-rock sudo snap install rockcraft --edge sudo snap install docker sudo snap install lxd +sudo snap install yq sudo snap install skopeo --edge --devmode ``` diff --git a/Makefile b/Makefile index 14a7fa9c..042a215c 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ _MAKE_DIR := .make_cache $(shell mkdir -p $(_MAKE_DIR)) K8S_TAG := $(_MAKE_DIR)/.k8s_tag +AWS_TAG := $(_MAKE_DIR)/.aws_tag IMAGE_NAME := $(shell yq .name rockcraft.yaml) VERSION := $(shell yq .version rockcraft.yaml) @@ -70,7 +71,7 @@ $(_TMP_OCI_TAG): $(_ROCK_OCI) touch $(_TMP_OCI_TAG) $(CHARMED_OCI_TAG): $(_TMP_OCI_TAG) - docker build - -t "$(CHARMED_OCI_FULL_NAME):$(TAG)" --build-arg BASE_IMAGE="$(_TMP_OCI_NAME):$(TAG)" < Dockerfile + docker build -t "$(CHARMED_OCI_FULL_NAME):$(TAG)" --build-arg BASE_IMAGE="$(_TMP_OCI_NAME):$(TAG)" -f Dockerfile . if [ ! -d "$(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)" ]; then mkdir -p "$(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)"; fi touch $(CHARMED_OCI_TAG) @@ -80,10 +81,15 @@ $(K8S_TAG): sg microk8s ./tests/integration/config-microk8s.sh @touch $(K8S_TAG) +$(AWS_TAG): $(K8S_TAG) + @echo "=== Setting up and configure AWS CLI ===" + /bin/bash ./tests/integration/setup-aws-cli.sh + @touch $(AWS_TAG) + microk8s: $(K8S_TAG) $(_MAKE_DIR)/%/$(TAG).tar: $(_MAKE_DIR)/%/$(TAG).tag - docker save $*:$(TAG) > $(_MAKE_DIR)/$*/$(TAG).tar + docker save $*:$(TAG) -o $(_MAKE_DIR)/$*/$(TAG).tar $(BASE_NAME): $(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)/$(TAG).tar @echo "=== Creating $(BASE_NAME) OCI archive ===" @@ -106,7 +112,7 @@ import: $(K8S_TAG) build microk8s ctr images import --base-name $(CHARMED_OCI_FULL_NAME):$(TAG) $(BASE_NAME) endif -tests: +tests: $(K8S_TAG) $(AWS_TAG) @echo "=== Running Integration Tests ===" /bin/bash ./tests/integration/integration-tests.sh diff --git a/tests/integration/config-microk8s.sh b/tests/integration/config-microk8s.sh index 82c6ebf7..9461fb82 100755 --- a/tests/integration/config-microk8s.sh +++ b/tests/integration/config-microk8s.sh @@ -2,3 +2,4 @@ microk8s status --wait-ready microk8s config | tee ~/.kube/config microk8s.enable dns microk8s.enable rbac +microk8s.enable minio \ No newline at end of file diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index a9267613..f2b8d9e4 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -135,7 +135,7 @@ teardown_test_pod() { run_example_job_in_pod() { SPARK_EXAMPLES_JAR_NAME="spark-examples_2.12-$(get_spark_version).jar" - PREVIOUS_JOB=$(kubectl get pods | grep driver | tail -n 1 | cut -d' ' -f1) + PREVIOUS_JOB=$(kubectl get pods -n ${NAMESPACE}| grep driver | tail -n 1 | cut -d' ' -f1) NAMESPACE=$1 USERNAME=$2 @@ -166,6 +166,82 @@ run_example_job_in_pod() { validate_pi_value $pi } +get_s3_access_key(){ + kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d +} + +get_s3_secret_key(){ + kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d +} + +get_s3_endpoint(){ + kubectl get service minio -n minio-operator -o jsonpath='{.spec.clusterIP}' +} + +create_s3_bucket(){ + S3_ENDPOINT=$(get_s3_endpoint) + BUCKET_NAME=$1 + aws --endpoint-url "http://$S3_ENDPOINT" s3api create-bucket --bucket "$BUCKET_NAME" + echo "Created S3 bucket ${BUCKET_NAME}" +} + +copy_file_to_s3_bucket(){ + BUCKET_NAME=$1 + FILE_PATH=$2 + BASE_NAME=$(basename "$FILE_PATH") + aws s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" + echo "Copied file ${FILE_PATH} to S3 bucket ${BUCKET_NAME}" +} + +run_iceberg_example_in_pod(){ + create_s3_bucket spark + copy_file_to_s3_bucket spark ./tests/integration/resources/test-iceberg.py + + NAMESPACE="tests" + USERNAME="spark" + NUM_ROWS_TO_INSERT="4" + PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) + + kubectl exec testpod -- env UU="$USERNAME" NN="$NAMESPACE" IM="$(spark_image)" \ + /bin/bash -c 'spark-client.spark-submit' \ + --username $UU --namespace $NN \ + --conf spark.kubernetes.driver.request.cores=100m \ + --conf spark.kubernetes.executor.request.cores=100m \ + --conf spark.kubernetes.container.image=$IM \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ + --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.access.key=$(get_s3_access_key) \ + --conf spark.hadoop.fs.s3a.secret.key=$(get_s3_secret_key) \ + --conf spark.jars.ivy=/tmp \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=s3a://spark/warehouse \ + --conf spark.sql.defaultCatalog=local + s3a://spark/test-iceberg.py -n ${NUM_ROWS_TO_INSERT} + + DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) + + if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]] + then + echo "ERROR: Sample job has not run!" + exit 1 + fi + + DRIVER_POD_ID=$(kubectl get pods -n ${NAMESPACE} | grep test-iceberg-.*-driver | tail -n 1 | cut -d' ' -f1) + OUTPUT_LOG_LINE=$(kubectl logs ${DRIVER_POD_ID} -n ${NAMESPACE} | grep 'Number of rows inserted:' ) + NUM_ROWS_INSERTED=$(echo $OUTPUT_LOG_LINE | rev | cut -d' ' -f1 | rev) + + if [ "${NUM_ROWS_INSERTED}" != "${NUM_ROWS_TO_INSERT}" ]; then + echo "ERROR: ${NUM_ROWS_TO_INSERT} were supposed to be inserted. Found ${NUM_ROWS_INSERTED} rows. Aborting with exit code 1." + exit 1 + fi + +} + run_example_job_in_pod_with_pod_templates() { SPARK_EXAMPLES_JAR_NAME="spark-examples_2.12-$(get_spark_version).jar" @@ -425,6 +501,13 @@ echo -e "RUN EXAMPLE JOB WITH ERRORS" echo -e "########################################" (setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod + +echo -e "##################################" +echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES" +echo -e "##################################" + +(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod + echo -e "##################################" echo -e "TEARDOWN TEST POD" echo -e "##################################" diff --git a/tests/integration/resources/test-iceberg.py b/tests/integration/resources/test-iceberg.py new file mode 100644 index 00000000..d8b34aca --- /dev/null +++ b/tests/integration/resources/test-iceberg.py @@ -0,0 +1,39 @@ +import argparse +import random + +from pyspark.sql import SparkSession +from pyspark.sql.types import LongType, StructType, StructField + +parser = argparse.ArgumentParser("TestIceberg") +parser.add_argument("--num_rows", "-n", type=int) +args = parser.parse_args() +num_rows = args.num_rows + +spark = SparkSession\ + .builder\ + .appName("IcebergExample")\ + .getOrCreate() + + +schema = StructType([ + StructField("row_id", LongType(), True), + StructField("row_val", LongType(), True) +]) + +# df = spark.createDataFrame([], schema) +# df.writeTo("demo.foo.bar").create() + +# schema = spark.table("demo.nyc.taxis").schema +data = [] +for idx in range(num_rows): + row = (idx + 1, random.randint(1, 100)) + data.append(row) + +df = spark.createDataFrame(data, schema) +# df.writeTo("demo.nyc.taxis").append() +df.writeTo("demo.foo.bar").create() + + +df = spark.table("demo.foo.bar") +count = df.count() +print(f"Number of rows inserted: {count}") \ No newline at end of file diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh new file mode 100644 index 00000000..c56cbda4 --- /dev/null +++ b/tests/integration/setup-aws-cli.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Install AWS CLI +sudo snap install aws-cli --classic + +# Get Access key and secret key from MinIO +ACCESS_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d) +SECRET_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d) +S3_BUCKET="spark" + +# Configure AWS CLI credentials +aws configure set aws_access_key_id $ACCESS_KEY +aws configure set aws_secret_access_key $SECRET_KEY From c682abd684d6825243aa53f84be65bb0bbfe38dc Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 10:53:33 +0545 Subject: [PATCH 05/33] Ignore build artifacts from VCS --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 723ef36f..0b26130a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -.idea \ No newline at end of file +.idea +*.rock +*.tar +.make_cache \ No newline at end of file From f735866f13cb3991f0b7e258636a8efb95585156 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 10:55:11 +0545 Subject: [PATCH 06/33] Add integration tests for Apache Iceberg integration --- CONTRIBUTING.md | 1 + Makefile | 12 ++- tests/integration/config-microk8s.sh | 1 + tests/integration/integration-tests.sh | 85 ++++++++++++++++++++- tests/integration/resources/test-iceberg.py | 39 ++++++++++ tests/integration/setup-aws-cli.sh | 13 ++++ 6 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 tests/integration/resources/test-iceberg.py create mode 100644 tests/integration/setup-aws-cli.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eba0a602..c27e3b68 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,6 +13,7 @@ cd charmed-spark-rock sudo snap install rockcraft --edge sudo snap install docker sudo snap install lxd +sudo snap install yq sudo snap install skopeo --edge --devmode ``` diff --git a/Makefile b/Makefile index 14a7fa9c..042a215c 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ _MAKE_DIR := .make_cache $(shell mkdir -p $(_MAKE_DIR)) K8S_TAG := $(_MAKE_DIR)/.k8s_tag +AWS_TAG := $(_MAKE_DIR)/.aws_tag IMAGE_NAME := $(shell yq .name rockcraft.yaml) VERSION := $(shell yq .version rockcraft.yaml) @@ -70,7 +71,7 @@ $(_TMP_OCI_TAG): $(_ROCK_OCI) touch $(_TMP_OCI_TAG) $(CHARMED_OCI_TAG): $(_TMP_OCI_TAG) - docker build - -t "$(CHARMED_OCI_FULL_NAME):$(TAG)" --build-arg BASE_IMAGE="$(_TMP_OCI_NAME):$(TAG)" < Dockerfile + docker build -t "$(CHARMED_OCI_FULL_NAME):$(TAG)" --build-arg BASE_IMAGE="$(_TMP_OCI_NAME):$(TAG)" -f Dockerfile . if [ ! -d "$(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)" ]; then mkdir -p "$(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)"; fi touch $(CHARMED_OCI_TAG) @@ -80,10 +81,15 @@ $(K8S_TAG): sg microk8s ./tests/integration/config-microk8s.sh @touch $(K8S_TAG) +$(AWS_TAG): $(K8S_TAG) + @echo "=== Setting up and configure AWS CLI ===" + /bin/bash ./tests/integration/setup-aws-cli.sh + @touch $(AWS_TAG) + microk8s: $(K8S_TAG) $(_MAKE_DIR)/%/$(TAG).tar: $(_MAKE_DIR)/%/$(TAG).tag - docker save $*:$(TAG) > $(_MAKE_DIR)/$*/$(TAG).tar + docker save $*:$(TAG) -o $(_MAKE_DIR)/$*/$(TAG).tar $(BASE_NAME): $(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)/$(TAG).tar @echo "=== Creating $(BASE_NAME) OCI archive ===" @@ -106,7 +112,7 @@ import: $(K8S_TAG) build microk8s ctr images import --base-name $(CHARMED_OCI_FULL_NAME):$(TAG) $(BASE_NAME) endif -tests: +tests: $(K8S_TAG) $(AWS_TAG) @echo "=== Running Integration Tests ===" /bin/bash ./tests/integration/integration-tests.sh diff --git a/tests/integration/config-microk8s.sh b/tests/integration/config-microk8s.sh index 82c6ebf7..9461fb82 100755 --- a/tests/integration/config-microk8s.sh +++ b/tests/integration/config-microk8s.sh @@ -2,3 +2,4 @@ microk8s status --wait-ready microk8s config | tee ~/.kube/config microk8s.enable dns microk8s.enable rbac +microk8s.enable minio \ No newline at end of file diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index a9267613..f2b8d9e4 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -135,7 +135,7 @@ teardown_test_pod() { run_example_job_in_pod() { SPARK_EXAMPLES_JAR_NAME="spark-examples_2.12-$(get_spark_version).jar" - PREVIOUS_JOB=$(kubectl get pods | grep driver | tail -n 1 | cut -d' ' -f1) + PREVIOUS_JOB=$(kubectl get pods -n ${NAMESPACE}| grep driver | tail -n 1 | cut -d' ' -f1) NAMESPACE=$1 USERNAME=$2 @@ -166,6 +166,82 @@ run_example_job_in_pod() { validate_pi_value $pi } +get_s3_access_key(){ + kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d +} + +get_s3_secret_key(){ + kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d +} + +get_s3_endpoint(){ + kubectl get service minio -n minio-operator -o jsonpath='{.spec.clusterIP}' +} + +create_s3_bucket(){ + S3_ENDPOINT=$(get_s3_endpoint) + BUCKET_NAME=$1 + aws --endpoint-url "http://$S3_ENDPOINT" s3api create-bucket --bucket "$BUCKET_NAME" + echo "Created S3 bucket ${BUCKET_NAME}" +} + +copy_file_to_s3_bucket(){ + BUCKET_NAME=$1 + FILE_PATH=$2 + BASE_NAME=$(basename "$FILE_PATH") + aws s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" + echo "Copied file ${FILE_PATH} to S3 bucket ${BUCKET_NAME}" +} + +run_iceberg_example_in_pod(){ + create_s3_bucket spark + copy_file_to_s3_bucket spark ./tests/integration/resources/test-iceberg.py + + NAMESPACE="tests" + USERNAME="spark" + NUM_ROWS_TO_INSERT="4" + PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) + + kubectl exec testpod -- env UU="$USERNAME" NN="$NAMESPACE" IM="$(spark_image)" \ + /bin/bash -c 'spark-client.spark-submit' \ + --username $UU --namespace $NN \ + --conf spark.kubernetes.driver.request.cores=100m \ + --conf spark.kubernetes.executor.request.cores=100m \ + --conf spark.kubernetes.container.image=$IM \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ + --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.access.key=$(get_s3_access_key) \ + --conf spark.hadoop.fs.s3a.secret.key=$(get_s3_secret_key) \ + --conf spark.jars.ivy=/tmp \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=s3a://spark/warehouse \ + --conf spark.sql.defaultCatalog=local + s3a://spark/test-iceberg.py -n ${NUM_ROWS_TO_INSERT} + + DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) + + if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]] + then + echo "ERROR: Sample job has not run!" + exit 1 + fi + + DRIVER_POD_ID=$(kubectl get pods -n ${NAMESPACE} | grep test-iceberg-.*-driver | tail -n 1 | cut -d' ' -f1) + OUTPUT_LOG_LINE=$(kubectl logs ${DRIVER_POD_ID} -n ${NAMESPACE} | grep 'Number of rows inserted:' ) + NUM_ROWS_INSERTED=$(echo $OUTPUT_LOG_LINE | rev | cut -d' ' -f1 | rev) + + if [ "${NUM_ROWS_INSERTED}" != "${NUM_ROWS_TO_INSERT}" ]; then + echo "ERROR: ${NUM_ROWS_TO_INSERT} were supposed to be inserted. Found ${NUM_ROWS_INSERTED} rows. Aborting with exit code 1." + exit 1 + fi + +} + run_example_job_in_pod_with_pod_templates() { SPARK_EXAMPLES_JAR_NAME="spark-examples_2.12-$(get_spark_version).jar" @@ -425,6 +501,13 @@ echo -e "RUN EXAMPLE JOB WITH ERRORS" echo -e "########################################" (setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod + +echo -e "##################################" +echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES" +echo -e "##################################" + +(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod + echo -e "##################################" echo -e "TEARDOWN TEST POD" echo -e "##################################" diff --git a/tests/integration/resources/test-iceberg.py b/tests/integration/resources/test-iceberg.py new file mode 100644 index 00000000..d8b34aca --- /dev/null +++ b/tests/integration/resources/test-iceberg.py @@ -0,0 +1,39 @@ +import argparse +import random + +from pyspark.sql import SparkSession +from pyspark.sql.types import LongType, StructType, StructField + +parser = argparse.ArgumentParser("TestIceberg") +parser.add_argument("--num_rows", "-n", type=int) +args = parser.parse_args() +num_rows = args.num_rows + +spark = SparkSession\ + .builder\ + .appName("IcebergExample")\ + .getOrCreate() + + +schema = StructType([ + StructField("row_id", LongType(), True), + StructField("row_val", LongType(), True) +]) + +# df = spark.createDataFrame([], schema) +# df.writeTo("demo.foo.bar").create() + +# schema = spark.table("demo.nyc.taxis").schema +data = [] +for idx in range(num_rows): + row = (idx + 1, random.randint(1, 100)) + data.append(row) + +df = spark.createDataFrame(data, schema) +# df.writeTo("demo.nyc.taxis").append() +df.writeTo("demo.foo.bar").create() + + +df = spark.table("demo.foo.bar") +count = df.count() +print(f"Number of rows inserted: {count}") \ No newline at end of file diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh new file mode 100644 index 00000000..c56cbda4 --- /dev/null +++ b/tests/integration/setup-aws-cli.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Install AWS CLI +sudo snap install aws-cli --classic + +# Get Access key and secret key from MinIO +ACCESS_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d) +SECRET_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d) +S3_BUCKET="spark" + +# Configure AWS CLI credentials +aws configure set aws_access_key_id $ACCESS_KEY +aws configure set aws_secret_access_key $SECRET_KEY From d02d641e1e91c7678f83578dd5650eba19c9da11 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 15:32:42 +0545 Subject: [PATCH 07/33] Fix tests not passing on CI --- Makefile | 2 +- tests/integration/integration-tests.sh | 63 +++++++++++++++++--------- tests/integration/setup-aws-cli.sh | 1 + 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 042a215c..4474e3d1 100644 --- a/Makefile +++ b/Makefile @@ -84,7 +84,7 @@ $(K8S_TAG): $(AWS_TAG): $(K8S_TAG) @echo "=== Setting up and configure AWS CLI ===" /bin/bash ./tests/integration/setup-aws-cli.sh - @touch $(AWS_TAG) + touch $(AWS_TAG) microk8s: $(K8S_TAG) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index f2b8d9e4..db30cf93 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -x get_spark_version(){ SPARK_VERSION=$(yq '(.version)' rockcraft.yaml) @@ -185,15 +186,22 @@ create_s3_bucket(){ echo "Created S3 bucket ${BUCKET_NAME}" } +delete_s3_bucket(){ + S3_ENDPOINT=$(get_s3_endpoint) + BUCKET_NAME=$1 + aws --endpoint-url "http://$S3_ENDPOINT" s3 rb "s3://$BUCKET_NAME" --force + echo "Deleted S3 bucket ${BUCKET_NAME}" +} + copy_file_to_s3_bucket(){ BUCKET_NAME=$1 FILE_PATH=$2 BASE_NAME=$(basename "$FILE_PATH") - aws s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" + aws --endpoint-url "http://$S3_ENDPOINT" s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" echo "Copied file ${FILE_PATH} to S3 bucket ${BUCKET_NAME}" } -run_iceberg_example_in_pod(){ +test_iceberg_example_in_pod(){ create_s3_bucket spark copy_file_to_s3_bucket spark ./tests/integration/resources/test-iceberg.py @@ -202,27 +210,38 @@ run_iceberg_example_in_pod(){ NUM_ROWS_TO_INSERT="4" PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) - kubectl exec testpod -- env UU="$USERNAME" NN="$NAMESPACE" IM="$(spark_image)" \ - /bin/bash -c 'spark-client.spark-submit' \ - --username $UU --namespace $NN \ - --conf spark.kubernetes.driver.request.cores=100m \ - --conf spark.kubernetes.executor.request.cores=100m \ - --conf spark.kubernetes.container.image=$IM \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ - --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ - --conf spark.hadoop.fs.s3a.path.style.access=true \ - --conf spark.hadoop.fs.s3a.access.key=$(get_s3_access_key) \ - --conf spark.hadoop.fs.s3a.secret.key=$(get_s3_secret_key) \ - --conf spark.jars.ivy=/tmp \ - --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ - --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ - --conf spark.sql.catalog.spark_catalog.type=hive \ - --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ - --conf spark.sql.catalog.local.type=hadoop \ - --conf spark.sql.catalog.local.warehouse=s3a://spark/warehouse \ - --conf spark.sql.defaultCatalog=local - s3a://spark/test-iceberg.py -n ${NUM_ROWS_TO_INSERT} + kubectl exec testpod -- \ + env \ + UU="$USERNAME" \ + NN="$NAMESPACE" \ + IM="$(spark_image)" \ + NUM_ROWS="$NUM_ROWS_TO_INSERT" \ + ACCESS_KEY="$(get_s3_access_key)" \ + SECRET_KEY="$(get_s3_secret_key)" \ + S3_ENDPOINT="$(get_s3_endpoint)" \ + /bin/bash -c '\ + spark-client.spark-submit \ + --username $UU --namespace $NN \ + --conf spark.kubernetes.driver.request.cores=100m \ + --conf spark.kubernetes.executor.request.cores=100m \ + --conf spark.kubernetes.container.image=$IM \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ + --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.endpoint=$S3_ENDPOINT \ + --conf spark.hadoop.fs.s3a.access.key=$ACCESS_KEY \ + --conf spark.hadoop.fs.s3a.secret.key=$SECRET_KEY \ + --conf spark.jars.ivy=/tmp \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=s3a://spark/warehouse \ + --conf spark.sql.defaultCatalog=local \ + s3a://spark/test-iceberg.py -n $NUM_ROWS' + delete_s3_bucket spark DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]] diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh index c56cbda4..efc24982 100644 --- a/tests/integration/setup-aws-cli.sh +++ b/tests/integration/setup-aws-cli.sh @@ -11,3 +11,4 @@ S3_BUCKET="spark" # Configure AWS CLI credentials aws configure set aws_access_key_id $ACCESS_KEY aws configure set aws_secret_access_key $SECRET_KEY +echo "AWS CLI credentials set successfully" From 2c18492aedf022968e4a65a7872b3fdc761f4d9a Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 15:39:28 +0545 Subject: [PATCH 08/33] Make script executable --- tests/integration/setup-aws-cli.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/integration/setup-aws-cli.sh diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh old mode 100644 new mode 100755 From 6f0783b717875fe8b6a33db5e0a12f1a4f1497a2 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 15:56:12 +0545 Subject: [PATCH 09/33] Revert a docker command in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4474e3d1..0a8bbced 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ $(AWS_TAG): $(K8S_TAG) microk8s: $(K8S_TAG) $(_MAKE_DIR)/%/$(TAG).tar: $(_MAKE_DIR)/%/$(TAG).tag - docker save $*:$(TAG) -o $(_MAKE_DIR)/$*/$(TAG).tar + docker save $*:$(TAG) > $(_MAKE_DIR)/$*/$(TAG).tar $(BASE_NAME): $(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)/$(TAG).tar @echo "=== Creating $(BASE_NAME) OCI archive ===" From 0b085e27b0cb0c6c20c83771ec15bf728e4ec5b9 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 16:10:47 +0545 Subject: [PATCH 10/33] Remove set -x option in integration tests. --- tests/integration/integration-tests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index db30cf93..fc007bb0 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -1,5 +1,4 @@ #!/bin/bash -set -x get_spark_version(){ SPARK_VERSION=$(yq '(.version)' rockcraft.yaml) From fd05ccec4c39e71c391e9a809756b5eeed8cfd6a Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 16:23:50 +0545 Subject: [PATCH 11/33] Use -o option instead of redirect --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0a8bbced..4474e3d1 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ $(AWS_TAG): $(K8S_TAG) microk8s: $(K8S_TAG) $(_MAKE_DIR)/%/$(TAG).tar: $(_MAKE_DIR)/%/$(TAG).tag - docker save $*:$(TAG) > $(_MAKE_DIR)/$*/$(TAG).tar + docker save $*:$(TAG) -o $(_MAKE_DIR)/$*/$(TAG).tar $(BASE_NAME): $(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)/$(TAG).tar @echo "=== Creating $(BASE_NAME) OCI archive ===" From 063c4e519015ff6ddb898d521c34411de06b346d Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 17:04:10 +0545 Subject: [PATCH 12/33] Change artifact permissions. --- .github/workflows/build.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 82b26dc8..67762db2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -44,6 +44,9 @@ jobs: ARTIFACT=$(make help | grep 'Artifact: ') echo "name=${ARTIFACT#'Artifact: '}" >> $GITHUB_OUTPUT + - name: Change artifact permissions + run: sudo chmod a+x ${{ steps.artifact.outputs.name }} + - name: Upload locally built artifact uses: actions/upload-artifact@v3 with: From f22dabb3b8e890a56ff89ee6545364daa4f6b156 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 17:11:58 +0545 Subject: [PATCH 13/33] Give read permission to all. --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 67762db2..c2c5f564 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -45,7 +45,7 @@ jobs: echo "name=${ARTIFACT#'Artifact: '}" >> $GITHUB_OUTPUT - name: Change artifact permissions - run: sudo chmod a+x ${{ steps.artifact.outputs.name }} + run: sudo chmod r+x ${{ steps.artifact.outputs.name }} - name: Upload locally built artifact uses: actions/upload-artifact@v3 From c032913060e71afe174ee5a3500642a4dca5bf83 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 17:22:28 +0545 Subject: [PATCH 14/33] Fix typo in build.yaml --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c2c5f564..adbd9fbd 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -45,7 +45,7 @@ jobs: echo "name=${ARTIFACT#'Artifact: '}" >> $GITHUB_OUTPUT - name: Change artifact permissions - run: sudo chmod r+x ${{ steps.artifact.outputs.name }} + run: sudo chmod a+r ${{ steps.artifact.outputs.name }} - name: Upload locally built artifact uses: actions/upload-artifact@v3 From 07540ad6fa3570a4e0bd1a4516ecef17783a59eb Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 18:48:12 +0545 Subject: [PATCH 15/33] Add debug session step --- .github/workflows/integration.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 1007cc36..61ff49e5 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -39,6 +39,10 @@ jobs: name: charmed-spark path: charmed-spark + - name: Setup upterm session + uses: lhotari/action-upterm@v1 + + - name: Run tests run: | # Unpack Artifact From 3c2511585dfa84b67b564e45812688a93b71db4d Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 19:11:33 +0545 Subject: [PATCH 16/33] Move the order of jobs in CI --- .github/workflows/integration.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 61ff49e5..e8ff4c00 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -39,10 +39,6 @@ jobs: name: charmed-spark path: charmed-spark - - name: Setup upterm session - uses: lhotari/action-upterm@v1 - - - name: Run tests run: | # Unpack Artifact @@ -58,3 +54,6 @@ jobs: -o $(find .make_cache -name "*.tag") sg microk8s -c "make tests" + + - name: Setup upterm session + uses: lhotari/action-upterm@v1 \ No newline at end of file From 82171695c509590e2a43c85dd3e362a38a9d96fa Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 19:34:38 +0545 Subject: [PATCH 17/33] Revert "Move the order of jobs in CI" This reverts commit 3c2511585dfa84b67b564e45812688a93b71db4d. --- .github/workflows/integration.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index e8ff4c00..61ff49e5 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -39,6 +39,10 @@ jobs: name: charmed-spark path: charmed-spark + - name: Setup upterm session + uses: lhotari/action-upterm@v1 + + - name: Run tests run: | # Unpack Artifact @@ -54,6 +58,3 @@ jobs: -o $(find .make_cache -name "*.tag") sg microk8s -c "make tests" - - - name: Setup upterm session - uses: lhotari/action-upterm@v1 \ No newline at end of file From 149cc95aa834ebe44bf3b0039ce67c825b25c4ac Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 20:01:22 +0545 Subject: [PATCH 18/33] Temporarily disable other tests --- tests/integration/integration-tests.sh | 52 +++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index fc007bb0..45d5024b 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -483,54 +483,54 @@ echo -e "##################################" setup_test_pod -echo -e "##################################" -echo -e "RUN EXAMPLE JOB" -echo -e "##################################" +# echo -e "##################################" +# echo -e "RUN EXAMPLE JOB" +# echo -e "##################################" -(setup_user_admin_context && test_example_job_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +# (setup_user_admin_context && test_example_job_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod -echo -e "##################################" -echo -e "RUN SPARK SHELL IN POD" -echo -e "##################################" +# echo -e "##################################" +# echo -e "RUN SPARK SHELL IN POD" +# echo -e "##################################" -(setup_user_admin_context && test_spark_shell_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +# (setup_user_admin_context && test_spark_shell_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod -echo -e "##################################" -echo -e "RUN PYSPARK IN POD" -echo -e "##################################" +# echo -e "##################################" +# echo -e "RUN PYSPARK IN POD" +# echo -e "##################################" -(setup_user_admin_context && test_pyspark_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +# (setup_user_admin_context && test_pyspark_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod -echo -e "##################################" -echo -e "RUN EXAMPLE JOB WITH POD TEMPLATE" -echo -e "##################################" +# echo -e "##################################" +# echo -e "RUN EXAMPLE JOB WITH POD TEMPLATE" +# echo -e "##################################" -(setup_user_admin_context && test_example_job_in_pod_with_templates && cleanup_user_success) || cleanup_user_failure_in_pod +# (setup_user_admin_context && test_example_job_in_pod_with_templates && cleanup_user_success) || cleanup_user_failure_in_pod -echo -e "########################################" -echo -e "RUN EXAMPLE JOB WITH PROMETHEUS METRICS" -echo -e "########################################" +# echo -e "########################################" +# echo -e "RUN EXAMPLE JOB WITH PROMETHEUS METRICS" +# echo -e "########################################" -(setup_user_admin_context && test_example_job_in_pod_with_metrics && cleanup_user_success) || cleanup_user_failure_in_pod +# (setup_user_admin_context && test_example_job_in_pod_with_metrics && cleanup_user_success) || cleanup_user_failure_in_pod -echo -e "########################################" -echo -e "RUN EXAMPLE JOB WITH ERRORS" -echo -e "########################################" +# echo -e "########################################" +# echo -e "RUN EXAMPLE JOB WITH ERRORS" +# echo -e "########################################" -(setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod +# (setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod echo -e "##################################" echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES" echo -e "##################################" -(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) #|| cleanup_user_failure_in_pod echo -e "##################################" echo -e "TEARDOWN TEST POD" echo -e "##################################" -teardown_test_pod +# teardown_test_pod echo -e "##################################" echo -e "END OF THE TEST" From 75299b78c9b2501b765055df4b6ea369dd443bf5 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 20:23:55 +0545 Subject: [PATCH 19/33] Sleep for 1000 seconds --- tests/integration/integration-tests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index 45d5024b..1a2efabc 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -535,3 +535,5 @@ echo -e "##################################" echo -e "##################################" echo -e "END OF THE TEST" echo -e "##################################" + +sleep 1000 \ No newline at end of file From 9c9fb09d31c63a53ad1fa05f5a612dd17dd35f42 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 21:03:01 +0545 Subject: [PATCH 20/33] Increase sleep timing, set the default region for the AWS cli --- tests/integration/integration-tests.sh | 2 +- tests/integration/setup-aws-cli.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index 1a2efabc..e41fb879 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -536,4 +536,4 @@ echo -e "##################################" echo -e "END OF THE TEST" echo -e "##################################" -sleep 1000 \ No newline at end of file +sleep 10000 \ No newline at end of file diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh index efc24982..9f0d5463 100755 --- a/tests/integration/setup-aws-cli.sh +++ b/tests/integration/setup-aws-cli.sh @@ -11,4 +11,5 @@ S3_BUCKET="spark" # Configure AWS CLI credentials aws configure set aws_access_key_id $ACCESS_KEY aws configure set aws_secret_access_key $SECRET_KEY +aws configure set default.region us-east-2 echo "AWS CLI credentials set successfully" From ce0a06d402a5b819dac35b5ea01ba796fddeae99 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 21:23:28 +0545 Subject: [PATCH 21/33] Unset debug mode --- tests/integration/integration-tests.sh | 50 +++++++++++++------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index e41fb879..b8250e23 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -483,42 +483,42 @@ echo -e "##################################" setup_test_pod -# echo -e "##################################" -# echo -e "RUN EXAMPLE JOB" -# echo -e "##################################" +echo -e "##################################" +echo -e "RUN EXAMPLE JOB" +echo -e "##################################" -# (setup_user_admin_context && test_example_job_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_example_job_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod -# echo -e "##################################" -# echo -e "RUN SPARK SHELL IN POD" -# echo -e "##################################" +echo -e "##################################" +echo -e "RUN SPARK SHELL IN POD" +echo -e "##################################" -# (setup_user_admin_context && test_spark_shell_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_spark_shell_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod -# echo -e "##################################" -# echo -e "RUN PYSPARK IN POD" -# echo -e "##################################" +echo -e "##################################" +echo -e "RUN PYSPARK IN POD" +echo -e "##################################" -# (setup_user_admin_context && test_pyspark_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_pyspark_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod -# echo -e "##################################" -# echo -e "RUN EXAMPLE JOB WITH POD TEMPLATE" -# echo -e "##################################" +echo -e "##################################" +echo -e "RUN EXAMPLE JOB WITH POD TEMPLATE" +echo -e "##################################" -# (setup_user_admin_context && test_example_job_in_pod_with_templates && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_example_job_in_pod_with_templates && cleanup_user_success) || cleanup_user_failure_in_pod -# echo -e "########################################" -# echo -e "RUN EXAMPLE JOB WITH PROMETHEUS METRICS" -# echo -e "########################################" +echo -e "########################################" +echo -e "RUN EXAMPLE JOB WITH PROMETHEUS METRICS" +echo -e "########################################" -# (setup_user_admin_context && test_example_job_in_pod_with_metrics && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_example_job_in_pod_with_metrics && cleanup_user_success) || cleanup_user_failure_in_pod -# echo -e "########################################" -# echo -e "RUN EXAMPLE JOB WITH ERRORS" -# echo -e "########################################" +echo -e "########################################" +echo -e "RUN EXAMPLE JOB WITH ERRORS" +echo -e "########################################" -# (setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod +(setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod echo -e "##################################" echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES" @@ -535,5 +535,3 @@ echo -e "##################################" echo -e "##################################" echo -e "END OF THE TEST" echo -e "##################################" - -sleep 10000 \ No newline at end of file From 8ad6334753d7bf4e64b09e5804633f242477a3c5 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 21:46:14 +0545 Subject: [PATCH 22/33] Remove SSH access to GH runner --- .github/workflows/build.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index adbd9fbd..82b26dc8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -44,9 +44,6 @@ jobs: ARTIFACT=$(make help | grep 'Artifact: ') echo "name=${ARTIFACT#'Artifact: '}" >> $GITHUB_OUTPUT - - name: Change artifact permissions - run: sudo chmod a+r ${{ steps.artifact.outputs.name }} - - name: Upload locally built artifact uses: actions/upload-artifact@v3 with: From 87fb99ed96e6f30e962fd7419ade0f450f97821b Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Mon, 29 Jan 2024 21:53:45 +0545 Subject: [PATCH 23/33] Fix CI --- .github/workflows/build.yaml | 3 +++ .github/workflows/integration.yaml | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 82b26dc8..adbd9fbd 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -44,6 +44,9 @@ jobs: ARTIFACT=$(make help | grep 'Artifact: ') echo "name=${ARTIFACT#'Artifact: '}" >> $GITHUB_OUTPUT + - name: Change artifact permissions + run: sudo chmod a+r ${{ steps.artifact.outputs.name }} + - name: Upload locally built artifact uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 61ff49e5..1007cc36 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -39,10 +39,6 @@ jobs: name: charmed-spark path: charmed-spark - - name: Setup upterm session - uses: lhotari/action-upterm@v1 - - - name: Run tests run: | # Unpack Artifact From 4ff64d0d6993d454d1de7b5d34b165010bf5f900 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 09:02:25 +0545 Subject: [PATCH 24/33] Revert "Add integration tests for Apache Iceberg integration" This reverts commit cc552b8f6a1caa6afb5c16e3650ef1fbbd21c041. --- CONTRIBUTING.md | 1 - Makefile | 12 +-- tests/integration/config-microk8s.sh | 1 - tests/integration/integration-tests.sh | 85 +-------------------- tests/integration/resources/test-iceberg.py | 39 ---------- tests/integration/setup-aws-cli.sh | 13 ---- 6 files changed, 4 insertions(+), 147 deletions(-) delete mode 100644 tests/integration/resources/test-iceberg.py delete mode 100644 tests/integration/setup-aws-cli.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c27e3b68..eba0a602 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,7 +13,6 @@ cd charmed-spark-rock sudo snap install rockcraft --edge sudo snap install docker sudo snap install lxd -sudo snap install yq sudo snap install skopeo --edge --devmode ``` diff --git a/Makefile b/Makefile index 042a215c..14a7fa9c 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,6 @@ _MAKE_DIR := .make_cache $(shell mkdir -p $(_MAKE_DIR)) K8S_TAG := $(_MAKE_DIR)/.k8s_tag -AWS_TAG := $(_MAKE_DIR)/.aws_tag IMAGE_NAME := $(shell yq .name rockcraft.yaml) VERSION := $(shell yq .version rockcraft.yaml) @@ -71,7 +70,7 @@ $(_TMP_OCI_TAG): $(_ROCK_OCI) touch $(_TMP_OCI_TAG) $(CHARMED_OCI_TAG): $(_TMP_OCI_TAG) - docker build -t "$(CHARMED_OCI_FULL_NAME):$(TAG)" --build-arg BASE_IMAGE="$(_TMP_OCI_NAME):$(TAG)" -f Dockerfile . + docker build - -t "$(CHARMED_OCI_FULL_NAME):$(TAG)" --build-arg BASE_IMAGE="$(_TMP_OCI_NAME):$(TAG)" < Dockerfile if [ ! -d "$(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)" ]; then mkdir -p "$(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)"; fi touch $(CHARMED_OCI_TAG) @@ -81,15 +80,10 @@ $(K8S_TAG): sg microk8s ./tests/integration/config-microk8s.sh @touch $(K8S_TAG) -$(AWS_TAG): $(K8S_TAG) - @echo "=== Setting up and configure AWS CLI ===" - /bin/bash ./tests/integration/setup-aws-cli.sh - @touch $(AWS_TAG) - microk8s: $(K8S_TAG) $(_MAKE_DIR)/%/$(TAG).tar: $(_MAKE_DIR)/%/$(TAG).tag - docker save $*:$(TAG) -o $(_MAKE_DIR)/$*/$(TAG).tar + docker save $*:$(TAG) > $(_MAKE_DIR)/$*/$(TAG).tar $(BASE_NAME): $(_MAKE_DIR)/$(CHARMED_OCI_FULL_NAME)/$(TAG).tar @echo "=== Creating $(BASE_NAME) OCI archive ===" @@ -112,7 +106,7 @@ import: $(K8S_TAG) build microk8s ctr images import --base-name $(CHARMED_OCI_FULL_NAME):$(TAG) $(BASE_NAME) endif -tests: $(K8S_TAG) $(AWS_TAG) +tests: @echo "=== Running Integration Tests ===" /bin/bash ./tests/integration/integration-tests.sh diff --git a/tests/integration/config-microk8s.sh b/tests/integration/config-microk8s.sh index 9461fb82..82c6ebf7 100755 --- a/tests/integration/config-microk8s.sh +++ b/tests/integration/config-microk8s.sh @@ -2,4 +2,3 @@ microk8s status --wait-ready microk8s config | tee ~/.kube/config microk8s.enable dns microk8s.enable rbac -microk8s.enable minio \ No newline at end of file diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index f2b8d9e4..a9267613 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -135,7 +135,7 @@ teardown_test_pod() { run_example_job_in_pod() { SPARK_EXAMPLES_JAR_NAME="spark-examples_2.12-$(get_spark_version).jar" - PREVIOUS_JOB=$(kubectl get pods -n ${NAMESPACE}| grep driver | tail -n 1 | cut -d' ' -f1) + PREVIOUS_JOB=$(kubectl get pods | grep driver | tail -n 1 | cut -d' ' -f1) NAMESPACE=$1 USERNAME=$2 @@ -166,82 +166,6 @@ run_example_job_in_pod() { validate_pi_value $pi } -get_s3_access_key(){ - kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d -} - -get_s3_secret_key(){ - kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d -} - -get_s3_endpoint(){ - kubectl get service minio -n minio-operator -o jsonpath='{.spec.clusterIP}' -} - -create_s3_bucket(){ - S3_ENDPOINT=$(get_s3_endpoint) - BUCKET_NAME=$1 - aws --endpoint-url "http://$S3_ENDPOINT" s3api create-bucket --bucket "$BUCKET_NAME" - echo "Created S3 bucket ${BUCKET_NAME}" -} - -copy_file_to_s3_bucket(){ - BUCKET_NAME=$1 - FILE_PATH=$2 - BASE_NAME=$(basename "$FILE_PATH") - aws s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" - echo "Copied file ${FILE_PATH} to S3 bucket ${BUCKET_NAME}" -} - -run_iceberg_example_in_pod(){ - create_s3_bucket spark - copy_file_to_s3_bucket spark ./tests/integration/resources/test-iceberg.py - - NAMESPACE="tests" - USERNAME="spark" - NUM_ROWS_TO_INSERT="4" - PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) - - kubectl exec testpod -- env UU="$USERNAME" NN="$NAMESPACE" IM="$(spark_image)" \ - /bin/bash -c 'spark-client.spark-submit' \ - --username $UU --namespace $NN \ - --conf spark.kubernetes.driver.request.cores=100m \ - --conf spark.kubernetes.executor.request.cores=100m \ - --conf spark.kubernetes.container.image=$IM \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ - --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ - --conf spark.hadoop.fs.s3a.path.style.access=true \ - --conf spark.hadoop.fs.s3a.access.key=$(get_s3_access_key) \ - --conf spark.hadoop.fs.s3a.secret.key=$(get_s3_secret_key) \ - --conf spark.jars.ivy=/tmp \ - --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ - --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ - --conf spark.sql.catalog.spark_catalog.type=hive \ - --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ - --conf spark.sql.catalog.local.type=hadoop \ - --conf spark.sql.catalog.local.warehouse=s3a://spark/warehouse \ - --conf spark.sql.defaultCatalog=local - s3a://spark/test-iceberg.py -n ${NUM_ROWS_TO_INSERT} - - DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) - - if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]] - then - echo "ERROR: Sample job has not run!" - exit 1 - fi - - DRIVER_POD_ID=$(kubectl get pods -n ${NAMESPACE} | grep test-iceberg-.*-driver | tail -n 1 | cut -d' ' -f1) - OUTPUT_LOG_LINE=$(kubectl logs ${DRIVER_POD_ID} -n ${NAMESPACE} | grep 'Number of rows inserted:' ) - NUM_ROWS_INSERTED=$(echo $OUTPUT_LOG_LINE | rev | cut -d' ' -f1 | rev) - - if [ "${NUM_ROWS_INSERTED}" != "${NUM_ROWS_TO_INSERT}" ]; then - echo "ERROR: ${NUM_ROWS_TO_INSERT} were supposed to be inserted. Found ${NUM_ROWS_INSERTED} rows. Aborting with exit code 1." - exit 1 - fi - -} - run_example_job_in_pod_with_pod_templates() { SPARK_EXAMPLES_JAR_NAME="spark-examples_2.12-$(get_spark_version).jar" @@ -501,13 +425,6 @@ echo -e "RUN EXAMPLE JOB WITH ERRORS" echo -e "########################################" (setup_user_admin_context && test_example_job_in_pod_with_errors && cleanup_user_success) || cleanup_user_failure_in_pod - -echo -e "##################################" -echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES" -echo -e "##################################" - -(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod - echo -e "##################################" echo -e "TEARDOWN TEST POD" echo -e "##################################" diff --git a/tests/integration/resources/test-iceberg.py b/tests/integration/resources/test-iceberg.py deleted file mode 100644 index d8b34aca..00000000 --- a/tests/integration/resources/test-iceberg.py +++ /dev/null @@ -1,39 +0,0 @@ -import argparse -import random - -from pyspark.sql import SparkSession -from pyspark.sql.types import LongType, StructType, StructField - -parser = argparse.ArgumentParser("TestIceberg") -parser.add_argument("--num_rows", "-n", type=int) -args = parser.parse_args() -num_rows = args.num_rows - -spark = SparkSession\ - .builder\ - .appName("IcebergExample")\ - .getOrCreate() - - -schema = StructType([ - StructField("row_id", LongType(), True), - StructField("row_val", LongType(), True) -]) - -# df = spark.createDataFrame([], schema) -# df.writeTo("demo.foo.bar").create() - -# schema = spark.table("demo.nyc.taxis").schema -data = [] -for idx in range(num_rows): - row = (idx + 1, random.randint(1, 100)) - data.append(row) - -df = spark.createDataFrame(data, schema) -# df.writeTo("demo.nyc.taxis").append() -df.writeTo("demo.foo.bar").create() - - -df = spark.table("demo.foo.bar") -count = df.count() -print(f"Number of rows inserted: {count}") \ No newline at end of file diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh deleted file mode 100644 index c56cbda4..00000000 --- a/tests/integration/setup-aws-cli.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# Install AWS CLI -sudo snap install aws-cli --classic - -# Get Access key and secret key from MinIO -ACCESS_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d) -SECRET_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d) -S3_BUCKET="spark" - -# Configure AWS CLI credentials -aws configure set aws_access_key_id $ACCESS_KEY -aws configure set aws_secret_access_key $SECRET_KEY From c621d89ce3f5a5edfdad5a27f2d406d09c776fa8 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 09:02:36 +0545 Subject: [PATCH 25/33] Revert "Ignore build artifacts from VCS" This reverts commit cc7e95fb51b91fad1ce84088f72c893c2f826886. --- .gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 0b26130a..723ef36f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1 @@ -.idea -*.rock -*.tar -.make_cache \ No newline at end of file +.idea \ No newline at end of file From efb0ee8214546b0675ee99c9e036626b25011df3 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 09:54:03 +0545 Subject: [PATCH 26/33] Add missing shell variable --- tests/integration/integration-tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index b8250e23..c7f6b54b 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -196,6 +196,7 @@ copy_file_to_s3_bucket(){ BUCKET_NAME=$1 FILE_PATH=$2 BASE_NAME=$(basename "$FILE_PATH") + S3_ENDPOINT=$(get_s3_endpoint) aws --endpoint-url "http://$S3_ENDPOINT" s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" echo "Copied file ${FILE_PATH} to S3 bucket ${BUCKET_NAME}" } From 2818165b04d857ac1a70fcbda2abe28b9f9f4ed6 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 09:55:19 +0545 Subject: [PATCH 27/33] Make default region configurable. --- tests/integration/setup-aws-cli.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/setup-aws-cli.sh b/tests/integration/setup-aws-cli.sh index 9f0d5463..b30293cb 100755 --- a/tests/integration/setup-aws-cli.sh +++ b/tests/integration/setup-aws-cli.sh @@ -6,10 +6,12 @@ sudo snap install aws-cli --classic # Get Access key and secret key from MinIO ACCESS_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d) SECRET_KEY=$(kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d) + S3_BUCKET="spark" +DEFAULT_REGION="us-east-2" # Configure AWS CLI credentials aws configure set aws_access_key_id $ACCESS_KEY aws configure set aws_secret_access_key $SECRET_KEY -aws configure set default.region us-east-2 +aws configure set default.region $DEFAULT_REGION echo "AWS CLI credentials set successfully" From c230cf70694e3de667bc3082f0bcadd8ed185b92 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 12:52:07 +0545 Subject: [PATCH 28/33] Remove commented lines --- tests/integration/resources/test-iceberg.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/integration/resources/test-iceberg.py b/tests/integration/resources/test-iceberg.py index d8b34aca..4675ed87 100644 --- a/tests/integration/resources/test-iceberg.py +++ b/tests/integration/resources/test-iceberg.py @@ -20,17 +20,12 @@ StructField("row_val", LongType(), True) ]) -# df = spark.createDataFrame([], schema) -# df.writeTo("demo.foo.bar").create() - -# schema = spark.table("demo.nyc.taxis").schema data = [] for idx in range(num_rows): row = (idx + 1, random.randint(1, 100)) data.append(row) df = spark.createDataFrame(data, schema) -# df.writeTo("demo.nyc.taxis").append() df.writeTo("demo.foo.bar").create() From fff7f4a704975999ac4363e887d647afabbfcca0 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 12:53:45 +0545 Subject: [PATCH 29/33] Format with black. --- tests/integration/resources/test-iceberg.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/integration/resources/test-iceberg.py b/tests/integration/resources/test-iceberg.py index 4675ed87..b4be2551 100644 --- a/tests/integration/resources/test-iceberg.py +++ b/tests/integration/resources/test-iceberg.py @@ -9,16 +9,12 @@ args = parser.parse_args() num_rows = args.num_rows -spark = SparkSession\ - .builder\ - .appName("IcebergExample")\ - .getOrCreate() +spark = SparkSession.builder.appName("IcebergExample").getOrCreate() -schema = StructType([ - StructField("row_id", LongType(), True), - StructField("row_val", LongType(), True) -]) +schema = StructType( + [StructField("row_id", LongType(), True), StructField("row_val", LongType(), True)] +) data = [] for idx in range(num_rows): @@ -31,4 +27,4 @@ df = spark.table("demo.foo.bar") count = df.count() -print(f"Number of rows inserted: {count}") \ No newline at end of file +print(f"Number of rows inserted: {count}") From 4d7fbd6c3ec0893d24dee74f6059b92815d56d8f Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 15:16:08 +0545 Subject: [PATCH 30/33] Uncomment a line in integration tests. --- tests/integration/integration-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index c7f6b54b..6f21385b 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -525,7 +525,7 @@ echo -e "##################################" echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES" echo -e "##################################" -(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) #|| cleanup_user_failure_in_pod +(setup_user_admin_context && test_iceberg_example_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod echo -e "##################################" echo -e "TEARDOWN TEST POD" From c741c458b1ec0ff0a03f99b118557088c0c04c82 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Tue, 30 Jan 2024 15:22:14 +0545 Subject: [PATCH 31/33] Uncomment teardown_test_pod --- tests/integration/integration-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index 6f21385b..b8b4e03f 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -531,7 +531,7 @@ echo -e "##################################" echo -e "TEARDOWN TEST POD" echo -e "##################################" -# teardown_test_pod +teardown_test_pod echo -e "##################################" echo -e "END OF THE TEST" From 1da1160b7e260020fddd0b13f31456baed0138e4 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Thu, 1 Feb 2024 16:22:57 +0545 Subject: [PATCH 32/33] Add comments --- tests/integration/integration-tests.sh | 37 ++++++++++++++++++++- tests/integration/resources/test-iceberg.py | 9 +++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index b8b4e03f..fecdcbeb 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -167,18 +167,22 @@ run_example_job_in_pod() { } get_s3_access_key(){ + # Prints out S3 Access Key by reading it from K8s secret kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_ACCESS_KEY}' | base64 -d } get_s3_secret_key(){ + # Prints out S3 Secret Key by reading it from K8s secret kubectl get secret -n minio-operator microk8s-user-1 -o jsonpath='{.data.CONSOLE_SECRET_KEY}' | base64 -d } get_s3_endpoint(){ + # Prints out the endpoint S3 bucket is exposed on. kubectl get service minio -n minio-operator -o jsonpath='{.spec.clusterIP}' } create_s3_bucket(){ + # Creates a S3 bucket with the given name. S3_ENDPOINT=$(get_s3_endpoint) BUCKET_NAME=$1 aws --endpoint-url "http://$S3_ENDPOINT" s3api create-bucket --bucket "$BUCKET_NAME" @@ -186,6 +190,7 @@ create_s3_bucket(){ } delete_s3_bucket(){ + # Deletes a S3 bucket with the given name. S3_ENDPOINT=$(get_s3_endpoint) BUCKET_NAME=$1 aws --endpoint-url "http://$S3_ENDPOINT" s3 rb "s3://$BUCKET_NAME" --force @@ -193,23 +198,39 @@ delete_s3_bucket(){ } copy_file_to_s3_bucket(){ + # Copies a file from local to S3 bucket. + # The bucket name and the path to file that is to be uploaded is to be provided as arguments BUCKET_NAME=$1 FILE_PATH=$2 + + # If file path is '/foo/bar/file.ext', the basename is 'file.ext' BASE_NAME=$(basename "$FILE_PATH") S3_ENDPOINT=$(get_s3_endpoint) + + # Copy the file to S3 bucket aws --endpoint-url "http://$S3_ENDPOINT" s3 cp $FILE_PATH s3://"$BUCKET_NAME"/"$BASE_NAME" echo "Copied file ${FILE_PATH} to S3 bucket ${BUCKET_NAME}" } test_iceberg_example_in_pod(){ - create_s3_bucket spark + # Test Iceberg integration in Charmed Spark Rock + + # First create S3 bucket named 'spark' + create_s3_bucket spark] + + # Copy 'test-iceberg.py' script to 'spark' bucket copy_file_to_s3_bucket spark ./tests/integration/resources/test-iceberg.py NAMESPACE="tests" USERNAME="spark" + + # Number of rows that are to be inserted during the test. NUM_ROWS_TO_INSERT="4" + + # Number of driver pods that exist in the namespace already. PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) + # Submit the job from inside 'testpod' kubectl exec testpod -- \ env \ UU="$USERNAME" \ @@ -241,17 +262,31 @@ test_iceberg_example_in_pod(){ --conf spark.sql.defaultCatalog=local \ s3a://spark/test-iceberg.py -n $NUM_ROWS' + # Delete 'spark' bucket delete_s3_bucket spark + + # Number of driver pods after the job is completed. DRIVER_PODS_COUNT=$(kubectl get pods -n ${NAMESPACE} | grep driver | wc -l) + # If the number of driver pods is same as before, job has not been run at all! if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]] then echo "ERROR: Sample job has not run!" exit 1 fi + # Find the ID of the driver pod that ran the job. + # tail -n 1 => Filter out the last line + # cut -d' ' -f1 => Split by spaces and pick the first part DRIVER_POD_ID=$(kubectl get pods -n ${NAMESPACE} | grep test-iceberg-.*-driver | tail -n 1 | cut -d' ' -f1) + + # Filter out the output log line OUTPUT_LOG_LINE=$(kubectl logs ${DRIVER_POD_ID} -n ${NAMESPACE} | grep 'Number of rows inserted:' ) + + # Fetch out the number of rows inserted + # rev => Reverse the string + # cut -d' ' -f1 => Split by spaces and pick the first part + # rev => Reverse the string back NUM_ROWS_INSERTED=$(echo $OUTPUT_LOG_LINE | rev | cut -d' ' -f1 | rev) if [ "${NUM_ROWS_INSERTED}" != "${NUM_ROWS_TO_INSERT}" ]; then diff --git a/tests/integration/resources/test-iceberg.py b/tests/integration/resources/test-iceberg.py index b4be2551..929914e4 100644 --- a/tests/integration/resources/test-iceberg.py +++ b/tests/integration/resources/test-iceberg.py @@ -9,22 +9,27 @@ args = parser.parse_args() num_rows = args.num_rows +# Create Spark session spark = SparkSession.builder.appName("IcebergExample").getOrCreate() - +# Create schema schema = StructType( [StructField("row_id", LongType(), True), StructField("row_val", LongType(), True)] ) +# Generate 'num_rows' number of random rows to be inserted. data = [] for idx in range(num_rows): row = (idx + 1, random.randint(1, 100)) data.append(row) +# Create a data frame and write it df = spark.createDataFrame(data, schema) df.writeTo("demo.foo.bar").create() - +# Read back the inserted data and count the number of rows df = spark.table("demo.foo.bar") count = df.count() + +# Print the number of rows print(f"Number of rows inserted: {count}") From 29c45bcae0b70119f64a99a8cf686db966aa9b19 Mon Sep 17 00:00:00 2001 From: Bikalpa Dhakal Date: Thu, 1 Feb 2024 17:19:33 +0545 Subject: [PATCH 33/33] Fix a typo --- tests/integration/integration-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh index fecdcbeb..a73961bc 100755 --- a/tests/integration/integration-tests.sh +++ b/tests/integration/integration-tests.sh @@ -216,7 +216,7 @@ test_iceberg_example_in_pod(){ # Test Iceberg integration in Charmed Spark Rock # First create S3 bucket named 'spark' - create_s3_bucket spark] + create_s3_bucket spark # Copy 'test-iceberg.py' script to 'spark' bucket copy_file_to_s3_bucket spark ./tests/integration/resources/test-iceberg.py