From 4e989879585b9e771a0ef698eec6ab798937a884 Mon Sep 17 00:00:00 2001 From: yalctay93 <39557685+yalctay93@users.noreply.github.com> Date: Wed, 5 Aug 2020 22:35:04 +0200 Subject: [PATCH] Include Flink-Job-Cluster Helm Chart (#292) --- README.md | 1 + docs/flink_job_cluster_guide.md | 136 ++++++++++++++++++ helm-chart/flink-job-cluster/.helmignore | 22 +++ helm-chart/flink-job-cluster/Chart.yaml | 11 ++ helm-chart/flink-job-cluster/README.md | 3 + .../flink-job-cluster/templates/_helpers.tpl | 33 +++++ .../templates/flinkjobcluster.yaml | 93 ++++++++++++ .../templates/podmonitor.yaml | 19 +++ helm-chart/flink-job-cluster/values.yaml | 113 +++++++++++++++ 9 files changed, 431 insertions(+) create mode 100644 docs/flink_job_cluster_guide.md create mode 100644 helm-chart/flink-job-cluster/.helmignore create mode 100644 helm-chart/flink-job-cluster/Chart.yaml create mode 100644 helm-chart/flink-job-cluster/README.md create mode 100644 helm-chart/flink-job-cluster/templates/_helpers.tpl create mode 100644 helm-chart/flink-job-cluster/templates/flinkjobcluster.yaml create mode 100644 helm-chart/flink-job-cluster/templates/podmonitor.yaml create mode 100644 helm-chart/flink-job-cluster/values.yaml diff --git a/README.md b/README.md index e89952c4..eb645b39 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ The operator is still under active development, there is no Helm chart available * [Run Apache Beam Python jobs](docs/beam_guide.md) * [Use GCS connector](images/flink/README.md) * [Test with Apache Kafka](docs/kafka_test_guide.md) +* [Create Flink job clusters with Helm Chart](docs/flink_job_cluster_guide.md) ### Tech talks diff --git a/docs/flink_job_cluster_guide.md b/docs/flink_job_cluster_guide.md new file mode 100644 index 00000000..42bb3e0d --- /dev/null +++ b/docs/flink_job_cluster_guide.md @@ -0,0 +1,136 @@ +# Create Flink job clusters with Helm Chart + +## Overview + +This Helm Chart is an addition to the [existing](https://github.com/GoogleCloudPlatform/flink-on-k8s-operator/tree/master/config/samples) way of deploying Flink job clusters. + +### Why use the Helm Chart? +A typical Helm chart will usually include all of the manifests which you would manually apply with kubectl as templates, along with a ```values.yaml``` file for quick management of user preferences, so it becomes a one-step process to manage all of these resources as a single resource. Since Flink job clusters are currently deployed with just one YAML file, it might seem like the helm chart is unnecessary. However, the more components are added in the future, such as a ```PodMonitor``` or ```Services```, the easier it will be to manage those manifests from a central ```values.yaml```. Helm also supports various deployment checks before and after deployment so it integrates well with CI/CD pipelines. Some of these benefits are listed below: + +- Easy configuration as you just have to configure or enable features in the ```values.yaml``` without much knowledge of the entire chart +- You can use helm operations such as ```helm --dry run``` to check for any errors before deployment +- Automated rollback to a previous functioning release with the ```--atomic``` flag +- Manual rollbacks to previous revisions possible with ```helm rollback``` +- Helm includes release versioning which can be checked by using the ```helm list ``` command + +## Prerequisites + + +- Fink Operator Image Version: ```gcr.io/flink-operator/flink-operator:v1beta1-6``` follow these [instructions](https://github.com/GoogleCloudPlatform/flink-on-k8s-operator/tree/master/helm-chart/flink-operator) to deploy the Operator +- Flink Image Version: ```flink:1.9.3``` or ```flink:latest``` +- [Helm](https://helm.sh/docs/helm/helm_install/) version 2.x or 3.x + +Optional: +- [Prometheus-Operator](https://github.com/GoogleCloudPlatform/flink-on-k8s-operator/blob/master/docs/user_guide.md#monitoring-with-prometheus) to use the custom resource ```PodMonitor``` in order to scrape flink-job-cluster metrics + + +## Installing the Chart + +The instructions to install the Flink Job Cluster chart: + +1. Clone the repository to your local machine, which has access to your running kubernetes cluster. + ```bash + git clone https://github.com/GoogleCloudPlatform/flink-on-k8s-operator.git + ``` +2. Navigate to the following folder: ```/flink-on-k8s-operator/helm-chart``` + +3. Use the following command to dry-run the Flink job cluster chart: + ```bash + helm install --dry-run --namespace= flink-job-cluster ./flink-job-cluster -f ./flink-job-cluster/values.yaml + ``` + The ```dry-run``` flag will render the templated yaml files. It is used to debug your chart. You'll be notified if there is any error in the chart configuration. + +4. Use the following command to install the Flink job cluster chart: + ```bash + helm install --namespace= flink-job-cluster ./flink-job-cluster -f ./flink-job-cluster/values.yaml + ``` + + Afterwards, you should see the following output in your console: + ```bash + NAME: flink-job-cluster + LAST DEPLOYED: Tue Aug 4 10:39:10 2020 + NAMESPACE: + STATUS: deployed + REVISION: 1 + TEST SUITE: None + ``` +***Note*** the ```values.file``` in ```/flink-on-k8s-operator/helm-chart/flink-job-cluster/``` is just an example configuration. You can use your own values.yaml if you wish and edit the parts that you want to change. The current values.yaml has the minimum configuration requirements enabled for the Flink job cluster to start successfully. + +## Uninstalling the Chart + +To uninstall your release: + +1. Use the following command to list the Flink job cluster release: + ```bash + helm list --namespace= + ``` +2. Find your release name and delete it: + ```bash + helm delete --namespace= + ``` + +## Check the Flink job cluster + +After using the helm command, the following resources will be deployed + +- Flink job (1x) +- Flink job manager (1x) +- Flink task manager (2x) + +You can check the status of your deployment by using +```bash +kubectl get deployments --namespace= +``` + +## Flink Operator Releases + +You can check which images of the Operator are available at [GoogleCloudPlatform](https://console.cloud.google.com/gcr/images/flink-operator/GLOBAL/flink-operator?gcrImageListsize=30) + +## Monitoring + +The Flink job cluster comes with a PodMonitor resource, which is the counter part to a ServiceMonitor. +The PodMonitor will use pod labels and configure prometheus to scrape the Flink job cluster metrics. Reason for using the PodMonitor is simple, the Flink job cluster does not deploy services. + +You can use the following [dashboard](https://grafana.com/grafana/dashboards/10369) in your grafana to monitor the flink-job-cluster. + +## Run Jobs with InitContainer + +You have the option to download job jars to be executed as jobs, directly into the Flink job cluster pods. +There is already an [example](https://github.com/GoogleCloudPlatform/flink-on-k8s-operator/blob/master/config/samples/flinkoperator_v1beta1_remotejobjar.yaml.) on how to run the Flink job cluster with a remote job jar. + +## Run Jobs without InitContainer + +If you do not want to use a remote job jar, you can simply use the Flink image e.g. ```flink:1.9.3``` and copy your built jar file into that image to create your custom Flink image. This way you can directly start the job without using an InitContainer. Just use your custom Flink image as ```image``` in the values.yaml, and make sure to set the correct path for the job to look for the JAR file. + +```yaml +image: + repository: / + tag: 1.9.3 + +job: + # job will look for a JAR file at ./examples/streaming/WordCount.jar and execute it + # className has to be valid and used in the provided JAR File + jarFile: ./examples/streaming/WordCount.jar +``` + +## Check Running Jobs + +You can check running jobs by using the following command: + +```kubectl get jobs -n ``` + +## Updating Job + +1. Build your new/updated JAR file which will be executed by the Flink job cluster +2. Prepare a new custom Flink Image which has your JAR file included, for example at: /JARFiles/ +3. Adjust the path for the jar file in ```values.yaml``` at job.jarFile from "./examples/streaming/WordCount.jar" to "/JARFiles/" +3. Upload your custom Flink Image to your registry +4. Specify your custom Flink Image in the helm-chart ```values.yaml``` under the ```image``` section +5. Navigate to ```/flink-on-k8s-operator/helm-chart``` and use the helm command to update your running Flink job cluster. + ```bash + helm upgrade --namespace= flink-job-cluster ./flink-job-cluster -f ./flink-job-cluster/values.yaml + ``` + +## Roadmap + +We are planning to extend the chart by adding the possibility to use [strimzi.io](https://strimzi.io/) KafkaUsers. This way, we can automatically spin up new KafkaUsers when deploying a Flink job cluster. diff --git a/helm-chart/flink-job-cluster/.helmignore b/helm-chart/flink-job-cluster/.helmignore new file mode 100644 index 00000000..06e1663c --- /dev/null +++ b/helm-chart/flink-job-cluster/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj + diff --git a/helm-chart/flink-job-cluster/Chart.yaml b/helm-chart/flink-job-cluster/Chart.yaml new file mode 100644 index 00000000..722973d5 --- /dev/null +++ b/helm-chart/flink-job-cluster/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +appVersion: "1.0" +description: flinkjobcluster deployment +name: flink-job-cluster +version: 0.1.51 +contributors: +- name: javier moreno molina + email: javier.moreno_molina@daimler.com +- name: tayfun yalcin + email: tayfun.yalcin@daimler.com + diff --git a/helm-chart/flink-job-cluster/README.md b/helm-chart/flink-job-cluster/README.md new file mode 100644 index 00000000..975ef6ac --- /dev/null +++ b/helm-chart/flink-job-cluster/README.md @@ -0,0 +1,3 @@ +# Helm Chart for Flink Job Cluster + +Please follow the Flink job cluster Helm Chart [guide](docs/flink_job_cluster_guide) to deploy the helm chart. diff --git a/helm-chart/flink-job-cluster/templates/_helpers.tpl b/helm-chart/flink-job-cluster/templates/_helpers.tpl new file mode 100644 index 00000000..aed4ca9f --- /dev/null +++ b/helm-chart/flink-job-cluster/templates/_helpers.tpl @@ -0,0 +1,33 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "flink-job-cluster.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "flink-job-cluster.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "flink-job-cluster.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + diff --git a/helm-chart/flink-job-cluster/templates/flinkjobcluster.yaml b/helm-chart/flink-job-cluster/templates/flinkjobcluster.yaml new file mode 100644 index 00000000..c5beae74 --- /dev/null +++ b/helm-chart/flink-job-cluster/templates/flinkjobcluster.yaml @@ -0,0 +1,93 @@ +apiVersion: flinkoperator.k8s.io/v1beta1 +kind: FlinkCluster +metadata: + name: {{ template "flink-job-cluster.fullname" . }} + labels: + app: {{ template "flink-job-cluster.name" . }} + chart: {{ template "flink-job-cluster.chart" . }} + release: {{ .Release.Name }} +spec: + image: + name: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + envVars: +{{ toYaml .Values.envVars | indent 4 }} + jobManager: + accessScope: {{ .Values.jobManager.accessScope }} + ports: + ui: {{ .Values.jobManager.ports.ui }} +{{- if .Values.jobManager.volumes }} + volumes: +{{ toYaml .Values.jobManager.volumes | indent 6}} +{{- end }} +{{- if .Values.jobManager.volumeMounts }} + volumeMounts: +{{ toYaml .Values.jobManager.volumeMounts | indent 6}} +{{- end }} +{{- if .Values.jobManager.metrics.enabled }} + extraPorts: +{{ toYaml .Values.jobManager.metrics.extraPorts | indent 6 }} +{{- end }} + resources: +{{ toYaml .Values.jobManager.resources | indent 6 }} + podAnnotations: +{{- with .Values.podAnnotations }} + {{- toYaml . | nindent 6 }} +{{- end }} + taskManager: + replicas: {{ .Values.taskManager.replicas }} +{{- if .Values.taskManager.volumes }} + volumes: +{{ toYaml .Values.taskManager.volumes | indent 6}} +{{- end }} +{{- if .Values.taskManager.volumeMounts }} + volumeMounts: +{{ toYaml .Values.taskManager.volumeMounts | indent 6}} +{{- end }} +{{- if .Values.taskManager.metrics.enabled }} + extraPorts: +{{ toYaml .Values.taskManager.metrics.extraPorts | indent 6 }} +{{- end }} + resources: +{{ toYaml .Values.taskManager.resources | indent 6 }} + podAnnotations: +{{- with .Values.podAnnotations }} + {{- toYaml . | nindent 6 }} +{{- end }} + job: + jarFile: {{ .Values.job.jarFile }} + className: {{ .Values.job.className }} + args: ["--input", "./README.txt"] + parallelism: {{ .Values.job.parallelism }} + restartPolicy: {{ .Values.job.restartPolicy }} +{{- if .Values.job.volumes }} + volumes: +{{ toYaml .Values.job.volumes | indent 6}} +{{- end }} +{{- if .Values.job.volumeMounts }} + volumeMounts: +{{ toYaml .Values.job.volumeMounts | indent 6}} +{{- end }} + {{- if .Values.job.initContainers.enabled }} + initContainers: + - name: {{ .Chart.Name}}-python-blob-downloader + image: "{{ .Values.job.initContainers.image }}:{{ .Values.job.initContainers.tag }}" + command: ["/bin/sh","-c","--"] + args: ["/app/exec-python.sh"] + env: + - name: STORAGE_CONNECTION_STRING + valueFrom: + secretKeyRef: + name: {{ .Values.job.initContainers.Storage.secretName }} + key: {{ .Values.job.initContainers.Storage.secretNameKey }} + - name: CONTAINER_NAME + value: {{ .Values.job.initContainers.Storage.containerName }} + - name: BLOB_NAME + value: {{ .Values.job.initContainers.Storage.blobName }} + {{- end }} + podAnnotations: +{{- with .Values.podAnnotations }} + {{- toYaml . | nindent 6 }} +{{- end }} + flinkProperties: +{{ toYaml .Values.flinkProperties | indent 4}} + diff --git a/helm-chart/flink-job-cluster/templates/podmonitor.yaml b/helm-chart/flink-job-cluster/templates/podmonitor.yaml new file mode 100644 index 00000000..461c1def --- /dev/null +++ b/helm-chart/flink-job-cluster/templates/podmonitor.yaml @@ -0,0 +1,19 @@ +{{- if .Values.podMonitor.enabled -}} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ template "flink-job-cluster.fullname" . }} +{{- if .Values.podMonitor.podMonitorSelectorLabels }} + labels: +{{ toYaml .Values.podMonitor.podMonitorSelectorLabels | indent 4 }} +{{- end }} +spec: + podTargetLabels: +{{ toYaml .Values.podMonitor.podTargetLabels | indent 6 }} + selector: + matchLabels: + app: flink + podMetricsEndpoints: +{{ toYaml .Values.podMonitor.podMetricsEndpoints | indent 4 }} +{{- end -}} + diff --git a/helm-chart/flink-job-cluster/values.yaml b/helm-chart/flink-job-cluster/values.yaml new file mode 100644 index 00000000..be38b3c7 --- /dev/null +++ b/helm-chart/flink-job-cluster/values.yaml @@ -0,0 +1,113 @@ +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +image: + repository: flink + tag: 1.9.3 + +# Prometheus reporter jar to be loaded by flink +envVars: + - name: HADOOP_CLASSPATH + value: /opt/flink/opt/flink-metrics-prometheus-1.9.3.jar + +jobManager: + accessScope: Cluster + ports: + ui: 8081 + +# enable metrics ports for jobManager + metrics: + enabled: true + extraPorts: + - name: prom + containerPort: 9249 + + resources: + limits: + memory: 1024Mi + cpu: 200m + +taskManager: + replicas: 2 + + volumeMounts: {} + +# enable metrics ports for taskManager + metrics: + enabled: true + extraPorts: + - name: prom + containerPort: 9249 + protocol: TCP + + resources: + limits: + memory: 1024Mi + cpu: "200m" + +job: + # job will look for a JAR file at ./examples/streaming/WordCount.jar and execute it + # className has to be valid and used in the provided JAR File + jarFile: ./examples/streaming/WordCount.jar + className: org.apache.flink.streaming.examples.wordcount.WordCount + args: ["--input", "./README.txt"] + paralellism: 2 + restartPolicy: Never + + # Mount an EmptyDir so the InitContainer can store its JarFile in the specific path for the job to execute, only needed when initContainer is enabled. + #volumes: + # - name: properties + # emptyDir: {} + volumeMounts: {} + + # Init Container is used to download a remote job jar to your job pod. + # it is only needed if you have no other way to download your job files into the Flink job cluster. + initContainers: + enabled: false +# image: / +# tag: "1.0" +# command: ["/bin/sh","-c","--"] +# args: ["/app/exec-python.sh"] + +# You can use the following setup to download the remote jar from e.g. a blob-storage. The below fields then have to be adjusted according to your blob-storage. + + # Storage: + # Provide the secret name, in which the storage connection-string is stored +# secretName: storage-connectstr +# secretNameKey: connectstr + # Provide the container name, from which a blob should be downloaded by the InitContainer +# containerName: snapshot + # Provide blob name, which should be downloaded from the container +# blobName: + +flinkProperties: + taskmanager.numberOfTaskSlots: "1" + + # metrics reporter "PrometheusReporter" + # visit https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/metrics.html#prometheus-orgapacheflinkmetricsprometheusprometheusreporter + # for more information + metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + +## Extra Annotations to be added to pod +podAnnotations: + fluentbit.io/parser: foo + +## Enable podMonitor for metrics - you need the Prometheus-Operator and its CRDs up and running in order to use PodMonitor. +podMonitor: + enabled: false + podTargetLabels: + - cluster + - component + + # include the podMonitorSelectorLabel which you have set in your prometheus-operator + # set podMonitorSelectorLabels {} if your prometheus-operator is set to collect all podMonitors + podMonitorSelectorLabels: + prometheus: cluster-metrics + + selector: + matchLabels: + app: flink + + podMetricsEndpoints: + - port: prom +