Cilium Cluster Mesh upgrade (ci-clustermesh) #5558
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Cilium Cluster Mesh upgrade (ci-clustermesh) | |
# Any change in triggers needs to be reflected in the concurrency group. | |
on: | |
workflow_dispatch: | |
inputs: | |
PR-number: | |
description: "Pull request number." | |
required: true | |
context-ref: | |
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)." | |
required: true | |
SHA: | |
description: "SHA under test (head of the PR branch)." | |
required: true | |
extra-args: | |
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow." | |
required: false | |
default: '{}' | |
push: | |
branches: | |
- v1.15 | |
- ft/v1.15/** | |
- 'renovate/v1.15-**' | |
paths-ignore: | |
- 'Documentation/**' | |
# By specifying the access of one of the scopes, all of those that are not | |
# specified are set to 'none'. | |
permissions: | |
# To be able to access the repository with actions/checkout | |
contents: read | |
# To allow retrieving information from the PR API | |
pull-requests: read | |
# To be able to set commit status | |
statuses: write | |
concurrency: | |
# Structure: | |
# - Workflow name | |
# - Event type | |
# - A unique identifier depending on event type: | |
# - push: SHA | |
# - workflow_dispatch: PR number | |
# | |
# This structure ensures a unique concurrency group name is generated for each | |
# type of testing, such that re-runs will cancel the previous run. | |
group: | | |
${{ github.workflow }} | |
${{ github.event_name }} | |
${{ | |
(github.event_name == 'push' && github.sha) || | |
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number) | |
}} | |
cancel-in-progress: true | |
env: | |
cilium_cli_ci_version: | |
clusterName1: cluster1 | |
clusterName2: cluster2 | |
contextName1: kind-cluster1 | |
contextName2: kind-cluster2 | |
jobs: | |
echo-inputs: | |
if: ${{ github.event_name == 'workflow_dispatch' }} | |
name: Echo Workflow Dispatch Inputs | |
runs-on: ubuntu-24.04 | |
steps: | |
- name: Echo Workflow Dispatch Inputs | |
run: | | |
echo '${{ tojson(inputs) }}' | |
commit-status-start: | |
name: Commit Status Start | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set initial commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
upgrade-and-downgrade: | |
name: "Upgrade and Downgrade Test" | |
runs-on: ${{ vars.GH_RUNNER_EXTRA_POWER_UBUNTU_LATEST || 'ubuntu-latest' }} | |
timeout-minutes: 60 | |
env: | |
job_name: "Installation and Connectivity Test" | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- name: '1' | |
encryption: 'disabled' | |
kube-proxy: 'iptables' | |
external-kvstore: false | |
cm-auth-mode: 'legacy' | |
- name: '2' | |
encryption: 'disabled' | |
kube-proxy: 'none' | |
external-kvstore: false | |
cm-auth-mode: 'migration' | |
# Currently, ipsec requires to synchronously regenerate the host | |
# endpoint to ensure ordering (#25735). Given that this is a blocking | |
# operation, we cannot wait for full clustermesh synchronization | |
# for an extended period of time, as that would prevent the agents from | |
# becoming ready (and new pods scheduled). This means that we will | |
# experience cross-cluster connection drops during upgrades/downgrades, | |
# given that the timeout is too low to account for the initialization | |
# of a new clustermesh-apiserver replica (while it is enough to prevent | |
# issues in case of agent restarts, if all remote clusters are ready, | |
# as well as when connecting to an external kvstore as in this case). | |
- name: '3' | |
encryption: 'ipsec' | |
kube-proxy: 'iptables' | |
external-kvstore: true | |
cm-auth-mode: 'legacy' | |
- name: '4' | |
encryption: 'wireguard' | |
kube-proxy: 'iptables' | |
external-kvstore: false | |
cm-auth-mode: 'cluster' | |
steps: | |
- name: Checkout context ref (trusted) | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
ref: ${{ inputs.context-ref || github.sha }} | |
persist-credentials: false | |
- name: Set Environment Variables | |
uses: ./.github/actions/set-env-variables | |
- name: Set up newest settings | |
id: newest-vars | |
uses: ./.github/actions/helm-default | |
with: | |
image-tag: ${{ inputs.SHA }} | |
chart-dir: ./untrusted/cilium-newest/install/kubernetes/cilium | |
- name: Set up job variables | |
id: vars | |
run: | | |
CILIUM_DOWNGRADE_VERSION=$(contrib/scripts/print-downgrade-version.sh stable) | |
echo "downgrade_version=${CILIUM_DOWNGRADE_VERSION}" >> $GITHUB_OUTPUT | |
# * Monitor aggregation is set to medium to avoid the performance penalty | |
# in the testing environment due to the relatively high traffic load. | |
# * We explicitly configure the IPAM mode to prevent it from being | |
# reset to the default value on upgrade/downgrade due to --reset-values. | |
# * We explicitly configure the sync timeout to a higher value to | |
# give enough time to the clustermesh-apiserver to restart after | |
# the upgrade/downgrade before that agents regenerate the endpoints. | |
# * We configure the maximum number of unavailable agents to 1 to slow | |
# down the rollout process and highlight possible connection disruption | |
# occurring in the meanwhile. | |
CILIUM_INSTALL_DEFAULTS=" \ | |
--set=debug.enabled=true \ | |
--set=bpf.monitorAggregation=medium \ | |
--set=hubble.enabled=true \ | |
--set=routingMode=tunnel \ | |
--set=tunnelProtocol=vxlan \ | |
--set=ipv4.enabled=true \ | |
--set=ipv6.enabled=true \ | |
--set=kubeProxyReplacement=${{ matrix.kube-proxy == 'none' }} \ | |
--set=bpf.masquerade=${{ matrix.kube-proxy == 'none' }} \ | |
--set=ipam.mode=kubernetes \ | |
--set=operator.replicas=1 \ | |
--set=updateStrategy.rollingUpdate.maxUnavailable=1 \ | |
--set=clustermesh.useAPIServer=${{ !matrix.external-kvstore }} \ | |
--set=clustermesh.config.enabled=true \ | |
--set=extraConfig.clustermesh-ip-identities-sync-timeout=10m \ | |
--set=clustermesh.apiserver.tls.authMode=${{ matrix.cm-auth-mode }} \ | |
" | |
# Run only a limited subset of tests to reduce the amount of time | |
# required. The full suite is run in conformance-clustermesh. | |
CONNECTIVITY_TEST_DEFAULTS=" \ | |
--hubble=false \ | |
--flow-validation=disabled \ | |
--test='no-interrupted-connections' \ | |
--test='no-unexpected-packet-drops' \ | |
--test='no-policies/' \ | |
--test='no-policies-extra/' \ | |
--test='allow-all-except-world/' \ | |
--test='client-ingress/' \ | |
--test='client-egress/' \ | |
--test='cluster-entity-multi-cluster/' \ | |
--test='!/pod-to-world' \ | |
--test='!/pod-to-cidr' \ | |
--collect-sysdump-on-failure" | |
CILIUM_INSTALL_ENCRYPTION="" | |
if [ "${{ matrix.encryption }}" != "disabled" ]; then | |
CILIUM_INSTALL_ENCRYPTION=" \ | |
--set=encryption.enabled=true \ | |
--set=encryption.type=${{ matrix.encryption }}" | |
fi | |
echo "cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} ${CILIUM_INSTALL_ENCRYPTION}" >> $GITHUB_OUTPUT | |
echo "connectivity_test_defaults=${CONNECTIVITY_TEST_DEFAULTS}" >> $GITHUB_OUTPUT | |
- name: Install Cilium CLI | |
uses: cilium/cilium-cli@c52e8c38e6d6235bd8e6e961199a984275547d6f # v0.16.22 | |
with: | |
repository: ${{ env.CILIUM_CLI_RELEASE_REPO }} | |
release-version: ${{ env.CILIUM_CLI_VERSION }} | |
ci-version: ${{ env.cilium_cli_ci_version }} | |
- name: Generate Kind configuration files | |
run: | | |
PODCIDR=10.242.0.0/16,fd00:10:242::/48 \ | |
SVCCIDR=10.243.0.0/16,fd00:10:243::/112 \ | |
IPFAMILY=dual \ | |
KUBEPROXYMODE=${{ matrix.kube-proxy }} \ | |
envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster1.yaml | |
PODCIDR=10.244.0.0/16,fd00:10:244::/48 \ | |
SVCCIDR=10.245.0.0/16,fd00:10:245::/112 \ | |
IPFAMILY=dual \ | |
KUBEPROXYMODE=${{ matrix.kube-proxy }} \ | |
envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster2.yaml | |
- name: Create Kind cluster 1 | |
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0 | |
with: | |
cluster_name: ${{ env.clusterName1 }} | |
version: ${{ env.KIND_VERSION }} | |
node_image: ${{ env.KIND_K8S_IMAGE }} | |
kubectl_version: ${{ env.KIND_K8S_VERSION }} | |
config: ./.github/kind-config-cluster1.yaml | |
wait: 0 # The control-plane never becomes ready, since no CNI is present | |
- name: Create Kind cluster 2 | |
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0 | |
with: | |
cluster_name: ${{ env.clusterName2 }} | |
version: ${{ env.KIND_VERSION }} | |
node_image: ${{ env.KIND_K8S_IMAGE }} | |
kubectl_version: ${{ env.KIND_K8S_VERSION }} | |
config: ./.github/kind-config-cluster2.yaml | |
wait: 0 # The control-plane never becomes ready, since no CNI is present | |
# Make sure that coredns uses IPv4-only upstream DNS servers also in case of clusters | |
# with IP family dual, since IPv6 ones are not reachable and cause spurious failures. | |
# Additionally, this is also required to workaround #23283. | |
- name: Configure the coredns nameservers | |
run: | | |
COREDNS_PATCH=" | |
spec: | |
template: | |
spec: | |
dnsPolicy: None | |
dnsConfig: | |
nameservers: | |
- 8.8.4.4 | |
- 8.8.8.8 | |
" | |
kubectl --context ${{ env.contextName1 }} patch deployment -n kube-system coredns --patch="$COREDNS_PATCH" | |
kubectl --context ${{ env.contextName2 }} patch deployment -n kube-system coredns --patch="$COREDNS_PATCH" | |
- name: Create the IPSec secret in both clusters | |
if: matrix.encryption == 'ipsec' | |
run: | | |
SECRET="3 rfc4106(gcm(aes)) $(openssl rand -hex 20) 128" | |
kubectl --context ${{ env.contextName1 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}" | |
kubectl --context ${{ env.contextName2 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}" | |
- name: Start kvstore clusters | |
id: kvstore | |
if: matrix.external-kvstore | |
uses: ./.github/actions/kvstore | |
with: | |
clusters: 2 | |
- name: Create the secret containing the kvstore credentials | |
if: matrix.external-kvstore | |
run: | | |
kubectl --context ${{ env.contextName1 }} create -n kube-system -f ${{ steps.kvstore.outputs.cilium_etcd_secrets_path }} | |
kubectl --context ${{ env.contextName2 }} create -n kube-system -f ${{ steps.kvstore.outputs.cilium_etcd_secrets_path }} | |
- name: Set clustermesh connection parameters | |
id: clustermesh-vars | |
run: | | |
# Let's retrieve in advance the parameters to mesh the two clusters, so | |
# that we don't need to do that through the CLI in a second step, as it | |
# would be reset during upgrade (as we are resetting the values). | |
# Explicitly configure the NodePorts to make sure that they are different | |
# in each cluster, to workaround #24692 | |
PORT1=32379 | |
PORT2=32380 | |
CILIUM_INSTALL_CLUSTER1=" \ | |
--set cluster.name=${{ env.clusterName1 }} \ | |
--set cluster.id=1 \ | |
--set clustermesh.apiserver.service.nodePort=$PORT1 \ | |
" | |
CILIUM_INSTALL_CLUSTER2=" \ | |
--set cluster.name=${{ env.clusterName2 }} \ | |
--set cluster.id=255 \ | |
--set clustermesh.apiserver.service.nodePort=$PORT2 \ | |
" | |
CILIUM_INSTALL_COMMON=" \ | |
--set clustermesh.config.clusters[0].name=${{ env.clusterName1 }} \ | |
--set clustermesh.config.clusters[1].name=${{ env.clusterName2 }} \ | |
" | |
if [ "${{ matrix.external-kvstore }}" == "true" ]; then | |
CILIUM_INSTALL_COMMON="$CILIUM_INSTALL_COMMON \ | |
${{ steps.kvstore.outputs.cilium_install_clustermesh }}" | |
else | |
IP1=$(kubectl --context ${{ env.contextName1 }} get nodes \ | |
${{ env.clusterName1 }}-worker -o wide --no-headers | awk '{ print $6 }') | |
IP2=$(kubectl --context ${{ env.contextName2 }} get nodes \ | |
${{ env.clusterName2 }}-worker -o wide --no-headers | awk '{ print $6 }') | |
CILIUM_INSTALL_COMMON="$CILIUM_INSTALL_COMMON \ | |
--set clustermesh.config.clusters[0].ips={$IP1} \ | |
--set clustermesh.config.clusters[0].port=$PORT1 \ | |
--set clustermesh.config.clusters[1].ips={$IP2} \ | |
--set clustermesh.config.clusters[1].port=$PORT2 \ | |
" | |
fi | |
echo cilium_install_cluster1="$CILIUM_INSTALL_CLUSTER1 $CILIUM_INSTALL_COMMON" >> $GITHUB_OUTPUT | |
echo cilium_install_cluster2="$CILIUM_INSTALL_CLUSTER2 $CILIUM_INSTALL_COMMON" >> $GITHUB_OUTPUT | |
# Warning: since this is a privileged workflow, subsequent workflow job | |
# steps must take care not to execute untrusted code. | |
- name: Checkout pull request branch (NOT TRUSTED) | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
ref: ${{ steps.newest-vars.outputs.sha }} | |
persist-credentials: false | |
path: untrusted/cilium-newest | |
sparse-checkout: | | |
install/kubernetes/cilium | |
- name: Checkout ${{ steps.vars.outputs.downgrade_version }} branch | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
ref: ${{ steps.vars.outputs.downgrade_version }} | |
persist-credentials: false | |
path: untrusted/cilium-downgrade | |
sparse-checkout: | | |
install/kubernetes/cilium | |
- name: Set up downgrade settings | |
id: downgrade-vars | |
run: | | |
SHA="$(cd untrusted/cilium-downgrade && git rev-parse HEAD)" | |
CILIUM_IMAGE_SETTINGS=" \ | |
--chart-directory=./untrusted/cilium-downgrade/install/kubernetes/cilium \ | |
--set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \ | |
--set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${SHA} \ | |
--set=clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${SHA} \ | |
--set=clustermesh.apiserver.kvstoremesh.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/kvstoremesh-ci:${SHA} \ | |
" | |
echo "sha=${SHA}" >> $GITHUB_OUTPUT | |
echo "cilium_image_settings=${CILIUM_IMAGE_SETTINGS}" >> $GITHUB_OUTPUT | |
- name: Wait for images to be available (newest) | |
timeout-minutes: 10 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-generic-ci clustermesh-apiserver-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.newest-vars.outputs.sha }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Wait for images to be available (downgrade) | |
timeout-minutes: 10 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-generic-ci clustermesh-apiserver-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.downgrade-vars.outputs.sha }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Install Cilium in cluster1 | |
id: install-cilium-cluster1 | |
env: | |
KVSTORE_ID: 1 | |
run: | | |
# We enable the clustermesh-apiserver (although with zero replicas) | |
# also when actually connecting to an external kvstore. This is a | |
# workaround to enable creating the clustermesh configuration secret, | |
# that is otherwise skipped in Cilium v1.14 (and earlier). | |
cilium --context ${{ env.contextName1 }} install \ | |
${{ steps.downgrade-vars.outputs.cilium_image_settings }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.kvstore.outputs.cilium_install_kvstore }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} \ | |
--set=clustermesh.useAPIServer=true \ | |
--set=clustermesh.apiserver.replicas=${{ matrix.external-kvstore && '0' || '1' }} \ | |
- name: Copy the Cilium CA secret to cluster2, as they must match | |
if: ${{ !matrix.external-kvstore }} | |
run: | | |
kubectl --context ${{ env.contextName1 }} get secret -n kube-system cilium-ca -o yaml | | |
kubectl --context ${{ env.contextName2 }} create -f - | |
- name: Install Cilium in cluster2 | |
env: | |
KVSTORE_ID: 2 | |
run: | | |
cilium --context ${{ env.contextName2 }} install \ | |
${{ steps.newest-vars.outputs.cilium_install_defaults }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.kvstore.outputs.cilium_install_kvstore }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster2 }} | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait | |
cilium --context ${{ env.contextName2 }} status --wait | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Make JUnit report directory | |
run: | | |
mkdir -p cilium-junits | |
- name: Run connectivity test - pre-upgrade (${{ join(matrix.*, ', ') }}) | |
run: | | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
${{ steps.vars.outputs.connectivity_test_defaults }} \ | |
--junit-file "cilium-junits/${{ env.job_name }} - pre-upgrade (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests pre-upgrade (${{ join(matrix.*, ', ') }})" | |
# Create pods which establish long lived connections. They will be used by | |
# subsequent connectivity tests with --include-conn-disrupt-test to catch any | |
# interruption in such flows. | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} --hubble=false \ | |
--include-conn-disrupt-test --conn-disrupt-test-setup \ | |
--conn-disrupt-dispatch-interval 0ms | |
- name: Upgrade Cilium in cluster1 and enable kvstoremesh | |
env: | |
KVSTORE_ID: 1 | |
run: | | |
cilium --context ${{ env.contextName1 }} upgrade --reset-values \ | |
${{ steps.newest-vars.outputs.cilium_install_defaults }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.kvstore.outputs.cilium_install_kvstore }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} \ | |
--set clustermesh.apiserver.kvstoremesh.enabled=true | |
- name: Rollout Cilium agents in cluster2 | |
if: ${{ !matrix.external-kvstore }} | |
run: | | |
# This makes sure that the remote agents reconnect to the new instance of the | |
# clustermesh-apiserver, without waiting for the watchdog mechanism to kick in. | |
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait --wait-duration=10m | |
cilium --context ${{ env.contextName2 }} status --wait --wait-duration=10m | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Set cilium connectivity test namespace | |
id: cilium-cli | |
run: | | |
NAMESPACE=$(kubectl get namespace -l "app.kubernetes.io/name=cilium-cli" -o name | sort | cut -d / -f 2 | head -1) | |
echo namespace="$NAMESPACE" >> $GITHUB_OUTPUT | |
${{ steps.cilium-cli.outputs.namespace }} | |
- name: Gather additional troubleshooting information | |
run: | | |
kubectl --context ${{ env.contextName1 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt | |
kubectl --context ${{ env.contextName2 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt | |
kubectl --context ${{ env.contextName1 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps | |
kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps | |
kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --previous --ignore-errors --timestamps | |
- name: Run connectivity test - post-upgrade (${{ join(matrix.*, ', ') }}) | |
run: | | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
${{ steps.vars.outputs.connectivity_test_defaults }} \ | |
--include-conn-disrupt-test \ | |
--junit-file "cilium-junits/${{ env.job_name }} - post upgrade (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests post-upgrade (${{ join(matrix.*, ', ') }})" | |
# Create pods which establish long lived connections. They will be used by | |
# subsequent connectivity tests with --include-conn-disrupt-test to catch any | |
# interruption in such flows. | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} --hubble=false \ | |
--include-conn-disrupt-test --conn-disrupt-test-setup \ | |
--conn-disrupt-dispatch-interval 0ms | |
# Perform an additional "stress" test, scaling the clustermesh-apiservers in both clusters | |
# to zero replicas, and restarting all agents. Existing connections should not be disrupted. | |
# One exception to this is represented by Cilium being in charge of handling NodePort | |
# traffic, as the simultaneous restart of the clustermesh-apiserver pods in both clusters | |
# after rolling out all agents can lead to a circular dependency (#30156). | |
- name: Scale the clustermesh-apiserver replicas to 0 | |
if: ${{ !matrix.external-kvstore }} | |
run: | | |
kubectl --context ${{ env.contextName1 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 0 | |
if [ ${{ matrix.kube-proxy }} != "none" ]; then | |
kubectl --context ${{ env.contextName2 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 0 | |
fi | |
- name: Rollout Cilium agents in both clusters | |
run: | | |
kubectl --context ${{ env.contextName1 }} rollout restart -n kube-system ds/cilium | |
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium | |
# Wait until all agents successfully restarted before scaling the replicas again | |
kubectl --context ${{ env.contextName1 }} rollout status -n kube-system ds/cilium --timeout=5m | |
kubectl --context ${{ env.contextName2 }} rollout status -n kube-system ds/cilium --timeout=5m | |
- name: Scale the clustermesh-apiserver replicas back to 1 | |
if: ${{ !matrix.external-kvstore }} | |
run: | | |
kubectl --context ${{ env.contextName1 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 1 | |
kubectl --context ${{ env.contextName2 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 1 | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait | |
cilium --context ${{ env.contextName2 }} status --wait | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Gather additional troubleshooting information | |
run: | | |
kubectl --context ${{ env.contextName1 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt | |
kubectl --context ${{ env.contextName2 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt | |
kubectl --context ${{ env.contextName1 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps | |
kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps | |
kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --previous --ignore-errors --timestamps | |
- name: Run connectivity test - stress-test (${{ join(matrix.*, ', ') }}) | |
run: | | |
# Only check that no long living connection was disrupted | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
--hubble=false \ | |
--flow-validation=disabled \ | |
--test='no-interrupted-connections' \ | |
--test='no-unexpected-packet-drops' \ | |
--include-conn-disrupt-test \ | |
--junit-file "cilium-junits/${{ env.job_name }} - stress test (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests stess-test (${{ join(matrix.*, ', ') }})" | |
# Create pods which establish long lived connections. They will be used by | |
# subsequent connectivity tests with --include-conn-disrupt-test to catch any | |
# interruption in such flows. | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} --hubble=false \ | |
--include-conn-disrupt-test --conn-disrupt-test-setup \ | |
--conn-disrupt-dispatch-interval 0ms | |
- name: Downgrade Cilium in cluster1 and disable kvstoremesh | |
env: | |
KVSTORE_ID: 1 | |
run: | | |
# We enable the clustermesh-apiserver (although with zero replicas) | |
# also when actually connecting to an external kvstore. This is a | |
# workaround to enable creating the clustermesh configuration secret, | |
# that is otherwise skipped in Cilium v1.14 (and earlier). | |
cilium --context ${{ env.contextName1 }} upgrade --reset-values \ | |
${{ steps.downgrade-vars.outputs.cilium_image_settings }} \ | |
${{ steps.vars.outputs.cilium_install_defaults }} \ | |
${{ steps.kvstore.outputs.cilium_install_kvstore }} \ | |
${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} \ | |
--set=clustermesh.useAPIServer=true \ | |
--set=clustermesh.apiserver.replicas=${{ matrix.external-kvstore && '0' || '1' }} \ | |
- name: Rollout Cilium agents in cluster2 | |
if: ${{ !matrix.external-kvstore }} | |
run: | | |
# This makes sure that the remote agents reconnect to the new instance of the | |
# clustermesh-apiserver, without waiting for the watchdog mechanism to kick in. | |
kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium | |
- name: Wait for cluster mesh status to be ready | |
run: | | |
cilium --context ${{ env.contextName1 }} status --wait | |
cilium --context ${{ env.contextName2 }} status --wait | |
cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m | |
cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m | |
- name: Gather additional troubleshooting information | |
run: | | |
kubectl --context ${{ env.contextName1 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt | |
kubectl --context ${{ env.contextName2 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt | |
kubectl --context ${{ env.contextName1 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps | |
kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps | |
kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --previous --ignore-errors --timestamps | |
- name: Run connectivity test - post-downgrade (${{ join(matrix.*, ', ') }}) | |
run: | | |
cilium --context ${{ env.contextName1 }} connectivity test \ | |
--multi-cluster=${{ env.contextName2 }} \ | |
${{ steps.vars.outputs.connectivity_test_defaults }} \ | |
--include-conn-disrupt-test \ | |
--junit-file "cilium-junits/${{ env.job_name }} - post downgrade (${{ join(matrix.*, ', ') }}).xml" \ | |
--junit-property github_job_step="Run tests post-downgrade (${{ join(matrix.*, ', ') }})" | |
- name: Post-test information gathering | |
if: ${{ !success() && steps.install-cilium-cluster1.outcome != 'skipped' }} | |
run: | | |
cilium --context ${{ env.contextName1 }} status | |
cilium --context ${{ env.contextName1 }} clustermesh status | |
cilium --context ${{ env.contextName2 }} status | |
cilium --context ${{ env.contextName2 }} clustermesh status | |
kubectl config use-context ${{ env.contextName1 }} | |
kubectl get pods --all-namespaces -o wide | |
cilium sysdump --output-filename cilium-sysdump-context1-final-${{ join(matrix.*, '-') }} | |
kubectl config use-context ${{ env.contextName2 }} | |
kubectl get pods --all-namespaces -o wide | |
cilium sysdump --output-filename cilium-sysdump-context2-final-${{ join(matrix.*, '-') }} | |
if [ "${{ matrix.external-kvstore }}" == "true" ]; then | |
for i in {1..2}; do | |
echo | |
echo "# Retrieving logs from kvstore$i docker container" | |
docker logs kvstore$i | |
done | |
fi | |
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently | |
- name: Upload artifacts | |
if: ${{ !success() }} | |
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: cilium-sysdumps-${{ matrix.name }} | |
path: cilium-sysdump-*.zip | |
- name: Upload JUnits [junit] | |
if: ${{ always() }} | |
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: cilium-junits-${{ matrix.name }} | |
path: cilium-junits/*.xml | |
- name: Publish Test Results As GitHub Summary | |
if: ${{ always() }} | |
uses: aanm/junit2md@332ebf0fddd34e91b03a832cfafaa826306558f9 # v0.0.3 | |
with: | |
junit-directory: "cilium-junits" | |
merge-upload: | |
if: ${{ always() }} | |
name: Merge and Upload Artifacts | |
runs-on: ubuntu-latest | |
needs: upgrade-and-downgrade | |
steps: | |
- name: Merge Sysdumps | |
if: ${{ needs.upgrade-and-downgrade.result == 'failure' }} | |
uses: actions/upload-artifact/merge@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: cilium-sysdumps | |
pattern: cilium-sysdumps-* | |
retention-days: 5 | |
delete-merged: true | |
continue-on-error: true | |
- name: Merge JUnits | |
uses: actions/upload-artifact/merge@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: cilium-junits | |
pattern: cilium-junits-* | |
retention-days: 5 | |
delete-merged: true | |
commit-status-final: | |
if: ${{ always() }} | |
name: Commit Status Final | |
needs: upgrade-and-downgrade | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set final commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
status: ${{ needs.upgrade-and-downgrade.result }} |