.github/workflows/tests-clustermesh-upgrade.yaml

name: Cilium Cluster Mesh upgrade (ci-clustermesh)

# Any change in triggers needs to be reflected in the concurrency group.
on:
  workflow_dispatch:
    inputs:
      PR-number:
        description: "Pull request number."
        required: true
      context-ref:
        description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)."
        required: true
      SHA:
        description: "SHA under test (head of the PR branch)."
        required: true
      extra-args:
        description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow."
        required: false
        default: '{}'

  push:
    branches:
      - v1.16
      - ft/v1.16/**
      - 'renovate/v1.16-**'
    paths-ignore:
      - 'Documentation/**'

# By specifying the access of one of the scopes, all of those that are not
# specified are set to 'none'.
permissions:
  # To read actions state with catchpoint/workflow-telemetry-action
  actions: read
  # To be able to access the repository with actions/checkout
  contents: read
  # To allow retrieving information from the PR API
  pull-requests: read
  # To be able to set commit status
  statuses: write

concurrency:
  # Structure:
  # - Workflow name
  # - Event type
  # - A unique identifier depending on event type:
  #   - push: SHA
  #   - workflow_dispatch: PR number
  #
  # This structure ensures a unique concurrency group name is generated for each
  # type of testing, such that re-runs will cancel the previous run.
  group: |
    ${{ github.workflow }}
    ${{ github.event_name }}
    ${{
      (github.event_name == 'push' && github.sha) ||
      (github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number)
    }}
  cancel-in-progress: true

env:
  cilium_cli_ci_version:

  clusterName1: cluster1
  clusterName2: cluster2
  contextName1: kind-cluster1
  contextName2: kind-cluster2

jobs:
  echo-inputs:
    if: ${{ github.event_name == 'workflow_dispatch' }}
    name: Echo Workflow Dispatch Inputs
    runs-on: ubuntu-24.04
    steps:
      - name: Echo Workflow Dispatch Inputs
        run: |
          echo '${{ tojson(inputs) }}'

  commit-status-start:
    name: Commit Status Start
    runs-on: ubuntu-latest
    steps:
      - name: Set initial commit status
        uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
        with:
          sha: ${{ inputs.SHA || github.sha }}

  upgrade-and-downgrade:
    name: "Upgrade and Downgrade Test"
    runs-on: ${{ vars.GH_RUNNER_EXTRA_POWER_UBUNTU_LATEST || 'ubuntu-latest' }}
    timeout-minutes: 60
    env:
      job_name: "Installation and Connectivity Test"

    strategy:
      fail-fast: false
      matrix:
        include:
          - name: '1'
            encryption: 'disabled'
            kube-proxy: 'iptables'
            external-kvstore: false
            max-connected-clusters: 255
            cm-auth-mode: 'legacy'

          - name: '2'
            encryption: 'disabled'
            kube-proxy: 'none'
            external-kvstore: false
            max-connected-clusters: 511
            cm-auth-mode: 'migration'

          # Currently, ipsec requires to synchronously regenerate the host
          # endpoint to ensure ordering (#25735). Given that this is a blocking
          # operation, we cannot wait for full clustermesh synchronization
          # for an extended period of time, as that would prevent the agents from
          # becoming ready (and new pods scheduled). This means that we will
          # experience cross-cluster connection drops during upgrades/downgrades,
          # given that the timeout is too low to account for the initialization
          # of a new clustermesh-apiserver replica (while it is enough to prevent
          # issues in case of agent restarts, if all remote clusters are ready,
          # as well as when connecting to an external kvstore as in this case).
          - name: '3'
            encryption: 'ipsec'
            kube-proxy: 'iptables'
            external-kvstore: true
            max-connected-clusters: 255

          - name: '4'
            encryption: 'wireguard'
            kube-proxy: 'iptables'
            external-kvstore: false
            max-connected-clusters: 511
            cm-auth-mode: 'cluster'

    steps:
      - name: Collect Workflow Telemetry
        uses: catchpoint/workflow-telemetry-action@94c3c3d9567a0205de6da68a76c428ce4e769af1 # v2.0.0
        with:
          comment_on_pr: false

      - name: Checkout context ref (trusted)
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ inputs.context-ref || github.sha }}
          persist-credentials: false

      - name: Set Environment Variables
        uses: ./.github/actions/set-env-variables

      - name: Set up newest settings
        id: newest-vars
        uses: ./.github/actions/helm-default
        with:
          image-tag: ${{ inputs.SHA }}
          chart-dir: ./untrusted/cilium-newest/install/kubernetes/cilium

      - name: Set up job variables
        id: vars
        run: |
          CILIUM_DOWNGRADE_VERSION=$(contrib/scripts/print-downgrade-version.sh stable)
          echo "downgrade_version=${CILIUM_DOWNGRADE_VERSION}" >> $GITHUB_OUTPUT

          # * Monitor aggregation is set to medium to avoid the performance penalty
          #   in the testing environment due to the relatively high traffic load.
          # * We explicitly configure the IPAM mode to prevent it from being
          #   reset to the default value on upgrade/downgrade due to --reset-values.
          # * We explicitly configure the sync timeout to a higher value to
          #   give enough time to the clustermesh-apiserver to restart after
          #   the upgrade/downgrade before that agents regenerate the endpoints.
          # * We configure the maximum number of unavailable agents to 1 to slow
          #   down the rollout process and highlight possible connection disruption
          #   occurring in the meanwhile.
          CILIUM_INSTALL_DEFAULTS=" \
            --set=debug.enabled=true \
            --set=bpf.monitorAggregation=medium \
            --set=hubble.enabled=true \
            --set=routingMode=tunnel \
            --set=tunnelProtocol=vxlan \
            --set=ipv4.enabled=true \
            --set=ipv6.enabled=true \
            --set=kubeProxyReplacement=${{ matrix.kube-proxy == 'none' }} \
            --set=bpf.masquerade=${{ matrix.kube-proxy == 'none' }} \
            --set=ipam.mode=kubernetes \
            --set=operator.replicas=1 \
            --set=updateStrategy.rollingUpdate.maxUnavailable=1 \
            --set=clustermesh.useAPIServer=${{ !matrix.external-kvstore }} \
            --set=clustermesh.maxConnectedClusters=${{ matrix.max-connected-clusters }} \
            --set=clustermesh.config.enabled=true \
            --set=extraConfig.clustermesh-ip-identities-sync-timeout=10m \
            --set=clustermesh.apiserver.readinessProbe.periodSeconds=1 \
            --set=clustermesh.apiserver.kvstoremesh.readinessProbe.periodSeconds=1 \
            --set=clustermesh.apiserver.updateStrategy.rollingUpdate.maxSurge=1 `# Use surge update strategy to enable clients to failover` \
            --set=clustermesh.apiserver.updateStrategy.rollingUpdate.maxUnavailable=0 \
            --set=clustermesh.apiserver.tls.authMode=${{ matrix.cm-auth-mode }} \
          "

          # Run only a limited subset of tests to reduce the amount of time
          # required. The full suite is run in conformance-clustermesh.
          CONNECTIVITY_TEST_DEFAULTS=" \
            --hubble=false \
            --flow-validation=disabled \
            --test='no-interrupted-connections' \
            --test='no-unexpected-packet-drops' \
            --test='no-policies/' \
            --test='no-policies-extra/' \
            --test='allow-all-except-world/' \
            --test='client-ingress/' \
            --test='client-egress/' \
            --test='cluster-entity-multi-cluster/' \
            --test='!/pod-to-world' \
            --test='!/pod-to-cidr' \
            --collect-sysdump-on-failure"

          CILIUM_INSTALL_ENCRYPTION=""
          if [ "${{ matrix.encryption }}" != "disabled" ]; then
            CILIUM_INSTALL_ENCRYPTION=" \
              --set=encryption.enabled=true \
              --set=encryption.type=${{ matrix.encryption }}"
          fi

          echo "cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} ${CILIUM_INSTALL_ENCRYPTION}" >> $GITHUB_OUTPUT
          echo "connectivity_test_defaults=${CONNECTIVITY_TEST_DEFAULTS}" >> $GITHUB_OUTPUT

      - name: Install Cilium CLI
        uses: cilium/cilium-cli@c52e8c38e6d6235bd8e6e961199a984275547d6f # v0.16.22
        with:
          repository: ${{ env.CILIUM_CLI_RELEASE_REPO }}
          release-version: ${{ env.CILIUM_CLI_VERSION }}
          ci-version: ${{ env.cilium_cli_ci_version }}

      - name: Generate Kind configuration files
        run: |
          PODCIDR=10.242.0.0/16,fd00:10:242::/48 \
            SVCCIDR=10.243.0.0/16,fd00:10:243::/112 \
            IPFAMILY=dual \
            KUBEPROXYMODE=${{ matrix.kube-proxy }} \
            envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster1.yaml

          PODCIDR=10.244.0.0/16,fd00:10:244::/48 \
            SVCCIDR=10.245.0.0/16,fd00:10:245::/112 \
            IPFAMILY=dual \
            KUBEPROXYMODE=${{ matrix.kube-proxy }} \
            envsubst < ./.github/kind-config.yaml.tmpl > ./.github/kind-config-cluster2.yaml

      - name: Create Kind cluster 1
        uses: helm/kind-action@9fdad0686e6f19fcd572f62516f5e0436f562ee7 # v1.10.0
        with:
          cluster_name: ${{ env.clusterName1 }}
          version: ${{ env.KIND_VERSION }}
          node_image: ${{ env.KIND_K8S_IMAGE }}
          kubectl_version: ${{ env.KIND_K8S_VERSION }}
          config: ./.github/kind-config-cluster1.yaml
          wait: 0 # The control-plane never becomes ready, since no CNI is present

      - name: Create Kind cluster 2
        uses: helm/kind-action@9fdad0686e6f19fcd572f62516f5e0436f562ee7 # v1.10.0
        with:
          cluster_name: ${{ env.clusterName2 }}
          version: ${{ env.KIND_VERSION }}
          node_image: ${{ env.KIND_K8S_IMAGE }}
          kubectl_version: ${{ env.KIND_K8S_VERSION }}
          config: ./.github/kind-config-cluster2.yaml
          wait: 0 # The control-plane never becomes ready, since no CNI is present

      # Make sure that coredns uses IPv4-only upstream DNS servers also in case of clusters
      # with IP family dual, since IPv6 ones are not reachable and cause spurious failures.
      # Additionally, this is also required to workaround
      # https://github.com/cilium/cilium/issues/23283#issuecomment-1597282247.
      - name: Configure the coredns nameservers
        run: |
          COREDNS_PATCH="
          spec:
            template:
              spec:
                dnsPolicy: None
                dnsConfig:
                  nameservers:
                  - 8.8.4.4
                  - 8.8.8.8
          "

          kubectl --context ${{ env.contextName1 }} patch deployment -n kube-system coredns --patch="$COREDNS_PATCH"
          kubectl --context ${{ env.contextName2 }} patch deployment -n kube-system coredns --patch="$COREDNS_PATCH"

      - name: Create the IPSec secret in both clusters
        if: matrix.encryption == 'ipsec'
        run: |
          SECRET="3 rfc4106(gcm(aes)) $(openssl rand -hex 20) 128"
          kubectl --context ${{ env.contextName1 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}"
          kubectl --context ${{ env.contextName2 }} create -n kube-system secret generic cilium-ipsec-keys --from-literal=keys="${SECRET}"

      - name: Start kvstore clusters
        id: kvstore
        if: matrix.external-kvstore
        uses: ./.github/actions/kvstore
        with:
          clusters: 2

      - name: Create the secret containing the kvstore credentials
        if: matrix.external-kvstore
        run: |
          kubectl --context ${{ env.contextName1 }} create -n kube-system -f ${{ steps.kvstore.outputs.cilium_etcd_secrets_path }}
          kubectl --context ${{ env.contextName2 }} create -n kube-system -f ${{ steps.kvstore.outputs.cilium_etcd_secrets_path }}

      - name: Set clustermesh connection parameters
        id: clustermesh-vars
        run: |
          # Let's retrieve in advance the parameters to mesh the two clusters, so
          # that we don't need to do that through the CLI in a second step, as it
          # would be reset during upgrade (as we are resetting the values).

          # Explicitly configure the NodePorts to make sure that they are different
          # in each cluster, to workaround #24692
          PORT1=32379
          PORT2=32380

          CILIUM_INSTALL_CLUSTER1=" \
            --set cluster.name=${{ env.clusterName1 }} \
            --set cluster.id=1 \
            --set clustermesh.apiserver.service.nodePort=$PORT1 \
          "

          CILIUM_INSTALL_CLUSTER2=" \
            --set cluster.name=${{ env.clusterName2 }} \
            --set cluster.id=${{ matrix.max-connected-clusters }} \
            --set clustermesh.apiserver.service.nodePort=$PORT2 \
          "

          CILIUM_INSTALL_COMMON=" \
            --set clustermesh.config.clusters[0].name=${{ env.clusterName1 }} \
            --set clustermesh.config.clusters[1].name=${{ env.clusterName2 }} \
          "

          if [ "${{ matrix.external-kvstore }}" == "true" ]; then
            CILIUM_INSTALL_COMMON="$CILIUM_INSTALL_COMMON \
              ${{ steps.kvstore.outputs.cilium_install_clustermesh }}"
          else
            IP1=$(kubectl --context ${{ env.contextName1 }} get nodes \
              ${{ env.clusterName1 }}-worker -o wide --no-headers | awk '{ print $6 }')
            IP2=$(kubectl --context ${{ env.contextName2 }} get nodes \
              ${{ env.clusterName2 }}-worker -o wide --no-headers | awk '{ print $6 }')

            CILIUM_INSTALL_COMMON="$CILIUM_INSTALL_COMMON \
              --set clustermesh.config.clusters[0].ips={$IP1} \
              --set clustermesh.config.clusters[0].port=$PORT1 \
              --set clustermesh.config.clusters[1].ips={$IP2} \
              --set clustermesh.config.clusters[1].port=$PORT2 \
            "
          fi

          echo cilium_install_cluster1="$CILIUM_INSTALL_CLUSTER1 $CILIUM_INSTALL_COMMON" >> $GITHUB_OUTPUT
          echo cilium_install_cluster2="$CILIUM_INSTALL_CLUSTER2 $CILIUM_INSTALL_COMMON" >> $GITHUB_OUTPUT

      # Warning: since this is a privileged workflow, subsequent workflow job
      # steps must take care not to execute untrusted code.
      - name: Checkout pull request branch (NOT TRUSTED)
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ steps.newest-vars.outputs.sha }}
          persist-credentials: false
          path: untrusted/cilium-newest
          sparse-checkout: |
            install/kubernetes/cilium

      - name: Checkout ${{ steps.vars.outputs.downgrade_version }} branch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ steps.vars.outputs.downgrade_version }}
          persist-credentials: false
          path: untrusted/cilium-downgrade
          sparse-checkout: |
            install/kubernetes/cilium

      - name: Set up downgrade settings
        id: downgrade-vars
        run: |
          SHA="$(cd untrusted/cilium-downgrade && git rev-parse HEAD)"
          CILIUM_IMAGE_SETTINGS=" \
            --chart-directory=./untrusted/cilium-downgrade/install/kubernetes/cilium \
            --set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \
            --set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${SHA} \
            --set=clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${SHA} \
            --set=clustermesh.apiserver.kvstoremesh.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/kvstoremesh-ci:${SHA} \
          "
          echo "sha=${SHA}" >> $GITHUB_OUTPUT
          echo "cilium_image_settings=${CILIUM_IMAGE_SETTINGS}" >> $GITHUB_OUTPUT

      - name: Wait for images to be available (newest)
        timeout-minutes: 10
        shell: bash
        run: |
          for image in cilium-ci operator-generic-ci clustermesh-apiserver-ci ; do
            until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.newest-vars.outputs.sha }} &> /dev/null; do sleep 45s; done
          done

      - name: Wait for images to be available (downgrade)
        timeout-minutes: 10
        shell: bash
        run: |
          for image in cilium-ci operator-generic-ci clustermesh-apiserver-ci ; do
            until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.downgrade-vars.outputs.sha }} &> /dev/null; do sleep 45s; done
          done


      - name: Install Cilium in cluster1
        id: install-cilium-cluster1
        env:
          KVSTORE_ID: 1
        run: |
          cilium --context ${{ env.contextName1 }} install \
            ${{ steps.downgrade-vars.outputs.cilium_image_settings }} \
            ${{ steps.vars.outputs.cilium_install_defaults }} \
            ${{ steps.kvstore.outputs.cilium_install_kvstore }} \
            ${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }}

      - name: Copy the Cilium CA secret to cluster2, as they must match
        if: ${{ !matrix.external-kvstore }}
        run: |
          kubectl --context ${{ env.contextName1 }} get secret -n kube-system cilium-ca -o yaml |
            kubectl --context ${{ env.contextName2 }} create -f -

      - name: Install Cilium in cluster2
        env:
          KVSTORE_ID: 2
        run: |
          cilium --context ${{ env.contextName2 }} install \
            ${{ steps.newest-vars.outputs.cilium_install_defaults }} \
            ${{ steps.vars.outputs.cilium_install_defaults }} \
            ${{ steps.kvstore.outputs.cilium_install_kvstore }} \
            ${{ steps.clustermesh-vars.outputs.cilium_install_cluster2 }}

      - name: Wait for cluster mesh status to be ready
        run: |
          cilium --context ${{ env.contextName1 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName2 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
          cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m

      - name: Make JUnit report directory
        run: |
          mkdir -p cilium-junits

      - name: Run connectivity test - pre-upgrade (${{ join(matrix.*, ', ') }})
        run: |
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} \
            ${{ steps.vars.outputs.connectivity_test_defaults }} \
            --junit-file "cilium-junits/${{ env.job_name }} - pre-upgrade (${{ join(matrix.*, ', ') }}).xml" \
            --junit-property github_job_step="Run tests pre-upgrade (${{ join(matrix.*, ', ') }})"

          # Create pods which establish long lived connections. They will be used by
          # subsequent connectivity tests with --include-conn-disrupt-test to catch any
          # interruption in such flows.
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} --hubble=false \
            --include-conn-disrupt-test --conn-disrupt-test-setup \
            --conn-disrupt-dispatch-interval 0ms

      - name: Features tested on cluster 1
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName1 }}"
          title: "Summary of all features tested on cluster 1"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 1"

      - name: Features tested on cluster 2
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName2 }}"
          title: "Summary of all features tested on cluster 2"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 2"


      - name: Upgrade Cilium in cluster1
        env:
          KVSTORE_ID: 1
        run: |
          cilium --context ${{ env.contextName1 }} upgrade --reset-values \
            ${{ steps.newest-vars.outputs.cilium_install_defaults }} \
            ${{ steps.vars.outputs.cilium_install_defaults }} \
            ${{ steps.kvstore.outputs.cilium_install_kvstore }} \
            ${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }}

      - name: Wait for cluster mesh status to be ready
        run: |
          cilium --context ${{ env.contextName1 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName2 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
          cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m

      - name: Set cilium connectivity test namespace
        id: cilium-cli
        run: |
          NAMESPACE=$(kubectl get namespace -l "app.kubernetes.io/name=cilium-cli" -o name | sort | cut -d / -f 2 | head -1)
          echo namespace="$NAMESPACE" >> $GITHUB_OUTPUT
          ${{ steps.cilium-cli.outputs.namespace }}

      - name: Write the Service manifest for testing failover
        if: ${{ !matrix.external-kvstore }}
        run: |
          cat << EOF > echo-failover.yaml
          apiVersion: v1
          kind: Service
          metadata:
            annotations:
              service.cilium.io/global: "true"
            labels:
              kind: echo
              context: failover
            name: echo-other-node-failover
            namespace: ${{ steps.cilium-cli.outputs.namespace }}
          spec:
            ipFamilies:
            - IPv4
            - IPv6
            ipFamilyPolicy: PreferDualStack
            ports:
            - name: http
              port: 80
              protocol: TCP
              targetPort: 8080
            selector:
              name: echo-other-node
            sessionAffinity: None
            type: ClusterIP
          EOF

      - name: Restart clustermesh-apiserver and ensure client can connect to new Service
        if: ${{ !matrix.external-kvstore }}
        run: |
          echo "Restarting clustermesh-apiserver deployments"
          kubectl --context ${{ env.contextName2 }} -n kube-system rollout restart deployment -l k8s-app=clustermesh-apiserver
          kubectl --context ${{ env.contextName2 }} -n kube-system rollout status deployment -l k8s-app=clustermesh-apiserver

          echo "Deploying a global Service to test failover"
          kubectl --context ${{ env.contextName1 }} apply -f echo-failover.yaml
          kubectl --context ${{ env.contextName2 }} apply -f echo-failover.yaml

          echo "Testing client connection to global Service"
          kubectl --context ${{ env.contextName1 }} -n ${{ steps.cilium-cli.outputs.namespace }} exec deploy/client -i -- curl -s -v --connect-timeout 2 --max-time 5 --retry-max-time 60 --retry-all-errors --retry 10 --output /dev/null --fail echo-other-node-failover

          # Clean up the service so that it can be re-deployed in subsequent steps
          kubectl --context ${{ env.contextName1 }} delete -f echo-failover.yaml
          kubectl --context ${{ env.contextName2 }} delete -f echo-failover.yaml

      - name: Enable kvstoremesh on cluster1
        if: ${{ !matrix.external-kvstore }}
        env:
          KVSTORE_ID: 1
        run: |
          cilium --context ${{ env.contextName1 }} upgrade --reset-values \
            ${{ steps.newest-vars.outputs.cilium_install_defaults }} \
            ${{ steps.vars.outputs.cilium_install_defaults }} \
            ${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }} \
            --set clustermesh.apiserver.kvstoremesh.enabled=true

      - name: Wait for cluster mesh status to be ready
        if: ${{ !matrix.external-kvstore }}
        run: |
          cilium --context ${{ env.contextName1 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName2 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
          cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m

      - name: Restart clustermesh-apiserver and ensure client can connect to new Service
        if: ${{ !matrix.external-kvstore }}
        run: |
          echo "Restarting clustermesh-apiserver deployments"
          kubectl --context ${{ env.contextName2 }} -n kube-system rollout restart deployment -l k8s-app=clustermesh-apiserver
          kubectl --context ${{ env.contextName2 }} -n kube-system rollout status deployment -l k8s-app=clustermesh-apiserver

          echo "Deploying a global Service to test failover"
          kubectl --context ${{ env.contextName1 }} apply -f echo-failover.yaml
          kubectl --context ${{ env.contextName2 }} apply -f echo-failover.yaml

          echo "Testing client connection to global Service"
          kubectl --context ${{ env.contextName1 }} -n ${{ steps.cilium-cli.outputs.namespace }} exec deploy/client -i -- curl -s -v --connect-timeout 2 --max-time 5 --retry-max-time 60 --retry-all-errors --retry 10 --output /dev/null --fail echo-other-node-failover

          # Clean up the service so that it can be re-deployed in subsequent steps
          kubectl --context ${{ env.contextName1 }} delete -f echo-failover.yaml
          kubectl --context ${{ env.contextName2 }} delete -f echo-failover.yaml

      - name: Gather additional troubleshooting information
        run: |
          kubectl --context ${{ env.contextName1 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt
          kubectl --context ${{ env.contextName2 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt
          kubectl --context ${{ env.contextName1 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps
          kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps
          kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --previous --ignore-errors --timestamps

      - name: Run connectivity test - post-upgrade (${{ join(matrix.*, ', ') }})
        run: |
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} \
            ${{ steps.vars.outputs.connectivity_test_defaults }} \
            --include-conn-disrupt-test \
            --junit-file "cilium-junits/${{ env.job_name }} - post upgrade (${{ join(matrix.*, ', ') }}).xml" \
            --junit-property github_job_step="Run tests post-upgrade (${{ join(matrix.*, ', ') }})"

          # Create pods which establish long lived connections. They will be used by
          # subsequent connectivity tests with --include-conn-disrupt-test to catch any
          # interruption in such flows.
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} --hubble=false \
            --include-conn-disrupt-test --conn-disrupt-test-setup \
            --conn-disrupt-dispatch-interval 0ms

      - name: Features tested on cluster 1 - post upgrade
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName1 }}"
          title: "Summary of all features tested on cluster 1 - post upgrade"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 1 - post upgrade"

      - name: Features tested on cluster 2 - post upgrade
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName2 }}"
          title: "Summary of all features tested on cluster 2 - post upgrade"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 2 - post upgrade"


      # Perform an additional "stress" test, scaling the clustermesh-apiservers in both clusters
      # to zero replicas, and restarting all agents. Existing connections should not be disrupted.
      # One exception to this is represented by Cilium being in charge of handling NodePort
      # traffic, as the simultaneous restart of the clustermesh-apiserver pods in both clusters
      # after rolling out all agents can lead to a circular dependency (#30156).
      - name: Scale the clustermesh-apiserver replicas to 0
        if: ${{ !matrix.external-kvstore }}
        run: |
          kubectl --context ${{ env.contextName1 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 0
          if [ ${{ matrix.kube-proxy }} != "none" ]; then
            kubectl --context ${{ env.contextName2 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 0
          fi

      - name: Rollout Cilium agents in both clusters
        run: |
          kubectl --context ${{ env.contextName1 }} rollout restart -n kube-system ds/cilium
          kubectl --context ${{ env.contextName2 }} rollout restart -n kube-system ds/cilium

          # Wait until all agents successfully restarted before scaling the replicas again
          kubectl --context ${{ env.contextName1 }} rollout status -n kube-system ds/cilium --timeout=5m
          kubectl --context ${{ env.contextName2 }} rollout status -n kube-system ds/cilium --timeout=5m

      - name: Scale the clustermesh-apiserver replicas back to 1
        if: ${{ !matrix.external-kvstore }}
        run: |
          kubectl --context ${{ env.contextName1 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 1
          kubectl --context ${{ env.contextName2 }} scale -n kube-system deploy/clustermesh-apiserver --replicas 1

      - name: Wait for cluster mesh status to be ready
        run: |
          cilium --context ${{ env.contextName1 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName2 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
          cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m

      - name: Gather additional troubleshooting information
        run: |
          kubectl --context ${{ env.contextName1 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt
          kubectl --context ${{ env.contextName2 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt
          kubectl --context ${{ env.contextName1 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps
          kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps
          kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --previous --ignore-errors --timestamps

      - name: Run connectivity test - stress-test (${{ join(matrix.*, ', ') }})
        run: |
          # Only check that no long living connection was disrupted
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} \
            --hubble=false \
            --flow-validation=disabled \
            --test='no-interrupted-connections' \
            --test='no-unexpected-packet-drops' \
            --include-conn-disrupt-test \
            --junit-file "cilium-junits/${{ env.job_name }} - stress test (${{ join(matrix.*, ', ') }}).xml" \
            --junit-property github_job_step="Run tests stess-test (${{ join(matrix.*, ', ') }})"

          # Create pods which establish long lived connections. They will be used by
          # subsequent connectivity tests with --include-conn-disrupt-test to catch any
          # interruption in such flows.
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} --hubble=false \
            --include-conn-disrupt-test --conn-disrupt-test-setup \
            --conn-disrupt-dispatch-interval 0ms

      - name: Features tested on cluster 1 - stress-test
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName1 }}"
          title: "Summary of all features tested on cluster 1 - stress-test"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 1 - stress-test"

      - name: Features tested on cluster 2 - stress-test
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName2 }}"
          title: "Summary of all features tested on cluster 2 - stress-test"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 2 - stress-test"


      - name: Downgrade Cilium in cluster1 and disable kvstoremesh
        env:
          KVSTORE_ID: 1
        run: |
          cilium --context ${{ env.contextName1 }} upgrade --reset-values \
            ${{ steps.downgrade-vars.outputs.cilium_image_settings }} \
            ${{ steps.vars.outputs.cilium_install_defaults }} \
            ${{ steps.kvstore.outputs.cilium_install_kvstore }} \
            ${{ steps.clustermesh-vars.outputs.cilium_install_cluster1 }}

      - name: Wait for cluster mesh status to be ready
        run: |
          cilium --context ${{ env.contextName1 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName2 }} status --wait --wait-duration=10m
          cilium --context ${{ env.contextName1 }} clustermesh status --wait --wait-duration=5m
          cilium --context ${{ env.contextName2 }} clustermesh status --wait --wait-duration=5m

      - name: Gather additional troubleshooting information
        run: |
          kubectl --context ${{ env.contextName1 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt
          kubectl --context ${{ env.contextName2 }} get po -n ${{ steps.cilium-cli.outputs.namespace }} -o wide -l kind=test-conn-disrupt
          kubectl --context ${{ env.contextName1 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps
          kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --timestamps
          kubectl --context ${{ env.contextName2 }} logs -n ${{ steps.cilium-cli.outputs.namespace }} -l kind=test-conn-disrupt --prefix --previous --ignore-errors --timestamps

      - name: Run connectivity test - post-downgrade (${{ join(matrix.*, ', ') }})
        run: |
          cilium --context ${{ env.contextName1 }} connectivity test \
            --multi-cluster=${{ env.contextName2 }} \
            ${{ steps.vars.outputs.connectivity_test_defaults }} \
            --include-conn-disrupt-test \
            --junit-file "cilium-junits/${{ env.job_name }} - post downgrade (${{ join(matrix.*, ', ') }}).xml" \
            --junit-property github_job_step="Run tests post-downgrade (${{ join(matrix.*, ', ') }})"

      - name: Features tested on cluster 1 - post-downgrade
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName1 }}"
          title: "Summary of all features tested on cluster 1 - post-downgrade"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 1 - post-downgrade"

      - name: Features tested on cluster 2 - post-downgrade
        uses: ./.github/actions/feature-status
        with:
          cilium-cli: "cilium --context ${{ env.contextName2 }}"
          title: "Summary of all features tested on cluster 2 - post-downgrade"
          json-filename: "${{ env.job_name }} (${{ join(matrix.*, ', ') }}) - cluster 2 - post-downgrade"


      - name: Post-test information gathering
        if: ${{ !success() && steps.install-cilium-cluster1.outcome != 'skipped' }}
        run: |
          cilium --context ${{ env.contextName1 }} status
          cilium --context ${{ env.contextName1 }} clustermesh status
          cilium --context ${{ env.contextName2 }} status
          cilium --context ${{ env.contextName2 }} clustermesh status

          kubectl config use-context ${{ env.contextName1 }}
          kubectl get pods --all-namespaces -o wide
          cilium sysdump --output-filename cilium-sysdump-context1-final-${{ join(matrix.*, '-') }}

          kubectl config use-context ${{ env.contextName2 }}
          kubectl get pods --all-namespaces -o wide
          cilium sysdump --output-filename cilium-sysdump-context2-final-${{ join(matrix.*, '-') }}

          if [ "${{ matrix.external-kvstore }}" == "true" ]; then
            for i in {1..2}; do
              echo
              echo "# Retrieving logs from kvstore$i docker container"
              docker logs kvstore$i
            done
          fi
        shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently

      - name: Upload artifacts
        if: ${{ !success() }}
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: cilium-sysdumps-${{ matrix.name }}
          path: cilium-sysdump-*.zip

      - name: Upload JUnits [junit]
        if: ${{ always() }}
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: cilium-junits-${{ matrix.name }}
          path: cilium-junits/*.xml

      - name: Upload features tested
        if: ${{ always() }}
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: features-tested-${{ matrix.name }}
          path: ${{ env.job_name }}*.json

      - name: Publish Test Results As GitHub Summary
        if: ${{ always() }}
        uses: aanm/junit2md@332ebf0fddd34e91b03a832cfafaa826306558f9 # v0.0.3
        with:
          junit-directory: "cilium-junits"

  merge-upload:
    if: ${{ always() }}
    name: Merge and Upload Artifacts
    runs-on: ubuntu-latest
    needs: upgrade-and-downgrade
    steps:
      - name: Merge Sysdumps
        if: ${{ needs.upgrade-and-downgrade.result == 'failure' }}
        uses: actions/upload-artifact/merge@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: cilium-sysdumps
          pattern: cilium-sysdumps-*
          retention-days: 5
          delete-merged: true
        continue-on-error: true
      - name: Merge JUnits
        uses: actions/upload-artifact/merge@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: cilium-junits
          pattern: cilium-junits-*
          retention-days: 5
          delete-merged: true
      - name: Merge Features tested
        uses: actions/upload-artifact/merge@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: features-tested
          pattern: features-tested-*
          retention-days: 5
          delete-merged: true

  commit-status-final:
    if: ${{ always() }}
    name: Commit Status Final
    needs: upgrade-and-downgrade
    runs-on: ubuntu-latest
    steps:
      - name: Set final commit status
        uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
        with:
          sha: ${{ inputs.SHA || github.sha }}
          status: ${{ needs.upgrade-and-downgrade.result }}