Docker provider for node actions (#369)

* Docker provider for node actions * Adjusted dependencies and imports * Update config_kind.yaml Signed-off-by: José Castillo Lema <[email protected]> Signed-off-by: José Castillo Lema <[email protected]>
krkn-chaos · Jan 10, 2023 · 493a8a2 · 493a8a2
1 parent d76ab31
commit 493a8a2
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 0 deletions.
diff --git a/config/config_kind.yaml b/config/config_kind.yaml
@@ -0,0 +1,40 @@
+kraken:
+    distribution: kubernetes                               # Distribution can be kubernetes or openshift
+    kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
+    exit_on_failure: False                                 # Exit when a post action scenario fails
+    port: 8081
+    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
+    signal_state: RUN                                      # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
+    signal_address: 0.0.0.0                                # Signal listening address
+    litmus_install: True                                   # Installs specified version, set to False if it's already setup
+    litmus_version: v1.13.6                                # Litmus version to install
+    litmus_uninstall: False                                # If you want to uninstall litmus if failure
+    litmus_uninstall_before_run: True                      # If you want to uninstall litmus before a new run starts
+    chaos_scenarios:                                       # List of policies/chaos scenarios to load
+        - plugin_scenarios:
+            - scenarios/kind/scheduler.yml
+        - node_scenarios:
+            - scenarios/kind/node_scenarios_example.yml        
+
+cerberus:
+    cerberus_enabled: False                                # Enable it when cerberus is previously installed
+    cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
+    check_applicaton_routes: False                         # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
+
+performance_monitoring:
+    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
+    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
+    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
+    capture_metrics: False
+    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
+    metrics_profile_path: config/metrics-aggregated.yaml
+    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
+    uuid:                                                 # uuid for the run is generated by default if not set
+    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
+    alert_profile: config/alerts                          # Path to alert profile with the prometheus queries
+
+tunings:
+    wait_duration: 60                                      # Duration to wait between each chaos scenario
+    iterations: 1                                          # Number of times to execute the scenarios
+    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md
@@ -38,6 +38,14 @@ See the example node scenario or the example below.
 
 **NOTE**: Baremetal machines are fragile. Some node actions can occasionally corrupt the filesystem if it does not shut down properly, and sometimes the kubelet does not start properly.
 
+#### Docker
+
+The Docker provider can be used to run node scenarios against kind clusters.
+
+[kind](https://kind.sigs.k8s.io/) is a tool for running local Kubernetes clusters using Docker container "nodes".
+
+kind was primarily designed for testing Kubernetes itself, but may be used for local development or CI.
+
 #### GCP
 How to set up GCP cli to run node scenarios is defined [here](cloud_setup.md#gcp).
 

diff --git a/kraken/node_actions/docker_node_scenarios.py b/kraken/node_actions/docker_node_scenarios.py
@@ -0,0 +1,109 @@
+import kraken.node_actions.common_node_functions as nodeaction
+from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
+import logging
+import sys
+import docker
+
+
+class Docker:
+    def __init__(self):
+        self.client = docker.from_env()
+
+    def get_container_id(self, node_name): 
+        container = self.client.containers.get(node_name)
+        return container.id    
+
+    # Start the node instance
+    def start_instances(self, node_name):
+        container = self.client.containers.get(node_name)
+        container.start()
+
+    # Stop the node instance
+    def stop_instances(self, node_name):
+        container = self.client.containers.get(node_name)
+        container.stop()
+
+    # Reboot the node instance
+    def reboot_instances(self, node_name):
+        container = self.client.containers.get(node_name)
+        container.restart()
+
+    # Terminate the node instance
+    def terminate_instances(self, node_name):
+        container = self.client.containers.get(node_name)
+        container.stop()
+        container.remove()
+
+
+class docker_node_scenarios(abstract_node_scenarios):
+    def __init__(self):
+        self.docker = Docker()
+
+    # Node scenario to start the node
+    def node_start_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_start_scenario injection")
+                container_id = self.docker.get_container_id(node)
+                logging.info("Starting the node %s with container ID: %s " % (node, container_id))
+                self.docker.start_instances(node)
+                nodeaction.wait_for_ready_status(node, timeout)
+                logging.info("Node with container ID: %s is in running state" % (container_id))
+                logging.info("node_start_scenario has been successfully injected!")
+            except Exception as e:
+                logging.error(
+                    "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
+                )
+                logging.error("node_start_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to stop the node
+    def node_stop_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_stop_scenario injection")
+                container_id = self.docker.get_container_id(node)
+                logging.info("Stopping the node %s with container ID: %s " % (node, container_id))
+                self.docker.stop_instances(node)
+                logging.info("Node with container ID: %s is in stopped state" % (container_id))
+                nodeaction.wait_for_unknown_status(node, timeout)
+            except Exception as e:
+                logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
+                logging.error("node_stop_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to terminate the node
+    def node_termination_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_termination_scenario injection")
+                container_id = self.docker.get_container_id(node)
+                logging.info("Terminating the node %s with container ID: %s " % (node, container_id))
+                self.docker.terminate_instances(node)
+                logging.info("Node with container ID: %s has been terminated" % (container_id))
+                logging.info("node_termination_scenario has been successfuly injected!")
+            except Exception as e:
+                logging.error(
+                    "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
+                )
+                logging.error("node_termination_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to reboot the node
+    def node_reboot_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting node_reboot_scenario injection")
+                container_id = self.docker.get_container_id(node)
+                logging.info("Rebooting the node %s with container ID: %s " % (node, container_id))
+                self.docker.reboot_instances(node)
+                nodeaction.wait_for_unknown_status(node, timeout)
+                nodeaction.wait_for_ready_status(node, timeout)
+                logging.info("Node with container ID: %s has been rebooted" % (container_id))
+                logging.info("node_reboot_scenario has been successfuly injected!")
+            except Exception as e:
+                logging.error(
+                    "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
+                )
+                logging.error("node_reboot_scenario injection failed!")
+                sys.exit(1)
diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py
@@ -9,6 +9,7 @@
 from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios
 from kraken.node_actions.alibaba_node_scenarios import alibaba_node_scenarios
 from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
+from kraken.node_actions.docker_node_scenarios import docker_node_scenarios
 import kraken.node_actions.common_node_functions as common_node_functions
 import kraken.cerberus.setup as cerberus
 
@@ -36,6 +37,8 @@ def get_node_scenario_object(node_scenario):
         return bm_node_scenarios(
             node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None)
         )
+    elif node_scenario["cloud_type"] == "docker":
+        return docker_node_scenarios()
     else:
         logging.error(
             "Cloud type " + node_scenario["cloud_type"] + " is not currently supported; "

diff --git a/requirements.txt b/requirements.txt
@@ -18,6 +18,7 @@ openshift-client
 python-ipmi
 podman-compose
 docker-compose
+docker
 jinja2==3.0.3
 itsdangerous==2.0.1
 werkzeug==2.0.3

diff --git a/scenarios/kind/node_scenarios_example.yml b/scenarios/kind/node_scenarios_example.yml
@@ -0,0 +1,16 @@
+node_scenarios:
+  - actions:                                                        # node chaos scenarios to be injected
+    - node_stop_start_scenario
+    node_name: kind-worker                                          # node on which scenario has to be injected; can set multiple names separated by comma
+    # label_selector: node-role.kubernetes.io/worker                # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
+    instance_count: 1                                               # Number of nodes to perform action/select that match the label selector
+    runs: 1                                                         # number of times to inject each scenario under actions (will perform on same node each time)
+    timeout: 120                                                    # duration to wait for completion of node scenario injection
+    cloud_type: docker                                                # cloud type on which Kubernetes/OpenShift runs
+  - actions:
+    - node_reboot_scenario
+    node_name: kind-worker
+    # label_selector: node-role.kubernetes.io/infra
+    instance_count: 1
+    timeout: 120
+    cloud_type: docker
diff --git a/scenarios/kind/scheduler.yml b/scenarios/kind/scheduler.yml
@@ -0,0 +1,10 @@
+# yaml-language-server: $schema=../plugin.schema.json
+- id: kill-pods
+  config:
+    namespace_pattern: ^kube-system$
+    label_selector: component=kube-scheduler
+- id: wait-for-pods
+  config:
+    namespace_pattern: ^kube-system$
+    label_selector: component=kube-scheduler
+    count: 3