rancher · susesgartner · Feb 28, 2024 · markusewalker · Feb 29, 2024 · susesgartner
@@ -0,0 +1,4 @@
+apiVersion: audit.k8s.io/v1
+kind: Policy
+rules:
+- level: Metadata
@@ -0,0 +1,8 @@
+#!/bin/sh
+sudo mkdir -p -m o+w /var/lib/rancher/k3s/server/logs
+sudo install -m o+w /dev/null /var/lib/rancher/k3s/server/audit.yaml
+sudo chmod o+w /etc/systemd/system/k3s.service
+sed -i '$d' /etc/systemd/system/k3s.service
+sudo echo -e "       '--kube-apiserver-arg=audit-log-path=/var/lib/rancher/k3s/server/logs/audit.log'  \ \n       '--kube-apiserver-arg=audit-policy-file=/var/lib/rancher/k3s/server/audit.yaml'" >> /etc/systemd/system/k3s.service
+sudo systemctl daemon-reload
+sudo systemctl restart k3s.service
@@ -5,10 +5,12 @@ package provisioning
 // process running on an individual node.
 
 import (
+	"context"
 	"errors"
+	"os/user"
+	"path/filepath"
 	"strconv"
 	"strings"
-
 	"time"
 
 	"github.com/rancher/shepherd/clients/rancher"
@@ -22,53 +24,72 @@ import (
 )
 
 const (
-	cpuUsageVar                                   = 100 // 100 is just a placeholder until we can determine an actual number. Even with cpu usage spiking it should not go past 100% cpu usage and previous issues concerning this were hitting around 130% and above
-	checkCPU        provisioninginput.SSHTestCase = "CheckCPU"
-	checkCPUCommand                               = "ps -C agent -o %cpu --no-header"
-	nodeReboot      provisioninginput.SSHTestCase = "NodeReboot"
-	activeState                                   = "active"
-	runningState                                  = "running"
-	fleetNamespace                                = "fleet-default"
+	cpuUsageTolerance = 100 // this value represents 100 core usage which should not happen at any time.
+	rancherDir        = "/var/lib/rancher/"
+
+	checkCPUCommand   = "ps -A -o '%c %C' --no-header"
+	rebootNodeCommand = "sudo reboot"
+
+	checkCPU   provisioninginput.SSHTestCase = "CheckCPU"
+	nodeReboot provisioninginput.SSHTestCase = "NodeReboot"
+	auditLog   provisioninginput.SSHTestCase = "AuditLog"
+
+	activeState                = "active"
+	runningState               = "running"
+	fleetNamespace             = "fleet-default"
+	controlPlaneLabel          = "node-role.kubernetes.io/control-plane"
+	clusterManagementSteveType = "management.cattle.io.cluster"
+	providerLabel              = "provider.cattle.io"
 )
 
 // CallSSHTestByName tests the ssh tests specified in the provisioninginput config clusterSSHTests field.
-// For example CheckCPU checks the cpu usage of the cluster agent. If the usage is too high the func will return a warning.
 func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node, client *rancher.Client, clusterID string, machineName string) error {
 	switch testCase {
+	//checks the cpu usage of all processes on the node. If the usage is too high the function will return a warning.
 	case checkCPU:
 		logrus.Infof("Running CheckCPU test on node %s", node.PublicIPAddress)
 		output, err := node.ExecuteCommand(checkCPUCommand)
-		if err != nil {
-			return err
+		if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
+			return errors.New(err.Error() + output)
 		}
-		strOutput := output[:strings.IndexByte(output, '\n')]
-		logrus.Info("CheckCPU test on node " + node.PublicIPAddress + " | Cluster agent cpu usage is: " + strOutput + "%")
 
-		outputInt, err := strconv.ParseFloat(strings.TrimSpace(strOutput), 32)
-		if outputInt > cpuUsageVar {
-			logrus.Warn("Cluster agent cpu usage is too high on node" + node.PublicIPAddress + " | Current cpu usage is: " + strOutput + "%")
-		}
-		if err != nil {
-			return err
+		lines := strings.Split(output, "\n")
+		logrus.Info("Checking all node processes CPU usage")
+		for _, line := range lines {
+			processFields := strings.Fields(line)
+			if len(processFields) > 0 {
+				CPUUsageInt, err := strconv.ParseFloat(strings.TrimSpace(processFields[1]), 32)
+				if err != nil {
+					return errors.New(err.Error() + output)
+				}
+
+				if CPUUsageInt >= cpuUsageTolerance {
+					logrus.Warnf("Process: %s | CPUUsage: %f", processFields[0], CPUUsageInt)
+				}
+			}
 		}
+
+	//This test reboots the node and verifies it comes back up in the correct state.
 	case nodeReboot:
 		logrus.Infof("Running NodeReboot test on node %s", node.PublicIPAddress)
-		command := "sudo reboot"
-		_, err := node.ExecuteCommand(command)
+		output, err := node.ExecuteCommand(rebootNodeCommand)
 		if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
-			return err
+			return errors.New(err.Error() + output)
 		}
-		// Verify machine shuts down within five minutes, shutting down should not take longer than that depending on the ami
-		err = wait.Poll(1*time.Second, defaults.FiveMinuteTimeout, func() (bool, error) {
-			newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName)
-			if err != nil {
-				return false, err
-			}
-			if newNode.State.Name == runningState {
-				return false, nil
-			}
-			return true, nil
-		})
+
+		err = wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, defaults.FiveMinuteTimeout, true,
+			func(ctx context.Context) (done bool, err error) {
+				newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName)
+				if err != nil {
+					return false, err
+				}
+
+				if newNode.State.Name == runningState {
+					return false, nil
+				}
+				return true, nil
+			})
+
 		if err != nil {
 			logrus.Errorf("Node %s was unable to reboot successfully | Cluster %s is still in active state", node.PublicIPAddress, clusterID)
 			return err
@@ -81,6 +102,79 @@ func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node,
 		}
 
 		return err
+
+	//This test checks if the audit log file is properly created on the node (skipped if its not a control plane node).
+	case auditLog:
+		if node.NodeLabels[controlPlaneLabel] != "true" {
+			logrus.Infof("Node %s is not a control-plane node, skipping", node.PublicIPAddress)
+			return nil
+		}
+
+		logrus.Infof("Running audit log test on node %s", node.PublicIPAddress)
+		cluster, err := client.Steve.SteveType(clusterManagementSteveType).ByID(clusterID)
+		if err != nil {
+			return err
+		}
+
+		auditLogPath := rancherDir + cluster.Labels[providerLabel]
+		auditLogFile := ""
+		if cluster.Labels[providerLabel] == "rke2" {
+			auditLogFile = "audit-policy-file"
+			auditLogPath = auditLogPath + "/etc/config-files/" + auditLogFile
+		}
+
+		if cluster.Labels[providerLabel] == "k3s" {
+			logrus.Info("Enabling audit logging")
+			auditLogFile = "audit.log"
+			auditLogPath = auditLogPath + "/server/logs/" + auditLogFile
+
+			user, err := user.Current()
+			if err != nil {
+				return err
+			}
+
+			dirPath := filepath.Join(user.HomeDir, "go/src/github.com/susesgartner/rancher/tests/framework/extensions/provisioning")
+			err = node.SCPFileToNode(dirPath+"/enable_auditing_k3s.sh", "/home/"+node.SSHUser+"/enable_auditing_k3s.sh")
+			if err != nil {
+				return err
+			}
+
+			err = node.SCPFileToNode(dirPath+"/audit.yaml", "/home/"+node.SSHUser+"/audit.yaml")
+			if err != nil {
+				return err
+			}
+
+			_, err = node.ExecuteCommand("sudo bash -c 'mv /home/" + node.SSHUser + "/audit.yaml /var/lib/rancher/k3s/server/audit.yaml'")
+			if err != nil {
+				return err
+			}
+
+			_, err = node.ExecuteCommand("sudo chmod o+x /home/" + node.SSHUser + "/enable_auditing_k3s.sh")
+			if err != nil {
+				return err
+			}
+
+			_, err = node.ExecuteCommand("sudo bash /home/" + node.SSHUser + "/enable_auditing_k3s.sh")
+			if err != nil {
+				return err
+			}
+
+		}
+
+		checkAuditLogCommand := "ls " + auditLogPath
+		logrus.Infof("Checking for audit log file at %s", auditLogPath)
+		output, err := node.ExecuteCommand(checkAuditLogCommand)
+		if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
+			return errors.New(err.Error() + output)
+		}
+
+		strOutput := output[:strings.IndexByte(output, '\n')]
+		if !strings.Contains(strings.TrimSpace(strOutput), auditLogFile) {
+			return errors.New("no audit log file found")
+		}
+
+		logrus.Infof("Successfully found audit log file %s", strOutput)
+
 	default:
 		err := errors.New("Invalid SSH test: " + string(testCase) + " is spelled incorrectly or does not exist.")
 		return err

@@ -481,6 +481,7 @@ func VerifySSHTests(t *testing.T, client *rancher.Client, clusterObject *steveV1
 
 			clusterNode := &nodes.Node{
 				NodeID:          node.ID,
+				NodeLabels:      node.Labels,
 				PublicIPAddress: nodeIP,
 				SSHUser:         sshUser,
 				SSHKey:          sshkey,

@@ -25,11 +25,12 @@ type SSHPath struct {
 
 // Node is a configuration of node that is from an outside cloud provider
 type Node struct {
-	NodeID           string `json:"nodeID" yaml:"nodeID"`
-	PublicIPAddress  string `json:"publicIPAddress" yaml:"publicIPAddress"`
-	PrivateIPAddress string `json:"privateIPAddress" yaml:"privateIPAddress"`
-	SSHUser          string `json:"sshUser" yaml:"sshUser"`
-	SSHKeyName       string `json:"sshKeyName" yaml:"sshKeyName"`
+	NodeID           string            `json:"nodeID" yaml:"nodeID"`
+	NodeLabels       map[string]string `json:"nodeLabels" yaml:"nodeLabels"`
+	PublicIPAddress  string            `json:"publicIPAddress" yaml:"publicIPAddress"`
+	PrivateIPAddress string            `json:"privateIPAddress" yaml:"privateIPAddress"`
+	SSHUser          string            `json:"sshUser" yaml:"sshUser"`
+	SSHKeyName       string            `json:"sshKeyName" yaml:"sshKeyName"`
 	SSHKey           []byte
 }