diff --git a/extensions/provisioning/audit.yaml b/extensions/provisioning/audit.yaml new file mode 100644 index 00000000..9ec0d3c1 --- /dev/null +++ b/extensions/provisioning/audit.yaml @@ -0,0 +1,4 @@ +apiVersion: audit.k8s.io/v1 +kind: Policy +rules: +- level: Metadata \ No newline at end of file diff --git a/extensions/provisioning/enable_auaditing_k3s.sh b/extensions/provisioning/enable_auaditing_k3s.sh new file mode 100644 index 00000000..0c48dfff --- /dev/null +++ b/extensions/provisioning/enable_auaditing_k3s.sh @@ -0,0 +1,8 @@ +#!/bin/sh +sudo mkdir -p -m o+w /var/lib/rancher/k3s/server/logs +sudo install -m o+w /dev/null /var/lib/rancher/k3s/server/audit.yaml +sudo chmod o+w /etc/systemd/system/k3s.service +sed -i '$d' /etc/systemd/system/k3s.service +sudo echo -e " '--kube-apiserver-arg=audit-log-path=/var/lib/rancher/k3s/server/logs/audit.log' \ \n '--kube-apiserver-arg=audit-policy-file=/var/lib/rancher/k3s/server/audit.yaml'" >> /etc/systemd/system/k3s.service +sudo systemctl daemon-reload +sudo systemctl restart k3s.service \ No newline at end of file diff --git a/extensions/provisioning/ssh.go b/extensions/provisioning/ssh.go index 12b09406..e94fb6c2 100644 --- a/extensions/provisioning/ssh.go +++ b/extensions/provisioning/ssh.go @@ -5,10 +5,12 @@ package provisioning // process running on an individual node. import ( + "context" "errors" + "os/user" + "path/filepath" "strconv" "strings" - "time" "github.com/rancher/shepherd/clients/rancher" @@ -22,53 +24,72 @@ import ( ) const ( - cpuUsageVar = 100 // 100 is just a placeholder until we can determine an actual number. Even with cpu usage spiking it should not go past 100% cpu usage and previous issues concerning this were hitting around 130% and above - checkCPU provisioninginput.SSHTestCase = "CheckCPU" - checkCPUCommand = "ps -C agent -o %cpu --no-header" - nodeReboot provisioninginput.SSHTestCase = "NodeReboot" - activeState = "active" - runningState = "running" - fleetNamespace = "fleet-default" + cpuUsageTolerance = 100 // this value represents 100 core usage which should not happen at any time. + rancherDir = "/var/lib/rancher/" + + checkCPUCommand = "ps -A -o '%c %C' --no-header" + rebootNodeCommand = "sudo reboot" + + checkCPU provisioninginput.SSHTestCase = "CheckCPU" + nodeReboot provisioninginput.SSHTestCase = "NodeReboot" + auditLog provisioninginput.SSHTestCase = "AuditLog" + + activeState = "active" + runningState = "running" + fleetNamespace = "fleet-default" + controlPlaneLabel = "node-role.kubernetes.io/control-plane" + clusterManagementSteveType = "management.cattle.io.cluster" + providerLabel = "provider.cattle.io" ) // CallSSHTestByName tests the ssh tests specified in the provisioninginput config clusterSSHTests field. -// For example CheckCPU checks the cpu usage of the cluster agent. If the usage is too high the func will return a warning. func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node, client *rancher.Client, clusterID string, machineName string) error { switch testCase { + //checks the cpu usage of all processes on the node. If the usage is too high the function will return a warning. case checkCPU: logrus.Infof("Running CheckCPU test on node %s", node.PublicIPAddress) output, err := node.ExecuteCommand(checkCPUCommand) - if err != nil { - return err + if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) { + return errors.New(err.Error() + output) } - strOutput := output[:strings.IndexByte(output, '\n')] - logrus.Info("CheckCPU test on node " + node.PublicIPAddress + " | Cluster agent cpu usage is: " + strOutput + "%") - outputInt, err := strconv.ParseFloat(strings.TrimSpace(strOutput), 32) - if outputInt > cpuUsageVar { - logrus.Warn("Cluster agent cpu usage is too high on node" + node.PublicIPAddress + " | Current cpu usage is: " + strOutput + "%") - } - if err != nil { - return err + lines := strings.Split(output, "\n") + logrus.Info("Checking all node processes CPU usage") + for _, line := range lines { + processFields := strings.Fields(line) + if len(processFields) > 0 { + CPUUsageInt, err := strconv.ParseFloat(strings.TrimSpace(processFields[1]), 32) + if err != nil { + return errors.New(err.Error() + output) + } + + if CPUUsageInt >= cpuUsageTolerance { + logrus.Warnf("Process: %s | CPUUsage: %f", processFields[0], CPUUsageInt) + } + } } + + //This test reboots the node and verifies it comes back up in the correct state. case nodeReboot: logrus.Infof("Running NodeReboot test on node %s", node.PublicIPAddress) - command := "sudo reboot" - _, err := node.ExecuteCommand(command) + output, err := node.ExecuteCommand(rebootNodeCommand) if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) { - return err + return errors.New(err.Error() + output) } - // Verify machine shuts down within five minutes, shutting down should not take longer than that depending on the ami - err = wait.Poll(1*time.Second, defaults.FiveMinuteTimeout, func() (bool, error) { - newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName) - if err != nil { - return false, err - } - if newNode.State.Name == runningState { - return false, nil - } - return true, nil - }) + + err = wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, defaults.FiveMinuteTimeout, true, + func(ctx context.Context) (done bool, err error) { + newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName) + if err != nil { + return false, err + } + + if newNode.State.Name == runningState { + return false, nil + } + return true, nil + }) + if err != nil { logrus.Errorf("Node %s was unable to reboot successfully | Cluster %s is still in active state", node.PublicIPAddress, clusterID) return err @@ -81,6 +102,79 @@ func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node, } return err + + //This test checks if the audit log file is properly created on the node (skipped if its not a control plane node). + case auditLog: + if node.NodeLabels[controlPlaneLabel] != "true" { + logrus.Infof("Node %s is not a control-plane node, skipping", node.PublicIPAddress) + return nil + } + + logrus.Infof("Running audit log test on node %s", node.PublicIPAddress) + cluster, err := client.Steve.SteveType(clusterManagementSteveType).ByID(clusterID) + if err != nil { + return err + } + + auditLogPath := rancherDir + cluster.Labels[providerLabel] + auditLogFile := "" + if cluster.Labels[providerLabel] == "rke2" { + auditLogFile = "audit-policy-file" + auditLogPath = auditLogPath + "/etc/config-files/" + auditLogFile + } + + if cluster.Labels[providerLabel] == "k3s" { + logrus.Info("Enabling audit logging") + auditLogFile = "audit.log" + auditLogPath = auditLogPath + "/server/logs/" + auditLogFile + + user, err := user.Current() + if err != nil { + return err + } + + dirPath := filepath.Join(user.HomeDir, "go/src/github.com/susesgartner/rancher/tests/framework/extensions/provisioning") + err = node.SCPFileToNode(dirPath+"/enable_auditing_k3s.sh", "/home/"+node.SSHUser+"/enable_auditing_k3s.sh") + if err != nil { + return err + } + + err = node.SCPFileToNode(dirPath+"/audit.yaml", "/home/"+node.SSHUser+"/audit.yaml") + if err != nil { + return err + } + + _, err = node.ExecuteCommand("sudo bash -c 'mv /home/" + node.SSHUser + "/audit.yaml /var/lib/rancher/k3s/server/audit.yaml'") + if err != nil { + return err + } + + _, err = node.ExecuteCommand("sudo chmod o+x /home/" + node.SSHUser + "/enable_auditing_k3s.sh") + if err != nil { + return err + } + + _, err = node.ExecuteCommand("sudo bash /home/" + node.SSHUser + "/enable_auditing_k3s.sh") + if err != nil { + return err + } + + } + + checkAuditLogCommand := "ls " + auditLogPath + logrus.Infof("Checking for audit log file at %s", auditLogPath) + output, err := node.ExecuteCommand(checkAuditLogCommand) + if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) { + return errors.New(err.Error() + output) + } + + strOutput := output[:strings.IndexByte(output, '\n')] + if !strings.Contains(strings.TrimSpace(strOutput), auditLogFile) { + return errors.New("no audit log file found") + } + + logrus.Infof("Successfully found audit log file %s", strOutput) + default: err := errors.New("Invalid SSH test: " + string(testCase) + " is spelled incorrectly or does not exist.") return err diff --git a/extensions/provisioning/verify.go b/extensions/provisioning/verify.go index 066b3e38..ab0ae50f 100644 --- a/extensions/provisioning/verify.go +++ b/extensions/provisioning/verify.go @@ -481,6 +481,7 @@ func VerifySSHTests(t *testing.T, client *rancher.Client, clusterObject *steveV1 clusterNode := &nodes.Node{ NodeID: node.ID, + NodeLabels: node.Labels, PublicIPAddress: nodeIP, SSHUser: sshUser, SSHKey: sshkey, diff --git a/pkg/nodes/nodes.go b/pkg/nodes/nodes.go index a49a2a0f..2082340d 100644 --- a/pkg/nodes/nodes.go +++ b/pkg/nodes/nodes.go @@ -25,11 +25,12 @@ type SSHPath struct { // Node is a configuration of node that is from an outside cloud provider type Node struct { - NodeID string `json:"nodeID" yaml:"nodeID"` - PublicIPAddress string `json:"publicIPAddress" yaml:"publicIPAddress"` - PrivateIPAddress string `json:"privateIPAddress" yaml:"privateIPAddress"` - SSHUser string `json:"sshUser" yaml:"sshUser"` - SSHKeyName string `json:"sshKeyName" yaml:"sshKeyName"` + NodeID string `json:"nodeID" yaml:"nodeID"` + NodeLabels map[string]string `json:"nodeLabels" yaml:"nodeLabels"` + PublicIPAddress string `json:"publicIPAddress" yaml:"publicIPAddress"` + PrivateIPAddress string `json:"privateIPAddress" yaml:"privateIPAddress"` + SSHUser string `json:"sshUser" yaml:"sshUser"` + SSHKeyName string `json:"sshKeyName" yaml:"sshKeyName"` SSHKey []byte }