-
Notifications
You must be signed in to change notification settings - Fork 45
Audit log test #83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Audit log test #83
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| apiVersion: audit.k8s.io/v1 | ||
| kind: Policy | ||
| rules: | ||
| - level: Metadata |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| #!/bin/sh | ||
| sudo mkdir -p -m o+w /var/lib/rancher/k3s/server/logs | ||
| sudo install -m o+w /dev/null /var/lib/rancher/k3s/server/audit.yaml | ||
| sudo chmod o+w /etc/systemd/system/k3s.service | ||
| sed -i '$d' /etc/systemd/system/k3s.service | ||
| sudo echo -e " '--kube-apiserver-arg=audit-log-path=/var/lib/rancher/k3s/server/logs/audit.log' \ \n '--kube-apiserver-arg=audit-policy-file=/var/lib/rancher/k3s/server/audit.yaml'" >> /etc/systemd/system/k3s.service | ||
| sudo systemctl daemon-reload | ||
| sudo systemctl restart k3s.service | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,10 +5,12 @@ package provisioning | |
| // process running on an individual node. | ||
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "os/user" | ||
| "path/filepath" | ||
| "strconv" | ||
| "strings" | ||
|
|
||
| "time" | ||
|
|
||
| "github.com/rancher/shepherd/clients/rancher" | ||
|
|
@@ -22,53 +24,72 @@ import ( | |
| ) | ||
|
|
||
| const ( | ||
| cpuUsageVar = 100 // 100 is just a placeholder until we can determine an actual number. Even with cpu usage spiking it should not go past 100% cpu usage and previous issues concerning this were hitting around 130% and above | ||
| checkCPU provisioninginput.SSHTestCase = "CheckCPU" | ||
| checkCPUCommand = "ps -C agent -o %cpu --no-header" | ||
| nodeReboot provisioninginput.SSHTestCase = "NodeReboot" | ||
| activeState = "active" | ||
| runningState = "running" | ||
| fleetNamespace = "fleet-default" | ||
| cpuUsageTolerance = 100 // this value represents 100 core usage which should not happen at any time. | ||
| rancherDir = "/var/lib/rancher/" | ||
|
|
||
| checkCPUCommand = "ps -A -o '%c %C' --no-header" | ||
| rebootNodeCommand = "sudo reboot" | ||
|
|
||
| checkCPU provisioninginput.SSHTestCase = "CheckCPU" | ||
| nodeReboot provisioninginput.SSHTestCase = "NodeReboot" | ||
| auditLog provisioninginput.SSHTestCase = "AuditLog" | ||
|
|
||
| activeState = "active" | ||
| runningState = "running" | ||
| fleetNamespace = "fleet-default" | ||
| controlPlaneLabel = "node-role.kubernetes.io/control-plane" | ||
| clusterManagementSteveType = "management.cattle.io.cluster" | ||
| providerLabel = "provider.cattle.io" | ||
| ) | ||
|
|
||
| // CallSSHTestByName tests the ssh tests specified in the provisioninginput config clusterSSHTests field. | ||
| // For example CheckCPU checks the cpu usage of the cluster agent. If the usage is too high the func will return a warning. | ||
| func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node, client *rancher.Client, clusterID string, machineName string) error { | ||
| switch testCase { | ||
| //checks the cpu usage of all processes on the node. If the usage is too high the function will return a warning. | ||
| case checkCPU: | ||
| logrus.Infof("Running CheckCPU test on node %s", node.PublicIPAddress) | ||
| output, err := node.ExecuteCommand(checkCPUCommand) | ||
| if err != nil { | ||
| return err | ||
| if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) { | ||
| return errors.New(err.Error() + output) | ||
| } | ||
| strOutput := output[:strings.IndexByte(output, '\n')] | ||
| logrus.Info("CheckCPU test on node " + node.PublicIPAddress + " | Cluster agent cpu usage is: " + strOutput + "%") | ||
|
|
||
| outputInt, err := strconv.ParseFloat(strings.TrimSpace(strOutput), 32) | ||
| if outputInt > cpuUsageVar { | ||
| logrus.Warn("Cluster agent cpu usage is too high on node" + node.PublicIPAddress + " | Current cpu usage is: " + strOutput + "%") | ||
| } | ||
| if err != nil { | ||
| return err | ||
| lines := strings.Split(output, "\n") | ||
| logrus.Info("Checking all node processes CPU usage") | ||
| for _, line := range lines { | ||
| processFields := strings.Fields(line) | ||
| if len(processFields) > 0 { | ||
| CPUUsageInt, err := strconv.ParseFloat(strings.TrimSpace(processFields[1]), 32) | ||
| if err != nil { | ||
| return errors.New(err.Error() + output) | ||
| } | ||
|
|
||
| if CPUUsageInt >= cpuUsageTolerance { | ||
| logrus.Warnf("Process: %s | CPUUsage: %f", processFields[0], CPUUsageInt) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| //This test reboots the node and verifies it comes back up in the correct state. | ||
| case nodeReboot: | ||
| logrus.Infof("Running NodeReboot test on node %s", node.PublicIPAddress) | ||
| command := "sudo reboot" | ||
| _, err := node.ExecuteCommand(command) | ||
| output, err := node.ExecuteCommand(rebootNodeCommand) | ||
| if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) { | ||
| return err | ||
| return errors.New(err.Error() + output) | ||
| } | ||
| // Verify machine shuts down within five minutes, shutting down should not take longer than that depending on the ami | ||
| err = wait.Poll(1*time.Second, defaults.FiveMinuteTimeout, func() (bool, error) { | ||
| newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
| if newNode.State.Name == runningState { | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| }) | ||
|
|
||
| err = wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, defaults.FiveMinuteTimeout, true, | ||
| func(ctx context.Context) (done bool, err error) { | ||
| newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
|
|
||
| if newNode.State.Name == runningState { | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit; please put this on a new line for code readability.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll be running our linter against this today once i get it running in vscode and it will catch that |
||
| }) | ||
|
|
||
| if err != nil { | ||
| logrus.Errorf("Node %s was unable to reboot successfully | Cluster %s is still in active state", node.PublicIPAddress, clusterID) | ||
| return err | ||
|
|
@@ -81,6 +102,79 @@ func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node, | |
| } | ||
|
|
||
| return err | ||
|
|
||
| //This test checks if the audit log file is properly created on the node (skipped if its not a control plane node). | ||
| case auditLog: | ||
| if node.NodeLabels[controlPlaneLabel] != "true" { | ||
| logrus.Infof("Node %s is not a control-plane node, skipping", node.PublicIPAddress) | ||
| return nil | ||
| } | ||
|
|
||
| logrus.Infof("Running audit log test on node %s", node.PublicIPAddress) | ||
| cluster, err := client.Steve.SteveType(clusterManagementSteveType).ByID(clusterID) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| auditLogPath := rancherDir + cluster.Labels[providerLabel] | ||
| auditLogFile := "" | ||
| if cluster.Labels[providerLabel] == "rke2" { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you put
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, once my other PR gets merged I plan to rebase and expand those constant files see defaults package in this PR |
||
| auditLogFile = "audit-policy-file" | ||
| auditLogPath = auditLogPath + "/etc/config-files/" + auditLogFile | ||
| } | ||
|
|
||
| if cluster.Labels[providerLabel] == "k3s" { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above. |
||
| logrus.Info("Enabling audit logging") | ||
| auditLogFile = "audit.log" | ||
| auditLogPath = auditLogPath + "/server/logs/" + auditLogFile | ||
|
|
||
| user, err := user.Current() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| dirPath := filepath.Join(user.HomeDir, "go/src/github.com/susesgartner/rancher/tests/framework/extensions/provisioning") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is pointing to your forked repo, this needs to be using the official repo. |
||
| err = node.SCPFileToNode(dirPath+"/enable_auditing_k3s.sh", "/home/"+node.SSHUser+"/enable_auditing_k3s.sh") | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| err = node.SCPFileToNode(dirPath+"/audit.yaml", "/home/"+node.SSHUser+"/audit.yaml") | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| _, err = node.ExecuteCommand("sudo bash -c 'mv /home/" + node.SSHUser + "/audit.yaml /var/lib/rancher/k3s/server/audit.yaml'") | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| _, err = node.ExecuteCommand("sudo chmod o+x /home/" + node.SSHUser + "/enable_auditing_k3s.sh") | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| _, err = node.ExecuteCommand("sudo bash /home/" + node.SSHUser + "/enable_auditing_k3s.sh") | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| } | ||
|
|
||
| checkAuditLogCommand := "ls " + auditLogPath | ||
| logrus.Infof("Checking for audit log file at %s", auditLogPath) | ||
| output, err := node.ExecuteCommand(checkAuditLogCommand) | ||
| if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) { | ||
| return errors.New(err.Error() + output) | ||
| } | ||
|
|
||
| strOutput := output[:strings.IndexByte(output, '\n')] | ||
| if !strings.Contains(strings.TrimSpace(strOutput), auditLogFile) { | ||
| return errors.New("no audit log file found") | ||
| } | ||
|
|
||
| logrus.Infof("Successfully found audit log file %s", strOutput) | ||
|
|
||
| default: | ||
| err := errors.New("Invalid SSH test: " + string(testCase) + " is spelled incorrectly or does not exist.") | ||
| return err | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a clarifying question, what's the need for the spaces in the beginning of this command?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That appears to be a copy paste error on my part.