Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions extensions/provisioning/audit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: audit.k8s.io/v1
kind: Policy
rules:
- level: Metadata
8 changes: 8 additions & 0 deletions extensions/provisioning/enable_auaditing_k3s.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh
sudo mkdir -p -m o+w /var/lib/rancher/k3s/server/logs
sudo install -m o+w /dev/null /var/lib/rancher/k3s/server/audit.yaml
sudo chmod o+w /etc/systemd/system/k3s.service
sed -i '$d' /etc/systemd/system/k3s.service
sudo echo -e " '--kube-apiserver-arg=audit-log-path=/var/lib/rancher/k3s/server/logs/audit.log' \ \n '--kube-apiserver-arg=audit-policy-file=/var/lib/rancher/k3s/server/audit.yaml'" >> /etc/systemd/system/k3s.service
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a clarifying question, what's the need for the spaces in the beginning of this command?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That appears to be a copy paste error on my part.

sudo systemctl daemon-reload
sudo systemctl restart k3s.service
160 changes: 127 additions & 33 deletions extensions/provisioning/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ package provisioning
// process running on an individual node.

import (
"context"
"errors"
"os/user"
"path/filepath"
"strconv"
"strings"

"time"

"github.com/rancher/shepherd/clients/rancher"
Expand All @@ -22,53 +24,72 @@ import (
)

const (
cpuUsageVar = 100 // 100 is just a placeholder until we can determine an actual number. Even with cpu usage spiking it should not go past 100% cpu usage and previous issues concerning this were hitting around 130% and above
checkCPU provisioninginput.SSHTestCase = "CheckCPU"
checkCPUCommand = "ps -C agent -o %cpu --no-header"
nodeReboot provisioninginput.SSHTestCase = "NodeReboot"
activeState = "active"
runningState = "running"
fleetNamespace = "fleet-default"
cpuUsageTolerance = 100 // this value represents 100 core usage which should not happen at any time.
rancherDir = "/var/lib/rancher/"

checkCPUCommand = "ps -A -o '%c %C' --no-header"
rebootNodeCommand = "sudo reboot"

checkCPU provisioninginput.SSHTestCase = "CheckCPU"
nodeReboot provisioninginput.SSHTestCase = "NodeReboot"
auditLog provisioninginput.SSHTestCase = "AuditLog"

activeState = "active"
runningState = "running"
fleetNamespace = "fleet-default"
controlPlaneLabel = "node-role.kubernetes.io/control-plane"
clusterManagementSteveType = "management.cattle.io.cluster"
providerLabel = "provider.cattle.io"
)

// CallSSHTestByName tests the ssh tests specified in the provisioninginput config clusterSSHTests field.
// For example CheckCPU checks the cpu usage of the cluster agent. If the usage is too high the func will return a warning.
func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node, client *rancher.Client, clusterID string, machineName string) error {
switch testCase {
//checks the cpu usage of all processes on the node. If the usage is too high the function will return a warning.
case checkCPU:
logrus.Infof("Running CheckCPU test on node %s", node.PublicIPAddress)
output, err := node.ExecuteCommand(checkCPUCommand)
if err != nil {
return err
if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
return errors.New(err.Error() + output)
}
strOutput := output[:strings.IndexByte(output, '\n')]
logrus.Info("CheckCPU test on node " + node.PublicIPAddress + " | Cluster agent cpu usage is: " + strOutput + "%")

outputInt, err := strconv.ParseFloat(strings.TrimSpace(strOutput), 32)
if outputInt > cpuUsageVar {
logrus.Warn("Cluster agent cpu usage is too high on node" + node.PublicIPAddress + " | Current cpu usage is: " + strOutput + "%")
}
if err != nil {
return err
lines := strings.Split(output, "\n")
logrus.Info("Checking all node processes CPU usage")
for _, line := range lines {
processFields := strings.Fields(line)
if len(processFields) > 0 {
CPUUsageInt, err := strconv.ParseFloat(strings.TrimSpace(processFields[1]), 32)
if err != nil {
return errors.New(err.Error() + output)
}

if CPUUsageInt >= cpuUsageTolerance {
logrus.Warnf("Process: %s | CPUUsage: %f", processFields[0], CPUUsageInt)
}
}
}

//This test reboots the node and verifies it comes back up in the correct state.
case nodeReboot:
logrus.Infof("Running NodeReboot test on node %s", node.PublicIPAddress)
command := "sudo reboot"
_, err := node.ExecuteCommand(command)
output, err := node.ExecuteCommand(rebootNodeCommand)
if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
return err
return errors.New(err.Error() + output)
}
// Verify machine shuts down within five minutes, shutting down should not take longer than that depending on the ami
err = wait.Poll(1*time.Second, defaults.FiveMinuteTimeout, func() (bool, error) {
newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName)
if err != nil {
return false, err
}
if newNode.State.Name == runningState {
return false, nil
}
return true, nil
})

err = wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, defaults.FiveMinuteTimeout, true,
func(ctx context.Context) (done bool, err error) {
newNode, err := client.Steve.SteveType(machineSteveResourceType).ByID(fleetNamespace + "/" + machineName)
if err != nil {
return false, err
}

if newNode.State.Name == runningState {
return false, nil
}
return true, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit; please put this on a new line for code readability.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll be running our linter against this today once i get it running in vscode and it will catch that

})

if err != nil {
logrus.Errorf("Node %s was unable to reboot successfully | Cluster %s is still in active state", node.PublicIPAddress, clusterID)
return err
Expand All @@ -81,6 +102,79 @@ func CallSSHTestByName(testCase provisioninginput.SSHTestCase, node *nodes.Node,
}

return err

//This test checks if the audit log file is properly created on the node (skipped if its not a control plane node).
case auditLog:
if node.NodeLabels[controlPlaneLabel] != "true" {
logrus.Infof("Node %s is not a control-plane node, skipping", node.PublicIPAddress)
return nil
}

logrus.Infof("Running audit log test on node %s", node.PublicIPAddress)
cluster, err := client.Steve.SteveType(clusterManagementSteveType).ByID(clusterID)
if err != nil {
return err
}

auditLogPath := rancherDir + cluster.Labels[providerLabel]
auditLogFile := ""
if cluster.Labels[providerLabel] == "rke2" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put rke2 in a const block?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, once my other PR gets merged I plan to rebase and expand those constant files see defaults package in this PR

auditLogFile = "audit-policy-file"
auditLogPath = auditLogPath + "/etc/config-files/" + auditLogFile
}

if cluster.Labels[providerLabel] == "k3s" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above.

logrus.Info("Enabling audit logging")
auditLogFile = "audit.log"
auditLogPath = auditLogPath + "/server/logs/" + auditLogFile

user, err := user.Current()
if err != nil {
return err
}

dirPath := filepath.Join(user.HomeDir, "go/src/github.com/susesgartner/rancher/tests/framework/extensions/provisioning")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pointing to your forked repo, this needs to be using the official repo.

err = node.SCPFileToNode(dirPath+"/enable_auditing_k3s.sh", "/home/"+node.SSHUser+"/enable_auditing_k3s.sh")
if err != nil {
return err
}

err = node.SCPFileToNode(dirPath+"/audit.yaml", "/home/"+node.SSHUser+"/audit.yaml")
if err != nil {
return err
}

_, err = node.ExecuteCommand("sudo bash -c 'mv /home/" + node.SSHUser + "/audit.yaml /var/lib/rancher/k3s/server/audit.yaml'")
if err != nil {
return err
}

_, err = node.ExecuteCommand("sudo chmod o+x /home/" + node.SSHUser + "/enable_auditing_k3s.sh")
if err != nil {
return err
}

_, err = node.ExecuteCommand("sudo bash /home/" + node.SSHUser + "/enable_auditing_k3s.sh")
if err != nil {
return err
}

}

checkAuditLogCommand := "ls " + auditLogPath
logrus.Infof("Checking for audit log file at %s", auditLogPath)
output, err := node.ExecuteCommand(checkAuditLogCommand)
if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
return errors.New(err.Error() + output)
}

strOutput := output[:strings.IndexByte(output, '\n')]
if !strings.Contains(strings.TrimSpace(strOutput), auditLogFile) {
return errors.New("no audit log file found")
}

logrus.Infof("Successfully found audit log file %s", strOutput)

default:
err := errors.New("Invalid SSH test: " + string(testCase) + " is spelled incorrectly or does not exist.")
return err
Expand Down
1 change: 1 addition & 0 deletions extensions/provisioning/verify.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ func VerifySSHTests(t *testing.T, client *rancher.Client, clusterObject *steveV1

clusterNode := &nodes.Node{
NodeID: node.ID,
NodeLabels: node.Labels,
PublicIPAddress: nodeIP,
SSHUser: sshUser,
SSHKey: sshkey,
Expand Down
11 changes: 6 additions & 5 deletions pkg/nodes/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ type SSHPath struct {

// Node is a configuration of node that is from an outside cloud provider
type Node struct {
NodeID string `json:"nodeID" yaml:"nodeID"`
PublicIPAddress string `json:"publicIPAddress" yaml:"publicIPAddress"`
PrivateIPAddress string `json:"privateIPAddress" yaml:"privateIPAddress"`
SSHUser string `json:"sshUser" yaml:"sshUser"`
SSHKeyName string `json:"sshKeyName" yaml:"sshKeyName"`
NodeID string `json:"nodeID" yaml:"nodeID"`
NodeLabels map[string]string `json:"nodeLabels" yaml:"nodeLabels"`
PublicIPAddress string `json:"publicIPAddress" yaml:"publicIPAddress"`
PrivateIPAddress string `json:"privateIPAddress" yaml:"privateIPAddress"`
SSHUser string `json:"sshUser" yaml:"sshUser"`
SSHKeyName string `json:"sshKeyName" yaml:"sshKeyName"`
SSHKey []byte
}

Expand Down