diff --git a/features.go b/features.go index 72aef5c3c51..f3e8ed6e87f 100644 --- a/features.go +++ b/features.go @@ -59,6 +59,10 @@ var featuresCommand = cli.Command{ Enabled: &t, Schemata: &t, }, + MemoryPolicy: &features.MemoryPolicy{ + Modes: specconv.KnownMemoryPolicyModes(), + Flags: specconv.KnownMemoryPolicyFlags(), + }, MountExtensions: &features.MountExtensions{ IDMap: &features.IDMap{ Enabled: &t, diff --git a/internal/linux/linux.go b/internal/linux/linux.go index 9b049647903..1cf479f4124 100644 --- a/internal/linux/linux.go +++ b/internal/linux/linux.go @@ -2,6 +2,7 @@ package linux import ( "os" + "unsafe" "golang.org/x/sys/unix" ) @@ -72,3 +73,15 @@ func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error { }) return os.NewSyscallError("sendmsg", err) } + +// SetMempolicy wraps set_mempolicy. +func SetMempolicy(mode uint, mask *unix.CPUSet) error { + err := retryOnEINTR(func() error { + _, _, errno := unix.Syscall(unix.SYS_SET_MEMPOLICY, uintptr(mode), uintptr(unsafe.Pointer(mask)), unsafe.Sizeof(*mask)*8) + if errno != 0 { + return errno + } + return nil + }) + return os.NewSyscallError("set_mempolicy", err) +} diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index a0cdaec6a19..6141c018fbd 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -214,6 +214,9 @@ type Config struct { // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + // MemoryPolicy specifies NUMA memory policy for the container. + MemoryPolicy *LinuxMemoryPolicy `json:"memory_policy,omitempty"` + // RootlessEUID is set when the runc was launched with non-zero EUID. // Note that RootlessEUID is set to false when launched with EUID=0 in userns. // When RootlessEUID is set, runc creates a new userns for the container. @@ -305,7 +308,8 @@ type CPUAffinity struct { Initial, Final *unix.CPUSet } -func toCPUSet(str string) (*unix.CPUSet, error) { +// ToCPUSet parses a string in list format into a unix.CPUSet, e.g. "0-3,5,7-9". +func ToCPUSet(str string) (*unix.CPUSet, error) { if str == "" { return nil, nil } @@ -356,7 +360,7 @@ func toCPUSet(str string) (*unix.CPUSet, error) { } } if s.Count() == 0 { - return nil, fmt.Errorf("no CPUs found in %q", str) + return nil, fmt.Errorf("no members found in set %q", str) } return s, nil @@ -367,11 +371,11 @@ func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) { if sa == nil { return nil, nil } - initial, err := toCPUSet(sa.Initial) + initial, err := ToCPUSet(sa.Initial) if err != nil { return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err) } - final, err := toCPUSet(sa.Final) + final, err := ToCPUSet(sa.Final) if err != nil { return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err) } diff --git a/libcontainer/configs/memorypolicy.go b/libcontainer/configs/memorypolicy.go new file mode 100644 index 00000000000..8c34609006d --- /dev/null +++ b/libcontainer/configs/memorypolicy.go @@ -0,0 +1,31 @@ +package configs + +import "golang.org/x/sys/unix" + +// Memory policy modes and flags as defined in /usr/include/linux/mempolicy.h + +//nolint:revive,staticcheck,nolintlint // ignore ALL_CAPS errors in consts from numaif.h, will match unix.* in the future +const ( + MPOL_DEFAULT = 0 + MPOL_PREFERRED = 1 + MPOL_BIND = 2 + MPOL_INTERLEAVE = 3 + MPOL_LOCAL = 4 + MPOL_PREFERRED_MANY = 5 + MPOL_WEIGHTED_INTERLEAVE = 6 + + MPOL_F_STATIC_NODES = 1 << 15 + MPOL_F_RELATIVE_NODES = 1 << 14 + MPOL_F_NUMA_BALANCING = 1 << 13 +) + +// LinuxMemoryPolicy contains memory policy configuration. +type LinuxMemoryPolicy struct { + // Mode specifies memory policy mode without mode flags. See + // set_mempolicy() documentation for details. + Mode uint `json:"mode,omitempty"` + // Flags contains mode flags. + Flags uint `json:"flags,omitempty"` + // Nodes contains NUMA nodes to which the mode applies. + Nodes *unix.CPUSet `json:"nodes,omitempty"` +} diff --git a/libcontainer/configs/tocpuset_test.go b/libcontainer/configs/tocpuset_test.go index 8af2ab57726..bd0f1e047dd 100644 --- a/libcontainer/configs/tocpuset_test.go +++ b/libcontainer/configs/tocpuset_test.go @@ -58,8 +58,8 @@ func TestToCPUSet(t *testing.T) { for _, tc := range testCases { t.Run(tc.in, func(t *testing.T) { - out, err := toCPUSet(tc.in) - t.Logf("toCPUSet(%q) = %v (error: %v)", tc.in, out, err) + out, err := ToCPUSet(tc.in) + t.Logf("ToCPUSet(%q) = %v (error: %v)", tc.in, out, err) // Check the error. if tc.isErr { if err == nil { diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 605286c1874..9851e4057d3 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -33,6 +33,7 @@ func Validate(config *configs.Config) error { mountsStrict, scheduler, ioPriority, + memoryPolicy, } for _, c := range checks { if err := c(config); err != nil { @@ -482,3 +483,26 @@ func ioPriority(config *configs.Config) error { return nil } + +func memoryPolicy(config *configs.Config) error { + mpol := config.MemoryPolicy + if mpol == nil { + return nil + } + switch mpol.Mode { + case configs.MPOL_DEFAULT, configs.MPOL_LOCAL: + if mpol.Nodes != nil && mpol.Nodes.Count() != 0 { + return fmt.Errorf("memory policy mode requires 0 nodes but got %d", mpol.Nodes.Count()) + } + case configs.MPOL_BIND, configs.MPOL_INTERLEAVE, + configs.MPOL_PREFERRED_MANY, configs.MPOL_WEIGHTED_INTERLEAVE: + if mpol.Nodes == nil || mpol.Nodes.Count() == 0 { + return fmt.Errorf("memory policy mode requires at least one node but got 0") + } + case configs.MPOL_PREFERRED: + // Zero or more nodes are allowed by the kernel. + default: + return fmt.Errorf("invalid memory policy mode: %d", mpol.Mode) + } + return nil +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index f9e88ad8d05..e4daf2c8f9a 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -659,6 +659,14 @@ func setupIOPriority(config *initConfig) error { return nil } +func setupMemoryPolicy(config *configs.Config) error { + mpol := config.MemoryPolicy + if mpol == nil { + return nil + } + return linux.SetMempolicy(mpol.Mode|mpol.Flags, config.MemoryPolicy.Nodes) +} + func setupPersonality(config *configs.Config) error { return system.SetLinuxPersonality(config.Personality.Domain) } diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 2bdaa4de8d5..7f7c0771754 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -88,6 +88,10 @@ func (l *linuxSetnsInit) Init() error { } } + if err := setupMemoryPolicy(l.config.Config); err != nil { + return err + } + // Tell our parent that we're ready to exec. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index b19ba358bc5..923ad67ce49 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -8,6 +8,7 @@ import ( "maps" "os" "path/filepath" + "slices" "sort" "strings" "sync" @@ -41,6 +42,8 @@ var ( flag int } complexFlags map[string]func(*configs.Mount) + mpolModeMap map[string]uint + mpolModeFMap map[string]uint ) func initMaps() { @@ -148,6 +151,22 @@ func initMaps() { m.IDMapping.Recursive = true }, } + + mpolModeMap = map[string]uint{ + string(specs.MpolDefault): configs.MPOL_DEFAULT, + string(specs.MpolPreferred): configs.MPOL_PREFERRED, + string(specs.MpolBind): configs.MPOL_BIND, + string(specs.MpolInterleave): configs.MPOL_INTERLEAVE, + string(specs.MpolLocal): configs.MPOL_LOCAL, + string(specs.MpolPreferredMany): configs.MPOL_PREFERRED_MANY, + string(specs.MpolWeightedInterleave): configs.MPOL_WEIGHTED_INTERLEAVE, + } + + mpolModeFMap = map[string]uint{ + string(specs.MpolFStaticNodes): configs.MPOL_F_STATIC_NODES, + string(specs.MpolFRelativeNodes): configs.MPOL_F_RELATIVE_NODES, + string(specs.MpolFNumaBalancing): configs.MPOL_F_NUMA_BALANCING, + } }) } @@ -184,6 +203,20 @@ func KnownMountOptions() []string { return res } +// KnownMemoryPolicyModes returns the list of the known memory policy modes. +// Used by `runc features`. +func KnownMemoryPolicyModes() []string { + initMaps() + return slices.Sorted(maps.Keys(mpolModeMap)) +} + +// KnownMemoryPolicyFlags returns the list of the known memory policy mode flags. +// Used by `runc features`. +func KnownMemoryPolicyFlags() []string { + initMaps() + return slices.Sorted(maps.Keys(mpolModeFMap)) +} + // AllowedDevices is the set of devices which are automatically included for // all containers. // @@ -468,6 +501,28 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { MemBwSchema: spec.Linux.IntelRdt.MemBwSchema, } } + if spec.Linux.MemoryPolicy != nil { + var ok bool + var err error + specMp := spec.Linux.MemoryPolicy + confMp := &configs.LinuxMemoryPolicy{} + confMp.Mode, ok = mpolModeMap[string(specMp.Mode)] + if !ok { + return nil, fmt.Errorf("invalid memory policy mode %q", specMp.Mode) + } + confMp.Nodes, err = configs.ToCPUSet(specMp.Nodes) + if err != nil { + return nil, fmt.Errorf("invalid memory policy nodes %q: %w", specMp.Nodes, err) + } + for _, specFlag := range specMp.Flags { + confFlag, ok := mpolModeFMap[string(specFlag)] + if !ok { + return nil, fmt.Errorf("invalid memory policy flag %q", specFlag) + } + confMp.Flags |= confFlag + } + config.MemoryPolicy = confMp + } if spec.Linux.Personality != nil { if len(spec.Linux.Personality.Flags) > 0 { logrus.Warnf("ignoring unsupported personality flags: %+v because personality flag has not supported at this time", spec.Linux.Personality.Flags) diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 901bd394e6a..15a4fcb577f 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -171,6 +171,10 @@ func (l *linuxStandardInit) Init() error { } } + if err := setupMemoryPolicy(l.config.Config); err != nil { + return err + } + // Tell our parent that we're ready to exec. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. diff --git a/tests/integration/memorypolicy.bats b/tests/integration/memorypolicy.bats new file mode 100644 index 00000000000..77ccd427cc9 --- /dev/null +++ b/tests/integration/memorypolicy.bats @@ -0,0 +1,133 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_busybox +} + +function teardown() { + teardown_bundle +} + +@test "runc run memory policy interleave without flags" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "MPOL_INTERLEAVE", + "nodes": "0" + }' + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "interleave:0" ]] +} + +@test "runc run memory policy bind static" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "MPOL_BIND", + "nodes": "0", + "flags": ["MPOL_F_STATIC_NODES"] + }' + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "bind"*"static"*"0" ]] +} + +@test "runc run and exec memory policy prefer relative" { + update_config ' + .linux.memoryPolicy = { + "mode": "MPOL_PREFERRED", + "nodes": "0", + "flags": ["MPOL_F_RELATIVE_NODES"] + }' + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox /bin/sh -c "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2" + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "prefer"*"relative"*"0" ]] +} + +@test "runc run empty memory policy" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + }' + runc run test_busybox + [ "$status" -eq 1 ] + [[ "${lines[0]}" == *"invalid memory policy"* ]] +} + +@test "runc run memory policy with non-existing mode" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "INTERLEAVE", + "nodes": "0" + }' + runc run test_busybox + [ "$status" -eq 1 ] + [[ "${lines[0]}" == *"invalid memory policy"* ]] +} + +@test "runc run memory policy with invalid flag" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "MPOL_PREFERRED", + "nodes": "0", + "flags": ["MPOL_F_RELATIVE_NODES", "badflag"] + }' + runc run test_busybox + [ "$status" -eq 1 ] + [[ "${lines[0]}" == *"invalid memory policy flag"* ]] +} + +@test "runc run memory policy default with missing nodes" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "MPOL_DEFAULT" + }' + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *"default"* ]] +} + +@test "runc run memory policy with missing mode" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "nodes": "0-7" + }' + runc run test_busybox + [ "$status" -eq 1 ] + [[ "${lines[0]}" == *"invalid memory policy mode"* ]] +} + +@test "runc run memory policy calls syscall with invalid arguments" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "MPOL_DEFAULT", + "nodes": "0-7", + }' + runc run test_busybox + [ "$status" -eq 1 ] + [[ "${lines[*]}" == *"mode requires 0 nodes but got 8"* ]] +} + +@test "runc run memory policy bind way too large a node number" { + update_config ' + .process.args = ["/bin/sh", "-c", "head -n 1 /proc/self/numa_maps | cut -d \" \" -f 2"] + | .linux.memoryPolicy = { + "mode": "MPOL_BIND", + "nodes": "0-9876543210", + "flags": [] + }' + runc run test_busybox + [ "$status" -eq 1 ] + [[ "${lines[0]}" == *"invalid memory policy node"* ]] +}