diff --git a/docs/systemd.md b/docs/systemd.md index 1381709d111..c68193fcae1 100644 --- a/docs/systemd.md +++ b/docs/systemd.md @@ -11,9 +11,9 @@ When creating a container, runc requests systemd (over dbus) to create a transient unit for the container, and place it into a specified slice. The name of the unit and the containing slice is derived from the container -runtime spec in the following way: +[runtime spec] in the following way: -1. If `Linux.CgroupsPath` is set, it is expected to be in the form +1. If [Linux.CgroupsPath] is set, it is expected to be in the form `[slice]:[prefix]:[name]`. Here `slice` is a systemd slice under which the container is placed. @@ -32,7 +32,7 @@ runtime spec in the following way: is `-.scope`, unless `name` has `.slice` suffix, in which case `prefix` is ignored and the `name` is used as is. -2. If `Linux.CgroupsPath` is not set or empty, it works the same way as if it +2. If [Linux.CgroupsPath] is not set or empty, it works the same way as if it would be set to `:runc:`. See the description above to see what it transforms to. @@ -102,7 +102,44 @@ The following tables summarize which properties are translated. | unified.pids.max | TasksMax | | For documentation on systemd unit resource properties, see -`systemd.resource-control(5)` man page. +[systemd.resource-control(5)] man page. + +### Device access rules + +[Device access rules] from the [runtime spec] are translated to systemd properties +(`DevicePolicy` and `DeviceAllow`). Not all configurations are supported; in +particular, the following can not be translated: + - blacklist-style rulesets; + - wildcard-major rules (meaning "all devices with any major number and the + given minor number"). + +NOTE that systemd v240 or later is highly recommended, since older versions +have limited ways to interpret `DeviceAllow` rules. When using systemd older +than v240, the following limitations exist: + + - it is not possible to add a rule for a device that does not have an + equivalent `/dev/{char,block}/:` file on the host + (for example, this is the case for NVidia devices); + - adding a wildcard-minor rule (meaning "devices with the given major number + any any minor number") results in having a set of individual rules for + existing devices only, meaning that any devices that will appear after the + container start won't be accessible. + +How the device access rules are applied depends on cgroup version: + +#### cgroup v1 + +The rules are applied by systemd to the cgroup device controller +(`device.{allow,deny}` files), then runc overwrites those rules with its own +set, which might be more complete due to older systemd limitations described +above. If some spec rules can not be translated to systemd properties, a minor +warning (which can only be seen when `runc` is run with `--debug` flag is used) +is emitted. + +#### cgroup v2 + +The rules are only applied by systemd. If some spec rules can not be translated +to systemd properties (see above), a warning is printed by runc. ### Auxiliary properties @@ -129,3 +166,8 @@ The values must be in the gvariant text format, as described in To find out which type systemd expects for a particular parameter, please consult systemd sources. + +[runtime spec]: https://github.com/opencontainers/runtime-spec/blob/main/spec.md +[Linux.CgroupsPath]: https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#cgroups-path +[systemd.resource-control(5)]: https://www.freedesktop.org/software/systemd/man/systemd.resource-control.html +[Device access rules]: https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#allowed-device-list diff --git a/libcontainer/cgroups/devices/systemd.go b/libcontainer/cgroups/devices/systemd.go index e1251c5ffb1..84c1b7d53ae 100644 --- a/libcontainer/cgroups/devices/systemd.go +++ b/libcontainer/cgroups/devices/systemd.go @@ -17,7 +17,7 @@ import ( // systemdProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) { +func systemdProperties(r *cgroups.Resources, sdVer int, cgroupVer int) ([]systemdDbus.Property, error) { if r.SkipDevices { return nil, nil } @@ -56,6 +56,19 @@ func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, if err != nil { return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) } + logRuleError := func(rule *devices.Rule, msg string) { + switch cgroupVer { + case 1: + // For cgroup v1, runc reapplies all the device rules later, + // in fsManager.Apply, so it's a temporary issue. + logrus.Debugf("temporary ignoring device rule %+v: %s", rule, msg) + case 2: + // For cgroup v2, if we can't translate the rule to systemd lingo, + // there's nothing we can do. For backward compatibility, we do not + // error out, but merely print a big fat warning. + logrus.Warnf("unable to apply device rule %+v: %s", rule, msg) + } + } var deviceAllowList []deviceAllowEntry for _, rule := range finalRules { if !rule.Allow { @@ -96,11 +109,20 @@ func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, // The only type of rule we can't handle is wildcard-major rules, and // so we'll give a warning in that case (note that the fallback code // will insert any rules systemd couldn't handle). What amazing fun. + // + // The properties generated here are applied by systemd. In case of + // cgroup v1 ("device.{allow,deny}" files), runc when overwrites the + // systemd rules, so the correct rules are in effect (except for the + // case when systemd is restarted and our rules gets overwritten, but + // there is nothing we can do here). + // + // For cgroup v2, we only use systemd rules, so we log warnings when + // some rules can't be translated. if rule.Major == devices.Wildcard { // "_ *:n _" rules aren't supported by systemd. if rule.Minor != devices.Wildcard { - logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + logRuleError(rule, "systemd does not support wildcard-major device rules") continue } @@ -119,14 +141,16 @@ func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, } entry.Path = prefix + strconv.FormatInt(rule.Major, 10) } else { - // For older systemd, "_ n:* _" rules require a device group from /proc/devices. + // For older systemd, {block,char}-NAME syntax is used, + // where NAME is a device group from /proc/devices + // corresponding to the major number. group, err := findDeviceGroup(rule.Type, rule.Major) if err != nil { return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) } if group == "" { // Couldn't find a group. - logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + logRuleError(rule, "systemd older than v240 does not support wildcard-minor rules for devices not listed in /proc/devices") continue } entry.Path = group @@ -141,12 +165,9 @@ func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, } if sdVer < 240 { // Old systemd versions use stat(2) on path to find out device major:minor - // numbers and type. If the path doesn't exist, it will not add the rule, - // emitting a warning instead. - // Since all of this logic is best-effort anyway (we manually set these - // rules separately to systemd) we can safely skip entries that don't - // have a corresponding path. + // numbers and type. If the path doesn't exist, it will not add the rule. if _, err := os.Stat(entry.Path); err != nil { + logRuleError(rule, "systemd older than v240 does not support device rules for non-existing device: "+err.Error()) continue } } diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 74dfcfdafcd..7e2c80cef1a 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -184,14 +184,18 @@ func (m *Manager) Set(r *cgroups.Resources) error { if err := setCPU(m.dirPath, r); err != nil { return err } - // devices (since kernel 4.15, pseudo-controller) - // - // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. - // However, errors from other subsystems are not ignored. - // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if err := setDevices(m.dirPath, r); err != nil { - if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { - return err + // When systemd is used, we do not add a second BPF_CGROUP_DEVICE program, + // relying on systemd to set all the device rules (see systemdProperties + // in libcontainer/cgroups/devices/systemd.go). + if !m.config.Systemd { + // Devices (since kernel 4.15, pseudo-controller). + if err := setDevices(m.dirPath, r); err != nil { + // When rootless is true, errors from the device subsystem are ignored because it is + // really not expected to work. However, errors from other subsystems are not ignored. + // See @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error". + if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { + return err + } } } // cpuset (since kernel 5.0) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index bcd1f99eb0c..74f35146937 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -37,7 +37,7 @@ var ( // [github.com/opencontainers/runc/libcontainer/cgroups/devices] // package is imported, it is set to nil, so cgroup managers can't // configure devices. - GenerateDeviceProps func(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) + GenerateDeviceProps func(r *cgroups.Resources, sdVer, cgroupVer int) ([]systemdDbus.Property, error) ) // NOTE: This function comes from package github.com/coreos/go-systemd/util @@ -350,7 +350,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st // generateDeviceProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { +func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager, cgroupVersion int) ([]systemdDbus.Property, error) { if GenerateDeviceProps == nil { if len(r.Devices) > 0 { return nil, cgroups.ErrDevicesUnsupported @@ -358,5 +358,5 @@ func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager) ([]syst return nil, nil } - return GenerateDeviceProps(r, systemdVersion(cm)) + return GenerateDeviceProps(r, systemdVersion(cm), cgroupVersion) } diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index cee76eb35cf..abb9c123bd9 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -74,7 +74,7 @@ var legacySubsystems = []subsystem{ func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property - deviceProperties, err := generateDeviceProperties(r, cm) + deviceProperties, err := generateDeviceProperties(r, cm, 1) if err != nil { return nil, err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 7b16f528b9c..bc7e026a74e 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -208,12 +208,7 @@ func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConn var properties []systemdDbus.Property - // NOTE: This is of questionable correctness because we insert our own - // devices eBPF program later. Two programs with identical rules - // aren't the end of the world, but it is a bit concerning. However - // it's unclear if systemd removes all eBPF programs attached when - // doing SetUnitProperties... - deviceProperties, err := generateDeviceProperties(r, cm) + deviceProperties, err := generateDeviceProperties(r, cm, 2) if err != nil { return nil, err }