Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libcontainer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ config := &configs.Config{
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
{Type: configs.NEWCGROUP},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Expand Down
21 changes: 11 additions & 10 deletions libcontainer/SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@ Minimum requirements:

### Namespaces

| Flag | Enabled |
| ------------ | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |

Namespaces are created for the container via the `clone` syscall.
| Flag | Enabled |
| --------------- | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |
| CLONE_NEWCGROUP | 1 |

Namespaces are created for the container via the `unshare` syscall.


### Filesystem
Expand Down
8 changes: 4 additions & 4 deletions libcontainer/cgroups/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
)

const (
cgroupNamePrefix = "name="
CgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)

Expand Down Expand Up @@ -139,8 +139,8 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
if strings.HasPrefix(opt, CgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(CgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
Expand Down Expand Up @@ -294,7 +294,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
return p, nil
}

if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
return p, nil
}

Expand Down
16 changes: 10 additions & 6 deletions libcontainer/configs/namespaces_syscall.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@ func (n *Namespace) Syscall() int {
return namespaceInfo[n.Type]
}

// This is not yet in the Go stdlib.
const syscall_CLONE_NEWCGROUP = (1 << 25)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been added to the unix library.


var namespaceInfo = map[NamespaceType]int{
NEWNET: syscall.CLONE_NEWNET,
NEWNS: syscall.CLONE_NEWNS,
NEWUSER: syscall.CLONE_NEWUSER,
NEWIPC: syscall.CLONE_NEWIPC,
NEWUTS: syscall.CLONE_NEWUTS,
NEWPID: syscall.CLONE_NEWPID,
NEWNET: syscall.CLONE_NEWNET,
NEWNS: syscall.CLONE_NEWNS,
NEWUSER: syscall.CLONE_NEWUSER,
NEWIPC: syscall.CLONE_NEWIPC,
NEWUTS: syscall.CLONE_NEWUTS,
NEWPID: syscall.CLONE_NEWPID,
NEWCGROUP: syscall_CLONE_NEWCGROUP,
}

// CloneFlags parses the container's Namespaces options to set the correct
Expand Down
16 changes: 10 additions & 6 deletions libcontainer/configs/namespaces_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ import (
)

const (
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
)

var (
Expand All @@ -37,6 +38,8 @@ func NsName(ns NamespaceType) string {
return "user"
case NEWUTS:
return "uts"
case NEWCGROUP:
return "cgroup"
}
return ""
}
Expand Down Expand Up @@ -70,6 +73,7 @@ func NamespaceTypes() []NamespaceType {
NEWUTS,
NEWIPC,
NEWUSER,
NEWCGROUP,
}
}

Expand Down
12 changes: 12 additions & 0 deletions libcontainer/configs/validate/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.cgroupnamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
Expand Down Expand Up @@ -107,6 +110,15 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
return nil
}

func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWCGROUP) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
}
}
return nil
}

// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
Expand Down
1 change: 1 addition & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -1293,6 +1293,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
configs.NEWCGROUP,
}

// Remove namespaces that we don't need to join.
Expand Down
57 changes: 57 additions & 0 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1694,3 +1694,60 @@ func TestTmpfsCopyUp(t *testing.T) {
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
}
}

func TestCGROUPPrivate(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
t.Skip("cgroupns is unsupported")
}
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I mentioned above, this should be a t.Skip if os.IsNotExist(err).


config := newTemplateConfig(rootfs)
config.Namespaces.Add(configs.NEWCGROUP, "")
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
}
}

func TestCGROUPHost(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
t.Skip("cgroupns is unsupported")
}
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
}
}
46 changes: 34 additions & 12 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ enum sync_t {
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};

/*
* Synchronisation value for cgroup namespace setup.
* The same constant is defined in process_linux.go as "createCgroupns".
*/
#define CREATECGROUPNS 0x80

/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
Expand Down Expand Up @@ -570,6 +576,17 @@ void nsexec(void)
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}

/* Send the init_func pid back to our parent. */
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}
}
break;
case SYNC_CHILD_READY:
Expand Down Expand Up @@ -614,17 +631,6 @@ void nsexec(void)
}
}

/* Send the init_func pid back to our parent. */
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}

exit(0);
}

Expand All @@ -640,6 +646,7 @@ void nsexec(void)
case JUMP_CHILD: {
pid_t child;
enum sync_t s;
uint32_t actual_flags = config.cloneflags;

/* We're in a child and thus need to tell the parent if we die. */
syncfd = sync_child_pipe[0];
Expand Down Expand Up @@ -667,7 +674,9 @@ void nsexec(void)
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
if (actual_flags & CLONE_NEWCGROUP)
actual_flags &= ~CLONE_NEWCGROUP;
if (unshare(actual_flags) < 0)
bail("failed to unshare namespaces");

/*
Expand Down Expand Up @@ -777,6 +786,19 @@ void nsexec(void)
if (setgroups(0, NULL) < 0)
bail("setgroups failed");

/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed");
if (value == CREATECGROUPNS) {
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
}
else
bail("received unknown synchronisation value");
}

s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
Expand Down
51 changes: 34 additions & 17 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ import (
"github.com/opencontainers/runc/libcontainer/utils"
)

// Synchronisation value for cgroup namespace setup.
// The same constant is defined in nsexec.c as "CREATECGROUPNS".
const createCgroupns byte = (1 << 7)

type parentProcess interface {
// pid returns the pid for the running process.
pid() int
Expand Down Expand Up @@ -224,12 +228,17 @@ func (p *initProcess) externalDescriptors() []string {
return p.fds
}

// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
// getChildPid receives the final child's pid over the provided pipe.
func (p *initProcess) getChildPid() (int, error) {
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return -1, err
}
return pid.Pid, nil
}

func (p *initProcess) waitForChildExit(childPid int) error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
Expand All @@ -239,12 +248,7 @@ func (p *initProcess) execSetns() error {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
process, err := os.FindProcess(pid.Pid)
process, err := os.FindProcess(childPid)
if err != nil {
return err
}
Expand All @@ -266,22 +270,35 @@ func (p *initProcess) start() error {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
childPid, err := p.getChildPid()
if err != nil {
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
fds, err := getPipeFds(childPid)
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
if err := p.manager.Apply(childPid); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
// Now it's time to setup cgroup namesapce
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
return newSystemErrorWithCause(err, "sending synchronization value to init process")
}
}

// Wait for our first child to exit
if err := p.waitForChildExit(childPid); err != nil {
return newSystemErrorWithCause(err, "waiting for our first child to exit")
}

defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
Expand Down
Loading