-
Notifications
You must be signed in to change notification settings - Fork 336
/
Copy pathcloned_binary_linux.go
262 lines (237 loc) · 8.57 KB
/
cloned_binary_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
package dmz
import (
"errors"
"fmt"
"io"
"os"
"strconv"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/system"
)
type SealFunc func(**os.File) error
var (
_ SealFunc = sealMemfd
_ SealFunc = sealFile
)
func isExecutable(f *os.File) bool {
if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
return true
} else if err == unix.EACCES {
return false
}
path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
if err := unix.Access(path, unix.X_OK); err == nil {
return true
} else if err == unix.EACCES {
return false
}
// Cannot check -- assume it's executable (if not, exec will fail).
logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
return true
}
const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
func sealMemfd(f **os.File) error {
if err := (*f).Chmod(0o511); err != nil {
return err
}
// Try to set the newer memfd sealing flags, but we ignore
// errors because they are not needed and we want to continue
// to work on older kernels.
fd := (*f).Fd()
// Skip F_SEAL_FUTURE_WRITE, it is not needed because we alreadu use the
// stronger F_SEAL_WRITE (and is buggy on Linux <5.5 -- see kernel commit
// 05d351102dbe and <https://github.com/opencontainers/runc/pull/4640>).
// F_SEAL_EXEC -- Linux 6.3
const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
// Apply all original memfd seals.
_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
}
// Memfd creates a sealable executable memfd (supported since Linux 3.17).
func Memfd(comment string) (*os.File, SealFunc, error) {
file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
return file, sealMemfd, err
}
func sealFile(f **os.File) error {
// When sealing an O_TMPFILE-style descriptor we need to
// re-open the path as O_PATH to clear the existing write
// handle we have.
opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("reopen tmpfile: %w", err)
}
_ = (*f).Close()
*f = opath
return nil
}
// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
// since Linux 3.11).
func otmpfile(dir string) (*os.File, SealFunc, error) {
file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
if err != nil {
return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
}
// Make sure we actually got an unlinked O_TMPFILE descriptor.
var stat unix.Stat_t
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
file.Close()
return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
} else if stat.Nlink != 0 {
file.Close()
return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
}
return file, sealFile, err
}
// mktemp creates a classic unlinked file in the given directory.
func mktemp(dir string) (*os.File, SealFunc, error) {
file, err := os.CreateTemp(dir, "runc.")
if err != nil {
return nil, nil, err
}
// Unlink the file and verify it was unlinked.
if err := os.Remove(file.Name()); err != nil {
return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
}
if err := file.Chmod(0o511); err != nil {
return nil, nil, fmt.Errorf("chmod classic tmpfile: %w", err)
}
var stat unix.Stat_t
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
} else if stat.Nlink != 0 {
return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
}
return file, sealFile, err
}
func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
// First, try an executable memfd (supported since Linux 3.17).
file, sealFn, err = Memfd(comment)
if err == nil {
return
}
logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
// The tmpDir here (c.root) might be mounted noexec, so we need a couple of
// fallbacks to try. It's possible that none of these are writable and
// executable, in which case there's nothing we can practically do (other
// than mounting our own executable tmpfs, which would have its own
// issues).
tmpDirs := []string{
tmpDir,
os.TempDir(),
"/tmp",
".",
"/bin",
"/",
}
// Try to fallback to O_TMPFILE (supported since Linux 3.11).
for _, dir := range tmpDirs {
file, sealFn, err = otmpfile(dir)
if err != nil {
continue
}
if !isExecutable(file) {
logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
file.Close()
continue
}
return
}
logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
// Finally, try a classic unlinked temporary file.
for _, dir := range tmpDirs {
file, sealFn, err = mktemp(dir)
if err != nil {
continue
}
if !isExecutable(file) {
logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
file.Close()
continue
}
return
}
return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
}
// CloneBinary creates a "sealed" clone of a given binary, which can be used to
// thwart attempts by the container process to gain access to host binaries
// through procfs magic-link shenanigans. For more details on why this is
// necessary, see CVE-2019-5736.
func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
logrus.Debugf("cloning %s binary (%d bytes)", name, size)
file, sealFn, err := getSealableFile(name, tmpDir)
if err != nil {
return nil, err
}
copied, err := system.Copy(file, src)
if err != nil {
file.Close()
return nil, fmt.Errorf("copy binary: %w", err)
} else if copied != size {
file.Close()
return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
}
if err := sealFn(&file); err != nil {
file.Close()
return nil, fmt.Errorf("could not seal fd: %w", err)
}
return file, nil
}
// IsCloned returns whether the given file can be guaranteed to be a safe exe.
func IsCloned(exe *os.File) bool {
seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
if err != nil {
// /proc/self/exe is probably not a memfd
logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
return false
}
// The memfd must have all of the base seals applied.
logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
return seals&baseMemfdSeals == baseMemfdSeals
}
// CloneSelfExe makes a clone of the current process's binary (through
// /proc/self/exe). This binary can then be used for "runc init" in order to
// make sure the container process can never resolve the original runc binary.
// For more details on why this is necessary, see CVE-2019-5736.
func CloneSelfExe(tmpDir string) (*os.File, error) {
// Try to create a temporary overlayfs to produce a readonly version of
// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
// to CloneBinary, this technique does not require any extra memory usage
// and does not have the (fairly noticeable) performance impact of copying
// a large binary file into a memfd.
//
// Based on some basic performance testing, the overlayfs approach has
// effectively no performance overhead (it is on par with both
// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
// around ~60% overhead during container startup.
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
if err == nil {
logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
return overlayFile, nil
}
logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
selfExe, err := os.Open("/proc/self/exe")
if err != nil {
return nil, fmt.Errorf("opening current binary: %w", err)
}
defer selfExe.Close()
stat, err := selfExe.Stat()
if err != nil {
return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
}
size := stat.Size()
return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
}
// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
// be guaranteed to be safe. This means that it must be a sealed memfd. Other
// types of clones cannot be completely verified as safe.
func IsSelfExeCloned() bool {
selfExe, err := os.Open("/proc/self/exe")
if err != nil {
logrus.Debugf("open /proc/self/exe failed: %v", err)
return false
}
defer selfExe.Close()
return IsCloned(selfExe)
}