Skip to content

Commit 9782e5e

Browse files
avagingvisor-bot
authored andcommitted
kvm: avoid mmio exits when Sentry faults on unmapped memory
Currently, we generate page tables for the entire sentry address space. Consequently, when the Sentry faults on unmapped memory - meaning a memory region not yet mapped into the VM - an MMIO exit is triggered. The issue is that because the current instruction is emulated (instead of executed natively), it becomes impossible to trigger a "normal" memory fault. To solve this, we must set up page tables only for the regions that are explicitly mapped into the VM. This, however, is more challenging than it sounds for several reasons: We map memory regions into the VM from a signal handler, where memory allocation is prohibited. This means all necessary page table entries must be allocated during platform initialization. Our memory regions are not aligned to huge page boundaries. Therefore, when mapping a memory slot, we often need to split huge pages and allocate new page table entries. We run into the nosplit stack limit, requiring us to introduce a PTE.Get method to safely avoid accessing splice entries via indices, which could trigger a panic and so requires a lot of extra stack. PiperOrigin-RevId: 827726897
1 parent 5edbb90 commit 9782e5e

File tree

13 files changed

+171
-47
lines changed

13 files changed

+171
-47
lines changed

pkg/ring0/pagetables/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ go_library(
4949
"pagetables_aarch64.go",
5050
"pagetables_amd64.go",
5151
"pagetables_arm64.go",
52+
"pagetables_unsafe.go",
5253
"pagetables_x86.go",
5354
"pcids.go",
5455
"pcids_aarch64.go",

pkg/ring0/pagetables/pagetables.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,16 +110,17 @@ func New(a Allocator) *PageTables {
110110
type mapVisitor struct {
111111
target uintptr // Input.
112112
physical uintptr // Input.
113-
opts MapOpts // Input.
114-
prev bool // Output.
113+
// opts is a pointer just to reduce a stack usage. It should never be changed.
114+
opts *MapOpts // Input.
115+
prev bool // Output.
115116
}
116117

117118
// visit is used for map.
118119
//
119120
//go:nosplit
120121
func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
121122
p := v.physical + (start - v.target)
122-
if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
123+
if pte.Valid() && (pte.Address() != p || pte.Opts() != *v.opts) {
123124
v.prev = true
124125
}
125126
if p&align != 0 {
@@ -169,7 +170,7 @@ func (p *PageTables) Map(addr hostarch.Addr, length uintptr, opts MapOpts, physi
169170
visitor: mapVisitor{
170171
target: uintptr(addr),
171172
physical: physical,
172-
opts: opts,
173+
opts: &opts,
173174
},
174175
}
175176
w.iterateRange(uintptr(addr), uintptr(addr)+length)

pkg/ring0/pagetables/pagetables_aarch64.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ type MapOpts struct {
9191
// User indicates the page is a user page.
9292
User bool
9393

94+
// Static indicates the entries should not be cleared/freed.
95+
Static bool
96+
9497
// MemoryType is the memory type.
9598
MemoryType hostarch.MemoryType
9699
}
@@ -156,7 +159,7 @@ func (p *PTE) IsSect() bool {
156159
// This does not change the sect page property.
157160
//
158161
//go:nosplit
159-
func (p *PTE) Set(addr uintptr, opts MapOpts) {
162+
func (p *PTE) Set(addr uintptr, opts *MapOpts) {
160163
v := (addr &^ optionMask) | nG | readOnly | protDefault
161164
// Note: p.IsSect is manually inlined to reduce stack size for
162165
// nosplit-ness.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package pagetables
16+
17+
import (
18+
"unsafe"
19+
)
20+
21+
// Get returns the entry with the specified index.
22+
//
23+
//go:nosplit
24+
func (p *PTEs) Get(idx uint16) *PTE {
25+
return (*PTE)(unsafe.Pointer(uintptr(unsafe.Pointer(&p[0])) + 8*uintptr(idx)))
26+
}

pkg/ring0/pagetables/pagetables_x86.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ type MapOpts struct {
7373
// User indicates the page is a user page.
7474
User bool
7575

76+
// Static indicates the entries should not be cleared/freed.
77+
Static bool
78+
7679
// MemoryType is the memory type.
7780
MemoryType hostarch.MemoryType
7881
}
@@ -91,7 +94,7 @@ func (p *PTE) Clear() {
9194
//
9295
//go:nosplit
9396
func (p *PTE) Valid() bool {
94-
return atomic.LoadUintptr((*uintptr)(p))&present != 0
97+
return atomic.LoadUintptr((*uintptr)(p)) != 0
9598
}
9699

97100
// Opts returns the PTE options.
@@ -138,8 +141,8 @@ func (p *PTE) IsSuper() bool {
138141
// This does not change the super page property.
139142
//
140143
//go:nosplit
141-
func (p *PTE) Set(addr uintptr, opts MapOpts) {
142-
if !opts.AccessType.Any() {
144+
func (p *PTE) Set(addr uintptr, opts *MapOpts) {
145+
if !opts.AccessType.Any() && !opts.Static {
143146
p.Clear()
144147
return
145148
}

pkg/ring0/pagetables/walker_amd64.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (w *Walker) walkPTEs(entries *PTEs, start, end uintptr) (bool, uint16) {
4343
var clearEntries uint16
4444
for start < end {
4545
pteIndex := uint16((start & pteMask) >> pteShift)
46-
entry := &entries[pteIndex]
46+
entry := entries.Get(pteIndex)
4747
if !entry.Valid() && !w.visitor.requiresAlloc() {
4848
clearEntries++
4949
start += pteSize
@@ -81,7 +81,7 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) {
8181
var pteEntries *PTEs
8282
nextBoundary := addrEnd(start, end, pmdSize)
8383
pmdIndex := uint16((start & pmdMask) >> pmdShift)
84-
pmdEntry := &pmdEntries[pmdIndex]
84+
pmdEntry := pmdEntries.Get(pmdIndex)
8585
if !pmdEntry.Valid() {
8686
if !w.visitor.requiresAlloc() {
8787
// Skip over this entry.
@@ -114,9 +114,10 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) {
114114
// Install the relevant entries.
115115
pteEntries = w.pageTables.Allocator.NewPTEs()
116116
for index := uint16(0); index < entriesPerPage; index++ {
117+
opts := pmdEntry.Opts()
117118
pteEntries[index].Set(
118119
pmdEntry.Address()+(pteSize*uintptr(index)),
119-
pmdEntry.Opts())
120+
&opts)
120121
}
121122
pmdEntry.setPageTable(w.pageTables, pteEntries)
122123
} else {
@@ -173,7 +174,7 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) {
173174
var pmdEntries *PTEs
174175
nextBoundary := addrEnd(start, end, pudSize)
175176
pudIndex := uint16((start & pudMask) >> pudShift)
176-
pudEntry := &pudEntries[pudIndex]
177+
pudEntry := pudEntries.Get(pudIndex)
177178
if !pudEntry.Valid() {
178179
if !w.visitor.requiresAlloc() {
179180
// Skip over this entry.
@@ -209,9 +210,10 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) {
209210
pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above.
210211
for index := uint16(0); index < entriesPerPage; index++ {
211212
pmdEntries[index].SetSuper()
213+
opts := pudEntry.Opts()
212214
pmdEntries[index].Set(
213215
pudEntry.Address()+(pmdSize*uintptr(index)),
214-
pudEntry.Opts())
216+
&opts)
215217
}
216218
pudEntry.setPageTable(w.pageTables, pmdEntries)
217219
} else {
@@ -261,7 +263,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
261263
var pudEntries *PTEs
262264
nextBoundary := addrEnd(start, end, pgdSize)
263265
pgdIndex := uint16((start & pgdMask) >> pgdShift)
264-
pgdEntry := &w.pageTables.root[pgdIndex]
266+
pgdEntry := w.pageTables.root.Get(pgdIndex)
265267
if !w.pageTables.largeAddressesEnabled {
266268
if !pgdEntry.Valid() {
267269
if !w.visitor.requiresAlloc() {

pkg/ring0/pagetables/walker_arm64.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
8787
pmdEntries = w.pageTables.Allocator.NewPTEs()
8888
for index := uint16(0); index < entriesPerPage; index++ {
8989
pmdEntries[index].SetSect()
90+
opts := pudEntry.Opts()
9091
pmdEntries[index].Set(
9192
pudEntry.Address()+(pmdSize*uintptr(index)),
92-
pudEntry.Opts())
93+
&opts)
9394
}
9495
pudEntry.setPageTable(w.pageTables, pmdEntries)
9596
} else {
@@ -152,9 +153,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
152153
// Install the relevant entries.
153154
pteEntries = w.pageTables.Allocator.NewPTEs()
154155
for index := uint16(0); index < entriesPerPage; index++ {
156+
opts := pmdEntry.Opts()
155157
pteEntries[index].Set(
156158
pmdEntry.Address()+(pteSize*uintptr(index)),
157-
pmdEntry.Opts())
159+
&opts)
158160
}
159161
pmdEntry.setPageTable(w.pageTables, pteEntries)
160162
} else {

pkg/sentry/platform/kvm/bluepill_fault.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@ import (
2525
var (
2626
// faultBlockSize is the size used for servicing memory faults.
2727
//
28-
// This should be large enough to avoid frequent faults and avoid using
29-
// all available KVM slots (~512), but small enough that KVM does not
30-
// complain about slot sizes (~4GB). See handleBluepillFault for how
31-
// this block is used.
32-
faultBlockSize = uintptr(2 << 30)
28+
// This should be large enough so that the total number of slots
29+
// required to cover the 47-bit virtual address space does not exceed
30+
// the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual
31+
// address space above 47-bit by default.
32+
// It must be small enough to limit the memory overhead associated with
33+
// KVM slot allocation. For example, using a 46-bit address space
34+
// results in an overhead of ~250 MB.
35+
faultBlockSize = uintptr(8 << 30)
3336

3437
// faultBlockMask is the mask for the fault blocks.
3538
//
@@ -56,13 +59,17 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng
5659
}
5760

5861
// Adjust the block to match our size.
59-
physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask
60-
virtualStart = pr.virtual + (physicalStart - pr.physical)
62+
physicalStart = pr.physical / faultBlockSize * faultBlockSize
63+
physicalStart = physicalStart + (alignedPhysical-physicalStart)&faultBlockMask
6164
physicalEnd := physicalStart + faultBlockSize
65+
if physicalStart < pr.physical {
66+
physicalStart = pr.physical
67+
}
6268
if physicalEnd > end {
6369
physicalEnd = end
6470
}
6571
length = physicalEnd - physicalStart
72+
virtualStart = pr.virtual + (physicalStart - pr.physical)
6673
return virtualStart, physicalStart, length, &physicalRegions[i]
6774
}
6875

pkg/sentry/platform/kvm/kvm_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
var dummyFPState fpu.State
4040

4141
type testHarness interface {
42+
Logf(format string, args ...any)
4243
Errorf(format string, args ...any)
4344
Fatalf(format string, args ...any)
4445
}
@@ -146,6 +147,7 @@ func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn f
146147
// done for regular user code, but is fine for test
147148
// purposes.)
148149
applyPhysicalRegions(func(pr physicalRegion) bool {
150+
t.Logf("Map %x-%x", pr.virtual, pr.virtual+pr.length)
149151
pt.Map(hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{
150152
AccessType: hostarch.AnyAccess,
151153
User: true,

pkg/sentry/platform/kvm/machine.go

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -332,26 +332,33 @@ func newMachine(vm int, config *Config) (*machine, error) {
332332
// faultBlockSize has to equal or less than KVM_MEM_MAX_NR_PAGES.
333333
faultBlockSize = uintptr(1) << 42
334334
faultBlockMask = ^uintptr(faultBlockSize - 1)
335+
for _, r := range physicalRegions {
336+
m.mapPhysical(r.physical, r.length)
337+
}
335338
} else {
339+
// Apply the physical mappings. Note that these mappings may point to
340+
// guest physical addresses that are not actually available. These
341+
// physical pages are mapped on demand, see kernel_unsafe.go.
342+
applyPhysicalRegions(func(pr physicalRegion) bool {
343+
physical := pr.physical
344+
for physical < pr.physical+pr.length {
345+
virtualStart, physicalStart, length, _ := calculateBluepillFault(physical)
346+
// Pre-allocate page tables in the lower half.
347+
m.kernel.PageTables.Map(
348+
hostarch.Addr(virtualStart),
349+
length,
350+
pagetables.MapOpts{Static: true},
351+
physicalStart)
352+
physical += length
353+
}
354+
355+
return true // Keep iterating.
356+
})
336357
// Install seccomp rules to trap runtime mmap system calls. They will
337358
// be handled by seccompMmapHandler.
338359
seccompMmapRules(m)
339360
}
340361

341-
// Apply the physical mappings. Note that these mappings may point to
342-
// guest physical addresses that are not actually available. These
343-
// physical pages are mapped on demand, see kernel_unsafe.go.
344-
applyPhysicalRegions(func(pr physicalRegion) bool {
345-
// Map everything in the lower half.
346-
m.kernel.PageTables.Map(
347-
hostarch.Addr(pr.virtual),
348-
pr.length,
349-
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
350-
pr.physical)
351-
352-
return true // Keep iterating.
353-
})
354-
355362
// Ensure that the currently mapped virtual regions are actually
356363
// available in the VM. Note that this doesn't guarantee no future
357364
// faults, however it should guarantee that everything is available to
@@ -368,6 +375,9 @@ func newMachine(vm int, config *Config) (*machine, error) {
368375
// Cap the length to the end of the area.
369376
length = vr.virtual + vr.length - virtual
370377
}
378+
// Ensure the physical range is mapped.
379+
m.mapPhysical(physical, length)
380+
371381
// Update page tables for executable mappings.
372382
if vr.accessType.Execute {
373383
if vr.accessType.Write {
@@ -380,8 +390,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
380390
physical)
381391
}
382392

383-
// Ensure the physical range is mapped.
384-
m.mapPhysical(physical, length)
385393
virtual += length
386394
}
387395
}
@@ -404,11 +412,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
404412
mapRegion(vr, 0)
405413

406414
})
407-
if mapEntireAddressSpace {
408-
for _, r := range physicalRegions {
409-
m.mapPhysical(r.physical, r.length)
410-
}
411-
}
412415
enableAsyncPreemption()
413416
// Initialize architecture state.
414417
if err := m.initArchState(); err != nil {
@@ -458,8 +461,15 @@ func (m *machine) mapPhysical(physical, length uintptr) {
458461
}
459462

460463
// Is this already mapped? Check the usedSlots.
461-
if !pr.mmio && !m.hasSlot(physicalStart) {
462-
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
464+
if !m.hasSlot(physicalStart) {
465+
m.kernel.PageTables.Map(
466+
hostarch.Addr(virtualStart),
467+
length,
468+
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
469+
physicalStart)
470+
if !pr.mmio {
471+
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
472+
}
463473
}
464474

465475
// Move to the next chunk.

0 commit comments

Comments
 (0)