Skip to content

Commit 0488814

Browse files
avagingvisor-bot
authored andcommitted
kvm: avoid mmio exits when Sentry faults on unmapped memory
Currently, we generate page tables for the entire sentry address space. Consequently, when the Sentry faults on unmapped memory - meaning a memory region not yet mapped into the VM - an MMIO exit is triggered. The issue is that because the current instruction is emulated (instead of executed natively), it becomes impossible to trigger a "normal" memory fault. To solve this, we must set up page tables only for the regions that are explicitly mapped into the VM. This, however, is more challenging than it sounds for several reasons: We map memory regions into the VM from a signal handler, where memory allocation is prohibited. This means all necessary page table entries must be allocated during platform initialization. Our memory regions are not aligned to huge page boundaries. Therefore, when mapping a memory slot, we often need to split huge pages and allocate new page table entries. We run into the nosplit stack limit, requiring us to introduce a PTE.Get method to safely avoid accessing splice entries via indices, which could trigger a panic and so requires a lot of extra stack. PiperOrigin-RevId: 827726897
1 parent 2e0758b commit 0488814

File tree

8 files changed

+109
-37
lines changed

8 files changed

+109
-37
lines changed

pkg/ring0/pagetables/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ go_library(
4949
"pagetables_aarch64.go",
5050
"pagetables_amd64.go",
5151
"pagetables_arm64.go",
52+
"pagetables_unsafe.go",
5253
"pagetables_x86.go",
5354
"pcids.go",
5455
"pcids_aarch64.go",

pkg/ring0/pagetables/pagetables_aarch64.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ type MapOpts struct {
9191
// User indicates the page is a user page.
9292
User bool
9393

94+
// Static indicates the entries should not be cleared/freed.
95+
Static bool
96+
9497
// MemoryType is the memory type.
9598
MemoryType hostarch.MemoryType
9699
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package pagetables
16+
17+
import (
18+
"unsafe"
19+
)
20+
21+
// Get returns the entry with the specified index.
22+
//
23+
//go:nosplit
24+
func (p *PTEs) Get(idx uint16) *PTE {
25+
return (*PTE)(unsafe.Pointer(uintptr(unsafe.Pointer(&p[0])) + 8*uintptr(idx)))
26+
}

pkg/ring0/pagetables/pagetables_x86.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ type MapOpts struct {
7373
// User indicates the page is a user page.
7474
User bool
7575

76+
// Static indicates the entries should not be cleared/freed.
77+
Static bool
78+
7679
// MemoryType is the memory type.
7780
MemoryType hostarch.MemoryType
7881
}
@@ -91,7 +94,7 @@ func (p *PTE) Clear() {
9194
//
9295
//go:nosplit
9396
func (p *PTE) Valid() bool {
94-
return atomic.LoadUintptr((*uintptr)(p))&present != 0
97+
return atomic.LoadUintptr((*uintptr)(p)) != 0
9598
}
9699

97100
// Opts returns the PTE options.
@@ -139,7 +142,7 @@ func (p *PTE) IsSuper() bool {
139142
//
140143
//go:nosplit
141144
func (p *PTE) Set(addr uintptr, opts MapOpts) {
142-
if !opts.AccessType.Any() {
145+
if false && !opts.AccessType.Any() && !opts.Static {
143146
p.Clear()
144147
return
145148
}

pkg/ring0/pagetables/walker_amd64.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (w *Walker) walkPTEs(entries *PTEs, start, end uintptr) (bool, uint16) {
4343
var clearEntries uint16
4444
for start < end {
4545
pteIndex := uint16((start & pteMask) >> pteShift)
46-
entry := &entries[pteIndex]
46+
entry := entries.Get(pteIndex)
4747
if !entry.Valid() && !w.visitor.requiresAlloc() {
4848
clearEntries++
4949
start += pteSize
@@ -81,7 +81,7 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) {
8181
var pteEntries *PTEs
8282
nextBoundary := addrEnd(start, end, pmdSize)
8383
pmdIndex := uint16((start & pmdMask) >> pmdShift)
84-
pmdEntry := &pmdEntries[pmdIndex]
84+
pmdEntry := pmdEntries.Get(pmdIndex)
8585
if !pmdEntry.Valid() {
8686
if !w.visitor.requiresAlloc() {
8787
// Skip over this entry.
@@ -173,7 +173,7 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) {
173173
var pmdEntries *PTEs
174174
nextBoundary := addrEnd(start, end, pudSize)
175175
pudIndex := uint16((start & pudMask) >> pudShift)
176-
pudEntry := &pudEntries[pudIndex]
176+
pudEntry := pudEntries.Get(pudIndex)
177177
if !pudEntry.Valid() {
178178
if !w.visitor.requiresAlloc() {
179179
// Skip over this entry.
@@ -261,7 +261,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
261261
var pudEntries *PTEs
262262
nextBoundary := addrEnd(start, end, pgdSize)
263263
pgdIndex := uint16((start & pgdMask) >> pgdShift)
264-
pgdEntry := &w.pageTables.root[pgdIndex]
264+
pgdEntry := w.pageTables.root.Get(pgdIndex)
265265
if !w.pageTables.largeAddressesEnabled {
266266
if !pgdEntry.Valid() {
267267
if !w.visitor.requiresAlloc() {

pkg/sentry/platform/kvm/bluepill_fault.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@ import (
2525
var (
2626
// faultBlockSize is the size used for servicing memory faults.
2727
//
28-
// This should be large enough to avoid frequent faults and avoid using
29-
// all available KVM slots (~512), but small enough that KVM does not
30-
// complain about slot sizes (~4GB). See handleBluepillFault for how
31-
// this block is used.
32-
faultBlockSize = uintptr(2 << 30)
28+
// This should be large enough so that the total number of slots
29+
// required to cover the 47-bit virtual address space does not exceed
30+
// the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual
31+
// address space above 47-bit by default.
32+
// It must be small enough to limit the memory overhead associated with
33+
// KVM slot allocation. For example, using a 46-bit address space
34+
// results in an overhead of ~250 MB.
35+
faultBlockSize = uintptr(8 << 30)
3336

3437
// faultBlockMask is the mask for the fault blocks.
3538
//
@@ -56,13 +59,17 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng
5659
}
5760

5861
// Adjust the block to match our size.
59-
physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask
60-
virtualStart = pr.virtual + (physicalStart - pr.physical)
62+
physicalStart = pr.physical / faultBlockSize * faultBlockSize
63+
physicalStart = physicalStart + (alignedPhysical-physicalStart)&faultBlockMask
6164
physicalEnd := physicalStart + faultBlockSize
65+
if physicalStart < pr.physical {
66+
physicalStart = pr.physical
67+
}
6268
if physicalEnd > end {
6369
physicalEnd = end
6470
}
6571
length = physicalEnd - physicalStart
72+
virtualStart = pr.virtual + (physicalStart - pr.physical)
6673
return virtualStart, physicalStart, length, &physicalRegions[i]
6774
}
6875

pkg/sentry/platform/kvm/machine.go

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -332,26 +332,33 @@ func newMachine(vm int, config *Config) (*machine, error) {
332332
// faultBlockSize has to equal or less than KVM_MEM_MAX_NR_PAGES.
333333
faultBlockSize = uintptr(1) << 42
334334
faultBlockMask = ^uintptr(faultBlockSize - 1)
335+
for _, r := range physicalRegions {
336+
m.mapPhysical(r.physical, r.length)
337+
}
335338
} else {
339+
// Apply the physical mappings. Note that these mappings may point to
340+
// guest physical addresses that are not actually available. These
341+
// physical pages are mapped on demand, see kernel_unsafe.go.
342+
applyPhysicalRegions(func(pr physicalRegion) bool {
343+
physical := pr.physical
344+
for physical < pr.physical+pr.length {
345+
virtualStart, physicalStart, length, _ := calculateBluepillFault(physical)
346+
// Pre-allocate page tables in the lower half.
347+
m.kernel.PageTables.Map(
348+
hostarch.Addr(virtualStart),
349+
length,
350+
pagetables.MapOpts{Static: true},
351+
physicalStart)
352+
physical += length
353+
}
354+
355+
return true // Keep iterating.
356+
})
336357
// Install seccomp rules to trap runtime mmap system calls. They will
337358
// be handled by seccompMmapHandler.
338359
seccompMmapRules(m)
339360
}
340361

341-
// Apply the physical mappings. Note that these mappings may point to
342-
// guest physical addresses that are not actually available. These
343-
// physical pages are mapped on demand, see kernel_unsafe.go.
344-
applyPhysicalRegions(func(pr physicalRegion) bool {
345-
// Map everything in the lower half.
346-
m.kernel.PageTables.Map(
347-
hostarch.Addr(pr.virtual),
348-
pr.length,
349-
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
350-
pr.physical)
351-
352-
return true // Keep iterating.
353-
})
354-
355362
// Ensure that the currently mapped virtual regions are actually
356363
// available in the VM. Note that this doesn't guarantee no future
357364
// faults, however it should guarantee that everything is available to
@@ -368,6 +375,9 @@ func newMachine(vm int, config *Config) (*machine, error) {
368375
// Cap the length to the end of the area.
369376
length = vr.virtual + vr.length - virtual
370377
}
378+
// Ensure the physical range is mapped.
379+
m.mapPhysical(physical, length)
380+
371381
// Update page tables for executable mappings.
372382
if vr.accessType.Execute {
373383
if vr.accessType.Write {
@@ -380,8 +390,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
380390
physical)
381391
}
382392

383-
// Ensure the physical range is mapped.
384-
m.mapPhysical(physical, length)
385393
virtual += length
386394
}
387395
}
@@ -404,11 +412,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
404412
mapRegion(vr, 0)
405413

406414
})
407-
if mapEntireAddressSpace {
408-
for _, r := range physicalRegions {
409-
m.mapPhysical(r.physical, r.length)
410-
}
411-
}
412415
enableAsyncPreemption()
413416
// Initialize architecture state.
414417
if err := m.initArchState(); err != nil {
@@ -458,8 +461,15 @@ func (m *machine) mapPhysical(physical, length uintptr) {
458461
}
459462

460463
// Is this already mapped? Check the usedSlots.
461-
if !pr.mmio && !m.hasSlot(physicalStart) {
462-
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
464+
if !m.hasSlot(physicalStart) {
465+
m.kernel.PageTables.Map(
466+
hostarch.Addr(virtualStart),
467+
length,
468+
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
469+
physicalStart)
470+
if !pr.mmio {
471+
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
472+
}
463473
}
464474

465475
// Move to the next chunk.

pkg/sentry/platform/kvm/physical_map.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package kvm
1616

1717
import (
1818
"fmt"
19+
"runtime"
1920
"sort"
2021

2122
"golang.org/x/sys/unix"
@@ -66,6 +67,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
6667
pSize := uintptr(1) << ring0.PhysicalAddressBits
6768
pSize -= reservedMemory
6869

70+
maxUserAddr := uintptr(0)
6971
// Add specifically excluded regions; see excludeVirtualRegion.
7072
if err := applyVirtualRegions(func(vr virtualRegion) {
7173
if excludeVirtualRegion(vr) {
@@ -81,9 +83,29 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
8183
})
8284
log.Infof("mmio: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
8385
}
86+
if vr.filename != "[vsyscall]" {
87+
maxUserAddr = vr.region.virtual + vr.region.length
88+
}
8489
}); err != nil {
8590
panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err))
8691
}
92+
if runtime.GOARCH == "amd64" {
93+
vSize47 := uintptr(1) << 47
94+
// This is a workaround for the kernel bug when vdso can be
95+
// mapped above the 47-bit address space boundary.
96+
if vSize47 > maxUserAddr {
97+
maxUserAddr = vSize47
98+
}
99+
r := region{
100+
virtual: maxUserAddr,
101+
length: vSize - vSize47,
102+
}
103+
specialRegions = append(specialRegions, specialVirtualRegion{
104+
region: r,
105+
})
106+
vSize -= r.length
107+
log.Infof("excluded: virtual [%x,%x)", r.virtual, r.virtual+r.length)
108+
}
87109

88110
// Do we need any more work?
89111
if vSize < pSize {
@@ -109,7 +131,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
109131
current := required // Attempted mmap size.
110132
filled := uintptr(0)
111133
suggestedAddr := uintptr(0)
112-
if ring0.VirtualAddressBits > 48 {
134+
if false && ring0.VirtualAddressBits > 48 {
113135
// Pass a hint address above 47 bits to indicate to the kernel that
114136
// we can handle, and want, mappings above 47 bits:
115137
// https://docs.kernel.org/arch/x86/x86_64/5level-paging.html#user-space-and-large-virtual-address-space.

0 commit comments

Comments
 (0)