Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 49 additions & 15 deletions kernel/src/filesystem/vfs/iov.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
use alloc::vec::Vec;
use system_error::SystemError;

use crate::syscall::user_access::{UserBufferReader, UserBufferWriter};
use crate::{
mm::VirtAddr,
syscall::user_access::{user_accessible_len, UserBufferReader, UserBufferWriter},
};
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct IoVec {
Expand Down Expand Up @@ -73,24 +76,55 @@ impl IoVecs {
/// This function reads data from each IoVec in sequence and combines them into
/// a single contiguous buffer.
///
/// # Returns
/// **Returns:**
///
/// Returns a [`Vec<u8>`] containing all the data from the IoVecs.
/// Returns a [`Vec<u8>`] containing the data copied from the IoVecs.
///
/// # Examples
/// **To Be patient:**
///
/// ```rust
/// let iovecs = IoVecs::from_user(/* ... */)?;
/// let buffer = iovecs.gather();
/// ```
pub fn gather(&self) -> Vec<u8> {
let mut buf = Vec::new();
for slice in self.0.iter() {
let buf_reader = UserBufferReader::new(slice.iov_base, slice.iov_len, true).unwrap();
let slice = buf_reader.buffer::<u8>(0).unwrap();
buf.extend_from_slice(slice);
/// If a buffer is only partially accessible, data is copied up to **the first
/// inaccessible byte** and the remaining iovecs are ignored. If no data can be
/// read at all, `Err(SystemError::EFAULT)` is returned.
pub fn gather(&self) -> Result<Vec<u8>, SystemError> {
let mut buf = Vec::with_capacity(self.total_len());

for iov in self.0.iter() {
// 检查从 iov_base 开始有多少 bytes 在 vma 内部且实际可以访问
let accessible =
user_accessible_len(VirtAddr::new(iov.iov_base as usize), iov.iov_len, false);

// log::debug!(
// "iov is {:?}. iov_len: {}; accessible len:{}",
// iov,
// iov.iov_len,
// accessible
// );

// 如果一个字节都不能访问
if accessible == 0 {
if buf.is_empty() {
// log::error!(
// "The first iov is empty, returning EFAULT. iov shape: {:?}",
// iov
// );
return Err(SystemError::EFAULT);
}
return Ok(buf);
}

// 复制可访问的部分
unsafe {
let src = core::slice::from_raw_parts(iov.iov_base as *const u8, accessible);
buf.extend_from_slice(src);
}

// 如果没有读取完整个 iov,说明遇到了不可访问的区域
if accessible < iov.iov_len {
return Ok(buf);
}
}
return buf;

Ok(buf)
}

/// Scatters the given data into the IoVecs.
Expand Down
9 changes: 7 additions & 2 deletions kernel/src/filesystem/vfs/syscall/sys_pwritev.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,15 @@ impl Syscall for SysPwriteVHandle {
let iov_count = Self::iov_count(args);
let offset = Self::offset(args);

// 将用户态传入的指向用户态应用的数据结构重新在内核栈上构造
// 将用户态传入的数据结构 `IoVecs` 重新在内核上构造
let iovecs = unsafe { IoVecs::from_user(iov, iov_count, false) }?;
let data = iovecs.gather();
let data = iovecs.gather()?;

// TODO: 支持零内核拷贝的分散写 (需要文件系统底层支持分散写)
// - 直接将传入的用户态 IoVec 使用 vma 做校验以后传入底层文件系统进行分散写,避免内核拷贝
// - 实现路径(linux):wirtev --> vfs_writev --> do_iter_write --> do_loop_readv_writev/do_iter_readv_writev
// - 目前内核文件子系统尚未实现分散写功能,即无法直接使用用户态的 IoVec 进行写操作
// - 目前先将用户态的 IoVec 聚合成一个连续的内核缓冲区 `data`,然后进行写操作,避免多次发起写操作的开销。
do_pwritev(fd, &data, offset)
}

Expand Down
10 changes: 8 additions & 2 deletions kernel/src/filesystem/vfs/syscall/sys_writev.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,15 @@ impl Syscall for SysWriteVHandle {
let iov = Self::iov(args);
let count = Self::count(args);

// IoVecs会进行用户态检验
// 将用户态传入的数据结构 `IoVecs` 重新在内核上构造
let iovecs = unsafe { IoVecs::from_user(iov, count, false) }?;
let data = iovecs.gather();
let data = iovecs.gather()?;

// TODO: 支持零内核拷贝的分散写 (需要文件系统底层支持分散写)
// - 直接将传入的用户态 IoVec 使用 vma 做校验以后传入底层文件系统进行分散写,避免内核拷贝
// - 实现路径(linux):wirtev --> vfs_writev --> do_iter_write --> do_loop_readv_writev/do_iter_readv_writev
// - 目前内核文件子系统尚未实现分散写功能,即无法直接使用用户态的 IoVec 进行写操作
// - 目前先将用户态的 IoVec 聚合成一个连续的内核缓冲区 `data`,然后进行写操作,避免多次发起写操作的开销。
do_write(fd, &data)
}

Expand Down
118 changes: 117 additions & 1 deletion kernel/src/syscall/user_access.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! This file contains functions for kernel-space access to user-space data

use core::{
cmp::min,
mem::size_of,
num::NonZero,
slice::{from_raw_parts, from_raw_parts_mut},
Expand All @@ -12,7 +13,7 @@ use defer::defer;

use crate::{
arch::MMArch,
mm::{verify_area, MemoryManagementArch, VirtAddr},
mm::{verify_area, MemoryManagementArch, VirtAddr, VmFlags},
process::ProcessManager,
};

Expand Down Expand Up @@ -919,3 +920,118 @@ pub unsafe fn copy_to_user_protected(dest: VirtAddr, src: &[u8]) -> Result<usize
_ => Err(SystemError::EFAULT),
}
}

/// Compute the contiguous accessible length starting at `addr`.
///
/// Returns the number of bytes that can be accessed before hitting an unmapped
/// page or a page that lacks the requested permissions.
pub fn user_accessible_len(addr: VirtAddr, size: usize, check_write: bool) -> usize {
// log::error!(
// "user_accessible_len(addr: {:?}, size:{:?}, check_write:{:?}",
// addr,
// size,
// check_write
// );
if size == 0 || addr.is_null() {
return 0;
}

// 获取当前进程的 vm (可访问的地址空间)
let vm = match ProcessManager::current_pcb().basic().user_vm() {
Some(vm) => vm,
None => return 0,
};

let vma_read_guard = vm.read_irqsave();
let mappings = &vma_read_guard.mappings;

let mut checked = 0usize;
let mut current = addr;

while checked < size {
// 判断当前地址是否落在一个有效 VMA 中
let Some(vma) = mappings.contains(current) else {
break;
};

// 获取地址所在 VMA 的起始地址 和结束地址,访问权限标志,后备的文件和当前VMA第一页映射到文件的哪一页
let (region_start, region_end, vm_flags, vma_size, file, file_page_offset) = {
let guard = vma.lock_irqsave();
let region_start = guard.region().start().data();
let region_end = guard.region().end().data();
let vm_flags = *guard.vm_flags();
let vma_size = region_end.saturating_sub(region_start);
let file = guard.vm_file();
let file_page_offset = guard.file_page_offset();

drop(guard);
(
region_start,
region_end,
vm_flags,
vma_size,
file,
file_page_offset,
)
};

// 根据 vm_flags 判断是否具备访问权限
Copy link

Copilot AI Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Chinese comments should be translated to English for consistency. The comment "根据 vm_flags 判断是否具备访问权限" should be translated to something like "Check if access permission is available based on vm_flags".

Suggested change
// 根据 vm_flags 判断是否具备访问权限
// Check if access permission is available based on vm_flags

Copilot uses AI. Check for mistakes.
let has_permission = if check_write {
vm_flags.contains(VmFlags::VM_WRITE)
} else {
vm_flags.contains(VmFlags::VM_READ)
};
if !has_permission {
break;
}

let file_backed_len = file.and_then(|file| {
let file_offset_pages = file_page_offset.unwrap_or(0);
let file_offset_bytes = file_offset_pages.saturating_mul(MMArch::PAGE_SIZE);
let file_size = match file.metadata() {
Ok(md) if md.size > 0 => {
let capped = core::cmp::min(md.size as u128, usize::MAX as u128);
capped as usize
}
Ok(_) => 0,
Err(_) => return None,
};

let backed = file_size.saturating_sub(file_offset_bytes);
Some(core::cmp::min(backed, vma_size))
});

// 计算当前 VMA 内从 current 地址开始的可用长度
let current_addr = current.data();
let mut available = region_end.saturating_sub(current_addr);

if let Some(backed_len) = file_backed_len {
let offset_in_vma = current_addr.saturating_sub(region_start);
let backed_available = backed_len.saturating_sub(offset_in_vma);
// Clamp to the range actually backed by the file to avoid walking into holes.
available = min(available, backed_available);
}
if available == 0 {
break;
}

// 这里的 `step` 要区分两种情况
// - 第一种情况:`available`(当前 VMA 剩余长度)已经覆盖了 `size - checked`,说明
// 本次检查的剩余数据全部落在这个 VMA 内,`step` 直接等于 `size - checked`。
// - 第二种情况:`available` 比 `size - checked` 小,意味着我们会在这个 VMA 的末尾停下,
// 需要等下一次循环再确认后续地址是否仍有 VMA 覆盖。
// - 例如 (addr = 0x1, size = 10),若某个 VMA 只覆盖 [0x0, 0x5),则第一轮只能推进 4 个字节,
// 后续是否继续完全取决于下一个 VMA 是否与 0x5 处相接且具有相同访问权限。
// 若下一轮 VMA 覆盖 [0x5, 0xf),虽然这块 VMA 可访问空间 available == 10 ,但是我们需要检查的部分就只剩 10 - 4 = 6 bytes。
// 所以 `step` 选择为 size - checked
let step = min(available, size - checked);
checked += step;

let Some(next) = current_addr.checked_add(step) else {
break;
};
current = VirtAddr::new(next);
}

checked
}
Loading
Loading