Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
francescolavra committed Sep 27, 2024
1 parent 243c5a2 commit fbb4260
Show file tree
Hide file tree
Showing 13 changed files with 166 additions and 64 deletions.
43 changes: 25 additions & 18 deletions src/net/netsyscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ static sysreturn netsock_listen(struct sock *sock, int backlog);
static sysreturn netsock_connect(struct sock *sock, struct sockaddr *addr,
socklen_t addrlen);
static sysreturn netsock_accept4(struct sock *sock, struct sockaddr *addr,
socklen_t *addrlen, int flags);
socklen_t *addrlen, int flags, context ctx, boolean in_bh,
io_completion completion);
static sysreturn netsock_getsockname(struct sock *sock, struct sockaddr *addr, socklen_t *addrlen);
static sysreturn netsock_getsockopt(struct sock *sock, int level,
int optname, void *optval, socklen_t *optlen);
Expand Down Expand Up @@ -2120,11 +2121,10 @@ sysreturn listen(int sockfd, int backlog)
}

closure_function(5, 1, sysreturn, accept_bh,
netsock, s, thread, t, struct sockaddr *, addr, socklen_t *, addrlen, int, flags,
netsock, s, struct sockaddr *, addr, socklen_t *, addrlen, int, flags, io_completion, completion,
u64 bqflags)
{
netsock s = bound(s);
thread t = bound(t);
netsock child = INVALID_ADDRESS;
sysreturn rv = 0;

Expand Down Expand Up @@ -2206,15 +2206,15 @@ closure_function(5, 1, sysreturn, accept_bh,
out:
if ((rv < 0) && (child != INVALID_ADDRESS))
apply(child->sock.f.close, 0, io_completion_ignore);
syscall_return(t, rv);
apply(bound(completion), rv);

socket_release(&s->sock);
closure_finish();
return rv;
}

static sysreturn netsock_accept4(struct sock *sock, struct sockaddr *addr,
socklen_t *addrlen, int flags)
socklen_t *addrlen, int flags, context ctx, boolean in_bh,
io_completion completion)
{
netsock s = (netsock) sock;
sysreturn rv;
Expand All @@ -2229,31 +2229,38 @@ static sysreturn netsock_accept4(struct sock *sock, struct sockaddr *addr,
goto out;
}

blockq_action ba = contextual_closure(accept_bh, s, current, addr, addrlen, flags);
return blockq_check(sock->rxbq, ba, false);
blockq_action ba = closure_from_context(ctx, accept_bh, s, addr, addrlen, flags, completion);
return blockq_check(sock->rxbq, ba, in_bh);
out:
socket_release(sock);
return rv;
}

sysreturn accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen,
int flags)
sysreturn socket_accept4(fdesc f, struct sockaddr *addr, socklen_t *addrlen, int flags, context ctx,
boolean in_bh, io_completion completion)
{
net_debug("sock %d, addr %p, addrlen %p, flags %x\n", sockfd, addr, addrlen,
flags);
if (f->type != FDESC_TYPE_SOCKET)
return io_complete(completion, -ENOTSOCK);

/* Use a dummy value for the address length, instead of reading it from addrlen (the value
* pointed to by addrlen might change before this syscall completes). */
if (addr && (!validate_user_memory(addrlen, sizeof(socklen_t), true) ||
!validate_user_memory(addr, PAGESIZE, true)))
return -EFAULT;

struct sock *sock = resolve_socket(current->p, sockfd);
if (!sock->accept4) {
socket_release(sock);
return -EOPNOTSUPP;
}
return sock->accept4(sock, addr, addrlen, flags);
struct sock *sock = struct_from_field(f, struct sock *, f);
if (!sock->accept4)
return io_complete(completion, -EOPNOTSUPP);
return sock->accept4(sock, addr, addrlen, flags, ctx, in_bh, completion);
}

sysreturn accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen,
int flags)
{
net_debug("sock %d, addr %p, addrlen %p, flags %x\n", sockfd, addr, addrlen, flags);
fdesc f = resolve_fd(current->p, sockfd);
context ctx = get_current_context(current_cpu());
return socket_accept4(f, addr, addrlen, flags, ctx, false, (io_completion)&f->io_complete);
}

sysreturn accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
Expand Down
3 changes: 1 addition & 2 deletions src/unix/blockq.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,7 @@ sysreturn blockq_check_timeout(blockq bq, blockq_action a, boolean in_bh,
blockq_lock(bq);
thread_lock(t);
t->bq_action = a;
if (!in_bh)
t->blocked_on = bq;
t->blocked_on = bq;
if (timeout > 0) {
t->bq_timer_pending = true;
t->bq_clkid = clkid;
Expand Down
7 changes: 4 additions & 3 deletions src/unix/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,18 @@ static void build_exec_stack(process p, thread t, Elf64_Ehdr * e, void *start,
exec_debug("build_exec_stack start %p, tid %d, va 0x%lx\n", start, t->tid, va);

/* allocate process stack at top of 32-bit address space */
u64 stack_start = 0x100000000 - PROCESS_STACK_SIZE;
u64 stack_size = p->rlimit_stack;
u64 stack_start = 0x100000000 - stack_size;
if (aslr)
stack_start = (stack_start - PROCESS_STACK_ASLR_RANGE) +
get_aslr_offset(PROCESS_STACK_ASLR_RANGE);

p->stack_map = allocate_vmap(p, irangel(stack_start, PROCESS_STACK_SIZE),
p->stack_map = allocate_vmap(p, irangel(stack_start, stack_size),
ivmap(VMAP_FLAG_STACK | VMAP_FLAG_READABLE | VMAP_FLAG_WRITABLE,
0, 0, 0, 0));
assert(p->stack_map != INVALID_ADDRESS);

u64 *s = (pointer_from_u64(stack_start) + PROCESS_STACK_SIZE);
u64 *s = (pointer_from_u64(stack_start) + stack_size);
void *as = stack_prealloc((void*)s, PROCESS_STACK_PREALLOC_SIZE);

/* 16 bytes of random data for userspace (e.g. SSP guard init) */
Expand Down
34 changes: 25 additions & 9 deletions src/unix/filesystem.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,40 @@ sysreturn sysreturn_from_fs_status_value(status s)
return rv;
}

u16 file_mode_from_type(int type)
u16 stat_mode(process p, int type, tuple meta)
{
u16 mode;
switch (type) {
case FDESC_TYPE_REGULAR:
return S_IFREG | 0644;
mode = S_IFREG;
break;
case FDESC_TYPE_DIRECTORY:
return S_IFDIR | 0777;
mode = S_IFDIR;
break;
case FDESC_TYPE_STDIO:
case FDESC_TYPE_SPECIAL: /* assuming only character devices */
return S_IFCHR;
mode = S_IFCHR;
break;
case FDESC_TYPE_SOCKET:
return S_IFSOCK;
mode = S_IFSOCK;
break;
case FDESC_TYPE_PIPE:
return S_IFIFO;
mode = S_IFIFO;
break;
case FDESC_TYPE_SYMLINK:
return S_IFLNK;
mode = S_IFLNK;
break;
default:
return 0;
}
return 0;
u32 perms = file_meta_perms(p, meta);
if (perms & ACCESS_PERM_READ)
mode |= 0444;
if (perms & ACCESS_PERM_WRITE)
mode |= 0222;
if (perms & ACCESS_PERM_EXEC)
mode |= 0111;
return mode;
}

void file_readahead(file f, u64 offset, u64 len)
Expand Down Expand Up @@ -355,7 +371,7 @@ static sysreturn statx_internal(filesystem fs, int type, tuple n, fsfile f, stru
if (!validate_user_memory(statxbuf, sizeof(struct rlimit), true) || context_set_err(ctx))
return -EFAULT;
zero(statxbuf, sizeof(*statxbuf));
statxbuf->stx_mode = file_mode_from_type(type);
statxbuf->stx_mode = stat_mode(current->p, type, n);
statxbuf->stx_mask = STATX_TYPE | STATX_MODE;
switch (type) {
case FDESC_TYPE_REGULAR:
Expand Down
4 changes: 3 additions & 1 deletion src/unix/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

sysreturn sysreturn_from_fs_status_value(status s);

u16 file_mode_from_type(int type);
u16 stat_mode(process p, int type, tuple meta);

/* Perform read-ahead following a userspace read request.
* offset and len arguments refer to the byte range being read from userspace,
Expand All @@ -37,6 +37,8 @@ int filesystem_chdir(process p, sstring path);

void filesystem_update_relatime(filesystem fs, tuple md);

sysreturn openat(int dirfd, const char *name, int flags, int mode);

sysreturn symlink(const char *target, const char *linkpath);
sysreturn symlinkat(const char *target, int dirfd, const char *linkpath);

Expand Down
72 changes: 67 additions & 5 deletions src/unix/io_uring.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include <unix_internal.h>
#include <filesystem.h>
#include <socket.h>

#define IORING_SETUP_CQSIZE (1 << 3)

Expand Down Expand Up @@ -36,12 +38,18 @@ struct io_uring_sqe {
u8 flags;
u16 ioprio;
s32 fd;
u64 off;
union {
u64 off;
u64 addr2;
};
u64 addr;
u32 len;
union {
u32 open_flags;
u32 accept_flags;
u32 rw_flags;
u32 fsync_flags;
u32 statx_flags;
u16 poll_events;
u32 sync_range_flags;
u32 msg_flags;
Expand All @@ -50,8 +58,13 @@ struct io_uring_sqe {
u64 user_data;
union{
u16 buf_index;
u64 __pad2[3];
u16 buf_group;
};
u16 personality;
union {
u32 file_index;
};
u64 __pad2[2];
};

struct io_uring_cqe {
Expand Down Expand Up @@ -85,6 +98,10 @@ enum iour_enter_opcode {
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
IORING_OP_FADVISE,
IORING_OP_MADVISE,
IORING_OP_SEND,
IORING_OP_RECV,
IORING_OP_LAST,
};

Expand Down Expand Up @@ -880,8 +897,11 @@ static boolean iour_submit(io_uring iour, struct io_uring_sqe *sqe)
case IORING_OP_READ_FIXED:
case IORING_OP_WRITE_FIXED:
case IORING_OP_POLL_ADD:
case IORING_OP_ACCEPT:
case IORING_OP_READ:
case IORING_OP_WRITE:
case IORING_OP_SEND:
case IORING_OP_RECV:
if (sqe->flags & IOSQE_FIXED_FILE) {
iour_lock(iour);
int fd = sqe->fd;
Expand Down Expand Up @@ -984,6 +1004,32 @@ static boolean iour_submit(io_uring iour, struct io_uring_sqe *sqe)
}
iour_timeout_remove(iour, sqe->addr, sqe->user_data);
break;
case IORING_OP_ACCEPT: {
io_completion completion;
process_context pc = get_process_context();
if (pc != INVALID_ADDRESS) {
completion = closure(iour->h, iour_rw_complete, iour, f, sqe->user_data,
&pc->uc.kc.context);
if (completion == INVALID_ADDRESS)
context_release_refcount(&pc->uc.kc.context);
} else {
completion = INVALID_ADDRESS;
}
if (completion == INVALID_ADDRESS) {
res = -ENOMEM;
goto complete;
}
fetch_and_add(&iour->noncancelable_ops, 1);
socket_accept4(f, pointer_from_u64(sqe->addr), pointer_from_u64(sqe->addr2),
sqe->accept_flags, &pc->uc.kc.context, true, completion);
break;
}
case IORING_OP_OPENAT:
if (!sqe->file_index)
res = openat(sqe->fd, pointer_from_u64(sqe->addr), sqe->open_flags, sqe->len);
else
res = -EOPNOTSUPP;
goto complete;
case IORING_OP_CLOSE:
if (sqe->ioprio || sqe->addr || sqe->len || sqe->off || sqe->buf_index
|| sqe->rw_flags) {
Expand Down Expand Up @@ -1027,25 +1073,39 @@ static boolean iour_submit(io_uring iour, struct io_uring_sqe *sqe)
res = iour_register_files_update(iour, (int *)sqe->addr, sqe->len,
sqe->off);
goto complete;
case IORING_OP_STATX:
res = statx(sqe->fd, pointer_from_u64(sqe->addr), sqe->statx_flags, sqe->len,
pointer_from_u64(sqe->off));
goto complete;
case IORING_OP_READ:
case IORING_OP_WRITE:
case IORING_OP_SEND:
case IORING_OP_RECV:
if (sqe->buf_index) {
res = -EINVAL;
goto complete;
} else {
void *buf = pointer_from_u64(sqe->addr);
u32 len = sqe->len;
boolean write = sqe->opcode == IORING_OP_WRITE;
u8 opcode = sqe->opcode;
boolean write = (opcode == IORING_OP_WRITE) || (opcode = IORING_OP_SEND);

if (!validate_user_memory(buf, len, !write)) {
res = -EFAULT;
goto complete;
}
iour_rw(iour, f, write, buf, len, sqe->off, sqe->user_data);
switch (opcode) {
case IORING_OP_READ:
case IORING_OP_WRITE:
iour_rw(iour, f, write, buf, len, sqe->off, sqe->user_data);
break;
default:
iour_txrx(iour, f, write, buf, len, sqe->user_data);
}
}
break;
default:
iour_complete(iour, sqe->user_data, -EINVAL, false, false);
iour_complete(iour, sqe->user_data, -EINVAL, false, false);rprintf("sqe->opcode %d\n",sqe->opcode);
return false;
}
return true;
Expand Down Expand Up @@ -1337,6 +1397,8 @@ static sysreturn iour_register_probe(struct io_uring_probe *probe,
probe->ops[IORING_OP_TIMEOUT_REMOVE].flags =
probe->ops[IORING_OP_CLOSE].flags =
probe->ops[IORING_OP_FILES_UPDATE].flags =
probe->ops[IORING_OP_OPENAT].flags =
probe->ops[IORING_OP_ACCEPT].flags =
probe->ops[IORING_OP_READ].flags =
probe->ops[IORING_OP_WRITE].flags = IO_URING_OP_SUPPORTED;
context_clear_err(ctx);
Expand Down
2 changes: 1 addition & 1 deletion src/unix/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ vmap vmap_from_vaddr(process p, u64 vaddr)
if ((vm != INVALID_ADDRESS) && (vm->node.r.start > vaddr)) {
/* vm does not cover this address; if it is a stack mapping, check whether it can be
* expanded downwards. */
if (!(vm->flags & VMAP_FLAG_STACK) || (vm->node.r.end > vaddr + PROCESS_STACK_SIZE))
if (!(vm->flags & VMAP_FLAG_STACK) || (vm->node.r.end > vaddr + p->rlimit_stack))
return INVALID_ADDRESS;
vmap prev = (vmap)rangemap_prev_node(rm, &vm->node);
vaddr &= ~PAGEMASK;
Expand Down
Loading

0 comments on commit fbb4260

Please sign in to comment.