summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rw-r--r--netio.h14
-rw-r--r--netio_posix.c49
-rw-r--r--netio_uring.c365
4 files changed, 433 insertions, 0 deletions
diff --git a/Makefile b/Makefile
index 68c1598..57583ae 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,8 @@
+CFLAGS += -fno-split-stack
+CFLAGS += -Wall -Wextra -Werror
+CFLAGS += -g -O0
+LDFLAGS += -static
+
srv9p: srv9p.o coroutine.o net9p.o
net9p_defs.h net9p_defs.c: net9p_defs.gen net9p_defs.txt
diff --git a/netio.h b/netio.h
new file mode 100644
index 0000000..cb2e603
--- /dev/null
+++ b/netio.h
@@ -0,0 +1,14 @@
+#ifndef _NETIO_H_
+#define _NETIO_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+void netio_init(void);
+
+int netio_accept(int sock);
+int netio_read(int conn, void *buf, size_t count);
+int netio_write(int conn, void *buf, size_t count);
+int netio_close(int conn, bool rd, bool wr);
+
+#endef /* _NETIO_H_ */
diff --git a/netio_posix.c b/netio_posix.c
new file mode 100644
index 0000000..0aa8277
--- /dev/null
+++ b/netio_posix.c
@@ -0,0 +1,49 @@
+#include <aio.h>
+#include <sys/socket.h>
+
+#include "coroutine.h"
+
+/* http://davmac.org/davpage/linux/async-io.html */
+void netio_init(void) {
+ sigaction(TODO)
+}
+
+int netio_accept(int socket) {
+ /* AFAICT there is no good POSIX way to do this without busy-polling. */
+ for (;;) {
+ int conn = accept(socket, NULL, NULL);
+ if (conn < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ cr_yield();
+ continue;
+ }
+ return -errno;
+ }
+ return conn;
+ }
+}
+
+int netio_read(int conn, void *buf, size_t count) {
+ int r;
+ struct aiocb ctl_block = {
+ .aio_filedes = conn,
+ .aio_buf = buff,
+ .aio_nbytes = count,
+ .aio_sigevent = {
+ .sigev_notify = SIGEV_SIGNAL,
+ .sigev_signo = SIGIO,
+ .sigev_value = {
+ .sigval_int = (int)cr_getcid(),
+ },
+ },
+ };
+
+ if (aio_read(&ctl_block) < 0)
+ return -errno;
+
+ while ((r = aio_error(&ctl_block)) == EINPROGRESS)
+ cr_pause_and_yield();
+
+
+
+}
diff --git a/netio_uring.c b/netio_uring.c
new file mode 100644
index 0000000..bf3f9c0
--- /dev/null
+++ b/netio_uring.c
@@ -0,0 +1,365 @@
+#include <linux/io_uring.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <error.h>
+#include <string.h>
+
+#include "coroutine.h"
+#include "netio.h"
+
+/* Config *********************************************************************/
+
+#define MAX_WORKERS 8
+
+/* Syscalls *******************************************************************/
+
+/* I don't want to pull in liburing, and glibc doesn't provide stubs
+ * for these syscalls.
+ *
+ * For the signatures, search for `SYSCALL_DEFINE` in
+ * `linux.git:io_uring/io_uring.c`.
+ */
+
+static inline int io_uring_setup(uint32_t entries, struct io_uring_params *p) {
+ return syscall(__NR_io_uring_setup, entries, p);
+}
+static inline int io_uring_enter1(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags,
+ sigset_t *sig) {
+ return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags & ~IORING_ENTER_EXT_ARG, sig);
+}
+static inline int io_uring_enter2(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags,
+ struct io_uring_getevents_arg *argp, size_t argsz) {
+ return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags | IORING_ENTER_EXT_ARG, argp, argsz);
+}
+static inline int io_uring_register(int fd, int opcode, void *arg, unsigned int nr_args) {
+ return syscall(__NR_io_uring_register, opcode, arg, nr_args);
+}
+
+/* Userspace component of io_uring ********************************************/
+
+/* I'm not too sure what the sematics around volatile-memory vs
+ * __sync_synchronize() are, but this is the definition that
+ * linux.git/tools/include/io_uring/mini_liburing.h uses. */
+#if defined(__x86_64__)
+# define memory_barrier() asm volatile ("":::"memory")
+#else
+# define memory_barrier() __sync_synchronize() /* GCC built-in */
+#endif
+
+/**
+ * Submission Queue
+ *
+ * A proxy into the kernel's internal structures, which have been mmap()ed into
+ * userspace. Each member is a pointer (which is what makes this a proxy)
+ * because the order of the members kernel memory is not ABI-stable, and instead
+ * we get offsets into the mmap()ed area from the io_uring_setup() syscall.
+ *
+ * Call memory_barrier() before reading a non-const value from the proxy, and
+ * after writing to it.
+ *
+ * Exactly which kernel structure this is proxying into varies by kernel version
+ * (I point this out so that you can more easily find the documentation for the
+ * internal kernel structures):
+ *
+ * fs/io_uring.c struct io_sq_ring; kernel v5.1 - v5.3
+ * fs/io_uring.c struct io_rings; kernel v5.4 - v5.19
+ * include/linux/io_uring_types.h struct io_rings; kernel v6.0 +
+ *
+ * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single
+ * monolithic structure in v5.4, I leave them separate here, because
+ * conceptually they are better separate; they were merged purely for
+ * performance reasons.
+ *
+ * I also include in a leading comment on each member with a kernel definition,
+ * which will make the size passed to mmap() make sense.
+ */
+struct my_io_sq_ring_proxy {
+ void *ring_mmap; /* in case we want to munmap() it */
+ size_t ring_mmap_size; /* in case we want to munmap() it */
+ /* pointers into mmap(offset=ORING_OFF_SQ_RING) */
+ /* kernel def */ /* who-writes-it ; description */
+ /* u32 r.head; */ uint32_t *sq_head; /* kernel ; apply sq_mask to get a valid index */
+ /* u32 r.tail; */ uint32_t *sq_tail; /* userspace ; apply sq_mask to get a valid index */
+ /* u32 ring_mask; */ uint32_t *sq_mask; /* kern-const ; TODO */
+ /* u32 ring_entries; */ uint32_t *sq_cap; /* kern-const ; number of entries, always a power-of-2 */
+ /* u32 dropped; */ uint32_t *sq_dropped; /* kernel ; number of entries dropped because invalid index */
+ /* atomic_t flags; */ int *sq_flags; /* kernel ; must use memory barrier before checking */
+ /* u32 array[]; */ uint32_t *sq_sqe_idxs; /* userspace ; sq_cap-sized array of indexes into the sq_sqes array */
+
+ /* This is actually separate from `struct io_sq_ring`/`struct io_rings`. */
+ void *entries_mmap; /* in case we want to munmap() */
+ size_t entries_mmap_size; /* in case we want to munmap() */
+ /* pointers into mmap(offset=ORING_OFF_SQES) */
+ /* who-writes-it ; description */
+ struct io_uring_sqe *sq_sqes; /* userspace ; sq_cap-sized array */
+
+ /* The structure of sq_sqe_idxs is as follows. The
+ * application writes a request into [sq_tail+1], then
+ * increments sq_tail. The kernel increments sq_head when it
+ * has processed a request.
+ *
+ * <- sq_cap=8
+ * [ 7: uninitialized ]
+ * [ 6: uninitialized ]
+ * [ 5: uninitialized ] <- sq_tail=5
+ * [ 4: pending T ]
+ * [ 3: pending | ]
+ * [ 2: pending H ] <- sq_head=2
+ * [ 1: finished ]
+ * [ 0: finished ]
+ *
+ * It may wrap-around like
+ *
+ * <- sq_cap=8
+ * [ 7: pending | ]
+ * [ 6: pending | ]
+ * [ 5: pending | ]
+ * [ 4: pending H ] <- sq_head=4
+ * [ 3: finished ]
+ * [ 2: finished ] <- sq_tail=10 (sq_tail%sq_cap = sq_tail&sq_mask = 2)
+ * [ 1: pending T ]
+ * [ 0: pending | ]
+ *
+ * When empty it looks like
+ * <- sq_cap=8
+ * [ 7: uninitialized ]
+ * [ 6: uninitialized ]
+ * [ 5: uninitialized ]
+ * [ 4: uninitialized ]
+ * [ 3: uninitialized ]
+ * [ 2: uninitialized ]
+ * [ 1: finished O ] <- sq_head=sq_tail=1
+ * [ 0: finished ]
+ */
+};
+
+/**
+ * Completion Queue
+ *
+ * A proxy into the kernel's internal structure, which has been mmap()ed into
+ * userspace. Each member is a pointer (which is what makes this a proxy)
+ * because the order of the members kernel memory is not ABI-stable, and instead
+ * we get offsets into the mmap()ed area from the io_uring_setup() syscall.
+ *
+ * Call memory_barrier() before reading a non-const value from the proxy, and
+ * after writing to it.
+ *
+ * Exactly which kernel structure this is proxying into varies by kernel version
+ * (I point this out so that you can more easily find the documentation for the
+ * internal kernel structures):
+ *
+ * fs/io_uring.c struct io_cq_ring; kernel v5.1 - v5.3
+ * fs/io_uring.c struct io_rings; kernel v5.4 - v5.19
+ * include/linux/io_uring_types.h struct io_rings; kernel v6.0 +
+ *
+ * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single
+ * monolithic structure in v5.4, I leave them separate here, because
+ * conceptually they are better separate; they were merged purely for
+ * performance reasons.
+ *
+ * I also include in a leading comment on each member with a kernel definition,
+ * which will make the size passed to mmap() make sense.
+ */
+struct my_io_cq_ring_proxy {
+ size_t mmap_size; /* in case we want to munmap() it */
+ void *mmap; /* in case we want to munmap() it */
+ /* pointers into mmap(offset=ORING_OFF_CQ_RING) */
+ /* kernel def */ /* who-writes-it ; description */
+ /* u32 r.head; */ uint32_t *cq_head; /* userspace ; apply cq_mask to get a valid index */
+ /* u32 r.tail; */ uint32_t *cq_tail; /* kernel ; apply cq_mask to get a valid index */
+ /* u32 ring_mask; */ uint32_t *cq_mask; /* kern-const ; TODO */
+ /* u32 ring_entries; */ uint32_t *cq_cap; /* kern-const ; number of entries, always a power-of-2 */
+ /* u32 flags; */ uint32_t *cq_flags; /* userspace ; TODO */
+ /* u32 overflow; */ uint32_t *cq_overflow; /* kernel ; TODO */
+ /* struct io_uring_cqe cqes[]; */ struct io_uring_cqe *cq_cqes; /* mostly-kernel ; cq_cap-sized array; userspace is allowed to modify pending entries */
+}
+
+struct my_uring {
+ int fd;
+
+ struct my_io_sq_ring_proxy kern_sq;
+ struct my_io_cq_ring_proxy kern_cq;
+
+ uint32_t user_sq_head;
+ uint32_t user_sq_tail;
+ uint32_t user_cq_head;
+ uint32_t user_cq_tail;
+};
+
+static int my_uring_deinit(struct my_uring *ring) {
+ if (ring->kern_cq.mmap &&
+ ring->kern_cq.mmap != MAP_FAILED &&
+ ring->kern_cq.mmap != ring->kern_sq.ring_mmap)
+ munmap(ring->ring->kern_cq.mmap, ring->kern_cq.mmap_size);
+ if (ring->kern_sq.entries_mmap &&
+ ring->kern_sq.entries_mmap != MAP_FAILED)
+ munmap(ring->ring->kern_sq.entries_mmap, ring->kern_sq.entries_mmap_size);
+ if (ring->kern_sq.ring_mmap &&
+ ring->kern_sq.ring_mmap != MAP_FAILED)
+ munmap(ring->ring->kern_sq.ring_mmap, ring->kern_sq.ring_mmap_size);
+ if (ring->fd >= 0)
+ close(ring->fd);
+ memset(ring, 0, sizeof(*ring));
+ ring->fd = -1;
+}
+
+static int my_uring_init(struct my_uring *ring, uint32_t num_entries) {
+ assert(ring);
+
+ memset(ring, 0, sizeof(*ring));
+
+ static struct io_uring_params params = { 0 };
+ ring->fd = io_uring_setup(num_entries, &params);
+ if (ring->fd < 0) {
+ error(0, ring->fd, "io_uring_setup");
+ my_uring_deinit(ring);
+ return -1;
+ }
+
+ ring->kern_sq.ring_mmap_size = params.sq_off.array + (params.sq_entries * sizeof(uint32_t));
+ ring->kern_sq.ring_mmap = mmap(NULL, ring->kern_sq.ring_mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_POPULATE,
+ ring->fd,
+ IORING_OFF_SQ_RING);
+ if (ring->kern_sq.ring_mmap == MAP_FAILED) {
+ error(0, errno, "mmap(SQ_RING)");
+ my_uring_deinit(ring);
+ return -1;
+ }
+ ring->kern_sq.sq_head = ring->kern_sq.mmap + params.sq_off.head;
+ ring->kern_sq.sq_tail = ring->kern_sq.mmap + params.sq_off.tail;
+ ring->kern_sq.sq_mask = ring->kern_sq.mmap + params.sq_off.ring_mask;
+ ring->kern_sq.sq_cap = ring->kern_sq.mmap + params.sq_off.ring_entries;
+ ring->kern_sq.sq_flags = ring->kern_sq.mmap + params.sq_off.flags;
+ ring->kern_sq.sq_dropped = ring->kern_sq.mmap + params.sq_off.dropped;
+ ring->kern_sq.sq_sqe_idxs = ring->kern_sq.mmap + params.sq_off.array;
+
+ ring->kern_sq.entries_mmap_size = params.sq_entries * sizeof(struct io_uring_sqe);
+ ring->kern_sq.entries_mmap = mmap(NULL, ring->kern_sq.entries_mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_POPULATE,
+ ring->fd,
+ IORING_OFF_SQES);
+ if (ring->kern_sq == MAP_FAILED) {
+ error(0, errno, "mmap(SQES)");
+ my_uring_deinit(ring);
+ return -1;
+ }
+ ring->kern_sq.sq_sqes = ring->kern_sq.entries_mmap;
+
+ if (params.features & IORING_FEAT_SINGLE_MMAP) {
+ /* Purely optional optimization that is possible since kernel v5.4. */
+ ring->cq.mmap_size = ring->kern_sq.ring_mmap_size;
+ ring->cq.mmap = ring->kern_sq.ring_mmap;
+ } else {
+ ring->cq.mmap_size = params.cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
+ ring->cq.mmap = mmap(NULL, ring->cq.mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_POPULATE,
+ ring->fd,
+ IORING_OFF_CQ_RING);
+ if (ring->cq.mmap == MAP_FAILED) {
+ error(0, errno, "mmap(CQ_RING)");
+ my_uring_deinit(ring);
+ return -1;
+ }
+ }
+ ring->cq.head = ring->cq.mmap + params.cq_off.head;
+ ring->cq.tail = ring->cq.mmap + params.cq_off.tail;
+ ring->cq.mask = ring->cq.mmap + params.cq_off.mask;
+ ring->cq.cap = ring->cq.mmap + params.cq_off.entries;
+ ring->cq.overflow = ring->cq.mmap + params.cq_off.overflow;
+ ring->cq.cqes = ring->cq.mmap + params.cq_off.cqes;
+ ring->cq.flags = ring->cq.mmap + params.cq_off.flags;
+
+ return 0;
+}
+
+/**
+ * Obtain a Submission Queue Entry that is available to be written into.
+ * Returns NULL if the queue is full.
+ *
+ * Once you have filled the sqe with your request, call my_uring_submit() to
+ * submit it to the kernel, in a batch with any other sqe's you've gotten since
+ * the last my_uring_submit() call.
+ */
+static inline struct io_uring_sqe *my_uring_get_sqe(struct my_uring *ring) {
+ /* Because sq_cap is always a power-of-2, we don't ever have to apply `%
+ * sq_cap` because `& sq_mask` does that for us, and UINT32_MAX being a
+ * power-of-2 as well means overflow will never hurt us. */
+ if (ring->user_sq_tail + 1 - ring->user_sq_head > *ring->kern_sq.sq_cap)
+ return 1;
+ return &sq->kern_sq.sq_sqes[sq->user_sq_tail++ & *sq->kern_sq.sq_mask];
+}
+
+/**
+ * Submit a batch of sqes (obtained from my_uring_get_sqe()) to the kernel.
+ *
+ * Returns the number of sqes successfully submitted, or -errno.
+ */
+static inline int my_uring_submit(struct my_uring *ring) {
+ uint32_t read_khead, ktail_to_write, mask;
+ int ret;
+
+ if (ring->user_sq.sq_head == ring->user_sq.sq_tail)
+ /* nothing to do */
+ return 0;
+
+ mask = *ring->kern_sq.sq_mask;
+
+ memory_barrier(); /* to read sq_head */
+ read_khead = *ring->kern_sq.sq_head;
+
+ ktail_to_write = *ring->kern_sq.sq_tail;
+ while (ring->user_sq.sq_head != ring->user_sq.sq_tail && ktail_to_write + 1 != read_khead)
+ ring_kern_sq.sq_sqe_idxs[ktail_to_write++ & mask] = ring->user_sq.sq_head++ & mask;
+ memory_barrier(); /* wrote sq_sqe_idxs; must do this *before* the sq_tail write below */
+
+ *ring->kern_sq.sq_tail = ktail_to_write;
+ memory_barrier(); /* wrote sq_tail */
+
+ ret = io_uring_enter1(ring->fd, ktail_to_write - read_khead, 0, 0, NULL);
+ return ret < 0 ? -errno : ret;
+}
+
+/* netio implementation on top of that ****************************************/
+
+static struct my_uring netio_uring = { 0 };
+
+void netio_init(void) {
+ if (!netio_uring.fd)
+ if (my_uring_init(&netio_uring, num_entries) < 0)
+ exit(1);
+}
+
+/** Like accept4(2). */
+static inline int netio_accept4(int sock, struct sockaddr *addr, socklen_t *addrlen, int flags) {
+ struct io_uring_sqe *sqe = my_uring_get_sqe(&netio_uring);
+ if (!sqe)
+ return -ENOSR;
+
+ sqe.opcode = IORING_OP_ACCEPT;
+ sqe.fd = sock;
+ sqe.addr = (uint64_t)addr;
+ sqe.off = (uint64_t)addrlen;
+ sqe.accept_flags = flags;
+ sqe.len = 0; /* number of iovecs */
+
+ /* Submit that accept4() call. */
+ my_uring_submit(&netio_uring);
+}
+
+int netio_accept(int sock) {
+ return netio_accept4(sock, NULL, NULL, 0);
+}
+
+int netio_read(int conn, void *buf, size_t count) {
+}
+
+int netio_write(int conn, void *buf, size_t count) {
+}
+
+int netio_close(int conn, bool rd, bool wr) {
+}
+