4 files changed, 433 insertions, 0 deletions
diff --git a/Makefile b/Makefile
index 68c1598..57583ae 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,8 @@
+CFLAGS += -fno-split-stack
+CFLAGS += -Wall -Wextra -Werror
+CFLAGS += -g -O0
+LDFLAGS += -static
+
 srv9p: srv9p.o coroutine.o net9p.o
 
 net9p_defs.h net9p_defs.c: net9p_defs.gen net9p_defs.txt
diff --git a/netio.h b/netio.h
new file mode 100644
index 0000000..cb2e603
--- /dev/null
+++ b/netio.h
@@ -0,0 +1,14 @@
+#ifndef _NETIO_H_
+#define _NETIO_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+void netio_init(void);
+
+int netio_accept(int sock);
+int netio_read(int conn, void *buf, size_t count);
+int netio_write(int conn, void *buf, size_t count);
+int netio_close(int conn, bool rd, bool wr);
+
+#endef /* _NETIO_H_ */
diff --git a/netio_posix.c b/netio_posix.c
new file mode 100644
index 0000000..0aa8277
--- /dev/null
+++ b/netio_posix.c
@@ -0,0 +1,49 @@
+#include <aio.h>
+#include <sys/socket.h>
+
+#include "coroutine.h"
+
+/* http://davmac.org/davpage/linux/async-io.html */
+void netio_init(void) {
+	sigaction(TODO)
+}
+
+int netio_accept(int socket) {
+	/* AFAICT there is no good POSIX way to do this without busy-polling.  */
+	for (;;) {
+		int conn = accept(socket, NULL, NULL);
+		if (conn < 0) {
+			if (errno == EAGAIN || errno == EWOULDBLOCK) {
+				cr_yield();
+				continue;
+			}
+			return -errno;
+		}
+		return conn;
+	}
+}
+
+int netio_read(int conn, void *buf, size_t count) {
+	int r;
+	struct aiocb ctl_block = {
+		.aio_filedes = conn,
+		.aio_buf = buff,
+		.aio_nbytes = count,
+		.aio_sigevent = {
+			.sigev_notify = SIGEV_SIGNAL,
+			.sigev_signo = SIGIO,
+			.sigev_value = {
+				.sigval_int = (int)cr_getcid(),
+			},
+		},
+	};
+
+	if (aio_read(&ctl_block) < 0)
+		return -errno;
+
+	while ((r = aio_error(&ctl_block)) == EINPROGRESS)
+		cr_pause_and_yield();
+	
+
+	
+}
diff --git a/netio_uring.c b/netio_uring.c
new file mode 100644
index 0000000..bf3f9c0
--- /dev/null
+++ b/netio_uring.c
@@ -0,0 +1,365 @@
+#include <linux/io_uring.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <error.h>
+#include <string.h>
+
+#include "coroutine.h"
+#include "netio.h"
+
+/* Config *********************************************************************/
+
+#define MAX_WORKERS 8
+
+/* Syscalls *******************************************************************/
+
+/* I don't want to pull in liburing, and glibc doesn't provide stubs
+ * for these syscalls.
+ *
+ * For the signatures, search for `SYSCALL_DEFINE` in
+ * `linux.git:io_uring/io_uring.c`.
+ */
+
+static inline int io_uring_setup(uint32_t entries, struct io_uring_params *p) {
+	return syscall(__NR_io_uring_setup, entries, p);
+}
+static inline int io_uring_enter1(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags,
+                                  sigset_t *sig) {
+	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags & ~IORING_ENTER_EXT_ARG, sig);
+}
+static inline int io_uring_enter2(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags,
+                                  struct io_uring_getevents_arg *argp, size_t argsz) {
+	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags | IORING_ENTER_EXT_ARG, argp, argsz);
+}
+static inline int io_uring_register(int fd, int opcode, void *arg, unsigned int nr_args) {
+	return syscall(__NR_io_uring_register, opcode, arg, nr_args);
+}
+
+/* Userspace component of io_uring ********************************************/
+
+/* I'm not too sure what the sematics around volatile-memory vs
+ * __sync_synchronize() are, but this is the definition that
+ * linux.git/tools/include/io_uring/mini_liburing.h uses. */
+#if defined(__x86_64__)
+#  define memory_barrier()  asm volatile ("":::"memory")
+#else
+#  define memory_barrier() __sync_synchronize() /* GCC built-in */
+#endif
+
+/**
+ * Submission Queue
+ *
+ * A proxy into the kernel's internal structures, which have been mmap()ed into
+ * userspace.  Each member is a pointer (which is what makes this a proxy)
+ * because the order of the members kernel memory is not ABI-stable, and instead
+ * we get offsets into the mmap()ed area from the io_uring_setup() syscall.
+ *
+ * Call memory_barrier() before reading a non-const value from the proxy, and
+ * after writing to it.
+ *
+ * Exactly which kernel structure this is proxying into varies by kernel version
+ * (I point this out so that you can more easily find the documentation for the
+ * internal kernel structures):
+ *
+ *     fs/io_uring.c                    struct io_sq_ring;  kernel v5.1 - v5.3
+ *     fs/io_uring.c                    struct io_rings;    kernel v5.4 - v5.19
+ *     include/linux/io_uring_types.h   struct io_rings;    kernel v6.0 +
+ *
+ * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single
+ * monolithic structure in v5.4, I leave them separate here, because
+ * conceptually they are better separate; they were merged purely for
+ * performance reasons.
+ *
+ * I also include in a leading comment on each member with a kernel definition,
+ * which will make the size passed to mmap() make sense.
+ */
+struct my_io_sq_ring_proxy {
+	void    *ring_mmap;      /* in case we want to munmap() it */
+	size_t   ring_mmap_size; /* in case we want to munmap() it */
+	/* pointers into mmap(offset=ORING_OFF_SQ_RING) */
+	/* kernel def              */                                    /* who-writes-it ; description */
+	/* u32       r.head;       */ uint32_t             *sq_head;     /* kernel        ; apply sq_mask to get a valid index */
+	/* u32       r.tail;       */ uint32_t             *sq_tail;     /* userspace     ; apply sq_mask to get a valid index */
+	/* u32       ring_mask;    */ uint32_t             *sq_mask;     /* kern-const    ; TODO */
+	/* u32       ring_entries; */ uint32_t             *sq_cap;      /* kern-const    ; number of entries, always a power-of-2 */
+	/* u32       dropped;      */ uint32_t             *sq_dropped;  /* kernel        ; number of entries dropped because invalid index */
+	/* atomic_t  flags;        */ int                  *sq_flags;    /* kernel        ; must use memory barrier before checking */
+	/* u32       array[];      */ uint32_t             *sq_sqe_idxs; /* userspace     ; sq_cap-sized array of indexes into the sq_sqes array */
+
+	/* This is actually separate from `struct io_sq_ring`/`struct io_rings`.  */
+	void    *entries_mmap;      /* in case we want to munmap() */
+	size_t   entries_mmap_size; /* in case we want to munmap() */
+	/* pointers into mmap(offset=ORING_OFF_SQES) */
+	                               /* who-writes-it ; description */
+	struct io_uring_sqe  *sq_sqes; /* userspace     ; sq_cap-sized array */
+
+	/* The structure of sq_sqe_idxs is as follows.  The
+	 * application writes a request into [sq_tail+1], then
+	 * increments sq_tail.  The kernel increments sq_head when it
+	 * has processed a request.
+	 *
+	 *                          <- sq_cap=8
+	 *     [ 7: uninitialized ]
+	 *     [ 6: uninitialized ]
+	 *     [ 5: uninitialized ] <- sq_tail=5
+	 *     [ 4: pending   T   ]
+	 *     [ 3: pending   |   ]
+	 *     [ 2: pending   H   ] <- sq_head=2
+	 *     [ 1: finished      ]
+	 *     [ 0: finished      ]
+	 *
+	 * It may wrap-around like
+	 *
+	 *                          <- sq_cap=8
+	 *     [ 7: pending   |   ]
+	 *     [ 6: pending   |   ]
+	 *     [ 5: pending   |   ]
+	 *     [ 4: pending   H   ] <- sq_head=4
+	 *     [ 3: finished      ]
+	 *     [ 2: finished      ] <- sq_tail=10 (sq_tail%sq_cap = sq_tail&sq_mask = 2)
+	 *     [ 1: pending   T   ]
+	 *     [ 0: pending   |   ]
+	 *
+	 * When empty it looks like
+	 *                          <- sq_cap=8
+	 *     [ 7: uninitialized ]
+	 *     [ 6: uninitialized ]
+	 *     [ 5: uninitialized ]
+	 *     [ 4: uninitialized ]
+	 *     [ 3: uninitialized ]
+	 *     [ 2: uninitialized ]
+	 *     [ 1: finished  O   ] <- sq_head=sq_tail=1
+	 *     [ 0: finished      ]
+	 */
+};
+
+/**
+ * Completion Queue
+ *
+ * A proxy into the kernel's internal structure, which has been mmap()ed into
+ * userspace.  Each member is a pointer (which is what makes this a proxy)
+ * because the order of the members kernel memory is not ABI-stable, and instead
+ * we get offsets into the mmap()ed area from the io_uring_setup() syscall.
+ *
+ * Call memory_barrier() before reading a non-const value from the proxy, and
+ * after writing to it.
+ *
+ * Exactly which kernel structure this is proxying into varies by kernel version
+ * (I point this out so that you can more easily find the documentation for the
+ * internal kernel structures):
+ *
+ *     fs/io_uring.c                    struct io_cq_ring;  kernel v5.1 - v5.3
+ *     fs/io_uring.c                    struct io_rings;    kernel v5.4 - v5.19
+ *     include/linux/io_uring_types.h   struct io_rings;    kernel v6.0 +
+ *
+ * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single
+ * monolithic structure in v5.4, I leave them separate here, because
+ * conceptually they are better separate; they were merged purely for
+ * performance reasons.
+ *
+ * I also include in a leading comment on each member with a kernel definition,
+ * which will make the size passed to mmap() make sense.
+ */
+struct my_io_cq_ring_proxy {
+	size_t   mmap_size; /* in case we want to munmap() it */
+	void    *mmap;      /* in case we want to munmap() it */
+	/* pointers into mmap(offset=ORING_OFF_CQ_RING) */
+	/* kernel def                         */                                    /* who-writes-it ; description */
+	/* u32                  r.head;       */ uint32_t             *cq_head;     /* userspace     ; apply cq_mask to get a valid index */
+	/* u32                  r.tail;       */ uint32_t             *cq_tail;     /* kernel        ; apply cq_mask to get a valid index */
+	/* u32                  ring_mask;    */ uint32_t             *cq_mask;     /* kern-const    ; TODO */
+	/* u32                  ring_entries; */ uint32_t             *cq_cap;      /* kern-const    ; number of entries, always a power-of-2 */
+	/* u32                  flags;        */ uint32_t             *cq_flags;    /* userspace     ; TODO */
+	/* u32                  overflow;     */ uint32_t             *cq_overflow; /* kernel        ; TODO */
+	/* struct io_uring_cqe  cqes[];       */ struct io_uring_cqe  *cq_cqes;     /* mostly-kernel ; cq_cap-sized array; userspace is allowed to modify pending entries */
+}
+
+struct my_uring {
+	int  fd;
+
+	struct my_io_sq_ring_proxy  kern_sq;
+	struct my_io_cq_ring_proxy  kern_cq;
+
+	uint32_t  user_sq_head;
+	uint32_t  user_sq_tail;
+	uint32_t  user_cq_head;
+	uint32_t  user_cq_tail;
+};
+
+static int my_uring_deinit(struct my_uring *ring) {
+	if (ring->kern_cq.mmap &&
+	    ring->kern_cq.mmap != MAP_FAILED &&
+	    ring->kern_cq.mmap != ring->kern_sq.ring_mmap)
+		munmap(ring->ring->kern_cq.mmap, ring->kern_cq.mmap_size);
+	if (ring->kern_sq.entries_mmap &&
+	    ring->kern_sq.entries_mmap != MAP_FAILED)
+		munmap(ring->ring->kern_sq.entries_mmap, ring->kern_sq.entries_mmap_size);
+	if (ring->kern_sq.ring_mmap &&
+	    ring->kern_sq.ring_mmap != MAP_FAILED)
+		munmap(ring->ring->kern_sq.ring_mmap, ring->kern_sq.ring_mmap_size);
+	if (ring->fd >= 0)
+		close(ring->fd);
+	memset(ring, 0, sizeof(*ring));
+	ring->fd = -1;
+}
+
+static int my_uring_init(struct my_uring *ring, uint32_t num_entries) {
+	assert(ring);
+
+	memset(ring, 0, sizeof(*ring));
+
+	static struct io_uring_params params = { 0 };
+	ring->fd = io_uring_setup(num_entries, &params);
+	if (ring->fd < 0) {
+		error(0, ring->fd, "io_uring_setup");
+		my_uring_deinit(ring);
+		return -1;
+	}
+	
+	ring->kern_sq.ring_mmap_size = params.sq_off.array + (params.sq_entries * sizeof(uint32_t));
+	ring->kern_sq.ring_mmap = mmap(NULL, ring->kern_sq.ring_mmap_size,
+	                               PROT_READ|PROT_WRITE,
+	                               MAP_SHARED|MAP_POPULATE,
+	                               ring->fd,
+	                               IORING_OFF_SQ_RING);
+	if (ring->kern_sq.ring_mmap == MAP_FAILED) {
+		error(0, errno, "mmap(SQ_RING)");
+		my_uring_deinit(ring);
+		return -1;
+	}
+	ring->kern_sq.sq_head     = ring->kern_sq.mmap + params.sq_off.head;
+	ring->kern_sq.sq_tail     = ring->kern_sq.mmap + params.sq_off.tail;
+	ring->kern_sq.sq_mask     = ring->kern_sq.mmap + params.sq_off.ring_mask;
+	ring->kern_sq.sq_cap      = ring->kern_sq.mmap + params.sq_off.ring_entries;
+	ring->kern_sq.sq_flags    = ring->kern_sq.mmap + params.sq_off.flags;
+	ring->kern_sq.sq_dropped  = ring->kern_sq.mmap + params.sq_off.dropped;
+	ring->kern_sq.sq_sqe_idxs = ring->kern_sq.mmap + params.sq_off.array;
+
+	ring->kern_sq.entries_mmap_size = params.sq_entries * sizeof(struct io_uring_sqe);
+	ring->kern_sq.entries_mmap = mmap(NULL, ring->kern_sq.entries_mmap_size,
+	                                  PROT_READ|PROT_WRITE,
+	                                  MAP_SHARED|MAP_POPULATE,
+	                                  ring->fd,
+	                                  IORING_OFF_SQES);
+	if (ring->kern_sq == MAP_FAILED) {
+		error(0, errno, "mmap(SQES)");
+		my_uring_deinit(ring);
+		return -1;
+	}
+	ring->kern_sq.sq_sqes = ring->kern_sq.entries_mmap;
+
+	if (params.features & IORING_FEAT_SINGLE_MMAP) {
+		/* Purely optional optimization that is possible since kernel v5.4.  */
+		ring->cq.mmap_size = ring->kern_sq.ring_mmap_size; 
+		ring->cq.mmap = ring->kern_sq.ring_mmap;
+	} else {
+		ring->cq.mmap_size = params.cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
+		ring->cq.mmap = mmap(NULL, ring->cq.mmap_size,
+		                     PROT_READ|PROT_WRITE,
+		                     MAP_SHARED|MAP_POPULATE,
+		                     ring->fd,
+		                     IORING_OFF_CQ_RING);
+		if (ring->cq.mmap == MAP_FAILED) {
+			error(0, errno, "mmap(CQ_RING)");
+			my_uring_deinit(ring);
+			return -1;
+		}
+	}
+	ring->cq.head     = ring->cq.mmap + params.cq_off.head;
+	ring->cq.tail     = ring->cq.mmap + params.cq_off.tail;
+	ring->cq.mask     = ring->cq.mmap + params.cq_off.mask;
+	ring->cq.cap      = ring->cq.mmap + params.cq_off.entries;
+	ring->cq.overflow = ring->cq.mmap + params.cq_off.overflow;
+	ring->cq.cqes     = ring->cq.mmap + params.cq_off.cqes;
+	ring->cq.flags    = ring->cq.mmap + params.cq_off.flags;
+
+	return 0;
+}
+
+/**
+ * Obtain a Submission Queue Entry that is available to be written into.
+ * Returns NULL if the queue is full.
+ *
+ * Once you have filled the sqe with your request, call my_uring_submit() to
+ * submit it to the kernel, in a batch with any other sqe's you've gotten since
+ * the last my_uring_submit() call.
+ */
+static inline struct io_uring_sqe *my_uring_get_sqe(struct my_uring *ring) {
+	/* Because sq_cap is always a power-of-2, we don't ever have to apply `%
+	 * sq_cap` because `& sq_mask` does that for us, and UINT32_MAX being a
+	 * power-of-2 as well means overflow will never hurt us.  */
+	if (ring->user_sq_tail + 1 - ring->user_sq_head > *ring->kern_sq.sq_cap)
+		return 1;
+	return &sq->kern_sq.sq_sqes[sq->user_sq_tail++ & *sq->kern_sq.sq_mask];
+}
+
+/**
+ * Submit a batch of sqes (obtained from my_uring_get_sqe()) to the kernel.
+ *
+ * Returns the number of sqes successfully submitted, or -errno.
+ */
+static inline int my_uring_submit(struct my_uring *ring) {
+	uint32_t read_khead, ktail_to_write, mask;
+	int ret;
+
+	if (ring->user_sq.sq_head == ring->user_sq.sq_tail)
+		/* nothing to do */
+		return 0;
+
+	mask = *ring->kern_sq.sq_mask;
+
+	memory_barrier(); /* to read sq_head */
+	read_khead = *ring->kern_sq.sq_head;
+	
+	ktail_to_write = *ring->kern_sq.sq_tail;
+	while (ring->user_sq.sq_head != ring->user_sq.sq_tail && ktail_to_write + 1 != read_khead)
+		ring_kern_sq.sq_sqe_idxs[ktail_to_write++ & mask] = ring->user_sq.sq_head++ & mask;
+	memory_barrier(); /* wrote sq_sqe_idxs; must do this *before* the sq_tail write below */
+
+	*ring->kern_sq.sq_tail = ktail_to_write;
+	memory_barrier(); /* wrote sq_tail */
+		
+	ret = io_uring_enter1(ring->fd, ktail_to_write - read_khead, 0, 0, NULL);
+	return ret < 0 ? -errno : ret;
+}
+
+/* netio implementation on top of that ****************************************/
+
+static struct my_uring netio_uring = { 0 };
+
+void netio_init(void) {
+	if (!netio_uring.fd)
+		if (my_uring_init(&netio_uring, num_entries) < 0)
+			exit(1);
+}
+
+/** Like accept4(2).  */
+static inline int netio_accept4(int sock, struct sockaddr *addr, socklen_t *addrlen, int flags) {
+	struct io_uring_sqe *sqe = my_uring_get_sqe(&netio_uring);
+	if (!sqe)
+		return -ENOSR;
+
+	sqe.opcode       = IORING_OP_ACCEPT;
+	sqe.fd           = sock;
+	sqe.addr         = (uint64_t)addr;
+	sqe.off          = (uint64_t)addrlen;
+	sqe.accept_flags = flags;
+	sqe.len          = 0; /* number of iovecs */
+
+	/* Submit that accept4() call.  */
+	my_uring_submit(&netio_uring);
+}
+
+int netio_accept(int sock) {
+	return netio_accept4(sock, NULL, NULL, 0);
+}
+
+int netio_read(int conn, void *buf, size_t count) {
+}
+
+int netio_write(int conn, void *buf, size_t count) {
+}
+
+int netio_close(int conn, bool rd, bool wr) {
+}
+