diff options
Diffstat (limited to 'netio_uring.c')
-rw-r--r-- | netio_uring.c | 365 |
1 files changed, 0 insertions, 365 deletions
diff --git a/netio_uring.c b/netio_uring.c deleted file mode 100644 index bf3f9c0..0000000 --- a/netio_uring.c +++ /dev/null @@ -1,365 +0,0 @@ -#include <linux/io_uring.h> -#include <sys/mman.h> -#include <assert.h> -#include <error.h> -#include <string.h> - -#include "coroutine.h" -#include "netio.h" - -/* Config *********************************************************************/ - -#define MAX_WORKERS 8 - -/* Syscalls *******************************************************************/ - -/* I don't want to pull in liburing, and glibc doesn't provide stubs - * for these syscalls. - * - * For the signatures, search for `SYSCALL_DEFINE` in - * `linux.git:io_uring/io_uring.c`. - */ - -static inline int io_uring_setup(uint32_t entries, struct io_uring_params *p) { - return syscall(__NR_io_uring_setup, entries, p); -} -static inline int io_uring_enter1(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags, - sigset_t *sig) { - return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags & ~IORING_ENTER_EXT_ARG, sig); -} -static inline int io_uring_enter2(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags, - struct io_uring_getevents_arg *argp, size_t argsz) { - return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags | IORING_ENTER_EXT_ARG, argp, argsz); -} -static inline int io_uring_register(int fd, int opcode, void *arg, unsigned int nr_args) { - return syscall(__NR_io_uring_register, opcode, arg, nr_args); -} - -/* Userspace component of io_uring ********************************************/ - -/* I'm not too sure what the sematics around volatile-memory vs - * __sync_synchronize() are, but this is the definition that - * linux.git/tools/include/io_uring/mini_liburing.h uses. */ -#if defined(__x86_64__) -# define memory_barrier() asm volatile ("":::"memory") -#else -# define memory_barrier() __sync_synchronize() /* GCC built-in */ -#endif - -/** - * Submission Queue - * - * A proxy into the kernel's internal structures, which have been mmap()ed into - * userspace. Each member is a pointer (which is what makes this a proxy) - * because the order of the members kernel memory is not ABI-stable, and instead - * we get offsets into the mmap()ed area from the io_uring_setup() syscall. - * - * Call memory_barrier() before reading a non-const value from the proxy, and - * after writing to it. - * - * Exactly which kernel structure this is proxying into varies by kernel version - * (I point this out so that you can more easily find the documentation for the - * internal kernel structures): - * - * fs/io_uring.c struct io_sq_ring; kernel v5.1 - v5.3 - * fs/io_uring.c struct io_rings; kernel v5.4 - v5.19 - * include/linux/io_uring_types.h struct io_rings; kernel v6.0 + - * - * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single - * monolithic structure in v5.4, I leave them separate here, because - * conceptually they are better separate; they were merged purely for - * performance reasons. - * - * I also include in a leading comment on each member with a kernel definition, - * which will make the size passed to mmap() make sense. - */ -struct my_io_sq_ring_proxy { - void *ring_mmap; /* in case we want to munmap() it */ - size_t ring_mmap_size; /* in case we want to munmap() it */ - /* pointers into mmap(offset=ORING_OFF_SQ_RING) */ - /* kernel def */ /* who-writes-it ; description */ - /* u32 r.head; */ uint32_t *sq_head; /* kernel ; apply sq_mask to get a valid index */ - /* u32 r.tail; */ uint32_t *sq_tail; /* userspace ; apply sq_mask to get a valid index */ - /* u32 ring_mask; */ uint32_t *sq_mask; /* kern-const ; TODO */ - /* u32 ring_entries; */ uint32_t *sq_cap; /* kern-const ; number of entries, always a power-of-2 */ - /* u32 dropped; */ uint32_t *sq_dropped; /* kernel ; number of entries dropped because invalid index */ - /* atomic_t flags; */ int *sq_flags; /* kernel ; must use memory barrier before checking */ - /* u32 array[]; */ uint32_t *sq_sqe_idxs; /* userspace ; sq_cap-sized array of indexes into the sq_sqes array */ - - /* This is actually separate from `struct io_sq_ring`/`struct io_rings`. */ - void *entries_mmap; /* in case we want to munmap() */ - size_t entries_mmap_size; /* in case we want to munmap() */ - /* pointers into mmap(offset=ORING_OFF_SQES) */ - /* who-writes-it ; description */ - struct io_uring_sqe *sq_sqes; /* userspace ; sq_cap-sized array */ - - /* The structure of sq_sqe_idxs is as follows. The - * application writes a request into [sq_tail+1], then - * increments sq_tail. The kernel increments sq_head when it - * has processed a request. - * - * <- sq_cap=8 - * [ 7: uninitialized ] - * [ 6: uninitialized ] - * [ 5: uninitialized ] <- sq_tail=5 - * [ 4: pending T ] - * [ 3: pending | ] - * [ 2: pending H ] <- sq_head=2 - * [ 1: finished ] - * [ 0: finished ] - * - * It may wrap-around like - * - * <- sq_cap=8 - * [ 7: pending | ] - * [ 6: pending | ] - * [ 5: pending | ] - * [ 4: pending H ] <- sq_head=4 - * [ 3: finished ] - * [ 2: finished ] <- sq_tail=10 (sq_tail%sq_cap = sq_tail&sq_mask = 2) - * [ 1: pending T ] - * [ 0: pending | ] - * - * When empty it looks like - * <- sq_cap=8 - * [ 7: uninitialized ] - * [ 6: uninitialized ] - * [ 5: uninitialized ] - * [ 4: uninitialized ] - * [ 3: uninitialized ] - * [ 2: uninitialized ] - * [ 1: finished O ] <- sq_head=sq_tail=1 - * [ 0: finished ] - */ -}; - -/** - * Completion Queue - * - * A proxy into the kernel's internal structure, which has been mmap()ed into - * userspace. Each member is a pointer (which is what makes this a proxy) - * because the order of the members kernel memory is not ABI-stable, and instead - * we get offsets into the mmap()ed area from the io_uring_setup() syscall. - * - * Call memory_barrier() before reading a non-const value from the proxy, and - * after writing to it. - * - * Exactly which kernel structure this is proxying into varies by kernel version - * (I point this out so that you can more easily find the documentation for the - * internal kernel structures): - * - * fs/io_uring.c struct io_cq_ring; kernel v5.1 - v5.3 - * fs/io_uring.c struct io_rings; kernel v5.4 - v5.19 - * include/linux/io_uring_types.h struct io_rings; kernel v6.0 + - * - * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single - * monolithic structure in v5.4, I leave them separate here, because - * conceptually they are better separate; they were merged purely for - * performance reasons. - * - * I also include in a leading comment on each member with a kernel definition, - * which will make the size passed to mmap() make sense. - */ -struct my_io_cq_ring_proxy { - size_t mmap_size; /* in case we want to munmap() it */ - void *mmap; /* in case we want to munmap() it */ - /* pointers into mmap(offset=ORING_OFF_CQ_RING) */ - /* kernel def */ /* who-writes-it ; description */ - /* u32 r.head; */ uint32_t *cq_head; /* userspace ; apply cq_mask to get a valid index */ - /* u32 r.tail; */ uint32_t *cq_tail; /* kernel ; apply cq_mask to get a valid index */ - /* u32 ring_mask; */ uint32_t *cq_mask; /* kern-const ; TODO */ - /* u32 ring_entries; */ uint32_t *cq_cap; /* kern-const ; number of entries, always a power-of-2 */ - /* u32 flags; */ uint32_t *cq_flags; /* userspace ; TODO */ - /* u32 overflow; */ uint32_t *cq_overflow; /* kernel ; TODO */ - /* struct io_uring_cqe cqes[]; */ struct io_uring_cqe *cq_cqes; /* mostly-kernel ; cq_cap-sized array; userspace is allowed to modify pending entries */ -} - -struct my_uring { - int fd; - - struct my_io_sq_ring_proxy kern_sq; - struct my_io_cq_ring_proxy kern_cq; - - uint32_t user_sq_head; - uint32_t user_sq_tail; - uint32_t user_cq_head; - uint32_t user_cq_tail; -}; - -static int my_uring_deinit(struct my_uring *ring) { - if (ring->kern_cq.mmap && - ring->kern_cq.mmap != MAP_FAILED && - ring->kern_cq.mmap != ring->kern_sq.ring_mmap) - munmap(ring->ring->kern_cq.mmap, ring->kern_cq.mmap_size); - if (ring->kern_sq.entries_mmap && - ring->kern_sq.entries_mmap != MAP_FAILED) - munmap(ring->ring->kern_sq.entries_mmap, ring->kern_sq.entries_mmap_size); - if (ring->kern_sq.ring_mmap && - ring->kern_sq.ring_mmap != MAP_FAILED) - munmap(ring->ring->kern_sq.ring_mmap, ring->kern_sq.ring_mmap_size); - if (ring->fd >= 0) - close(ring->fd); - memset(ring, 0, sizeof(*ring)); - ring->fd = -1; -} - -static int my_uring_init(struct my_uring *ring, uint32_t num_entries) { - assert(ring); - - memset(ring, 0, sizeof(*ring)); - - static struct io_uring_params params = { 0 }; - ring->fd = io_uring_setup(num_entries, ¶ms); - if (ring->fd < 0) { - error(0, ring->fd, "io_uring_setup"); - my_uring_deinit(ring); - return -1; - } - - ring->kern_sq.ring_mmap_size = params.sq_off.array + (params.sq_entries * sizeof(uint32_t)); - ring->kern_sq.ring_mmap = mmap(NULL, ring->kern_sq.ring_mmap_size, - PROT_READ|PROT_WRITE, - MAP_SHARED|MAP_POPULATE, - ring->fd, - IORING_OFF_SQ_RING); - if (ring->kern_sq.ring_mmap == MAP_FAILED) { - error(0, errno, "mmap(SQ_RING)"); - my_uring_deinit(ring); - return -1; - } - ring->kern_sq.sq_head = ring->kern_sq.mmap + params.sq_off.head; - ring->kern_sq.sq_tail = ring->kern_sq.mmap + params.sq_off.tail; - ring->kern_sq.sq_mask = ring->kern_sq.mmap + params.sq_off.ring_mask; - ring->kern_sq.sq_cap = ring->kern_sq.mmap + params.sq_off.ring_entries; - ring->kern_sq.sq_flags = ring->kern_sq.mmap + params.sq_off.flags; - ring->kern_sq.sq_dropped = ring->kern_sq.mmap + params.sq_off.dropped; - ring->kern_sq.sq_sqe_idxs = ring->kern_sq.mmap + params.sq_off.array; - - ring->kern_sq.entries_mmap_size = params.sq_entries * sizeof(struct io_uring_sqe); - ring->kern_sq.entries_mmap = mmap(NULL, ring->kern_sq.entries_mmap_size, - PROT_READ|PROT_WRITE, - MAP_SHARED|MAP_POPULATE, - ring->fd, - IORING_OFF_SQES); - if (ring->kern_sq == MAP_FAILED) { - error(0, errno, "mmap(SQES)"); - my_uring_deinit(ring); - return -1; - } - ring->kern_sq.sq_sqes = ring->kern_sq.entries_mmap; - - if (params.features & IORING_FEAT_SINGLE_MMAP) { - /* Purely optional optimization that is possible since kernel v5.4. */ - ring->cq.mmap_size = ring->kern_sq.ring_mmap_size; - ring->cq.mmap = ring->kern_sq.ring_mmap; - } else { - ring->cq.mmap_size = params.cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); - ring->cq.mmap = mmap(NULL, ring->cq.mmap_size, - PROT_READ|PROT_WRITE, - MAP_SHARED|MAP_POPULATE, - ring->fd, - IORING_OFF_CQ_RING); - if (ring->cq.mmap == MAP_FAILED) { - error(0, errno, "mmap(CQ_RING)"); - my_uring_deinit(ring); - return -1; - } - } - ring->cq.head = ring->cq.mmap + params.cq_off.head; - ring->cq.tail = ring->cq.mmap + params.cq_off.tail; - ring->cq.mask = ring->cq.mmap + params.cq_off.mask; - ring->cq.cap = ring->cq.mmap + params.cq_off.entries; - ring->cq.overflow = ring->cq.mmap + params.cq_off.overflow; - ring->cq.cqes = ring->cq.mmap + params.cq_off.cqes; - ring->cq.flags = ring->cq.mmap + params.cq_off.flags; - - return 0; -} - -/** - * Obtain a Submission Queue Entry that is available to be written into. - * Returns NULL if the queue is full. - * - * Once you have filled the sqe with your request, call my_uring_submit() to - * submit it to the kernel, in a batch with any other sqe's you've gotten since - * the last my_uring_submit() call. - */ -static inline struct io_uring_sqe *my_uring_get_sqe(struct my_uring *ring) { - /* Because sq_cap is always a power-of-2, we don't ever have to apply `% - * sq_cap` because `& sq_mask` does that for us, and UINT32_MAX being a - * power-of-2 as well means overflow will never hurt us. */ - if (ring->user_sq_tail + 1 - ring->user_sq_head > *ring->kern_sq.sq_cap) - return 1; - return &sq->kern_sq.sq_sqes[sq->user_sq_tail++ & *sq->kern_sq.sq_mask]; -} - -/** - * Submit a batch of sqes (obtained from my_uring_get_sqe()) to the kernel. - * - * Returns the number of sqes successfully submitted, or -errno. - */ -static inline int my_uring_submit(struct my_uring *ring) { - uint32_t read_khead, ktail_to_write, mask; - int ret; - - if (ring->user_sq.sq_head == ring->user_sq.sq_tail) - /* nothing to do */ - return 0; - - mask = *ring->kern_sq.sq_mask; - - memory_barrier(); /* to read sq_head */ - read_khead = *ring->kern_sq.sq_head; - - ktail_to_write = *ring->kern_sq.sq_tail; - while (ring->user_sq.sq_head != ring->user_sq.sq_tail && ktail_to_write + 1 != read_khead) - ring_kern_sq.sq_sqe_idxs[ktail_to_write++ & mask] = ring->user_sq.sq_head++ & mask; - memory_barrier(); /* wrote sq_sqe_idxs; must do this *before* the sq_tail write below */ - - *ring->kern_sq.sq_tail = ktail_to_write; - memory_barrier(); /* wrote sq_tail */ - - ret = io_uring_enter1(ring->fd, ktail_to_write - read_khead, 0, 0, NULL); - return ret < 0 ? -errno : ret; -} - -/* netio implementation on top of that ****************************************/ - -static struct my_uring netio_uring = { 0 }; - -void netio_init(void) { - if (!netio_uring.fd) - if (my_uring_init(&netio_uring, num_entries) < 0) - exit(1); -} - -/** Like accept4(2). */ -static inline int netio_accept4(int sock, struct sockaddr *addr, socklen_t *addrlen, int flags) { - struct io_uring_sqe *sqe = my_uring_get_sqe(&netio_uring); - if (!sqe) - return -ENOSR; - - sqe.opcode = IORING_OP_ACCEPT; - sqe.fd = sock; - sqe.addr = (uint64_t)addr; - sqe.off = (uint64_t)addrlen; - sqe.accept_flags = flags; - sqe.len = 0; /* number of iovecs */ - - /* Submit that accept4() call. */ - my_uring_submit(&netio_uring); -} - -int netio_accept(int sock) { - return netio_accept4(sock, NULL, NULL, 0); -} - -int netio_read(int conn, void *buf, size_t count) { -} - -int netio_write(int conn, void *buf, size_t count) { -} - -int netio_close(int conn, bool rd, bool wr) { -} - |