netio_uring.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365

#include <linux/io_uring.h>
#include <sys/mman.h>
#include <assert.h>
#include <error.h>
#include <string.h>

#include "coroutine.h"
#include "netio.h"

/* Config *********************************************************************/

#define MAX_WORKERS 8

/* Syscalls *******************************************************************/

/* I don't want to pull in liburing, and glibc doesn't provide stubs
 * for these syscalls.
 *
 * For the signatures, search for `SYSCALL_DEFINE` in
 * `linux.git:io_uring/io_uring.c`.
 */

static inline int io_uring_setup(uint32_t entries, struct io_uring_params *p) {
	return syscall(__NR_io_uring_setup, entries, p);
}
static inline int io_uring_enter1(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags,
                                  sigset_t *sig) {
	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags & ~IORING_ENTER_EXT_ARG, sig);
}
static inline int io_uring_enter2(int fd, uint32_t to_submit, uint32_t min_complete, uint32_t flags,
                                  struct io_uring_getevents_arg *argp, size_t argsz) {
	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags | IORING_ENTER_EXT_ARG, argp, argsz);
}
static inline int io_uring_register(int fd, int opcode, void *arg, unsigned int nr_args) {
	return syscall(__NR_io_uring_register, opcode, arg, nr_args);
}

/* Userspace component of io_uring ********************************************/

/* I'm not too sure what the sematics around volatile-memory vs
 * __sync_synchronize() are, but this is the definition that
 * linux.git/tools/include/io_uring/mini_liburing.h uses. */
#if defined(__x86_64__)
#  define memory_barrier()  asm volatile ("":::"memory")
#else
#  define memory_barrier() __sync_synchronize() /* GCC built-in */
#endif

/**
 * Submission Queue
 *
 * A proxy into the kernel's internal structures, which have been mmap()ed into
 * userspace.  Each member is a pointer (which is what makes this a proxy)
 * because the order of the members kernel memory is not ABI-stable, and instead
 * we get offsets into the mmap()ed area from the io_uring_setup() syscall.
 *
 * Call memory_barrier() before reading a non-const value from the proxy, and
 * after writing to it.
 *
 * Exactly which kernel structure this is proxying into varies by kernel version
 * (I point this out so that you can more easily find the documentation for the
 * internal kernel structures):
 *
 *     fs/io_uring.c                    struct io_sq_ring;  kernel v5.1 - v5.3
 *     fs/io_uring.c                    struct io_rings;    kernel v5.4 - v5.19
 *     include/linux/io_uring_types.h   struct io_rings;    kernel v6.0 +
 *
 * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single
 * monolithic structure in v5.4, I leave them separate here, because
 * conceptually they are better separate; they were merged purely for
 * performance reasons.
 *
 * I also include in a leading comment on each member with a kernel definition,
 * which will make the size passed to mmap() make sense.
 */
struct my_io_sq_ring_proxy {
	void    *ring_mmap;      /* in case we want to munmap() it */
	size_t   ring_mmap_size; /* in case we want to munmap() it */
	/* pointers into mmap(offset=ORING_OFF_SQ_RING) */
	/* kernel def              */                                    /* who-writes-it ; description */
	/* u32       r.head;       */ uint32_t             *sq_head;     /* kernel        ; apply sq_mask to get a valid index */
	/* u32       r.tail;       */ uint32_t             *sq_tail;     /* userspace     ; apply sq_mask to get a valid index */
	/* u32       ring_mask;    */ uint32_t             *sq_mask;     /* kern-const    ; TODO */
	/* u32       ring_entries; */ uint32_t             *sq_cap;      /* kern-const    ; number of entries, always a power-of-2 */
	/* u32       dropped;      */ uint32_t             *sq_dropped;  /* kernel        ; number of entries dropped because invalid index */
	/* atomic_t  flags;        */ int                  *sq_flags;    /* kernel        ; must use memory barrier before checking */
	/* u32       array[];      */ uint32_t             *sq_sqe_idxs; /* userspace     ; sq_cap-sized array of indexes into the sq_sqes array */

	/* This is actually separate from `struct io_sq_ring`/`struct io_rings`.  */
	void    *entries_mmap;      /* in case we want to munmap() */
	size_t   entries_mmap_size; /* in case we want to munmap() */
	/* pointers into mmap(offset=ORING_OFF_SQES) */
	                               /* who-writes-it ; description */
	struct io_uring_sqe  *sq_sqes; /* userspace     ; sq_cap-sized array */

	/* The structure of sq_sqe_idxs is as follows.  The
	 * application writes a request into [sq_tail+1], then
	 * increments sq_tail.  The kernel increments sq_head when it
	 * has processed a request.
	 *
	 *                          <- sq_cap=8
	 *     [ 7: uninitialized ]
	 *     [ 6: uninitialized ]
	 *     [ 5: uninitialized ] <- sq_tail=5
	 *     [ 4: pending   T   ]
	 *     [ 3: pending   |   ]
	 *     [ 2: pending   H   ] <- sq_head=2
	 *     [ 1: finished      ]
	 *     [ 0: finished      ]
	 *
	 * It may wrap-around like
	 *
	 *                          <- sq_cap=8
	 *     [ 7: pending   |   ]
	 *     [ 6: pending   |   ]
	 *     [ 5: pending   |   ]
	 *     [ 4: pending   H   ] <- sq_head=4
	 *     [ 3: finished      ]
	 *     [ 2: finished      ] <- sq_tail=10 (sq_tail%sq_cap = sq_tail&sq_mask = 2)
	 *     [ 1: pending   T   ]
	 *     [ 0: pending   |   ]
	 *
	 * When empty it looks like
	 *                          <- sq_cap=8
	 *     [ 7: uninitialized ]
	 *     [ 6: uninitialized ]
	 *     [ 5: uninitialized ]
	 *     [ 4: uninitialized ]
	 *     [ 3: uninitialized ]
	 *     [ 2: uninitialized ]
	 *     [ 1: finished  O   ] <- sq_head=sq_tail=1
	 *     [ 0: finished      ]
	 */
};

/**
 * Completion Queue
 *
 * A proxy into the kernel's internal structure, which has been mmap()ed into
 * userspace.  Each member is a pointer (which is what makes this a proxy)
 * because the order of the members kernel memory is not ABI-stable, and instead
 * we get offsets into the mmap()ed area from the io_uring_setup() syscall.
 *
 * Call memory_barrier() before reading a non-const value from the proxy, and
 * after writing to it.
 *
 * Exactly which kernel structure this is proxying into varies by kernel version
 * (I point this out so that you can more easily find the documentation for the
 * internal kernel structures):
 *
 *     fs/io_uring.c                    struct io_cq_ring;  kernel v5.1 - v5.3
 *     fs/io_uring.c                    struct io_rings;    kernel v5.4 - v5.19
 *     include/linux/io_uring_types.h   struct io_rings;    kernel v6.0 +
 *
 * Despite the kernel having merged `io_sq_ring` and `io_cq_ring` into a single
 * monolithic structure in v5.4, I leave them separate here, because
 * conceptually they are better separate; they were merged purely for
 * performance reasons.
 *
 * I also include in a leading comment on each member with a kernel definition,
 * which will make the size passed to mmap() make sense.
 */
struct my_io_cq_ring_proxy {
	size_t   mmap_size; /* in case we want to munmap() it */
	void    *mmap;      /* in case we want to munmap() it */
	/* pointers into mmap(offset=ORING_OFF_CQ_RING) */
	/* kernel def                         */                                    /* who-writes-it ; description */
	/* u32                  r.head;       */ uint32_t             *cq_head;     /* userspace     ; apply cq_mask to get a valid index */
	/* u32                  r.tail;       */ uint32_t             *cq_tail;     /* kernel        ; apply cq_mask to get a valid index */
	/* u32                  ring_mask;    */ uint32_t             *cq_mask;     /* kern-const    ; TODO */
	/* u32                  ring_entries; */ uint32_t             *cq_cap;      /* kern-const    ; number of entries, always a power-of-2 */
	/* u32                  flags;        */ uint32_t             *cq_flags;    /* userspace     ; TODO */
	/* u32                  overflow;     */ uint32_t             *cq_overflow; /* kernel        ; TODO */
	/* struct io_uring_cqe  cqes[];       */ struct io_uring_cqe  *cq_cqes;     /* mostly-kernel ; cq_cap-sized array; userspace is allowed to modify pending entries */
}

struct my_uring {
	int  fd;

	struct my_io_sq_ring_proxy  kern_sq;
	struct my_io_cq_ring_proxy  kern_cq;

	uint32_t  user_sq_head;
	uint32_t  user_sq_tail;
	uint32_t  user_cq_head;
	uint32_t  user_cq_tail;
};

static int my_uring_deinit(struct my_uring *ring) {
	if (ring->kern_cq.mmap &&
	    ring->kern_cq.mmap != MAP_FAILED &&
	    ring->kern_cq.mmap != ring->kern_sq.ring_mmap)
		munmap(ring->ring->kern_cq.mmap, ring->kern_cq.mmap_size);
	if (ring->kern_sq.entries_mmap &&
	    ring->kern_sq.entries_mmap != MAP_FAILED)
		munmap(ring->ring->kern_sq.entries_mmap, ring->kern_sq.entries_mmap_size);
	if (ring->kern_sq.ring_mmap &&
	    ring->kern_sq.ring_mmap != MAP_FAILED)
		munmap(ring->ring->kern_sq.ring_mmap, ring->kern_sq.ring_mmap_size);
	if (ring->fd >= 0)
		close(ring->fd);
	memset(ring, 0, sizeof(*ring));
	ring->fd = -1;
}

static int my_uring_init(struct my_uring *ring, uint32_t num_entries) {
	assert(ring);

	memset(ring, 0, sizeof(*ring));

	static struct io_uring_params params = { 0 };
	ring->fd = io_uring_setup(num_entries, &params);
	if (ring->fd < 0) {
		error(0, ring->fd, "io_uring_setup");
		my_uring_deinit(ring);
		return -1;
	}
	
	ring->kern_sq.ring_mmap_size = params.sq_off.array + (params.sq_entries * sizeof(uint32_t));
	ring->kern_sq.ring_mmap = mmap(NULL, ring->kern_sq.ring_mmap_size,
	                               PROT_READ|PROT_WRITE,
	                               MAP_SHARED|MAP_POPULATE,
	                               ring->fd,
	                               IORING_OFF_SQ_RING);
	if (ring->kern_sq.ring_mmap == MAP_FAILED) {
		error(0, errno, "mmap(SQ_RING)");
		my_uring_deinit(ring);
		return -1;
	}
	ring->kern_sq.sq_head     = ring->kern_sq.mmap + params.sq_off.head;
	ring->kern_sq.sq_tail     = ring->kern_sq.mmap + params.sq_off.tail;
	ring->kern_sq.sq_mask     = ring->kern_sq.mmap + params.sq_off.ring_mask;
	ring->kern_sq.sq_cap      = ring->kern_sq.mmap + params.sq_off.ring_entries;
	ring->kern_sq.sq_flags    = ring->kern_sq.mmap + params.sq_off.flags;
	ring->kern_sq.sq_dropped  = ring->kern_sq.mmap + params.sq_off.dropped;
	ring->kern_sq.sq_sqe_idxs = ring->kern_sq.mmap + params.sq_off.array;

	ring->kern_sq.entries_mmap_size = params.sq_entries * sizeof(struct io_uring_sqe);
	ring->kern_sq.entries_mmap = mmap(NULL, ring->kern_sq.entries_mmap_size,
	                                  PROT_READ|PROT_WRITE,
	                                  MAP_SHARED|MAP_POPULATE,
	                                  ring->fd,
	                                  IORING_OFF_SQES);
	if (ring->kern_sq == MAP_FAILED) {
		error(0, errno, "mmap(SQES)");
		my_uring_deinit(ring);
		return -1;
	}
	ring->kern_sq.sq_sqes = ring->kern_sq.entries_mmap;

	if (params.features & IORING_FEAT_SINGLE_MMAP) {
		/* Purely optional optimization that is possible since kernel v5.4.  */
		ring->cq.mmap_size = ring->kern_sq.ring_mmap_size; 
		ring->cq.mmap = ring->kern_sq.ring_mmap;
	} else {
		ring->cq.mmap_size = params.cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
		ring->cq.mmap = mmap(NULL, ring->cq.mmap_size,
		                     PROT_READ|PROT_WRITE,
		                     MAP_SHARED|MAP_POPULATE,
		                     ring->fd,
		                     IORING_OFF_CQ_RING);
		if (ring->cq.mmap == MAP_FAILED) {
			error(0, errno, "mmap(CQ_RING)");
			my_uring_deinit(ring);
			return -1;
		}
	}
	ring->cq.head     = ring->cq.mmap + params.cq_off.head;
	ring->cq.tail     = ring->cq.mmap + params.cq_off.tail;
	ring->cq.mask     = ring->cq.mmap + params.cq_off.mask;
	ring->cq.cap      = ring->cq.mmap + params.cq_off.entries;
	ring->cq.overflow = ring->cq.mmap + params.cq_off.overflow;
	ring->cq.cqes     = ring->cq.mmap + params.cq_off.cqes;
	ring->cq.flags    = ring->cq.mmap + params.cq_off.flags;

	return 0;
}

/**
 * Obtain a Submission Queue Entry that is available to be written into.
 * Returns NULL if the queue is full.
 *
 * Once you have filled the sqe with your request, call my_uring_submit() to
 * submit it to the kernel, in a batch with any other sqe's you've gotten since
 * the last my_uring_submit() call.
 */
static inline struct io_uring_sqe *my_uring_get_sqe(struct my_uring *ring) {
	/* Because sq_cap is always a power-of-2, we don't ever have to apply `%
	 * sq_cap` because `& sq_mask` does that for us, and UINT32_MAX being a
	 * power-of-2 as well means overflow will never hurt us.  */
	if (ring->user_sq_tail + 1 - ring->user_sq_head > *ring->kern_sq.sq_cap)
		return 1;
	return &sq->kern_sq.sq_sqes[sq->user_sq_tail++ & *sq->kern_sq.sq_mask];
}

/**
 * Submit a batch of sqes (obtained from my_uring_get_sqe()) to the kernel.
 *
 * Returns the number of sqes successfully submitted, or -errno.
 */
static inline int my_uring_submit(struct my_uring *ring) {
	uint32_t read_khead, ktail_to_write, mask;
	int ret;

	if (ring->user_sq.sq_head == ring->user_sq.sq_tail)
		/* nothing to do */
		return 0;

	mask = *ring->kern_sq.sq_mask;

	memory_barrier(); /* to read sq_head */
	read_khead = *ring->kern_sq.sq_head;
	
	ktail_to_write = *ring->kern_sq.sq_tail;
	while (ring->user_sq.sq_head != ring->user_sq.sq_tail && ktail_to_write + 1 != read_khead)
		ring_kern_sq.sq_sqe_idxs[ktail_to_write++ & mask] = ring->user_sq.sq_head++ & mask;
	memory_barrier(); /* wrote sq_sqe_idxs; must do this *before* the sq_tail write below */

	*ring->kern_sq.sq_tail = ktail_to_write;
	memory_barrier(); /* wrote sq_tail */
		
	ret = io_uring_enter1(ring->fd, ktail_to_write - read_khead, 0, 0, NULL);
	return ret < 0 ? -errno : ret;
}

/* netio implementation on top of that ****************************************/

static struct my_uring netio_uring = { 0 };

void netio_init(void) {
	if (!netio_uring.fd)
		if (my_uring_init(&netio_uring, num_entries) < 0)
			exit(1);
}

/** Like accept4(2).  */
static inline int netio_accept4(int sock, struct sockaddr *addr, socklen_t *addrlen, int flags) {
	struct io_uring_sqe *sqe = my_uring_get_sqe(&netio_uring);
	if (!sqe)
		return -ENOSR;

	sqe.opcode       = IORING_OP_ACCEPT;
	sqe.fd           = sock;
	sqe.addr         = (uint64_t)addr;
	sqe.off          = (uint64_t)addrlen;
	sqe.accept_flags = flags;
	sqe.len          = 0; /* number of iovecs */

	/* Submit that accept4() call.  */
	my_uring_submit(&netio_uring);
}

int netio_accept(int sock) {
	return netio_accept4(sock, NULL, NULL, 0);
}

int netio_read(int conn, void *buf, size_t count) {
}

int netio_write(int conn, void *buf, size_t count) {
}

int netio_close(int conn, bool rd, bool wr) {
}