/* coroutine.c - Simple embeddable coroutine implementation * * Copyright (C) 2024 Luke T. Shumaker * SPDX-Licence-Identifier: AGPL-3.0-or-later */ #include #include /* for setjmp(), longjmp(), jmp_buf */ #include /* for uint8_t */ #include /* for printf(), fprintf(), stderr */ #include /* for aligned_alloc(), free() */ #if __x86_64__ # include /* for pause() */ #elif __arm__ # include "hardware/sync.h" /* for __wfi(); */ # define pause() __wfi() #endif #include /* Configuration **************************************************************/ #define USE_CONFIG_COROUTINE #include "config.h" #ifndef CONFIG_COROUTINE_DEFAULT_STACK_SIZE # error config.h must define CONFIG_COROUTINE_DEFAULT_STACK_SIZE #endif #ifndef CONFIG_COROUTINE_NUM # error config.h must define CONFIG_COROUTINE_NUM #endif #ifndef CONFIG_COROUTINE_MEASURE_STACK # error config.h must define CONFIG_COROUTINE_MEASURE_STACK #endif #ifndef CONFIG_COROUTINE_PROTECT_STACK # error config.h must define CONFIG_COROUTINE_PROTECT_STACK #endif #ifndef CONFIG_COROUTINE_DEBUG # error config.h must define CONFIG_COROUTINE_DEBUG #endif /* Implementation *************************************************************/ /* * Portability notes: * * - It uses GCC `__attribute__`s, though these can likely be easily * swapped for other compilers. * * - It has a small bit of CPU-specific code (assembly, and definition * of a STACK_GROWS_DOWNWARD={0,1} macro) in * coroutine.c:call_with_stack(). Other than this, it should be * portable to other CPUs. It currently contains implementations * for __x86_64__ and __arm__, and should be fairly easy to add * implementations for other CPUs. * * - It uses setjmp()/longjmp() in "unsafe" ways. POSIX-2017 * longjmp(3) says * * > If the most recent invocation of setjmp() with the * > corresponding jmp_buf ... or if the function containing the * > invocation of setjmp() has terminated execution in the interim, * > or if the invocation of setjmp() was within the scope of an * > identifier with variably modified type and execution has left * > that scope in the interim, the behavior is undefined. * * We use longjmp() both of these scenarios, but make it OK by using * call_with_stack() to manage the stack ourselves, assuming the * sole reason that longjmp() behavior is undefined in such cases is * because the stack that its saved stack-pointer points to is no * longer around. It seems absurd that an implementation would * choose to do something else, but I'm calling it out here because * you never know. * * Note that setjmp()/longjmp() are defined in 3 places: in the libc * (glibc/newlib), as GCC intrinsics, and the lower-level GCC * __builtin_{setjmp,newlib} which the libc and intrinsic versions * likely use. Our assumptions seem to be valid for * x86_64-pc-linux-gnu/gcc-14.2.1/glibc-2.40 and * arm-none-eabi/gcc-14.1.0/newlib-4.4.0. * * Why not use , the now-deprecated (was in POSIX.1-2001, * is gone in POSIX-2008) predecesor to ? It would let us * do this without any assembly or unsafe assumptions. Simply: * because newlib does not provide it. */ /* * Design decisions and notes: * * - Coroutines are launched with a stack stack that is filled with * known values (zero or a pre-determined pattern, depending on the * config above). Because all stack variables should be initialized * before they are read, this "shouldn't" make a difference, but: * (1) Initializing it to a known value allows us to measure how * much of the stack was written to, which is helpful to tune stack * sizes. (2) Leaving it uninitialized just gives me the willies. * * - Because embedded programs should be adverse to using the heap, * CONFIG_COROUTINE_NUM is fixed, instead of having coroutine_add() * dynamically grow the coroutine_table as-needed. * * - On the flip-side, coroutine stacks are allocated on the heap * instead of having them be statically-allocated along with * coroutine_table. (1) This reduced the blast-area of damage for a * stack-overflow; and indeed if the end of the stack alignes with a * page-boundary then memory-protection can even detect the overflow * for us. (2) Having different-looking addresses for stack-area vs * static-area is handy for making things jump out at you when * debugging. (3) This can likely also improve things with being * page-aligned. * * - Coroutines must use cr_exit() instead of returning because if * they return then they will return to call_with_stack() in * coroutine_add() (not to after the longjmp() call in * coroutine_main()), and besides being * wrong-for-our-desired-flow-control, that's a stack location that * no longer exists. */ /* types **********************************************************************/ enum coroutine_state { CR_NONE = 0, /* this slot in the table is empty */ CR_INITIALIZING, /* running, before cr_begin() */ CR_RUNNING, /* running, after cr_begin() */ CR_RUNNABLE, /* not running, but runnable */ CR_PAUSED, /* not running, and not runnable */ }; struct coroutine { volatile enum coroutine_state state; volatile bool sig_unpause; jmp_buf env; size_t stack_size; void *stack; }; /* constants ******************************************************************/ const char *coroutine_state_strs[] = { [CR_NONE] = "CR_NONE", [CR_INITIALIZING] = "CR_INITIALIZING", [CR_RUNNING] = "CR_RUNNING", [CR_RUNNABLE] = "CR_RUNNABLE", [CR_PAUSED] = "CR_PAUSED", }; #define STACK_ALIGNMENT ({ __attribute__((aligned)) void fn(void) {}; __alignof__(fn); }) #if CONFIG_COROUTINE_MEASURE_STACK || CONFIG_COROUTINE_PROTECT_STACK /* We just need a pattern that is unlikely to occur naturaly; this is * just a few bytes that I read from /dev/random. */ static const uint8_t stack_pattern[] = {0x1e, 0x15, 0x16, 0x0a, 0xcc, 0x52, 0x7e, 0xb7}; #endif /* global variables ***********************************************************/ static jmp_buf coroutine_add_env; static jmp_buf coroutine_main_env; /* * Invariants (and non-invariants): * * - exactly 0 or 1 coroutines have state CR_INITIALIZING * - exactly 0 or 1 coroutines have state CR_RUNNING * - if coroutine_running is not zero, then * coroutine_table[coroutine_running-1] is the currently-running * coroutine * - the coroutine_running coroutine either has state CR_RUNNING or * CR_INITIALIZNG * - a coroutine having state CR_RUNNING does *not* imply that * coroutine_running points at that coroutine; if that coroutine is * in the middle of coroutine_add(), it coroutine_running points at * the CR_INITIALIZING child coroutine, while leaving the parent * coroutine as CR_RUNNING. */ static struct coroutine coroutine_table[CONFIG_COROUTINE_NUM] = {0}; static cid_t coroutine_running = 0; /* utility functions **********************************************************/ #define errorf(...) fprintf(stderr, "error: " __VA_ARGS__) #define infof(...) printf("info: " __VA_ARGS__) #if CONFIG_COROUTINE_DEBUG # define debugf(...) printf("dbg: " __VA_ARGS__) #else # define debugf(...) #endif #ifdef __GLIBC__ # define assertf(expr, ...) \ ((void) sizeof ((expr) ? 1 : 0), __extension__ ({ \ if (expr) \ ; /* empty */ \ else { \ errorf("assertion: " __VA_ARGS__); \ __assert_fail (#expr, __FILE__, __LINE__, __ASSERT_FUNCTION); \ } \ })) #else # define assertf(expr, ...) assert(expr) #endif /** Return `n` rounded up to the nearest multiple of `d` */ #define round_up(n, d) ( ( ((n)+(d)-1) / (d) ) * (d) ) static inline const char* coroutine_state_str(enum coroutine_state state) { assert(state < (sizeof(coroutine_state_strs)/sizeof(coroutine_state_strs[0]))); return coroutine_state_strs[state]; } static inline void assert_cid(cid_t cid) { assert(cid > 0); assert(cid <= CONFIG_COROUTINE_NUM); #if CONFIG_COROUTINE_PROTECT_STACK assert(coroutine_table[cid-1].stack_size); assert(coroutine_table[cid-1].stack); for (size_t i = 0; i < sizeof(stack_pattern); i++) { size_t j = coroutine_table[cid-1].stack_size - (i+1); assert(((uint8_t*)coroutine_table[cid-1].stack)[i] == stack_pattern[i]); assert(((uint8_t*)coroutine_table[cid-1].stack)[j] == stack_pattern[j%sizeof(stack_pattern)]); } #endif } #define assert_cid_state(cid, opstate) do { \ assert_cid(cid); \ assert(coroutine_table[(cid)-1].state opstate); \ } while (0) /* call_with_stack() **********************************************************/ static void call_with_stack(void *stack, cr_fn_t fn, void *args) { static void *saved_sp = NULL; /* As part of sbc-harness, this only really needs to support * ARM-32, but being able to run it on x86-64 is useful for * debugging. */ #if __x86_64__ #define STACK_GROWS_DOWNWARD 1 asm volatile ("movq %%rsp , %0\n\t" /* saved_sp = sp */ "movq %1 , %%rsp\n\t" /* sp = stack */ "movq %3 , %%rdi\n\t" /* arg0 = args */ "call *%2\n\t" /* fn() */ "movq %0 , %%rsp" /* sp = saved_sp */ : : /* %0 */"m"(saved_sp), /* %1 */"r"(stack), /* %2 */"r"(fn), /* %3 */"r"(args) : "rdi" ); #elif __arm__ #define STACK_GROWS_DOWNWARD 1 /* str/ldr can only work with a "lo" register, which sp is * not, so we use r0 as an intermediate because we're going to * clobber it with args anyway. */ asm volatile ("mov r0, sp\n\t" /* [saved_sp = sp */ "str r0, %0\n\t" /* ] */ "mov sp, %1\n\t" /* [sp = stack] */ "mov r0, %3\n\t" /* [arg0 = args] */ "blx %2\n\t" /* [fn()] */ "ldr r0, %0\n\t" /* [sp = staved_sp */ "mov sp, r0" /* ] */ : : /* %0 */"m"(saved_sp), /* %1 */"r"(stack), /* %2 */"r"(fn), /* %3 */"r"(args) : "r0" ); #else # error unsupported architecture #endif } /* coroutine_add() ************************************************************/ cid_t coroutine_add_with_stack_size(size_t stack_size, cr_fn_t fn, void *args) { static cid_t last_created = 0; cid_t parent = coroutine_running; if (parent) assert_cid_state(parent, == CR_RUNNING); assert(stack_size); assert(fn); debugf("coroutine_add_with_stack_size(%zu, %p, %p)...\n", stack_size, fn, args); cid_t child; { size_t idx_base = last_created; for (size_t idx_shift = 0; idx_shift < CONFIG_COROUTINE_NUM; idx_shift++) { child = ((idx_base + idx_shift) % CONFIG_COROUTINE_NUM) + 1; if (coroutine_table[child-1].state == CR_NONE) goto found; } return 0; found: } debugf("...child=%zu\n", child); last_created = child; coroutine_table[child-1].stack_size = stack_size; coroutine_table[child-1].stack = aligned_alloc(STACK_ALIGNMENT, stack_size); #if CONFIG_COROUTINE_MEASURE_STACK || CONFIG_COROUTINE_PROTECT_STACK for (size_t i = 0; i < stack_size; i++) ((uint8_t*)coroutine_table[child-1].stack)[i] = stack_pattern[i%sizeof(stack_pattern)]; #endif coroutine_running = child; coroutine_table[child-1].state = CR_INITIALIZING; if (!setjmp(coroutine_add_env)) { /* point=a */ void *stack_base = coroutine_table[child-1].stack + (STACK_GROWS_DOWNWARD ? stack_size : 0); #if CONFIG_COROUTINE_PROTECT_STACK # if STACK_GROWS_DOWNWARD stack_base -= round_up(sizeof(stack_pattern), STACK_ALIGNMENT); # else stack_base += round_up(sizeof(stack_pattern), STACK_ALIGNMENT); # endif #endif debugf("...stack =%p\n", coroutine_table[child-1].stack); debugf("...stack_base=%p\n", stack_base); /* run until cr_begin() */ call_with_stack(stack_base, fn, args); assert(false); /* should cr_begin() instead of returning */ } assert_cid_state(child, == CR_RUNNABLE); if (parent) assert_cid_state(parent, == CR_RUNNING); coroutine_running = parent; return child; } cid_t coroutine_add(cr_fn_t fn, void *args) { return coroutine_add_with_stack_size(CONFIG_COROUTINE_DEFAULT_STACK_SIZE, fn, args); } /* coroutine_main() ***********************************************************/ #if CONFIG_COROUTINE_MEASURE_STACK struct stack_stats { size_t cap; size_t max; //size_t cur; }; static void measure_stack(cid_t cid, struct stack_stats *ret) { ret->cap = coroutine_table[cid-1].stack_size - (CONFIG_COROUTINE_PROTECT_STACK ? 2*round_up(sizeof(stack_pattern), STACK_ALIGNMENT) : 0); ret->max = ret->cap; for (;;) { size_t i = STACK_GROWS_DOWNWARD ? (CONFIG_COROUTINE_PROTECT_STACK ? round_up(sizeof(stack_pattern), STACK_ALIGNMENT) : 0) + ret->cap - ret->max : ret->max - 1 - (CONFIG_COROUTINE_PROTECT_STACK ? round_up(sizeof(stack_pattern), STACK_ALIGNMENT) : 0); if (ret->max == 0 || ((uint8_t*)coroutine_table[cid-1].stack)[i] != stack_pattern[i%sizeof(stack_pattern)]) break; ret->max--; } } #endif static inline cid_t next_coroutine() { for (cid_t next = (coroutine_running%CONFIG_COROUTINE_NUM)+1; next != coroutine_running; next = (next%CONFIG_COROUTINE_NUM)+1) { if (coroutine_table[next-1].state == CR_RUNNABLE) return next; } return 0; } void coroutine_main(void) { debugf("coroutine_main()\n"); coroutine_running = 0; for (;;) { cid_t next = next_coroutine(); if (!next) { errorf("no coroutines\n"); return; } if (!setjmp(coroutine_main_env)) { /* point=b */ coroutine_running = next; coroutine_table[coroutine_running-1].state = CR_RUNNING; longjmp(coroutine_table[coroutine_running-1].env, 1); /* jump to point=c */ } assert_cid_state(coroutine_running, == CR_NONE); #if CONFIG_COROUTINE_MEASURE_STACK struct stack_stats sizes; measure_stack(coroutine_running, &sizes); infof("cid=%zu: exited having used %zu B stack space\n", coroutine_running, sizes.max); #endif free(coroutine_table[coroutine_running-1].stack); coroutine_table[coroutine_running-1] = (struct coroutine){0}; } } /* cr_*() *********************************************************************/ void cr_begin(void) { assert_cid_state(coroutine_running, == CR_INITIALIZING); coroutine_table[coroutine_running-1].state = CR_RUNNABLE; if (!setjmp(coroutine_table[coroutine_running-1].env)) /* point=c1 */ longjmp(coroutine_add_env, 1); /* jump to point=a */ } static inline void _cr_transition(enum coroutine_state state) { debugf("cid=%zu: transition %s->%s\n", coroutine_running, coroutine_state_str(coroutine_table[coroutine_running-1].state), coroutine_state_str(state)); coroutine_table[coroutine_running-1].state = state; cid_t next; for (;;) { next = next_coroutine(); if (next) break; if (coroutine_table[coroutine_running-1].state == CR_RUNNABLE) { debugf("cid=%zu: no other runnable coroutines, not yielding\n", coroutine_running); coroutine_table[coroutine_running-1].state = CR_RUNNING; return; } else { debugf("cid=%zu: no runnable coroutines, sleeping\n", coroutine_running); pause(); } } if (!setjmp(coroutine_table[coroutine_running-1].env)) { /* point=c2 */ coroutine_running = next; coroutine_table[coroutine_running-1].state = CR_RUNNING; longjmp(coroutine_table[coroutine_running-1].env, 1); /* jump to point=b */ } } void cr_yield(void) { assert_cid_state(coroutine_running, == CR_RUNNING); _cr_transition(CR_RUNNABLE); } void cr_pause_and_yield(void) { assert_cid_state(coroutine_running, == CR_RUNNING); if (coroutine_table[coroutine_running-1].sig_unpause) _cr_transition(CR_RUNNABLE); else _cr_transition(CR_PAUSED); coroutine_table[coroutine_running-1].sig_unpause = false; } void cr_exit(void) { assert_cid_state(coroutine_running, == CR_RUNNING); debugf("cid=%zu: exit\n", coroutine_running); coroutine_table[coroutine_running-1].state = CR_NONE; longjmp(coroutine_main_env, 1); /* jump to point=b */ } void cr_unpause(cid_t cid) { assert_cid_state(cid, == CR_PAUSED); debugf("cr_unpause(%zu)\n", cid); coroutine_table[cid-1].sig_unpause = false; coroutine_table[cid-1].state = CR_RUNNABLE; } void cr_unpause_from_sighandler(cid_t cid) { assert_cid(cid); debugf("cr_unpause_from_sighandler(%zu)\n", cid); switch (coroutine_table[cid-1].state) { case CR_RUNNING: debugf("... raced, deferring unpause\n"); coroutine_table[cid-1].sig_unpause = true; break; case CR_PAUSED: debugf("... actual unpause\n"); coroutine_table[cid-1].sig_unpause = false; coroutine_table[cid-1].state = CR_RUNNABLE; break; default: assertf(false, "cid=%zu state=%s\n", cid, coroutine_state_str(coroutine_table[cid-1].state)); } } cid_t cr_getcid(void) { return coroutine_running; }