diff options
Diffstat (limited to 'libmisc')
-rw-r--r-- | libmisc/assert.c | 4 | ||||
-rw-r--r-- | libmisc/fmt.c | 34 | ||||
-rw-r--r-- | libmisc/include/libmisc/hash.h | 4 | ||||
-rw-r--r-- | libmisc/include/libmisc/log.h | 4 | ||||
-rw-r--r-- | libmisc/include/libmisc/utf8.h | 54 | ||||
-rw-r--r-- | libmisc/intercept.c | 2 | ||||
-rw-r--r-- | libmisc/log.c | 2 | ||||
-rw-r--r-- | libmisc/map.c | 2 | ||||
-rw-r--r-- | libmisc/tests/test_rand.c | 2 |
9 files changed, 77 insertions, 31 deletions
diff --git a/libmisc/assert.c b/libmisc/assert.c index cb3a270..410ec21 100644 --- a/libmisc/assert.c +++ b/libmisc/assert.c @@ -11,8 +11,8 @@ #ifndef NDEBUG void __assert_msg_fail(const char *expr, - const char *file, unsigned int line, const char *func, - const char *msg) { + const char *file, unsigned int line, const char *func, + const char *msg) { static bool in_fail = false; if (!in_fail) { in_fail = true; diff --git a/libmisc/fmt.c b/libmisc/fmt.c index 33788b6..6cf1d8d 100644 --- a/libmisc/fmt.c +++ b/libmisc/fmt.c @@ -6,6 +6,8 @@ #include <string.h> /* for strnlen() */ +#include <libmisc/utf8.h> + #include <libmisc/fmt.h> static const char *const hexdig = "0123456789ABCDEF"; @@ -67,19 +69,18 @@ void fmt_print_qmem(lo_interface fmt_dest w, const void *_str, size_t size) { fmt_print_byte(w, '"'); for (size_t pos = 0; pos < size;) { uint32_t ch; - uint8_t chlen; - if ((str[pos] & 0b10000000) == 0b00000000) { ch = str[pos] & 0b01111111; chlen = 1; } - else if ((str[pos] & 0b11100000) == 0b11000000) { ch = str[pos] & 0b00011111; chlen = 2; } - else if ((str[pos] & 0b11110000) == 0b11100000) { ch = str[pos] & 0b00001111; chlen = 3; } - else if ((str[pos] & 0b11111000) == 0b11110000) { ch = str[pos] & 0b00000111; chlen = 4; } - else goto invalid_utf8; - if ((ch == 0 && chlen != 1) || pos + chlen > size) goto invalid_utf8; - for (uint8_t i = 1; i < chlen; i++) { - if ((str[pos+i] & 0b11000000) != 0b10000000) goto invalid_utf8; - ch = (ch << 6) | (str[pos+i] & 0b00111111); + uint8_t chlen; + utf8_decode_codepoint(&str[pos], size-pos, &ch, &chlen); + if (!chlen) { + /* invalid UTF-8 */ + /* \xAB */ + fmt_print_byte(w, '\\'); + fmt_print_byte(w, 'x'); + fmt_print_byte(w, hexdig[(str[pos] >> 4) & 0xF]); + fmt_print_byte(w, hexdig[(str[pos] >> 0) & 0xF]); + pos++; + continue; } - if (ch > 0x10FFFF) goto invalid_utf8; - if (ch == '\0' || ch == '\b' || ch == '\f' || @@ -132,15 +133,6 @@ void fmt_print_qmem(lo_interface fmt_dest w, const void *_str, size_t size) { fmt_print_byte(w, hexdig[(ch >> 0) & 0xF]); } pos += chlen; - continue; - - invalid_utf8: - /* \xAB */ - fmt_print_byte(w, '\\'); - fmt_print_byte(w, 'x'); - fmt_print_byte(w, hexdig[(str[pos] >> 4) & 0xF]); - fmt_print_byte(w, hexdig[(str[pos] >> 0) & 0xF]); - pos++; } fmt_print_byte(w, '"'); } diff --git a/libmisc/include/libmisc/hash.h b/libmisc/include/libmisc/hash.h index 91e6b10..58a895f 100644 --- a/libmisc/include/libmisc/hash.h +++ b/libmisc/include/libmisc/hash.h @@ -1,14 +1,14 @@ /* libmisc/hash.h - General-purpose hash utilities * - * Copyright (C) 2024 Luke T. Shumaker <lukeshu@lukeshu.com> + * Copyright (C) 2024-2025 Luke T. Shumaker <lukeshu@lukeshu.com> * SPDX-License-Identifier: AGPL-3.0-or-later */ #ifndef _LIBMISC_HASH_H_ #define _LIBMISC_HASH_H_ -#include <stdint.h> /* for uint{n}_t */ #include <stddef.h> /* for size_t */ +#include <stdint.h> /* for uint{n}_t */ /* djb2 hash */ typedef uint32_t hash_t; diff --git a/libmisc/include/libmisc/log.h b/libmisc/include/libmisc/log.h index e6dfb52..c40b642 100644 --- a/libmisc/include/libmisc/log.h +++ b/libmisc/include/libmisc/log.h @@ -9,9 +9,9 @@ #include <stdint.h> /* for uint8_t */ -#include <libmisc/macro.h> -#include <libmisc/fmt.h> #include <libmisc/_intercept.h> +#include <libmisc/fmt.h> +#include <libmisc/macro.h> #ifdef NDEBUG #define _LOG_NDEBUG 1 diff --git a/libmisc/include/libmisc/utf8.h b/libmisc/include/libmisc/utf8.h new file mode 100644 index 0000000..b5e1b0b --- /dev/null +++ b/libmisc/include/libmisc/utf8.h @@ -0,0 +1,54 @@ +/* libmisc/utf8.h - UTF-8 routines + * + * Copyright (C) 2024-2025 Luke T. Shumaker <lukeshu@lukeshu.com> + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#ifndef _LIBMISC_UTF8_H_ +#define _LIBMISC_UTF8_H_ + +#include <stddef.h> /* for size_t */ +#include <stdint.h> /* for uint{n}_t */ + +/** + * Decode the codepoint starting at `str` and consuming at most `len` + * bytes. Invalid UTF-8 is indicated with chlen=0. For valid UTF-8, + * chlen is always in the range [1, 4]. + */ +static inline void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) { + uint32_t ch; + uint8_t chlen; + if ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; } + else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; } + else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; } + else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; } + else goto invalid; + if ((ch == 0 && chlen != 1) || chlen > len) goto invalid; + for (uint8_t i = 1; i < chlen; i++) { + if ((str[i] & 0b11000000) != 0b10000000) goto invalid; + ch = (ch << 6) | (str[i] & 0b00111111); + } + if (ch > 0x10FFFF) goto invalid; + *ret_ch = ch; + *ret_chlen = chlen; + return; + invalid: + *ret_chlen = 0; +} + +static inline bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) { + for (size_t pos = 0; pos < len;) { + uint32_t ch; + uint8_t chlen; + utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen); + if (chlen == 0 || (forbid_nul && ch == 0)) + return false; + pos += chlen; + } + return true; +} + +#define utf8_is_valid(str, len) _utf8_is_valid(str, len, false) +#define utf8_is_valid_without_nul(str, len) _utf8_is_valid(str, len, true) + +#endif /* _LIBMISC_UTF8_H_ */ diff --git a/libmisc/intercept.c b/libmisc/intercept.c index 30870bf..d0e3602 100644 --- a/libmisc/intercept.c +++ b/libmisc/intercept.c @@ -11,7 +11,7 @@ [[gnu::weak]] void __lm_putchar(unsigned char c) { - (void) putchar(c); + (void)putchar(c); } [[gnu::weak]] diff --git a/libmisc/log.c b/libmisc/log.c index da4c92e..7e917c6 100644 --- a/libmisc/log.c +++ b/libmisc/log.c @@ -8,8 +8,8 @@ #include <libmisc/assert.h> /* for static_assert() */ -#include <libmisc/log.h> #include <libmisc/_intercept.h> +#include <libmisc/log.h> struct log_stdout {}; LO_IMPLEMENTATION_H(fmt_dest, struct log_stdout, log_stdout); diff --git a/libmisc/map.c b/libmisc/map.c index cc34c16..d1b2a57 100644 --- a/libmisc/map.c +++ b/libmisc/map.c @@ -7,9 +7,9 @@ #include <stdlib.h> #include <string.h> -#include <libmisc/hash.h> #include <libmisc/alloc.h> #include <libmisc/assert.h> +#include <libmisc/hash.h> #include <libmisc/map.h> #define FLAG_ITER (UINT8_C(1)<<0) diff --git a/libmisc/tests/test_rand.c b/libmisc/tests/test_rand.c index ecb1c49..1cfbd65 100644 --- a/libmisc/tests/test_rand.c +++ b/libmisc/tests/test_rand.c @@ -6,8 +6,8 @@ #include <setjmp.h> -#include <libmisc/rand.h> #include <libmisc/_intercept.h> +#include <libmisc/rand.h> #include "test.h" |