diff options
author | Luke T. Shumaker <lukeshu@lukeshu.com> | 2025-05-17 03:20:11 -0600 |
---|---|---|
committer | Luke T. Shumaker <lukeshu@lukeshu.com> | 2025-05-17 12:57:08 -0600 |
commit | ab9103440ade87509a1a3bd1eaad0b5396a89d1e (patch) | |
tree | 423673909d0af66d4ef5e260ce58b4b554bf2024 | |
parent | d505a998aafe5af8b02a2b2c2acf7e708812c3fc (diff) |
Pull UTF-8 decoding into libmisc/utf8.c
-rw-r--r-- | lib9p/core_gen/c.py | 2 | ||||
-rw-r--r-- | lib9p/core_gen/c_fmt_print.py | 2 | ||||
-rw-r--r-- | lib9p/core_gen/c_validate.py | 2 | ||||
-rw-r--r-- | lib9p/core_generated.c | 14 | ||||
-rw-r--r-- | lib9p/core_utf8.h | 36 | ||||
-rw-r--r-- | libmisc/fmt.c | 34 | ||||
-rw-r--r-- | libmisc/include/libmisc/utf8.h | 54 |
7 files changed, 77 insertions, 67 deletions
diff --git a/lib9p/core_gen/c.py b/lib9p/core_gen/c.py index 60ceb70..5776035 100644 --- a/lib9p/core_gen/c.py +++ b/lib9p/core_gen/c.py @@ -29,11 +29,11 @@ def gen_c(versions: set[str], typs: list[idl.UserType]) -> str: #include <libmisc/assert.h> #include <libmisc/endian.h> +#include <libmisc/utf8.h> #include <lib9p/core.h> #include "core_tables.h" -#include "core_utf8.h" """ # utilities ################################################################ ret += """ diff --git a/lib9p/core_gen/c_fmt_print.py b/lib9p/core_gen/c_fmt_print.py index eaacddb..7a0a9d3 100644 --- a/lib9p/core_gen/c_fmt_print.py +++ b/lib9p/core_gen/c_fmt_print.py @@ -112,7 +112,7 @@ def gen_c_fmt_print(versions: set[str], typs: list[idl.UserType]) -> str: cnt_str = f"self->{member.cnt.membname}" cnt_typ = c9util.typename(member.cnt.typ) if member.typ.static_size == 1: # SPECIAL (data) - ret += f"\tif (is_valid_utf8_without_nul((uint8_t *)self->{member.membname}, (size_t){cnt_str})) {{\n" + ret += f"\tif (utf8_is_valid_without_nul((uint8_t *)self->{member.membname}, (size_t){cnt_str})) {{\n" ret += f'\t\tfmt_print_str(w, " {member.membname}=");\n' ret += f"\t\tfmt_print_qmem(w, self->{member.membname}, {cnt_str} < 50 ? {cnt_str} : 50);\n" ret += f"\t\tif ({cnt_str} > 50)\n" diff --git a/lib9p/core_gen/c_validate.py b/lib9p/core_gen/c_validate.py index 9c55d8d..e7a4017 100644 --- a/lib9p/core_gen/c_validate.py +++ b/lib9p/core_gen/c_validate.py @@ -66,7 +66,7 @@ def gen_c_validate(versions: set[str], typs: list[idl.UserType]) -> str: "\t{\n" "\t\tsize_t len = n;\n" "\t\tVALIDATE_NET_BYTES(len);\n" - "\t\tif (!is_valid_utf8_without_nul(&net_bytes[net_offset-len], len))\n" + "\t\tif (!utf8_is_valid_without_nul(&net_bytes[net_offset-len], len))\n" f'\t\t\treturn lib9p_error(ctx, {c9util.IDENT("ERRNO_L_EBADMSG")}, "message contains invalid UTF-8");\n' "\t}\n" ) diff --git a/lib9p/core_generated.c b/lib9p/core_generated.c index 81ace7d..e19f6e6 100644 --- a/lib9p/core_generated.c +++ b/lib9p/core_generated.c @@ -6,11 +6,11 @@ #include <libmisc/assert.h> #include <libmisc/endian.h> +#include <libmisc/utf8.h> #include <lib9p/core.h> #include "core_tables.h" -#include "core_utf8.h" /* utilities ******************************************************************/ #if CONFIG_9P_ENABLE_9P2000 @@ -234,7 +234,7 @@ static const lib9p_lock_flags_t lock_flags_masks[LIB9P_VER_NUM] = { { \ size_t len = n; \ VALIDATE_NET_BYTES(len); \ - if (!is_valid_utf8_without_nul(&net_bytes[net_offset-len], len)) \ + if (!utf8_is_valid_without_nul(&net_bytes[net_offset-len], len)) \ return lib9p_error(ctx, LIB9P_ERRNO_L_EBADMSG, "message contains invalid UTF-8"); \ } #define RESERVE_HOST_BYTES(n) \ @@ -6507,7 +6507,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val fmt_print_tag(w, ctx, &self->tag); fmt_print_str(w, " count="); fmt_print_base10(w, self->count); - if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) { + if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) { fmt_print_str(w, " data="); fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50); if (self->count > 50) @@ -6528,7 +6528,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val fmt_print_base10(w, self->offset); fmt_print_str(w, " count="); fmt_print_base10(w, self->count); - if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) { + if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) { fmt_print_str(w, " data="); fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50); if (self->count > 50) @@ -6763,7 +6763,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val fmt_print_tag(w, ctx, &self->tag); fmt_print_str(w, " count="); fmt_print_base10(w, self->count); - if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) { + if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) { fmt_print_str(w, " data="); fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50); if (self->count > 50) @@ -6846,7 +6846,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val fmt_print_tag(w, ctx, &self->tag); fmt_print_str(w, " count="); fmt_print_base10(w, self->count); - if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) { + if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) { fmt_print_str(w, " data="); fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50); if (self->count > 50) @@ -7445,7 +7445,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val fmt_print_str(w, " ]"); fmt_print_str(w, " count="); fmt_print_base10(w, self->count); - if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) { + if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) { fmt_print_str(w, " data="); fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50); if (self->count > 50) diff --git a/lib9p/core_utf8.h b/lib9p/core_utf8.h deleted file mode 100644 index 2c451e0..0000000 --- a/lib9p/core_utf8.h +++ /dev/null @@ -1,36 +0,0 @@ -/* lib9p/core_utf8.h - Internal UTF-8 validation - * - * Copyright (C) 2024-2025 Luke T. Shumaker <lukeshu@lukeshu.com> - * SPDX-License-Identifier: AGPL-3.0-or-later - */ - -#ifndef _LIB9P_CORE_UTF8_H_ -#define _LIB9P_CORE_UTF8_H_ - -#include <stddef.h> /* for size_t */ -#include <stdint.h> /* for uint{n}_t */ - -static inline bool _is_valid_utf8(uint8_t *str, size_t len, bool forbid_nul) { - uint32_t ch; - uint8_t chlen; - for (size_t pos = 0; pos < len;) { - if ((str[pos] & 0b10000000) == 0b00000000) { ch = str[pos] & 0b01111111; chlen = 1; } - else if ((str[pos] & 0b11100000) == 0b11000000) { ch = str[pos] & 0b00011111; chlen = 2; } - else if ((str[pos] & 0b11110000) == 0b11100000) { ch = str[pos] & 0b00001111; chlen = 3; } - else if ((str[pos] & 0b11111000) == 0b11110000) { ch = str[pos] & 0b00000111; chlen = 4; } - else return false; - if ((ch == 0 && (chlen != 1 || forbid_nul)) || pos + chlen > len) return false; - for (uint8_t i = 1; i < chlen; i++) { - if ((str[pos+i] & 0b11000000) != 0b10000000) return false; - ch = (ch << 6) | (str[pos+i] & 0b00111111); - } - if (ch > 0x10FFFF) return false; - pos += chlen; - } - return true; -} - -#define is_valid_utf8(str, len) _is_valid_utf8(str, len, false) -#define is_valid_utf8_without_nul(str, len) _is_valid_utf8(str, len, true) - -#endif /* _LIB9P_CORE_UTF8_H_ */ diff --git a/libmisc/fmt.c b/libmisc/fmt.c index 33788b6..6cf1d8d 100644 --- a/libmisc/fmt.c +++ b/libmisc/fmt.c @@ -6,6 +6,8 @@ #include <string.h> /* for strnlen() */ +#include <libmisc/utf8.h> + #include <libmisc/fmt.h> static const char *const hexdig = "0123456789ABCDEF"; @@ -67,19 +69,18 @@ void fmt_print_qmem(lo_interface fmt_dest w, const void *_str, size_t size) { fmt_print_byte(w, '"'); for (size_t pos = 0; pos < size;) { uint32_t ch; - uint8_t chlen; - if ((str[pos] & 0b10000000) == 0b00000000) { ch = str[pos] & 0b01111111; chlen = 1; } - else if ((str[pos] & 0b11100000) == 0b11000000) { ch = str[pos] & 0b00011111; chlen = 2; } - else if ((str[pos] & 0b11110000) == 0b11100000) { ch = str[pos] & 0b00001111; chlen = 3; } - else if ((str[pos] & 0b11111000) == 0b11110000) { ch = str[pos] & 0b00000111; chlen = 4; } - else goto invalid_utf8; - if ((ch == 0 && chlen != 1) || pos + chlen > size) goto invalid_utf8; - for (uint8_t i = 1; i < chlen; i++) { - if ((str[pos+i] & 0b11000000) != 0b10000000) goto invalid_utf8; - ch = (ch << 6) | (str[pos+i] & 0b00111111); + uint8_t chlen; + utf8_decode_codepoint(&str[pos], size-pos, &ch, &chlen); + if (!chlen) { + /* invalid UTF-8 */ + /* \xAB */ + fmt_print_byte(w, '\\'); + fmt_print_byte(w, 'x'); + fmt_print_byte(w, hexdig[(str[pos] >> 4) & 0xF]); + fmt_print_byte(w, hexdig[(str[pos] >> 0) & 0xF]); + pos++; + continue; } - if (ch > 0x10FFFF) goto invalid_utf8; - if (ch == '\0' || ch == '\b' || ch == '\f' || @@ -132,15 +133,6 @@ void fmt_print_qmem(lo_interface fmt_dest w, const void *_str, size_t size) { fmt_print_byte(w, hexdig[(ch >> 0) & 0xF]); } pos += chlen; - continue; - - invalid_utf8: - /* \xAB */ - fmt_print_byte(w, '\\'); - fmt_print_byte(w, 'x'); - fmt_print_byte(w, hexdig[(str[pos] >> 4) & 0xF]); - fmt_print_byte(w, hexdig[(str[pos] >> 0) & 0xF]); - pos++; } fmt_print_byte(w, '"'); } diff --git a/libmisc/include/libmisc/utf8.h b/libmisc/include/libmisc/utf8.h new file mode 100644 index 0000000..b5e1b0b --- /dev/null +++ b/libmisc/include/libmisc/utf8.h @@ -0,0 +1,54 @@ +/* libmisc/utf8.h - UTF-8 routines + * + * Copyright (C) 2024-2025 Luke T. Shumaker <lukeshu@lukeshu.com> + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#ifndef _LIBMISC_UTF8_H_ +#define _LIBMISC_UTF8_H_ + +#include <stddef.h> /* for size_t */ +#include <stdint.h> /* for uint{n}_t */ + +/** + * Decode the codepoint starting at `str` and consuming at most `len` + * bytes. Invalid UTF-8 is indicated with chlen=0. For valid UTF-8, + * chlen is always in the range [1, 4]. + */ +static inline void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) { + uint32_t ch; + uint8_t chlen; + if ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; } + else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; } + else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; } + else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; } + else goto invalid; + if ((ch == 0 && chlen != 1) || chlen > len) goto invalid; + for (uint8_t i = 1; i < chlen; i++) { + if ((str[i] & 0b11000000) != 0b10000000) goto invalid; + ch = (ch << 6) | (str[i] & 0b00111111); + } + if (ch > 0x10FFFF) goto invalid; + *ret_ch = ch; + *ret_chlen = chlen; + return; + invalid: + *ret_chlen = 0; +} + +static inline bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) { + for (size_t pos = 0; pos < len;) { + uint32_t ch; + uint8_t chlen; + utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen); + if (chlen == 0 || (forbid_nul && ch == 0)) + return false; + pos += chlen; + } + return true; +} + +#define utf8_is_valid(str, len) _utf8_is_valid(str, len, false) +#define utf8_is_valid_without_nul(str, len) _utf8_is_valid(str, len, true) + +#endif /* _LIBMISC_UTF8_H_ */ |