From cf4af09e9a20e9cdaec4b3896eb6d10c27f89eba Mon Sep 17 00:00:00 2001 From: "Luke T. Shumaker" Date: Mon, 26 May 2025 14:58:07 -0400 Subject: No more (static inline) function bodies in headers --- libmisc/utf8.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 libmisc/utf8.c (limited to 'libmisc/utf8.c') diff --git a/libmisc/utf8.c b/libmisc/utf8.c new file mode 100644 index 0000000..5f91021 --- /dev/null +++ b/libmisc/utf8.c @@ -0,0 +1,40 @@ +/* libmisc/utf8.c - UTF-8 routines + * + * Copyright (C) 2024-2025 Luke T. Shumaker + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include + +void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) { + uint32_t ch; + uint8_t chlen; + if ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; } + else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; } + else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; } + else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; } + else goto invalid; + if ((ch == 0 && chlen != 1) || chlen > len) goto invalid; + for (uint8_t i = 1; i < chlen; i++) { + if ((str[i] & 0b11000000) != 0b10000000) goto invalid; + ch = (ch << 6) | (str[i] & 0b00111111); + } + if (ch > 0x10FFFF) goto invalid; + *ret_ch = ch; + *ret_chlen = chlen; + return; + invalid: + *ret_chlen = 0; +} + +bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) { + for (size_t pos = 0; pos < len;) { + uint32_t ch; + uint8_t chlen; + utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen); + if (chlen == 0 || (forbid_nul && ch == 0)) + return false; + pos += chlen; + } + return true; +} -- cgit v1.2.3-2-g168b