/* libmisc/utf8.c - UTF-8 routines * * Copyright (C) 2024-2025 Luke T. Shumaker * SPDX-License-Identifier: AGPL-3.0-or-later */ #include void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) { uint32_t ch; uint8_t chlen; if ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; } else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; } else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; } else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; } else goto invalid; if ((ch == 0 && chlen != 1) || chlen > len) goto invalid; for (uint8_t i = 1; i < chlen; i++) { if ((str[i] & 0b11000000) != 0b10000000) goto invalid; ch = (ch << 6) | (str[i] & 0b00111111); } if (ch > 0x10FFFF) goto invalid; *ret_ch = ch; *ret_chlen = chlen; return; invalid: *ret_chlen = 0; } bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) { for (size_t pos = 0; pos < len;) { uint32_t ch; uint8_t chlen; utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen); if (chlen == 0 || (forbid_nul && ch == 0)) return false; pos += chlen; } return true; }