/* libmisc/utf8.c - UTF-8 routines * * Copyright (C) 2024-2025 Luke T. Shumaker * SPDX-License-Identifier: AGPL-3.0-or-later */ #include void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) { uint32_t ch; uint8_t chlen; uint32_t chmin; if ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; chmin = 0; } /* bits=7+(0*6)= 7 */ else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; chmin = UINT32_C(1)<< 7; } /* bits=5+(1*6)=11 */ else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; chmin = UINT32_C(1)<<11; } /* bits=4+(2*6)=16 */ else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; chmin = UINT32_C(1)<<16; } /* bits=3+(3*6)=21 */ else goto invalid; if (chlen > len) goto invalid; for (uint8_t i = 1; i < chlen; i++) { if ((str[i] & 0b11000000) != 0b10000000) goto invalid; ch = (ch << 6) | (str[i] & 0b00111111); } if (ch > 0x10FFFF || ch < chmin) goto invalid; *ret_ch = ch; *ret_chlen = chlen; return; invalid: *ret_chlen = 0; } bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) { for (size_t pos = 0; pos < len;) { uint32_t ch; uint8_t chlen; utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen); if (chlen == 0 || (forbid_nul && ch == 0)) return false; pos += chlen; } return true; }