summaryrefslogtreecommitdiff
path: root/libmisc/utf8.c
blob: 28357f00747abc8cd211a5b6e267805f71f226c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/* libmisc/utf8.c - UTF-8 routines
 *
 * Copyright (C) 2024-2025  Luke T. Shumaker <lukeshu@lukeshu.com>
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

#include <libmisc/utf8.h>

void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) {
	uint32_t ch;
	uint8_t  chlen;
	uint32_t chmin;
	if      ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; chmin = 0;               } /* bits=7+(0*6)= 7 */
	else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; chmin = UINT32_C(1)<< 7; } /* bits=5+(1*6)=11 */
	else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; chmin = UINT32_C(1)<<11; } /* bits=4+(2*6)=16 */
	else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; chmin = UINT32_C(1)<<16; } /* bits=3+(3*6)=21 */
	else goto invalid;
	if (chlen > len)
		goto invalid;
	for (uint8_t i = 1; i < chlen; i++) {
		if ((str[i] & 0b11000000) != 0b10000000)
			goto invalid;
		ch = (ch << 6) | (str[i] & 0b00111111);
	}
	if (ch > 0x10FFFF || ch < chmin)
		goto invalid;
	*ret_ch    = ch;
	*ret_chlen = chlen;
	return;
 invalid:
	*ret_chlen = 0;
}

bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) {
	for (size_t pos = 0; pos < len;) {
		uint32_t ch;
		uint8_t  chlen;
		utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen);
		if (chlen == 0 || (forbid_nul && ch == 0))
			return false;
		pos += chlen;
	}
	return true;
}