summaryrefslogtreecommitdiff
path: root/libmisc/utf8.c
diff options
context:
space:
mode:
authorLuke T. Shumaker <lukeshu@lukeshu.com>2025-05-27 19:47:25 -0400
committerLuke T. Shumaker <lukeshu@lukeshu.com>2025-05-27 19:47:25 -0400
commitbf3667b8b76eefd95e33e32b4f5abbf2de0e2065 (patch)
tree241b909f0d68f3c0f355ad924375c2fa5b9a19ae /libmisc/utf8.c
parent42fb27570262b52e2ca889030c621b5f4af76fe1 (diff)
parent968f7710458f44d5e62d2624461f4e8459c04168 (diff)
Merge branch 'lukeshu/cover'HEADmain
Diffstat (limited to 'libmisc/utf8.c')
-rw-r--r--libmisc/utf8.c40
1 files changed, 40 insertions, 0 deletions
diff --git a/libmisc/utf8.c b/libmisc/utf8.c
new file mode 100644
index 0000000..5f91021
--- /dev/null
+++ b/libmisc/utf8.c
@@ -0,0 +1,40 @@
+/* libmisc/utf8.c - UTF-8 routines
+ *
+ * Copyright (C) 2024-2025 Luke T. Shumaker <lukeshu@lukeshu.com>
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <libmisc/utf8.h>
+
+void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) {
+ uint32_t ch;
+ uint8_t chlen;
+ if ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; }
+ else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; }
+ else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; }
+ else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; }
+ else goto invalid;
+ if ((ch == 0 && chlen != 1) || chlen > len) goto invalid;
+ for (uint8_t i = 1; i < chlen; i++) {
+ if ((str[i] & 0b11000000) != 0b10000000) goto invalid;
+ ch = (ch << 6) | (str[i] & 0b00111111);
+ }
+ if (ch > 0x10FFFF) goto invalid;
+ *ret_ch = ch;
+ *ret_chlen = chlen;
+ return;
+ invalid:
+ *ret_chlen = 0;
+}
+
+bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) {
+ for (size_t pos = 0; pos < len;) {
+ uint32_t ch;
+ uint8_t chlen;
+ utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen);
+ if (chlen == 0 || (forbid_nul && ch == 0))
+ return false;
+ pos += chlen;
+ }
+ return true;
+}