Pull UTF-8 decoding into libmisc/utf8.c

author: Luke T. Shumaker <lukeshu@lukeshu.com> 2025-05-17 03:20:11 -0600
committer: Luke T. Shumaker <lukeshu@lukeshu.com> 2025-05-17 12:57:08 -0600
commit: ab9103440ade87509a1a3bd1eaad0b5396a89d1e (patch)
tree: 423673909d0af66d4ef5e260ce58b4b554bf2024
parent: d505a998aafe5af8b02a2b2c2acf7e708812c3fc (diff)
7 files changed, 77 insertions, 67 deletions
diff --git a/lib9p/core_gen/c.py b/lib9p/core_gen/c.py
index 60ceb70..5776035 100644
--- a/lib9p/core_gen/c.py
+++ b/lib9p/core_gen/c.py
@@ -29,11 +29,11 @@ def gen_c(versions: set[str], typs: list[idl.UserType]) -> str:
 
 #include <libmisc/assert.h>
 #include <libmisc/endian.h>
+#include <libmisc/utf8.h>
 
 #include <lib9p/core.h>
 
 #include "core_tables.h"
-#include "core_utf8.h"
 """
     # utilities ################################################################
     ret += """
diff --git a/lib9p/core_gen/c_fmt_print.py b/lib9p/core_gen/c_fmt_print.py
index eaacddb..7a0a9d3 100644
--- a/lib9p/core_gen/c_fmt_print.py
+++ b/lib9p/core_gen/c_fmt_print.py
@@ -112,7 +112,7 @@ def gen_c_fmt_print(versions: set[str], typs: list[idl.UserType]) -> str:
                             cnt_str = f"self->{member.cnt.membname}"
                             cnt_typ = c9util.typename(member.cnt.typ)
                         if member.typ.static_size == 1:  # SPECIAL (data)
-                            ret += f"\tif (is_valid_utf8_without_nul((uint8_t *)self->{member.membname}, (size_t){cnt_str})) {{\n"
+                            ret += f"\tif (utf8_is_valid_without_nul((uint8_t *)self->{member.membname}, (size_t){cnt_str})) {{\n"
                             ret += f'\t\tfmt_print_str(w, " {member.membname}=");\n'
                             ret += f"\t\tfmt_print_qmem(w, self->{member.membname}, {cnt_str} < 50 ? {cnt_str} : 50);\n"
                             ret += f"\t\tif ({cnt_str} > 50)\n"
diff --git a/lib9p/core_gen/c_validate.py b/lib9p/core_gen/c_validate.py
index 9c55d8d..e7a4017 100644
--- a/lib9p/core_gen/c_validate.py
+++ b/lib9p/core_gen/c_validate.py
@@ -66,7 +66,7 @@ def gen_c_validate(versions: set[str], typs: list[idl.UserType]) -> str:
         "\t{\n"
         "\t\tsize_t len = n;\n"
         "\t\tVALIDATE_NET_BYTES(len);\n"
-        "\t\tif (!is_valid_utf8_without_nul(&net_bytes[net_offset-len], len))\n"
+        "\t\tif (!utf8_is_valid_without_nul(&net_bytes[net_offset-len], len))\n"
         f'\t\t\treturn lib9p_error(ctx, {c9util.IDENT("ERRNO_L_EBADMSG")}, "message contains invalid UTF-8");\n'
         "\t}\n"
     )
diff --git a/lib9p/core_generated.c b/lib9p/core_generated.c
index 81ace7d..e19f6e6 100644
--- a/lib9p/core_generated.c
+++ b/lib9p/core_generated.c
@@ -6,11 +6,11 @@
 
 #include <libmisc/assert.h>
 #include <libmisc/endian.h>
+#include <libmisc/utf8.h>
 
 #include <lib9p/core.h>
 
 #include "core_tables.h"
-#include "core_utf8.h"
 
 /* utilities ******************************************************************/
 #if CONFIG_9P_ENABLE_9P2000
@@ -234,7 +234,7 @@ static const lib9p_lock_flags_t lock_flags_masks[LIB9P_VER_NUM] = {
 	{                                                                                                 \
 		size_t len = n;                                                                           \
 		VALIDATE_NET_BYTES(len);                                                                  \
-		if (!is_valid_utf8_without_nul(&net_bytes[net_offset-len], len))                          \
+		if (!utf8_is_valid_without_nul(&net_bytes[net_offset-len], len))                          \
 			return lib9p_error(ctx, LIB9P_ERRNO_L_EBADMSG, "message contains invalid UTF-8"); \
 	}
 #define RESERVE_HOST_BYTES(n)                                           \
@@ -6507,7 +6507,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val
 	fmt_print_tag(w, ctx, &self->tag);
 	fmt_print_str(w, " count=");
 	fmt_print_base10(w, self->count);
-	if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) {
+	if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) {
 		fmt_print_str(w, " data=");
 		fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50);
 		if (self->count > 50)
@@ -6528,7 +6528,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val
 	fmt_print_base10(w, self->offset);
 	fmt_print_str(w, " count=");
 	fmt_print_base10(w, self->count);
-	if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) {
+	if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) {
 		fmt_print_str(w, " data=");
 		fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50);
 		if (self->count > 50)
@@ -6763,7 +6763,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val
 	fmt_print_tag(w, ctx, &self->tag);
 	fmt_print_str(w, " count=");
 	fmt_print_base10(w, self->count);
-	if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) {
+	if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) {
 		fmt_print_str(w, " data=");
 		fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50);
 		if (self->count > 50)
@@ -6846,7 +6846,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val
 	fmt_print_tag(w, ctx, &self->tag);
 	fmt_print_str(w, " count=");
 	fmt_print_base10(w, self->count);
-	if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) {
+	if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) {
 		fmt_print_str(w, " data=");
 		fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50);
 		if (self->count > 50)
@@ -7445,7 +7445,7 @@ static bool marshal_Rswrite(struct lib9p_ctx *ctx, struct lib9p_msg_Rswrite *val
 	fmt_print_str(w, " ]");
 	fmt_print_str(w, " count=");
 	fmt_print_base10(w, self->count);
-	if (is_valid_utf8_without_nul((uint8_t *)self->data, (size_t)self->count)) {
+	if (utf8_is_valid_without_nul((uint8_t *)self->data, (size_t)self->count)) {
 		fmt_print_str(w, " data=");
 		fmt_print_qmem(w, self->data, self->count < 50 ? self->count : 50);
 		if (self->count > 50)
diff --git a/lib9p/core_utf8.h b/lib9p/core_utf8.h
deleted file mode 100644
index 2c451e0..0000000
--- a/lib9p/core_utf8.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* lib9p/core_utf8.h - Internal UTF-8 validation
- *
- * Copyright (C) 2024-2025  Luke T. Shumaker <lukeshu@lukeshu.com>
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#ifndef _LIB9P_CORE_UTF8_H_
-#define _LIB9P_CORE_UTF8_H_
-
-#include <stddef.h> /* for size_t */
-#include <stdint.h> /* for uint{n}_t */
-
-static inline bool _is_valid_utf8(uint8_t *str, size_t len, bool forbid_nul) {
-	uint32_t ch;
-	uint8_t chlen;
-	for (size_t pos = 0; pos < len;) {
-		if      ((str[pos] & 0b10000000) == 0b00000000) { ch = str[pos] & 0b01111111; chlen = 1; }
-		else if ((str[pos] & 0b11100000) == 0b11000000) { ch = str[pos] & 0b00011111; chlen = 2; }
-		else if ((str[pos] & 0b11110000) == 0b11100000) { ch = str[pos] & 0b00001111; chlen = 3; }
-		else if ((str[pos] & 0b11111000) == 0b11110000) { ch = str[pos] & 0b00000111; chlen = 4; }
-		else return false;
-		if ((ch == 0 && (chlen != 1 || forbid_nul)) || pos + chlen > len) return false;
-		for (uint8_t i = 1; i < chlen; i++) {
-			if ((str[pos+i] & 0b11000000) != 0b10000000) return false;
-			ch = (ch << 6) | (str[pos+i] & 0b00111111);
-		}
-		if (ch > 0x10FFFF) return false;
-		pos += chlen;
-	}
-	return true;
-}
-
-#define is_valid_utf8(str, len)                 _is_valid_utf8(str, len, false)
-#define is_valid_utf8_without_nul(str, len)     _is_valid_utf8(str, len, true)
-
-#endif /* _LIB9P_CORE_UTF8_H_ */
diff --git a/libmisc/fmt.c b/libmisc/fmt.c
index 33788b6..6cf1d8d 100644
--- a/libmisc/fmt.c
+++ b/libmisc/fmt.c
@@ -6,6 +6,8 @@
 
 #include <string.h> /* for strnlen() */
 
+#include <libmisc/utf8.h>
+
 #include <libmisc/fmt.h>
 
 static const char *const hexdig = "0123456789ABCDEF";
@@ -67,19 +69,18 @@ void fmt_print_qmem(lo_interface fmt_dest w, const void *_str, size_t size) {
 	fmt_print_byte(w, '"');
 	for (size_t pos = 0; pos < size;) {
 		uint32_t ch;
-		uint8_t chlen;
-		if      ((str[pos] & 0b10000000) == 0b00000000) { ch = str[pos] & 0b01111111; chlen = 1; }
-		else if ((str[pos] & 0b11100000) == 0b11000000) { ch = str[pos] & 0b00011111; chlen = 2; }
-		else if ((str[pos] & 0b11110000) == 0b11100000) { ch = str[pos] & 0b00001111; chlen = 3; }
-		else if ((str[pos] & 0b11111000) == 0b11110000) { ch = str[pos] & 0b00000111; chlen = 4; }
-		else goto invalid_utf8;
-		if ((ch == 0 && chlen != 1) || pos + chlen > size) goto invalid_utf8;
-		for (uint8_t i = 1; i < chlen; i++) {
-			if ((str[pos+i] & 0b11000000) != 0b10000000) goto invalid_utf8;
-			ch = (ch << 6) | (str[pos+i] & 0b00111111);
+		uint8_t  chlen;
+		utf8_decode_codepoint(&str[pos], size-pos, &ch, &chlen);
+		if (!chlen) {
+			/* invalid UTF-8 */
+			/* \xAB */
+			fmt_print_byte(w, '\\');
+			fmt_print_byte(w, 'x');
+			fmt_print_byte(w, hexdig[(str[pos] >> 4) & 0xF]);
+			fmt_print_byte(w, hexdig[(str[pos] >> 0) & 0xF]);
+			pos++;
+			continue;
 		}
-		if (ch > 0x10FFFF) goto invalid_utf8;
-
 		if (ch == '\0' ||
 		    ch == '\b' ||
 		    ch == '\f' ||
@@ -132,15 +133,6 @@ void fmt_print_qmem(lo_interface fmt_dest w, const void *_str, size_t size) {
 			fmt_print_byte(w, hexdig[(ch >>  0) & 0xF]);
 		}
 		pos += chlen;
-		continue;
-
-	invalid_utf8:
-		/* \xAB */
-		fmt_print_byte(w, '\\');
-		fmt_print_byte(w, 'x');
-		fmt_print_byte(w, hexdig[(str[pos] >> 4) & 0xF]);
-		fmt_print_byte(w, hexdig[(str[pos] >> 0) & 0xF]);
-		pos++;
 	}
 	fmt_print_byte(w, '"');
 }
diff --git a/libmisc/include/libmisc/utf8.h b/libmisc/include/libmisc/utf8.h
new file mode 100644
index 0000000..b5e1b0b
--- /dev/null
+++ b/libmisc/include/libmisc/utf8.h
@@ -0,0 +1,54 @@
+/* libmisc/utf8.h - UTF-8 routines
+ *
+ * Copyright (C) 2024-2025  Luke T. Shumaker <lukeshu@lukeshu.com>
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#ifndef _LIBMISC_UTF8_H_
+#define _LIBMISC_UTF8_H_
+
+#include <stddef.h> /* for size_t */
+#include <stdint.h> /* for uint{n}_t */
+
+/**
+ * Decode the codepoint starting at `str` and consuming at most `len`
+ * bytes.  Invalid UTF-8 is indicated with chlen=0.  For valid UTF-8,
+ * chlen is always in the range [1, 4].
+ */
+static inline void utf8_decode_codepoint(const uint8_t *str, size_t len, uint32_t *ret_ch, uint8_t *ret_chlen) {
+	uint32_t ch;
+	uint8_t  chlen;
+	if      ((str[0] & 0b10000000) == 0b00000000) { ch = str[0] & 0b01111111; chlen = 1; }
+	else if ((str[0] & 0b11100000) == 0b11000000) { ch = str[0] & 0b00011111; chlen = 2; }
+	else if ((str[0] & 0b11110000) == 0b11100000) { ch = str[0] & 0b00001111; chlen = 3; }
+	else if ((str[0] & 0b11111000) == 0b11110000) { ch = str[0] & 0b00000111; chlen = 4; }
+	else goto invalid;
+	if ((ch == 0 && chlen != 1) || chlen > len) goto invalid;
+	for (uint8_t i = 1; i < chlen; i++) {
+		if ((str[i] & 0b11000000) != 0b10000000) goto invalid;
+		ch = (ch << 6) | (str[i] & 0b00111111);
+	}
+	if (ch > 0x10FFFF) goto invalid;
+	*ret_ch    = ch;
+	*ret_chlen = chlen;
+	return;
+ invalid:
+	*ret_chlen = 0;
+}
+
+static inline bool _utf8_is_valid(const uint8_t *str, size_t len, bool forbid_nul) {
+	for (size_t pos = 0; pos < len;) {
+		uint32_t ch;
+		uint8_t  chlen;
+		utf8_decode_codepoint(&str[pos], len-pos, &ch, &chlen);
+		if (chlen == 0 || (forbid_nul && ch == 0))
+			return false;
+		pos += chlen;
+	}
+	return true;
+}
+
+#define utf8_is_valid(str, len)             _utf8_is_valid(str, len, false)
+#define utf8_is_valid_without_nul(str, len) _utf8_is_valid(str, len, true)
+
+#endif /* _LIBMISC_UTF8_H_ */
author	Luke T. Shumaker <lukeshu@lukeshu.com>	2025-05-17 03:20:11 -0600
committer	Luke T. Shumaker <lukeshu@lukeshu.com>	2025-05-17 12:57:08 -0600
commit	ab9103440ade87509a1a3bd1eaad0b5396a89d1e (patch)
tree	423673909d0af66d4ef5e260ce58b4b554bf2024
parent	d505a998aafe5af8b02a2b2c2acf7e708812c3fc (diff)