diff options
author | Luke T. Shumaker <lukeshu@lukeshu.com> | 2025-04-02 20:44:59 -0600 |
---|---|---|
committer | Luke T. Shumaker <lukeshu@lukeshu.com> | 2025-04-02 20:44:59 -0600 |
commit | ff88c4cc9bfdc91c3af390ab6a7588f5a8ade40a (patch) | |
tree | ae18e6d4576fa594be94e8278877fbdedfa1d4ba /libfmt/quote.c | |
parent | 13b8cafb7e28784f037ecd24876c225ddcf48d76 (diff) | |
parent | 8cc87f8c1f25c9d3fec00561237891650a91b47a (diff) |
Diffstat (limited to 'libfmt/quote.c')
-rw-r--r-- | libfmt/quote.c | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/libfmt/quote.c b/libfmt/quote.c new file mode 100644 index 0000000..c91e0b0 --- /dev/null +++ b/libfmt/quote.c @@ -0,0 +1,159 @@ +/* libfmt/quote.c - C-string quoting for pico-fmt + * + * Copyright (C) 2025 Luke T. Shumaker <lukeshu@lukeshu.com> + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include <string.h> /* for strnlen() */ +#include <stdint.h> /* for uint{n}_t() */ + +#include <libfmt/fmt.h> + +enum quote { + QUOTE_NONE, /* c */ + QUOTE_SIMPLE, /* \c */ + QUOTE_U4, /* \uABCD */ + QUOTE_U8, /* \UABCDABCD */ +}; + +static inline enum quote needs_quote(uint32_t ch) { + if (ch == '\a' || + ch == '\b' || + ch == '\f' || + ch == '\n' || + ch == '\r' || + ch == '\t' || + ch == '\v' || + ch == '\\' || + ch == '\'' || + ch == '"' || + ch == '?') + return QUOTE_SIMPLE; + else if (' ' <= ch && ch <= '~') + return QUOTE_NONE; + else if (ch < 0x10000) + return QUOTE_U4; + else + return QUOTE_U8; +} + +/** + * Quote a string to ASCII-only C syntax. Valid UTF-8 is quoted as + * short C-escape characters, \uABCD or \UABCDABCD; invalid UTF-8 is + * quoted as \xAB. + */ +static void libfmt_conv_quote(struct fmt_state *state) { + uint32_t ch; + uint8_t chlen; + + const char *in = va_arg(*state->args, char*); + size_t in_len = strnlen(in, (state->flags & FMT_FLAG_PRECISION) ? state->precision : (size_t)-1); + + size_t out_len = 2; + for (size_t pos = 0; pos < in_len;) { + if ((in[pos] & 0b10000000) == 0b00000000) { ch = in[pos] & 0b01111111; chlen = 1; } + else if ((in[pos] & 0b11100000) == 0b11000000) { ch = in[pos] & 0b00011111; chlen = 2; } + else if ((in[pos] & 0b11110000) == 0b11100000) { ch = in[pos] & 0b00001111; chlen = 3; } + else if ((in[pos] & 0b11111000) == 0b11110000) { ch = in[pos] & 0b00000111; chlen = 4; } + else goto measure_invalid_utf8; + if ((ch == 0 && chlen != 1) || pos + chlen > in_len) goto measure_invalid_utf8; + for (uint8_t i = 1; i < chlen; i++) { + if ((in[pos+i] & 0b11000000) != 0b10000000) goto measure_invalid_utf8; + ch = (ch << 6) | (in[pos+i] & 0b00111111); + } + if (ch > 0x10FFFF) goto measure_invalid_utf8; + pos += chlen; + + switch (needs_quote(ch)) { + case QUOTE_NONE : out_len += 1; break; + case QUOTE_SIMPLE : out_len += 2; break; + case QUOTE_U4 : out_len += 6; break; + case QUOTE_U8 : out_len += 10; break; + } + continue; + measure_invalid_utf8: + pos++; + out_len += 4; /* \xAB */ + } + + if (!(state->flags & FMT_FLAG_LEFT)) { + for (size_t i = 0; i + out_len < state->width; i++) { + fmt_state_putchar(state, ' '); + } + } + + fmt_state_putchar(state, '"'); + for (size_t pos = 0; pos < in_len;) { + if ((in[pos] & 0b10000000) == 0b00000000) { ch = in[pos] & 0b01111111; chlen = 1; } + else if ((in[pos] & 0b11100000) == 0b11000000) { ch = in[pos] & 0b00011111; chlen = 2; } + else if ((in[pos] & 0b11110000) == 0b11100000) { ch = in[pos] & 0b00001111; chlen = 3; } + else if ((in[pos] & 0b11111000) == 0b11110000) { ch = in[pos] & 0b00000111; chlen = 4; } + else goto output_invalid_utf8; + if ((ch == 0 && chlen != 1) || pos + chlen > in_len) goto output_invalid_utf8; + for (uint8_t i = 1; i < chlen; i++) { + if ((in[pos+i] & 0b11000000) != 0b10000000) goto output_invalid_utf8; + ch = (ch << 6) | (in[pos+i] & 0b00111111); + } + if (ch > 0x10FFFF) goto output_invalid_utf8; + pos += chlen; + + switch (needs_quote(ch)) { + case QUOTE_NONE: + fmt_state_putchar(state, ch); + break; + case QUOTE_SIMPLE: + fmt_state_putchar(state, '\\'); + switch (ch) { + case '\a': fmt_state_putchar(state, 'a'); break; + case '\b': fmt_state_putchar(state, 'b'); break; + case '\f': fmt_state_putchar(state, 'f'); break; + case '\n': fmt_state_putchar(state, 'n'); break; + case '\r': fmt_state_putchar(state, 'r'); break; + case '\t': fmt_state_putchar(state, 't'); break; + case '\v': fmt_state_putchar(state, 'v'); break; + case '\\': fmt_state_putchar(state, '\\'); break; + case '\'': fmt_state_putchar(state, '\''); break; + case '"': fmt_state_putchar(state, '"'); break; + case '?': fmt_state_putchar(state, '?'); break; + } + break; + case QUOTE_U4: + fmt_state_putchar(state, '\\'); + fmt_state_putchar(state, 'u'); + fmt_state_putchar(state, (ch >> 12) & 0xF); + fmt_state_putchar(state, (ch >> 8) & 0xF); + fmt_state_putchar(state, (ch >> 4) & 0xF); + fmt_state_putchar(state, (ch >> 0) & 0xF); + break; + case QUOTE_U8: + fmt_state_putchar(state, '\\'); + fmt_state_putchar(state, 'U'); + fmt_state_putchar(state, (ch >> 28) & 0xF); + fmt_state_putchar(state, (ch >> 24) & 0xF); + fmt_state_putchar(state, (ch >> 20) & 0xF); + fmt_state_putchar(state, (ch >> 16) & 0xF); + fmt_state_putchar(state, (ch >> 12) & 0xF); + fmt_state_putchar(state, (ch >> 8) & 0xF); + fmt_state_putchar(state, (ch >> 4) & 0xF); + fmt_state_putchar(state, (ch >> 0) & 0xF); + break; + } + continue; + output_invalid_utf8: + fmt_state_putchar(state, '\\'); + fmt_state_putchar(state, 'x'); + fmt_state_putchar(state, (in[pos] >> 4) & 0xF); + fmt_state_putchar(state, (in[pos] >> 0) & 0xF); + pos++; + } + fmt_state_putchar(state, '"'); + + for (size_t i = 0; i + out_len < state->width; i++) { + fmt_state_putchar(state, ' '); + } +} + +[[gnu::constructor]] +static void libfmt_install_quote(void) { + fmt_install('q', libfmt_conv_quote); +} |