diff options
author | Luke Shumaker <lukeshu@lukeshu.com> | 2023-02-20 12:47:10 -0700 |
---|---|---|
committer | Luke Shumaker <lukeshu@lukeshu.com> | 2023-02-20 12:47:10 -0700 |
commit | f5ca3478c68e47ae20fd12748c1552fdf81f75f9 (patch) | |
tree | b3d3f889ed25084fe33ed9e01554d6ca51104bb5 /encode_escape.go | |
parent | d240d0b06c7b5711f583d961eddfc37d07d4546e (diff) | |
parent | 49ee8be679add0bd3cf08a2669331b3be7a835f8 (diff) |
Merge branch 'lukeshu/fixes'
Diffstat (limited to 'encode_escape.go')
-rw-r--r-- | encode_escape.go | 76 |
1 files changed, 65 insertions, 11 deletions
diff --git a/encode_escape.go b/encode_escape.go index 97da6e9..664c762 100644 --- a/encode_escape.go +++ b/encode_escape.go @@ -6,12 +6,29 @@ package lowmemjson import ( "fmt" - "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/jsonstring" ) -// BackslashEscapeMode identifies one of the three ways that a +// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or +// ReEncoder can behave when encountering invalid UTF-8 in a string +// value: +// +// - Replace the byte with the Unicode replacement character U+FFFD. +// +// - Allow the byte through to the string-encoder, with an +// escape-mode of BackslashEscapeRawByte. +// +// - Emit a syntax error. +type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode + +const ( + InvalidUTF8Replace = jsonstring.InvalidUTF8Replace + InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve + InvalidUTF8Error = jsonstring.InvalidUTF8Error +) + +// BackslashEscapeMode identifies one of the four ways that a // character may be represented in a JSON string: // // - literally (no backslash escaping) @@ -19,13 +36,41 @@ import ( // - as a short "well-known" `\X` backslash sequence (where `X` is a // single-character) // -// - as a long Unicode `\uXXXX` backslash sequence +// - as a long Unicode `\uXXXX` backslash sequence (with 16 +// permutations of capitalization) +// +// - as a raw byte; this allows you to emit invalid JSON; JSON must +// be valid UTF-8, but this allows you to emit arbitrary binary +// data. If the character does not satisfy `utf8.RuneSelf <= char +// <= 0xFF`, then the encoder will panic. type BackslashEscapeMode = jsonstring.BackslashEscapeMode const ( BackslashEscapeNone = jsonstring.BackslashEscapeNone BackslashEscapeShort = jsonstring.BackslashEscapeShort - BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode + BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte + + BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX + BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx + BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX + BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx + BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX + BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx + BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX + BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx + BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX + BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx + BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX + BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx + BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX + BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx + BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX + BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx + + BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin + BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax + + BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat ) func hexToInt(c byte) rune { @@ -49,13 +94,24 @@ func hexToRune(a, b, c, d byte) rune { hexToInt(d)<<0 } +func hexToMode(a, b, c, d byte) BackslashEscapeMode { + // The 0b0010_0000 bit is the ASCII "lowercase bit". + return BackslashEscapeUnicodeMin + BackslashEscapeMode(0| + ((a&0b0010_0000)>>2)| + ((b&0b0010_0000)>>3)| + ((c&0b0010_0000)>>4)| + ((d&0b0010_0000)>>5)) +} + // A BackslashEscaper controls how a ReEncoder emits a character in a // JSON string. The `rune` argument is the character being // considered, and the `BackslashEscapeMode` argument is how it was // originally encoded in the input. // // The ReEncoder will panic if a BackslashEscaper returns an unknown -// BackslashEscapeMode. +// BackslashEscapeMode. However, a BackslashEscaper should be +// permissive of BackslashEscapeModes it doesn't recognize; it is safe +// to just return them unmodified. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode // EscapePreserve is a BackslashEscaper that preserves the original @@ -96,14 +152,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode // behavior of encoding/json. // // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX` -// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement -// character. +// sequences for `\b` and `\f` // // A ReEncoder uses EscapeDefault if a BackslashEscaper is not // specified. func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { - case '\b', '\f', utf8.RuneError: + case '\b', '\f': return BackslashEscapeUnicode default: return EscapeHTMLSafe(c, wasEscaped) @@ -115,11 +170,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { // SetEscapeHTML(false) called on it. // // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX` -// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement -// character. +// sequences for `\b` and `\f`. func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { - case '\b', '\f', utf8.RuneError: + case '\b', '\f': return BackslashEscapeUnicode default: return EscapeJSSafe(c, wasEscaped) |