summaryrefslogtreecommitdiff
path: root/encode_escape.go
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-20 12:47:10 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-20 12:47:10 -0700
commitf5ca3478c68e47ae20fd12748c1552fdf81f75f9 (patch)
treeb3d3f889ed25084fe33ed9e01554d6ca51104bb5 /encode_escape.go
parentd240d0b06c7b5711f583d961eddfc37d07d4546e (diff)
parent49ee8be679add0bd3cf08a2669331b3be7a835f8 (diff)
Merge branch 'lukeshu/fixes'
Diffstat (limited to 'encode_escape.go')
-rw-r--r--encode_escape.go76
1 files changed, 65 insertions, 11 deletions
diff --git a/encode_escape.go b/encode_escape.go
index 97da6e9..664c762 100644
--- a/encode_escape.go
+++ b/encode_escape.go
@@ -6,12 +6,29 @@ package lowmemjson
import (
"fmt"
- "unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/jsonstring"
)
-// BackslashEscapeMode identifies one of the three ways that a
+// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or
+// ReEncoder can behave when encountering invalid UTF-8 in a string
+// value:
+//
+// - Replace the byte with the Unicode replacement character U+FFFD.
+//
+// - Allow the byte through to the string-encoder, with an
+// escape-mode of BackslashEscapeRawByte.
+//
+// - Emit a syntax error.
+type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode
+
+const (
+ InvalidUTF8Replace = jsonstring.InvalidUTF8Replace
+ InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve
+ InvalidUTF8Error = jsonstring.InvalidUTF8Error
+)
+
+// BackslashEscapeMode identifies one of the four ways that a
// character may be represented in a JSON string:
//
// - literally (no backslash escaping)
@@ -19,13 +36,41 @@ import (
// - as a short "well-known" `\X` backslash sequence (where `X` is a
// single-character)
//
-// - as a long Unicode `\uXXXX` backslash sequence
+// - as a long Unicode `\uXXXX` backslash sequence (with 16
+// permutations of capitalization)
+//
+// - as a raw byte; this allows you to emit invalid JSON; JSON must
+// be valid UTF-8, but this allows you to emit arbitrary binary
+// data. If the character does not satisfy `utf8.RuneSelf <= char
+// <= 0xFF`, then the encoder will panic.
type BackslashEscapeMode = jsonstring.BackslashEscapeMode
const (
BackslashEscapeNone = jsonstring.BackslashEscapeNone
BackslashEscapeShort = jsonstring.BackslashEscapeShort
- BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode
+ BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
+
+ BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx
+ BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX
+ BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx
+ BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX
+ BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx
+ BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX
+ BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx
+ BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX
+ BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx
+ BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX
+ BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx
+ BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX
+ BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx
+ BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX
+ BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin
+ BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax
+
+ BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat
)
func hexToInt(c byte) rune {
@@ -49,13 +94,24 @@ func hexToRune(a, b, c, d byte) rune {
hexToInt(d)<<0
}
+func hexToMode(a, b, c, d byte) BackslashEscapeMode {
+ // The 0b0010_0000 bit is the ASCII "lowercase bit".
+ return BackslashEscapeUnicodeMin + BackslashEscapeMode(0|
+ ((a&0b0010_0000)>>2)|
+ ((b&0b0010_0000)>>3)|
+ ((c&0b0010_0000)>>4)|
+ ((d&0b0010_0000)>>5))
+}
+
// A BackslashEscaper controls how a ReEncoder emits a character in a
// JSON string. The `rune` argument is the character being
// considered, and the `BackslashEscapeMode` argument is how it was
// originally encoded in the input.
//
// The ReEncoder will panic if a BackslashEscaper returns an unknown
-// BackslashEscapeMode.
+// BackslashEscapeMode. However, a BackslashEscaper should be
+// permissive of BackslashEscapeModes it doesn't recognize; it is safe
+// to just return them unmodified.
type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
// EscapePreserve is a BackslashEscaper that preserves the original
@@ -96,14 +152,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode
// behavior of encoding/json.
//
// It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`
//
// A ReEncoder uses EscapeDefault if a BackslashEscaper is not
// specified.
func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
- case '\b', '\f', utf8.RuneError:
+ case '\b', '\f':
return BackslashEscapeUnicode
default:
return EscapeHTMLSafe(c, wasEscaped)
@@ -115,11 +170,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
// SetEscapeHTML(false) called on it.
//
// It is like EscapeJSSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`.
func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
- case '\b', '\f', utf8.RuneError:
+ case '\b', '\f':
return BackslashEscapeUnicode
default:
return EscapeJSSafe(c, wasEscaped)