Merge branch 'lukeshu/fixes'

author: Luke Shumaker <lukeshu@lukeshu.com> 2023-02-20 12:47:10 -0700
committer: Luke Shumaker <lukeshu@lukeshu.com> 2023-02-20 12:47:10 -0700
commit: f5ca3478c68e47ae20fd12748c1552fdf81f75f9 (patch)
tree: b3d3f889ed25084fe33ed9e01554d6ca51104bb5 /encode_escape.go
parent: d240d0b06c7b5711f583d961eddfc37d07d4546e (diff)
parent: 49ee8be679add0bd3cf08a2669331b3be7a835f8 (diff)
1 files changed, 65 insertions, 11 deletions
diff --git a/encode_escape.go b/encode_escape.go
index 97da6e9..664c762 100644
--- a/encode_escape.go
+++ b/encode_escape.go
@@ -6,12 +6,29 @@ package lowmemjson
 
 import (
 	"fmt"
-	"unicode/utf8"
 
 	"git.lukeshu.com/go/lowmemjson/internal/jsonstring"
 )
 
-// BackslashEscapeMode identifies one of the three ways that a
+// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or
+// ReEncoder can behave when encountering invalid UTF-8 in a string
+// value:
+//
+//   - Replace the byte with the Unicode replacement character U+FFFD.
+//
+//   - Allow the byte through to the string-encoder, with an
+//     escape-mode of BackslashEscapeRawByte.
+//
+//   - Emit a syntax error.
+type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode
+
+const (
+	InvalidUTF8Replace  = jsonstring.InvalidUTF8Replace
+	InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve
+	InvalidUTF8Error    = jsonstring.InvalidUTF8Error
+)
+
+// BackslashEscapeMode identifies one of the four ways that a
 // character may be represented in a JSON string:
 //
 //   - literally (no backslash escaping)
@@ -19,13 +36,41 @@ import (
 //   - as a short "well-known" `\X` backslash sequence (where `X` is a
 //     single-character)
 //
-//   - as a long Unicode `\uXXXX` backslash sequence
+//   - as a long Unicode `\uXXXX` backslash sequence (with 16
+//     permutations of capitalization)
+//
+//   - as a raw byte; this allows you to emit invalid JSON; JSON must
+//     be valid UTF-8, but this allows you to emit arbitrary binary
+//     data.  If the character does not satisfy `utf8.RuneSelf <= char
+//     <= 0xFF`, then the encoder will panic.
 type BackslashEscapeMode = jsonstring.BackslashEscapeMode
 
 const (
 	BackslashEscapeNone    = jsonstring.BackslashEscapeNone
 	BackslashEscapeShort   = jsonstring.BackslashEscapeShort
-	BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode
+	BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
+
+	BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX
+	BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx
+	BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX
+	BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx
+	BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX
+	BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx
+	BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX
+	BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx
+	BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX
+	BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx
+	BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX
+	BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx
+	BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX
+	BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx
+	BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX
+	BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx
+
+	BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin
+	BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax
+
+	BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat
 )
 
 func hexToInt(c byte) rune {
@@ -49,13 +94,24 @@ func hexToRune(a, b, c, d byte) rune {
 		hexToInt(d)<<0
 }
 
+func hexToMode(a, b, c, d byte) BackslashEscapeMode {
+	// The 0b0010_0000 bit is the ASCII "lowercase bit".
+	return BackslashEscapeUnicodeMin + BackslashEscapeMode(0|
+		((a&0b0010_0000)>>2)|
+		((b&0b0010_0000)>>3)|
+		((c&0b0010_0000)>>4)|
+		((d&0b0010_0000)>>5))
+}
+
 // A BackslashEscaper controls how a ReEncoder emits a character in a
 // JSON string.  The `rune` argument is the character being
 // considered, and the `BackslashEscapeMode` argument is how it was
 // originally encoded in the input.
 //
 // The ReEncoder will panic if a BackslashEscaper returns an unknown
-// BackslashEscapeMode.
+// BackslashEscapeMode.  However, a BackslashEscaper should be
+// permissive of BackslashEscapeModes it doesn't recognize; it is safe
+// to just return them unmodified.
 type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
 
 // EscapePreserve is a BackslashEscaper that preserves the original
@@ -96,14 +152,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode
 // behavior of encoding/json.
 //
 // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`
 //
 // A ReEncoder uses EscapeDefault if a BackslashEscaper is not
 // specified.
 func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
 	switch c {
-	case '\b', '\f', utf8.RuneError:
+	case '\b', '\f':
 		return BackslashEscapeUnicode
 	default:
 		return EscapeHTMLSafe(c, wasEscaped)
@@ -115,11 +170,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
 // SetEscapeHTML(false) called on it.
 //
 // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`.
 func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
 	switch c {
-	case '\b', '\f', utf8.RuneError:
+	case '\b', '\f':
 		return BackslashEscapeUnicode
 	default:
 		return EscapeJSSafe(c, wasEscaped)
author	Luke Shumaker <lukeshu@lukeshu.com>	2023-02-20 12:47:10 -0700
committer	Luke Shumaker <lukeshu@lukeshu.com>	2023-02-20 12:47:10 -0700
commit	f5ca3478c68e47ae20fd12748c1552fdf81f75f9 (patch)
tree	b3d3f889ed25084fe33ed9e01554d6ca51104bb5 /encode_escape.go
parent	d240d0b06c7b5711f583d961eddfc37d07d4546e (diff)
parent	49ee8be679add0bd3cf08a2669331b3be7a835f8 (diff)