summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-16 19:06:46 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-18 22:45:54 -0700
commit00187950437a10952b82353405e5ba4b4515fb29 (patch)
tree826c4ff76310bf6f58e79f37f2107c329810aaa8
parenta87d6cbbb51a19071c5c742ef3c91bbb90a727c6 (diff)
reencode: Don't normalize the capitalization of \uXXXX hex escapes
-rw-r--r--ReleaseNotes.md5
-rw-r--r--compat/json/compat.go2
-rw-r--r--compat/json/compat_test.go54
-rw-r--r--encode_escape.go39
-rw-r--r--internal/jsonstring/encode_string.go60
-rw-r--r--reencode.go3
6 files changed, 121 insertions, 42 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 73df694..a8496e0 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -39,6 +39,11 @@
BackslashEscapeMode to allow emitted strings to contain
invalid UTF-8.
+ + Feature: ReEncoder: No longer unconditionally normalizes
+ `\uXXXX` hex characters to lower-case; now this is controlled
+ by the `BackslashEscaper` (and the default is now to leave the
+ capitalization alone).
+
+ Change: EscapeDefault, EscapeDefaultNonHTMLSafe: No longer
force long Unicode `\uXXXX` sequences for the U+FFFD Unicode
replacement character.
diff --git a/compat/json/compat.go b/compat/json/compat.go
index d33f278..3a9bd6c 100644
--- a/compat/json/compat.go
+++ b/compat/json/compat.go
@@ -157,7 +157,7 @@ func HTMLEscape(dst *bytes.Buffer, src []byte) {
case lowmemjson.BackslashEscapeNone:
dst.WriteRune(c)
case lowmemjson.BackslashEscapeUnicode:
- _ = jsonstring.WriteStringUnicodeEscape(dst, c)
+ _ = jsonstring.WriteStringUnicodeEscape(dst, c, mode)
default:
panic(fmt.Errorf("lowmemjson.EscapeHTMLSafe returned an unexpected escape mode=%d", mode))
}
diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go
index c83ca7e..29a8b37 100644
--- a/compat/json/compat_test.go
+++ b/compat/json/compat_test.go
@@ -18,7 +18,10 @@ func TestCompatHTMLEscape(t *testing.T) {
Out string
}
testcases := map[string]testcase{
- "invalid": {In: `x`, Out: `x`},
+ "invalid": {In: `x`, Out: `x`},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
}
for tcName, tc := range testcases {
tc := tc
@@ -39,11 +42,14 @@ func TestCompatValid(t *testing.T) {
Exp bool
}
testcases := map[string]testcase{
- "empty": {In: ``, Exp: false},
- "num": {In: `1`, Exp: true},
- "trunc": {In: `{`, Exp: false},
- "object": {In: `{}`, Exp: true},
- "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517
+ "empty": {In: ``, Exp: false},
+ "num": {In: `1`, Exp: true},
+ "trunc": {In: `{`, Exp: false},
+ "object": {In: `{}`, Exp: true},
+ "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517
+ "hex-lower": {In: `"\uabcd"`, Exp: true},
+ "hex-upper": {In: `"\uABCD"`, Exp: true},
+ "hex-mixed": {In: `"\uAbCd"`, Exp: true},
}
for tcName, tc := range testcases {
tc := tc
@@ -64,10 +70,13 @@ func TestCompatCompact(t *testing.T) {
Err string
}
testcases := map[string]testcase{
- "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
- "object": {In: `{}`, Out: `{}`},
- "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
- "float": {In: `1.200e003`, Out: `1.200e003`},
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+ "float": {In: `1.200e003`, Out: `1.200e003`},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
}
for tcName, tc := range testcases {
tc := tc
@@ -94,17 +103,20 @@ func TestCompatIndent(t *testing.T) {
Err string
}
testcases := map[string]testcase{
- "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
- "object": {In: `{}`, Out: `{}`},
- "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
- "float": {In: `1.200e003`, Out: `1.200e003`},
- "tailws0": {In: `0`, Out: `0`},
- "tailws1": {In: `0 `, Out: `0 `},
- "tailws2": {In: `0 `, Out: `0 `},
- "tailws3": {In: "0\n", Out: "0\n"},
- "headws1": {In: ` 0`, Out: `0`},
- "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"},
- "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+ "float": {In: `1.200e003`, Out: `1.200e003`},
+ "tailws0": {In: `0`, Out: `0`},
+ "tailws1": {In: `0 `, Out: `0 `},
+ "tailws2": {In: `0 `, Out: `0 `},
+ "tailws3": {In: "0\n", Out: "0\n"},
+ "headws1": {In: ` 0`, Out: `0`},
+ "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"},
+ "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
}
for tcName, tc := range testcases {
tc := tc
diff --git a/encode_escape.go b/encode_escape.go
index c9e2bc9..664c762 100644
--- a/encode_escape.go
+++ b/encode_escape.go
@@ -36,7 +36,8 @@ const (
// - as a short "well-known" `\X` backslash sequence (where `X` is a
// single-character)
//
-// - as a long Unicode `\uXXXX` backslash sequence
+// - as a long Unicode `\uXXXX` backslash sequence (with 16
+// permutations of capitalization)
//
// - as a raw byte; this allows you to emit invalid JSON; JSON must
// be valid UTF-8, but this allows you to emit arbitrary binary
@@ -47,8 +48,29 @@ type BackslashEscapeMode = jsonstring.BackslashEscapeMode
const (
BackslashEscapeNone = jsonstring.BackslashEscapeNone
BackslashEscapeShort = jsonstring.BackslashEscapeShort
- BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode
BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
+
+ BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx
+ BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX
+ BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx
+ BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX
+ BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx
+ BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX
+ BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx
+ BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX
+ BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx
+ BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX
+ BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx
+ BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX
+ BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx
+ BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX
+ BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin
+ BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax
+
+ BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat
)
func hexToInt(c byte) rune {
@@ -72,13 +94,24 @@ func hexToRune(a, b, c, d byte) rune {
hexToInt(d)<<0
}
+func hexToMode(a, b, c, d byte) BackslashEscapeMode {
+ // The 0b0010_0000 bit is the ASCII "lowercase bit".
+ return BackslashEscapeUnicodeMin + BackslashEscapeMode(0|
+ ((a&0b0010_0000)>>2)|
+ ((b&0b0010_0000)>>3)|
+ ((c&0b0010_0000)>>4)|
+ ((d&0b0010_0000)>>5))
+}
+
// A BackslashEscaper controls how a ReEncoder emits a character in a
// JSON string. The `rune` argument is the character being
// considered, and the `BackslashEscapeMode` argument is how it was
// originally encoded in the input.
//
// The ReEncoder will panic if a BackslashEscaper returns an unknown
-// BackslashEscapeMode.
+// BackslashEscapeMode. However, a BackslashEscaper should be
+// permissive of BackslashEscapeModes it doesn't recognize; it is safe
+// to just return them unmodified.
type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
// EscapePreserve is a BackslashEscaper that preserves the original
diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go
index 2488cb2..1416b3e 100644
--- a/internal/jsonstring/encode_string.go
+++ b/internal/jsonstring/encode_string.go
@@ -31,22 +31,49 @@ type BackslashEscapeMode uint8
const (
BackslashEscapeNone BackslashEscapeMode = iota
BackslashEscapeShort
- BackslashEscapeUnicode
BackslashEscapeRawByte
+
+ // It is significant to the implementation that if X=binary-0
+ // and x=binary-1, then these "BackslashEscapeUnicode"
+ // constants are counting in-order from 0 to 15.
+
+ BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeXXXx
+ BackslashEscapeUnicodeXXxX
+ BackslashEscapeUnicodeXXxx
+ BackslashEscapeUnicodeXxXX
+ BackslashEscapeUnicodeXxXx
+ BackslashEscapeUnicodeXxxX
+ BackslashEscapeUnicodeXxxx
+ BackslashEscapeUnicodexXXX
+ BackslashEscapeUnicodexXXx
+ BackslashEscapeUnicodexXxX
+ BackslashEscapeUnicodexXxx
+ BackslashEscapeUnicodexxXX
+ BackslashEscapeUnicodexxXx
+ BackslashEscapeUnicodexxxX
+ BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicodeMin = BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeMax = BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicode = BackslashEscapeUnicodexxxx // back-compat
)
// BackslashEscaper is describe in the main lowmemjson package docs.
type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
-func WriteStringUnicodeEscape(w io.Writer, c rune) error {
- const alphabet = "0123456789abcdef"
+func WriteStringUnicodeEscape(w io.Writer, c rune, mode BackslashEscapeMode) error {
+ const alphabet = "0123456789ABCDEF"
+ _mode := byte(mode - BackslashEscapeUnicodeMin)
buf := [6]byte{
'\\',
'u',
- alphabet[(c>>12)&0xf],
- alphabet[(c>>8)&0xf],
- alphabet[(c>>4)&0xf],
- alphabet[(c>>0)&0xf],
+ // The 0b0010_0000 bit is the ASCII "lowercase bit".
+ alphabet[(c>>12)&0xf] | ((_mode << 2) & 0b0010_0000),
+ alphabet[(c>>8)&0xf] | ((_mode << 3) & 0b0010_0000),
+ alphabet[(c>>4)&0xf] | ((_mode << 4) & 0b0010_0000),
+ alphabet[(c>>0)&0xf] | ((_mode << 5) & 0b0010_0000),
}
_, err := noescape.Write(w, buf[:])
return err
@@ -84,7 +111,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
case '\b', '\f', '\n', '\r', '\t': // short-escape if possible
return writeStringShortEscape(w, c)
default:
- return WriteStringUnicodeEscape(w, c)
+ return WriteStringUnicodeEscape(w, c, BackslashEscapeUnicode)
}
case c == '"' || c == '\\': // override, gotta escape these
return writeStringShortEscape(w, c)
@@ -100,14 +127,6 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
_, err := w.WriteRune(c)
return err
}
- case BackslashEscapeUnicode:
- switch {
- case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
- _, err := w.WriteRune(c)
- return err
- default: // obey
- return WriteStringUnicodeEscape(w, c)
- }
case BackslashEscapeRawByte:
switch {
case c < utf8.RuneSelf:
@@ -118,6 +137,15 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
return w.WriteByte(byte(c))
}
default:
+ if BackslashEscapeUnicodeMin <= escape && escape <= BackslashEscapeUnicodeMax {
+ switch {
+ case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
+ _, err := w.WriteRune(c)
+ return err
+ default: // obey
+ return WriteStringUnicodeEscape(w, c, escape)
+ }
+ }
panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape))
}
}
diff --git a/reencode.go b/reencode.go
index 1943b9c..7439bf0 100644
--- a/reencode.go
+++ b/reencode.go
@@ -441,8 +441,9 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int
enc.uhex[2] = byte(c)
return nil
case jsonparse.RuneTypeStringEscUD:
+ mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
- return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeUnicode, stackSize)
+ return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize)
case jsonparse.RuneTypeError:
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
default: