summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2022-08-14 10:55:02 -0600
committerLuke Shumaker <lukeshu@datawire.io>2022-08-14 17:06:28 -0600
commitc8a79cbfde0e42ac3d677bb986e4dbfc9e5cfa85 (patch)
tree10294f06b9d1143644dbdfb7a2e895acd8504efd
parent076ca46bad3e18ea7f4c3b3320ab410c3ebea747 (diff)
reencode: Rethink the string backslash encoder
-rw-r--r--encode.go2
-rw-r--r--misc.go81
-rw-r--r--reencode.go28
3 files changed, 68 insertions, 43 deletions
diff --git a/encode.go b/encode.go
index 377b9b9..8479785 100644
--- a/encode.go
+++ b/encode.go
@@ -302,7 +302,7 @@ func encodeString[T interface{ []byte | string }](w io.Writer, str T) {
encodeWriteByte(w, '"')
for i := 0; i < len(str); {
c, size := decodeRune(str[i:])
- if _, err := writeStringChar(w, c, false, nil); err != nil {
+ if _, err := writeStringChar(w, c, BackslashEscapeNone, nil); err != nil {
panic(encodeError{err})
}
i += size
diff --git a/misc.go b/misc.go
index a567cc7..4d8b136 100644
--- a/misc.go
+++ b/misc.go
@@ -68,34 +68,42 @@ func writeRune(w io.Writer, c rune) (int, error) {
// JSON string encoding ////////////////////////////////////////////////////////
-func UnicodeEscapeJSSafe(c rune, _ bool) bool {
+type BackslashEscapeMode uint8
+
+const (
+ BackslashEscapeNone = BackslashEscapeMode(iota)
+ BackslashEscapeShort
+ BackslashEscapeUnicode
+)
+
+func EscapeJSSafe(c rune, _ BackslashEscapeMode) BackslashEscapeMode {
// JSON is notionally a JS subset, but that's not actually
// true.
//
// http://timelessrepo.com/json-isnt-a-javascript-subset
switch c {
case '\u2028', '\u2029':
- return true
+ return BackslashEscapeUnicode
default:
- return false
+ return BackslashEscapeNone
}
}
-func UnicodeEscapeHTMLSafe(c rune, wasEscaped bool) bool {
+func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
case '&', '<', '>':
- return true
+ return BackslashEscapeUnicode
default:
- return UnicodeEscapeJSSafe(c, wasEscaped)
+ return EscapeJSSafe(c, wasEscaped)
}
}
-func UnicodeEscapeDefault(c rune, wasEscaped bool) bool {
+func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
case '\b', '\f', utf8.RuneError:
- return true
+ return BackslashEscapeUnicode
default:
- return UnicodeEscapeHTMLSafe(c, wasEscaped)
+ return EscapeHTMLSafe(c, wasEscaped)
}
}
@@ -114,31 +122,48 @@ func writeStringShortEscape(w io.Writer, c byte) (int, error) {
buf := [2]byte{'\\', c}
return w.Write(buf[:])
}
-func writeStringChar(w io.Writer, c rune, wasEscaped bool, escaper func(rune, bool) bool) (int, error) {
+func writeStringChar(w io.Writer, c rune, wasEscaped BackslashEscapeMode, escaper func(rune, BackslashEscapeMode) BackslashEscapeMode) (int, error) {
if escaper == nil {
- escaper = UnicodeEscapeDefault
+ escaper = EscapeDefault
}
- switch {
- case c <= 0xFFFF && escaper(c, wasEscaped):
- return writeStringUnicodeEscape(w, c)
- case c == '"' || c == '\\':
- return writeStringShortEscape(w, byte(c))
- case c < 0x0020:
+ switch escaper(c, wasEscaped) {
+ case BackslashEscapeNone:
+ switch {
+ case c < 0x0020:
+ switch c {
+ case '\b':
+ return writeStringShortEscape(w, 'b')
+ case '\f':
+ return writeStringShortEscape(w, 'f')
+ case '\n':
+ return writeStringShortEscape(w, 'n')
+ case '\r':
+ return writeStringShortEscape(w, 'r')
+ case '\t':
+ return writeStringShortEscape(w, 't')
+ default:
+ return writeStringUnicodeEscape(w, c)
+ }
+ case c == '"' || c == '\\':
+ return writeStringShortEscape(w, byte(c))
+ default:
+ return writeRune(w, c)
+ }
+ case BackslashEscapeShort:
switch c {
- case '\b':
- return writeStringShortEscape(w, 'b')
- case '\f':
- return writeStringShortEscape(w, 'f')
- case '\n':
- return writeStringShortEscape(w, 'n')
- case '\r':
- return writeStringShortEscape(w, 'r')
- case '\t':
- return writeStringShortEscape(w, 't')
+ case '"', '\\', '/', '\b', '\f', '\n', '\r', '\t':
+ return writeStringShortEscape(w, byte(c))
+ default:
+ return writeRune(w, c)
+ }
+ case BackslashEscapeUnicode:
+ switch {
+ case c > 0xFFFF:
+ return writeRune(w, c)
default:
return writeStringUnicodeEscape(w, c)
}
default:
- return writeRune(w, c)
+ panic("escaper returned an invalid escape mode")
}
}
diff --git a/reencode.go b/reencode.go
index 50c8ba3..66f25da 100644
--- a/reencode.go
+++ b/reencode.go
@@ -24,12 +24,12 @@ type ReEncoder struct {
// encoding/json only.
prefix string
// Returns whether a given character in a string should be
- // "\uXXXX" escaped. The bool argument is whether it was
+ // backslash-escaped. The bool argument is whether it was
// \u-escaped in the input. This does not affect characters
- // that must or must-not be \u-escaped to be valid JSON.
+ // that must or must-not be escaped to be valid JSON.
//
- // If not set, then EscapeUnicodeDefault is used.
- UnicodeEscape func(rune, bool) bool
+ // If not set, then EscapeDefault is used.
+ BackslashEscape func(rune, BackslashEscapeMode) BackslashEscapeMode
bailAfterCurrent bool
@@ -339,7 +339,7 @@ func (enc *ReEncoder) stateInString(c rune) error {
enc.popState()
return enc.emitByte(byte(c))
case 0x0020 <= c && c <= 0x10FFFF:
- return enc.emit(writeStringChar(enc.Out, c, false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, c, BackslashEscapeNone, enc.BackslashEscape))
default:
return &SyntaxError{fmt.Sprintf("string: unexpected character: %c", c), enc.inputPos}
}
@@ -348,28 +348,28 @@ func (enc *ReEncoder) stateInBackslash(c rune) error {
switch c {
case '"':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '"', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '"', BackslashEscapeShort, enc.BackslashEscape))
case '\\':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '\\', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '\\', BackslashEscapeShort, enc.BackslashEscape))
case '/':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '/', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '/', BackslashEscapeShort, enc.BackslashEscape))
case 'b':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '\b', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '\b', BackslashEscapeShort, enc.BackslashEscape))
case 'f':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '\f', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '\f', BackslashEscapeShort, enc.BackslashEscape))
case 'n':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '\n', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '\n', BackslashEscapeShort, enc.BackslashEscape))
case 'r':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '\r', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '\r', BackslashEscapeShort, enc.BackslashEscape))
case 't':
enc.replaceState(enc.stateInString, false)
- return enc.emit(writeStringChar(enc.Out, '\t', false, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, '\t', BackslashEscapeShort, enc.BackslashEscape))
case 'u':
enc.replaceState(enc.stateInUnicode, false)
return nil
@@ -396,7 +396,7 @@ func (enc *ReEncoder) stateInUnicode(c rune) error {
rune(enc.stateBuf[2])<<4 |
rune(enc.stateBuf[3])<<0
enc.stateBuf = enc.stateBuf[:0]
- return enc.emit(writeStringChar(enc.Out, c, true, enc.UnicodeEscape))
+ return enc.emit(writeStringChar(enc.Out, c, BackslashEscapeUnicode, enc.BackslashEscape))
}
return nil
}