summaryrefslogtreecommitdiff
path: root/internal/jsonstring
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-20 12:47:10 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-20 12:47:10 -0700
commitf5ca3478c68e47ae20fd12748c1552fdf81f75f9 (patch)
treeb3d3f889ed25084fe33ed9e01554d6ca51104bb5 /internal/jsonstring
parentd240d0b06c7b5711f583d961eddfc37d07d4546e (diff)
parent49ee8be679add0bd3cf08a2669331b3be7a835f8 (diff)
Merge branch 'lukeshu/fixes'
Diffstat (limited to 'internal/jsonstring')
-rw-r--r--internal/jsonstring/encode_string.go121
1 files changed, 101 insertions, 20 deletions
diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go
index fec2cc0..1416b3e 100644
--- a/internal/jsonstring/encode_string.go
+++ b/internal/jsonstring/encode_string.go
@@ -5,14 +5,25 @@
package jsonstring
import (
+ "encoding/json"
"fmt"
"io"
+ "reflect"
"unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/fastio"
"git.lukeshu.com/go/lowmemjson/internal/fastio/noescape"
)
+// InvalidUTF8Mode is describe in the main lowmemjson package docs.
+type InvalidUTF8Mode uint8
+
+const (
+ InvalidUTF8Replace InvalidUTF8Mode = iota
+ InvalidUTF8Preserve
+ InvalidUTF8Error
+)
+
// BackslashEscapeMode is describe in the main lowmemjson package
// docs.
type BackslashEscapeMode uint8
@@ -20,21 +31,49 @@ type BackslashEscapeMode uint8
const (
BackslashEscapeNone BackslashEscapeMode = iota
BackslashEscapeShort
- BackslashEscapeUnicode
+ BackslashEscapeRawByte
+
+ // It is significant to the implementation that if X=binary-0
+ // and x=binary-1, then these "BackslashEscapeUnicode"
+ // constants are counting in-order from 0 to 15.
+
+ BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeXXXx
+ BackslashEscapeUnicodeXXxX
+ BackslashEscapeUnicodeXXxx
+ BackslashEscapeUnicodeXxXX
+ BackslashEscapeUnicodeXxXx
+ BackslashEscapeUnicodeXxxX
+ BackslashEscapeUnicodeXxxx
+ BackslashEscapeUnicodexXXX
+ BackslashEscapeUnicodexXXx
+ BackslashEscapeUnicodexXxX
+ BackslashEscapeUnicodexXxx
+ BackslashEscapeUnicodexxXX
+ BackslashEscapeUnicodexxXx
+ BackslashEscapeUnicodexxxX
+ BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicodeMin = BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeMax = BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicode = BackslashEscapeUnicodexxxx // back-compat
)
// BackslashEscaper is describe in the main lowmemjson package docs.
type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
-func writeStringUnicodeEscape(w io.Writer, c rune) error {
- const alphabet = "0123456789abcdef"
+func WriteStringUnicodeEscape(w io.Writer, c rune, mode BackslashEscapeMode) error {
+ const alphabet = "0123456789ABCDEF"
+ _mode := byte(mode - BackslashEscapeUnicodeMin)
buf := [6]byte{
'\\',
'u',
- alphabet[(c>>12)&0xf],
- alphabet[(c>>8)&0xf],
- alphabet[(c>>4)&0xf],
- alphabet[(c>>0)&0xf],
+ // The 0b0010_0000 bit is the ASCII "lowercase bit".
+ alphabet[(c>>12)&0xf] | ((_mode << 2) & 0b0010_0000),
+ alphabet[(c>>8)&0xf] | ((_mode << 3) & 0b0010_0000),
+ alphabet[(c>>4)&0xf] | ((_mode << 4) & 0b0010_0000),
+ alphabet[(c>>0)&0xf] | ((_mode << 5) & 0b0010_0000),
}
_, err := noescape.Write(w, buf[:])
return err
@@ -72,7 +111,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
case '\b', '\f', '\n', '\r', '\t': // short-escape if possible
return writeStringShortEscape(w, c)
default:
- return writeStringUnicodeEscape(w, c)
+ return WriteStringUnicodeEscape(w, c, BackslashEscapeUnicode)
}
case c == '"' || c == '\\': // override, gotta escape these
return writeStringShortEscape(w, c)
@@ -88,27 +127,54 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
_, err := w.WriteRune(c)
return err
}
- case BackslashEscapeUnicode:
+ case BackslashEscapeRawByte:
switch {
- case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
- _, err := w.WriteRune(c)
- return err
- default: // obey
- return writeStringUnicodeEscape(w, c)
+ case c < utf8.RuneSelf:
+ panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c))
+ case c > 0xFF:
+ panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c))
+ default:
+ return w.WriteByte(byte(c))
}
default:
- panic("escaper returned an invalid escape mode")
+ if BackslashEscapeUnicodeMin <= escape && escape <= BackslashEscapeUnicodeMax {
+ switch {
+ case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
+ _, err := w.WriteRune(c)
+ return err
+ default: // obey
+ return WriteStringUnicodeEscape(w, c, escape)
+ }
+ }
+ panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape))
}
}
-func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error {
+func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error {
if err := w.WriteByte('"'); err != nil {
return err
}
- for _, c := range str {
- if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil {
+ for i := 0; i < len(str); {
+ escaped := BackslashEscapeNone
+ c, size := utf8.DecodeRuneInString(str[i:])
+ if c == utf8.RuneError && size == 1 {
+ switch utf {
+ case InvalidUTF8Replace:
+ escaped = BackslashEscapeUnicode
+ case InvalidUTF8Preserve:
+ escaped = BackslashEscapeRawByte
+ c = rune(str[i])
+ case InvalidUTF8Error:
+ return &json.UnsupportedValueError{
+ Value: val,
+ Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
+ }
+ }
+ }
+ if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
return err
}
+ i += size
}
if err := w.WriteByte('"'); err != nil {
return err
@@ -116,13 +182,28 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st
return nil
}
-func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error {
+func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error {
if err := w.WriteByte('"'); err != nil {
return err
}
for i := 0; i < len(str); {
+ escaped := BackslashEscapeNone
c, size := utf8.DecodeRune(str[i:])
- if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil {
+ if c == utf8.RuneError && size == 1 {
+ switch utf {
+ case InvalidUTF8Replace:
+ escaped = BackslashEscapeUnicode
+ case InvalidUTF8Preserve:
+ escaped = BackslashEscapeRawByte
+ c = rune(str[i])
+ case InvalidUTF8Error:
+ return &json.UnsupportedValueError{
+ Value: val,
+ Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
+ }
+ }
+ }
+ if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
return err
}
i += size