summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-14 22:36:25 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-18 22:45:54 -0700
commitdfc67cecbd95344d296c31b537fa3ae8aec8c292 (patch)
tree1e2e820cbd288d1ebef7b0e9dea14a07e2f33fc5
parent38989a9c4f69abfe04c3eb4ec3382be88802141c (diff)
encode, reencode: Fix handling of invalid UTF-8
-rw-r--r--ReleaseNotes.md24
-rw-r--r--compat/json/compat.go5
-rw-r--r--compat/json/compat_test.go45
-rw-r--r--compat/json/testcompat_test.go5
-rw-r--r--encode.go34
-rw-r--r--encode_escape.go37
-rw-r--r--errors.go3
-rw-r--r--internal/jsonstring/encode_string.go65
-rw-r--r--reencode.go107
9 files changed, 247 insertions, 78 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index d9a671a..b1647da 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -19,10 +19,34 @@
- Unicode:
+ + Feature: Encoder, ReEncoder: Add an `InvalidUTF8`
+ ReEncoderConfig option and `BackslashEscapeRawByte`
+ BackslashEscapeMode to allow emitted strings to contain
+ invalid UTF-8.
+
+ + Change: EscapeDefault, EscapeDefaultNonHTMLSafe: No longer
+ force long Unicode `\uXXXX` sequences for the U+FFFD Unicode
+ replacement character.
+
+ + Change: Encoder: Unless overridden by the BackslashEscaper,
+ now by default uses `\uXXXX` sequences when emitting the
+ U+FFFD Unicode replacement character in place of invalid
+ UTF-8.
+
+ Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8
that when a codepoint straddles a write boundary it is
interpreted as a sequence of U+FFFD runes.
+ + Bugfix: compat/json.Valid: Do not consider JSON containing
+ invalid UTF-8 to be valid (this is different than
+ `encoding/json` at the time of this writing; but I consider
+ that to be a bug in `encoding/json`; [go#58517][]).
+
+ + Bugfix: compat/json.Compact, compat/json.Indent: Don't munge
+ invalid UTF-8 in strings; as `encoding/json` doesn't.
+
+ [go#58517]: https://github.com/golang/go/issues/58517
+
# v0.3.6 (2023-02-16)
Theme: Architectural improvements
diff --git a/compat/json/compat.go b/compat/json/compat.go
index 1cdbf0b..d326514 100644
--- a/compat/json/compat.go
+++ b/compat/json/compat.go
@@ -160,6 +160,7 @@ func Compact(dst *bytes.Buffer, src []byte) error {
start := dst.Len()
err := reencode(dst, src, lowmemjson.ReEncoderConfig{
Compact: true,
+ InvalidUTF8: lowmemjson.InvalidUTF8Preserve,
BackslashEscape: lowmemjson.EscapePreserve,
})
if err != nil {
@@ -173,6 +174,7 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error {
err := reencode(dst, src, lowmemjson.ReEncoderConfig{
Indent: indent,
Prefix: prefix,
+ InvalidUTF8: lowmemjson.InvalidUTF8Preserve,
BackslashEscape: lowmemjson.EscapePreserve,
})
if err != nil {
@@ -183,7 +185,8 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error {
func Valid(data []byte) bool {
formatter := lowmemjson.NewReEncoder(io.Discard, lowmemjson.ReEncoderConfig{
- Compact: true,
+ Compact: true,
+ InvalidUTF8: lowmemjson.InvalidUTF8Error,
})
if _, err := formatter.Write(data); err != nil {
return false
diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go
index d513c27..d989a4d 100644
--- a/compat/json/compat_test.go
+++ b/compat/json/compat_test.go
@@ -18,10 +18,11 @@ func TestCompatValid(t *testing.T) {
Exp bool
}
testcases := map[string]testcase{
- "empty": {In: ``, Exp: false},
- "num": {In: `1`, Exp: true},
- "trunc": {In: `{`, Exp: false},
- "object": {In: `{}`, Exp: true},
+ "empty": {In: ``, Exp: false},
+ "num": {In: `1`, Exp: true},
+ "trunc": {In: `{`, Exp: false},
+ "object": {In: `{}`, Exp: true},
+ "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517
}
for tcName, tc := range testcases {
tc := tc
@@ -42,8 +43,9 @@ func TestCompatCompact(t *testing.T) {
Err string
}
testcases := map[string]testcase{
- "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
- "object": {In: `{}`, Out: `{}`},
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
}
for tcName, tc := range testcases {
tc := tc
@@ -70,8 +72,9 @@ func TestCompatIndent(t *testing.T) {
Err string
}
testcases := map[string]testcase{
- "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
- "object": {In: `{}`, Out: `{}`},
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
}
for tcName, tc := range testcases {
tc := tc
@@ -89,3 +92,29 @@ func TestCompatIndent(t *testing.T) {
})
}
}
+
+func TestCompatMarshal(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In any
+ Out string
+ Err string
+ }
+ testcases := map[string]testcase{
+ "non-utf8": {In: "\x85\xcd", Out: "\"\\ufffd\\ufffd\""},
+ "urc": {In: "\ufffd", Out: "\"\ufffd\""},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ out, err := Marshal(tc.In)
+ assert.Equal(t, tc.Out, string(out))
+ if tc.Err == "" {
+ assert.NoError(t, err)
+ } else {
+ assert.EqualError(t, err, tc.Err)
+ }
+ })
+ }
+}
diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go
index 42cbf5c..e89b4b4 100644
--- a/compat/json/testcompat_test.go
+++ b/compat/json/testcompat_test.go
@@ -8,6 +8,7 @@ import (
"bytes"
"encoding/json"
"io"
+ "reflect"
_ "unsafe"
"git.lukeshu.com/go/lowmemjson"
@@ -59,13 +60,13 @@ type encodeState struct {
}
func (es *encodeState) string(str string, _ bool) {
- if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil {
+ if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil {
panic(err)
}
}
func (es *encodeState) stringBytes(str []byte, _ bool) {
- if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil {
+ if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil {
panic(err)
}
}
diff --git a/encode.go b/encode.go
index 00d3dad..684cc75 100644
--- a/encode.go
+++ b/encode.go
@@ -87,7 +87,7 @@ func (enc *Encoder) Encode(obj any) (err error) {
if escaper == nil {
escaper = EscapeDefault
}
- if err := encode(enc.w, reflect.ValueOf(obj), escaper, false, 0, map[any]struct{}{}); err != nil {
+ if err := encode(enc.w, reflect.ValueOf(obj), escaper, enc.w.utf, false, 0, map[any]struct{}{}); err != nil {
if rwe, ok := err.(*ReEncodeWriteError); ok {
err = &EncodeWriteError{
Err: rwe.Err,
@@ -108,7 +108,7 @@ func discardInt(_ int, err error) error {
const startDetectingCyclesAfter = 1000
-func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error {
+func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error {
if !val.IsValid() {
return discardInt(w.WriteString("null"))
}
@@ -197,7 +197,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
Err: err,
}
}
- if err := jsonstring.EncodeStringFromBytes(w, escaper, text); err != nil {
+ if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, text); err != nil {
return err
}
default:
@@ -309,14 +309,14 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
} else {
if quote {
var buf bytes.Buffer
- if err := jsonstring.EncodeStringFromString(&buf, escaper, val.String()); err != nil {
+ if err := jsonstring.EncodeStringFromString(&buf, escaper, utf, val, val.String()); err != nil {
return err
}
- if err := jsonstring.EncodeStringFromBytes(w, escaper, buf.Bytes()); err != nil {
+ if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, buf.Bytes()); err != nil {
return err
}
} else {
- if err := jsonstring.EncodeStringFromString(w, escaper, val.String()); err != nil {
+ if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, val.String()); err != nil {
return err
}
}
@@ -327,7 +327,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
return err
}
} else {
- if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -350,13 +350,13 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
}
}
empty = false
- if err := jsonstring.EncodeStringFromString(w, escaper, field.Name); err != nil {
+ if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, field.Name); err != nil {
return err
}
if err := w.WriteByte(':'); err != nil {
return err
}
- if err := encode(w, fVal, escaper, field.Quote, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, fVal, escaper, utf, field.Quote, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -394,7 +394,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
for i := 0; iter.Next(); i++ {
// TODO: Avoid buffering the map key
var k strings.Builder
- if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper}), iter.Key(), escaper, false, cycleDepth, cycleSeen); err != nil {
+ if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper, InvalidUTF8: utf}), iter.Key(), escaper, utf, false, cycleDepth, cycleSeen); err != nil {
return err
}
kStr := k.String()
@@ -403,7 +403,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
}
if !strings.HasPrefix(kStr, `"`) {
k.Reset()
- if err := jsonstring.EncodeStringFromString(&k, escaper, kStr); err != nil {
+ if err := jsonstring.EncodeStringFromString(&k, escaper, utf, iter.Key(), kStr); err != nil {
return err
}
kStr = k.String()
@@ -427,7 +427,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
if err := w.WriteByte(':'); err != nil {
return err
}
- if err := encode(w, kv.V, escaper, false, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, kv.V, escaper, utf, false, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -491,12 +491,12 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
cycleSeen[ptr] = struct{}{}
defer delete(cycleSeen, ptr)
}
- if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil {
+ if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil {
return err
}
}
case reflect.Array:
- if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil {
+ if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil {
return err
}
case reflect.Pointer:
@@ -516,7 +516,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
cycleSeen[ptr] = struct{}{}
defer delete(cycleSeen, ptr)
}
- if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -529,7 +529,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
return nil
}
-func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycleDepth uint, cycleSeen map[any]struct{}) error {
+func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, cycleDepth uint, cycleSeen map[any]struct{}) error {
if err := w.WriteByte('['); err != nil {
return err
}
@@ -540,7 +540,7 @@ func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycl
return err
}
}
- if err := encode(w, val.Index(i), escaper, false, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, val.Index(i), escaper, utf, false, cycleDepth, cycleSeen); err != nil {
return err
}
}
diff --git a/encode_escape.go b/encode_escape.go
index 97da6e9..c9e2bc9 100644
--- a/encode_escape.go
+++ b/encode_escape.go
@@ -6,12 +6,29 @@ package lowmemjson
import (
"fmt"
- "unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/jsonstring"
)
-// BackslashEscapeMode identifies one of the three ways that a
+// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or
+// ReEncoder can behave when encountering invalid UTF-8 in a string
+// value:
+//
+// - Replace the byte with the Unicode replacement character U+FFFD.
+//
+// - Allow the byte through to the string-encoder, with an
+// escape-mode of BackslashEscapeRawByte.
+//
+// - Emit a syntax error.
+type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode
+
+const (
+ InvalidUTF8Replace = jsonstring.InvalidUTF8Replace
+ InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve
+ InvalidUTF8Error = jsonstring.InvalidUTF8Error
+)
+
+// BackslashEscapeMode identifies one of the four ways that a
// character may be represented in a JSON string:
//
// - literally (no backslash escaping)
@@ -20,12 +37,18 @@ import (
// single-character)
//
// - as a long Unicode `\uXXXX` backslash sequence
+//
+// - as a raw byte; this allows you to emit invalid JSON; JSON must
+// be valid UTF-8, but this allows you to emit arbitrary binary
+// data. If the character does not satisfy `utf8.RuneSelf <= char
+// <= 0xFF`, then the encoder will panic.
type BackslashEscapeMode = jsonstring.BackslashEscapeMode
const (
BackslashEscapeNone = jsonstring.BackslashEscapeNone
BackslashEscapeShort = jsonstring.BackslashEscapeShort
BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode
+ BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
)
func hexToInt(c byte) rune {
@@ -96,14 +119,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode
// behavior of encoding/json.
//
// It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`
//
// A ReEncoder uses EscapeDefault if a BackslashEscaper is not
// specified.
func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
- case '\b', '\f', utf8.RuneError:
+ case '\b', '\f':
return BackslashEscapeUnicode
default:
return EscapeHTMLSafe(c, wasEscaped)
@@ -115,11 +137,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
// SetEscapeHTML(false) called on it.
//
// It is like EscapeJSSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`.
func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
- case '\b', '\f', utf8.RuneError:
+ case '\b', '\f':
return BackslashEscapeUnicode
default:
return EscapeJSSafe(c, wasEscaped)
diff --git a/errors.go b/errors.go
index 516018c..da9de4d 100644
--- a/errors.go
+++ b/errors.go
@@ -142,7 +142,8 @@ func (e *EncodeWriteError) Unwrap() error { return e.Err }
type EncodeTypeError = json.UnsupportedTypeError
// An EncodeValueError is returned by Encode when attempting to encode
-// an unsupported value (such as a datastructure with a cycle).
+// an unsupported value (such as a datastructure with a cycle, or (if
+// InvalidUTF8=InvalidUTF8Error) a string with invalid UTF-8).
//
// type UnsupportedValueError struct {
// Value reflect.Value
diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go
index fec2cc0..76bbb38 100644
--- a/internal/jsonstring/encode_string.go
+++ b/internal/jsonstring/encode_string.go
@@ -5,14 +5,25 @@
package jsonstring
import (
+ "encoding/json"
"fmt"
"io"
+ "reflect"
"unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/fastio"
"git.lukeshu.com/go/lowmemjson/internal/fastio/noescape"
)
+// InvalidUTF8Mode is describe in the main lowmemjson package docs.
+type InvalidUTF8Mode uint8
+
+const (
+ InvalidUTF8Replace InvalidUTF8Mode = iota
+ InvalidUTF8Preserve
+ InvalidUTF8Error
+)
+
// BackslashEscapeMode is describe in the main lowmemjson package
// docs.
type BackslashEscapeMode uint8
@@ -21,6 +32,7 @@ const (
BackslashEscapeNone BackslashEscapeMode = iota
BackslashEscapeShort
BackslashEscapeUnicode
+ BackslashEscapeRawByte
)
// BackslashEscaper is describe in the main lowmemjson package docs.
@@ -96,19 +108,45 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
default: // obey
return writeStringUnicodeEscape(w, c)
}
+ case BackslashEscapeRawByte:
+ switch {
+ case c < utf8.RuneSelf:
+ panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c))
+ case c > 0xFF:
+ panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c))
+ default:
+ return w.WriteByte(byte(c))
+ }
default:
- panic("escaper returned an invalid escape mode")
+ panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape))
}
}
-func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error {
+func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error {
if err := w.WriteByte('"'); err != nil {
return err
}
- for _, c := range str {
- if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil {
+ for i := 0; i < len(str); {
+ escaped := BackslashEscapeNone
+ c, size := utf8.DecodeRuneInString(str[i:])
+ if c == utf8.RuneError && size == 1 {
+ switch utf {
+ case InvalidUTF8Replace:
+ escaped = BackslashEscapeUnicode
+ case InvalidUTF8Preserve:
+ escaped = BackslashEscapeRawByte
+ c = rune(str[i])
+ case InvalidUTF8Error:
+ return &json.UnsupportedValueError{
+ Value: val,
+ Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
+ }
+ }
+ }
+ if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
return err
}
+ i += size
}
if err := w.WriteByte('"'); err != nil {
return err
@@ -116,13 +154,28 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st
return nil
}
-func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error {
+func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error {
if err := w.WriteByte('"'); err != nil {
return err
}
for i := 0; i < len(str); {
+ escaped := BackslashEscapeNone
c, size := utf8.DecodeRune(str[i:])
- if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil {
+ if c == utf8.RuneError && size == 1 {
+ switch utf {
+ case InvalidUTF8Replace:
+ escaped = BackslashEscapeUnicode
+ case InvalidUTF8Preserve:
+ escaped = BackslashEscapeRawByte
+ c = rune(str[i])
+ case InvalidUTF8Error:
+ return &json.UnsupportedValueError{
+ Value: val,
+ Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
+ }
+ }
+ }
+ if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
return err
}
i += size
diff --git a/reencode.go b/reencode.go
index fd848f8..1a9999b 100644
--- a/reencode.go
+++ b/reencode.go
@@ -54,6 +54,13 @@ type ReEncoderConfig struct {
// this is different than the usual behavior.
ForceTrailingNewlines bool
+ // A JSON document is specified to be a sequence of Unicode
+ // codepoints; InvalidUTF8 controls how the *ReEncoder behaves
+ // when it encounters invalid UTF-8 bytes in a JSON string
+ // (i.e. the string is not representable as a sequence of
+ // Unicode codepoints, and thus the document is invalid JSON).
+ InvalidUTF8 InvalidUTF8Mode
+
// Returns whether a given character in a string should be
// backslash-escaped. The bool argument is whether it was
// \u-escaped in the input. This does not affect characters
@@ -119,6 +126,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
return &ReEncoder{
out: module,
esc: escaper,
+ utf: cfg.InvalidUTF8,
allowMultipleValues: cfg.AllowMultipleValues,
}
}
@@ -134,6 +142,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
type ReEncoder struct {
out reEncoderModule
esc BackslashEscaper
+ utf InvalidUTF8Mode
allowMultipleValues bool
// state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer
@@ -169,43 +178,54 @@ var (
_ io.Closer = (*ReEncoder)(nil)
)
-func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) {
+func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) {
+ var tmp []byte
if pos < enc.bufLen {
- var tmp [utf8.UTFMax]byte
- n := copy(tmp[:], enc.buf[pos:enc.bufLen])
- n += copy(tmp[n:], str)
- c, size := utf8.DecodeRune(tmp[:n])
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRune(tmp[:n])
- }
- return c, size, true
+ var buf [utf8.UTFMax]byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
} else {
- tmp := str[pos-enc.bufLen:]
- c, size := utf8.DecodeRune(tmp)
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRune(tmp)
- }
- return c, size, true
+ tmp = str[pos-enc.bufLen:]
+ }
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
}
}
-func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) {
+func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) {
if pos < enc.bufLen {
- var tmp [utf8.UTFMax]byte
- n := copy(tmp[:], enc.buf[pos:enc.bufLen])
- n += copy(tmp[n:], str)
- c, size := utf8.DecodeRune(tmp[:n])
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRune(tmp[:n])
+ var buf [utf8.UTFMax]byte
+ var tmp []byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
}
- return c, size, true
} else {
tmp := str[pos-enc.bufLen:]
c, size := utf8.DecodeRuneInString(tmp)
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRuneInString(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
}
- return c, size, true
}
}
@@ -223,7 +243,7 @@ func (enc *ReEncoder) Write(str []byte) (int, error) {
}
var n int
for {
- c, size, full := enc.getRuneFromBytes(str, n)
+ c, size, full, isRune := enc.getRuneFromBytes(str, n)
if !full {
if n < enc.bufLen {
l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
@@ -234,7 +254,13 @@ func (enc *ReEncoder) Write(str []byte) (int, error) {
}
return len(str), nil
}
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
@@ -250,7 +276,7 @@ func (enc *ReEncoder) WriteString(str string) (int, error) {
}
var n int
for {
- c, size, full := enc.getRuneFromString(str, n)
+ c, size, full, isRune := enc.getRuneFromString(str, n)
if !full {
if n < enc.bufLen {
l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
@@ -261,7 +287,13 @@ func (enc *ReEncoder) WriteString(str string) (int, error) {
}
return len(str), nil
}
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
@@ -298,7 +330,7 @@ func (enc *ReEncoder) Close() error {
return enc.err
}
if len(enc.barriers) == 0 {
- if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -312,7 +344,8 @@ func (enc *ReEncoder) Close() error {
return nil
}
-func (enc *ReEncoder) handleRune(c rune, size int) {
+// isRune=false indicates that 'c' is a raw byte from invalid UTF-8.
+func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) {
t, err := enc.par.HandleRune(c)
if err != nil {
enc.err = &ReEncodeSyntaxError{
@@ -321,7 +354,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) {
}
return
}
- if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -370,7 +403,7 @@ func (enc *ReEncoder) stackSize() int {
return sz
}
-func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error {
+func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error {
switch t {
case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU:
return nil
@@ -410,6 +443,10 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int
if t > jsonparse.RuneTypeEOF {
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
}
- return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize)
+ esc := BackslashEscapeNone
+ if !isRune {
+ esc = BackslashEscapeRawByte
+ }
+ return enc.out.HandleRune(c, t, esc, stackSize)
}
}