summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-23 21:30:12 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-25 01:18:59 -0700
commit051f966039028d257f27fc3a42c10cbff9f7c738 (patch)
tree6ae73810c4a2959a23294c6e46d13cc7fb7034be
parentd35495540df2b6d3ba16c84ce21627d9dbae000c (diff)
decode: Include the invalid UTF-8 byte in error messages
-rw-r--r--ReleaseNotes.md4
-rw-r--r--compat/json/compat.go33
-rw-r--r--compat/json/compat_test.go46
-rw-r--r--compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b2
-rw-r--r--decode_scan.go15
5 files changed, 75 insertions, 25 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 48982e4..af2adcc 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -14,6 +14,10 @@
then the first type error encountered is returned. This is
consistent with the behavior of `encoding/json`.
+ - Bugfix: Decoder: If there is a syntax error in a byte that
+ invalid UTF-8, include that byte value in the error message
+ rather than including the U+FFFD Unicode replacement character.
+
# v0.3.7 (2023-02-20)
Theme: Fixes from fuzzing (part 1?)
diff --git a/compat/json/compat.go b/compat/json/compat.go
index c2d47c0..6f13fbb 100644
--- a/compat/json/compat.go
+++ b/compat/json/compat.go
@@ -329,7 +329,10 @@ func Unmarshal(data []byte, ptr any) error {
}
type teeRuneScanner struct {
- src io.RuneScanner
+ src interface {
+ io.RuneScanner
+ io.ByteScanner
+ }
dst *bytes.Buffer
lastSize int
}
@@ -337,11 +340,14 @@ type teeRuneScanner struct {
func (tee *teeRuneScanner) ReadRune() (r rune, size int, err error) {
r, size, err = tee.src.ReadRune()
if err == nil {
- if _, err := tee.dst.WriteRune(r); err != nil {
- return 0, 0, err
+ if r == utf8.RuneError && size == 1 {
+ _ = tee.src.UnreadRune()
+ b, _ := tee.src.ReadByte()
+ _ = tee.dst.WriteByte(b)
+ } else {
+ _, _ = tee.dst.WriteRune(r)
}
}
-
tee.lastSize = size
return
}
@@ -356,6 +362,25 @@ func (tee *teeRuneScanner) UnreadRune() error {
return nil
}
+func (tee *teeRuneScanner) ReadByte() (b byte, err error) {
+ b, err = tee.src.ReadByte()
+ if err == nil {
+ _ = tee.dst.WriteByte(b)
+ tee.lastSize = 1
+ }
+ return
+}
+
+func (tee *teeRuneScanner) UnreadByte() error {
+ if tee.lastSize != 1 {
+ return lowmemjson.ErrInvalidUnreadRune
+ }
+ _ = tee.src.UnreadByte()
+ tee.dst.Truncate(tee.dst.Len() - tee.lastSize)
+ tee.lastSize = 0
+ return nil
+}
+
type Decoder struct {
validatorBuf *bufio.Reader
validator *lowmemjson.Decoder
diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go
index 098ac85..6aab103 100644
--- a/compat/json/compat_test.go
+++ b/compat/json/compat_test.go
@@ -72,13 +72,14 @@ func TestCompatCompact(t *testing.T) {
Err string
}
testcases := map[string]testcase{
- "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
- "object": {In: `{}`, Out: `{}`},
- "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
- "float": {In: `1.200e003`, Out: `1.200e003`},
- "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
- "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
- "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+ "float": {In: `1.200e003`, Out: `1.200e003`},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`},
}
for tcName, tc := range testcases {
tc := tc
@@ -105,20 +106,21 @@ func TestCompatIndent(t *testing.T) {
Err string
}
testcases := map[string]testcase{
- "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
- "object": {In: `{}`, Out: `{}`},
- "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
- "float": {In: `1.200e003`, Out: `1.200e003`},
- "tailws0": {In: `0`, Out: `0`},
- "tailws1": {In: `0 `, Out: `0 `},
- "tailws2": {In: `0 `, Out: `0 `},
- "tailws3": {In: "0\n", Out: "0\n"},
- "headws1": {In: ` 0`, Out: `0`},
- "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"},
- "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
- "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
- "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
- "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+ "float": {In: `1.200e003`, Out: `1.200e003`},
+ "tailws0": {In: `0`, Out: `0`},
+ "tailws1": {In: `0 `, Out: `0 `},
+ "tailws2": {In: `0 `, Out: `0 `},
+ "tailws3": {In: "0\n", Out: "0\n"},
+ "headws1": {In: ` 0`, Out: `0`},
+ "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"},
+ "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`},
}
for tcName, tc := range testcases {
tc := tc
@@ -181,6 +183,7 @@ func TestCompatUnmarshal(t *testing.T) {
"two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`},
"two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`},
"two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`},
+ "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`},
// 2e308 is slightly more than math.MaxFloat64 (~1.79e308)
"obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
"ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
@@ -223,6 +226,7 @@ func TestCompatDecode(t *testing.T) {
"two-objs": {In: `{} {}`, ExpOut: map[string]any{}},
"two-numbers1": {In: `00`, ExpOut: float64(0)},
"two-numbers2": {In: `1 2`, ExpOut: float64(1)},
+ "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`},
// 2e308 is slightly more than math.MaxFloat64 (~1.79e308)
"obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
"ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
diff --git a/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b
new file mode 100644
index 0000000..bb8752b
--- /dev/null
+++ b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b
@@ -0,0 +1,2 @@
+go test fuzz v1
+[]byte("\x85")
diff --git a/decode_scan.go b/decode_scan.go
index 63694c4..940de49 100644
--- a/decode_scan.go
+++ b/decode_scan.go
@@ -6,6 +6,7 @@ package lowmemjson
import (
"io"
+ "unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/jsonparse"
)
@@ -55,6 +56,17 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
sc.offset += int64(sc.rSize)
switch err {
case nil:
+ invalidUTF8 := false
+ if sc.rRune == utf8.RuneError && sc.rSize == 1 {
+ if bs, ok := sc.inner.(io.ByteScanner); ok {
+ _ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos
+ b, _ := bs.ReadByte()
+ _ = bs.UnreadByte()
+ _, _, _ = sc.inner.ReadRune()
+ sc.rRune = rune(b)
+ invalidUTF8 = true
+ }
+ }
sc.rType, err = sc.parser.HandleRune(sc.rRune)
if err != nil {
sc.rErr = &DecodeSyntaxError{
@@ -62,6 +74,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
Err: err,
}
} else {
+ if invalidUTF8 {
+ sc.rRune = utf8.RuneError
+ }
sc.rErr = nil
}
switch sc.rType {