decode: Include the invalid UTF-8 byte in error messages

author: Luke Shumaker <lukeshu@lukeshu.com> 2023-02-23 21:30:12 -0700
committer: Luke Shumaker <lukeshu@lukeshu.com> 2023-02-25 01:18:59 -0700
commit: 051f966039028d257f27fc3a42c10cbff9f7c738 (patch)
tree: 6ae73810c4a2959a23294c6e46d13cc7fb7034be
parent: d35495540df2b6d3ba16c84ce21627d9dbae000c (diff)
5 files changed, 75 insertions, 25 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 48982e4..af2adcc 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -14,6 +14,10 @@
      then the first type error encountered is returned.  This is
      consistent with the behavior of `encoding/json`.
 
+   - Bugfix: Decoder: If there is a syntax error in a byte that
+     invalid UTF-8, include that byte value in the error message
+     rather than including the U+FFFD Unicode replacement character.
+
 # v0.3.7 (2023-02-20)
 
   Theme: Fixes from fuzzing (part 1?)
diff --git a/compat/json/compat.go b/compat/json/compat.go
index c2d47c0..6f13fbb 100644
--- a/compat/json/compat.go
+++ b/compat/json/compat.go
@@ -329,7 +329,10 @@ func Unmarshal(data []byte, ptr any) error {
 }
 
 type teeRuneScanner struct {
-	src      io.RuneScanner
+	src interface {
+		io.RuneScanner
+		io.ByteScanner
+	}
 	dst      *bytes.Buffer
 	lastSize int
 }
@@ -337,11 +340,14 @@ type teeRuneScanner struct {
 func (tee *teeRuneScanner) ReadRune() (r rune, size int, err error) {
 	r, size, err = tee.src.ReadRune()
 	if err == nil {
-		if _, err := tee.dst.WriteRune(r); err != nil {
-			return 0, 0, err
+		if r == utf8.RuneError && size == 1 {
+			_ = tee.src.UnreadRune()
+			b, _ := tee.src.ReadByte()
+			_ = tee.dst.WriteByte(b)
+		} else {
+			_, _ = tee.dst.WriteRune(r)
 		}
 	}
-
 	tee.lastSize = size
 	return
 }
@@ -356,6 +362,25 @@ func (tee *teeRuneScanner) UnreadRune() error {
 	return nil
 }
 
+func (tee *teeRuneScanner) ReadByte() (b byte, err error) {
+	b, err = tee.src.ReadByte()
+	if err == nil {
+		_ = tee.dst.WriteByte(b)
+		tee.lastSize = 1
+	}
+	return
+}
+
+func (tee *teeRuneScanner) UnreadByte() error {
+	if tee.lastSize != 1 {
+		return lowmemjson.ErrInvalidUnreadRune
+	}
+	_ = tee.src.UnreadByte()
+	tee.dst.Truncate(tee.dst.Len() - tee.lastSize)
+	tee.lastSize = 0
+	return nil
+}
+
 type Decoder struct {
 	validatorBuf *bufio.Reader
 	validator    *lowmemjson.Decoder
diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go
index 098ac85..6aab103 100644
--- a/compat/json/compat_test.go
+++ b/compat/json/compat_test.go
@@ -72,13 +72,14 @@ func TestCompatCompact(t *testing.T) {
 		Err string
 	}
 	testcases := map[string]testcase{
-		"trunc":     {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
-		"object":    {In: `{}`, Out: `{}`},
-		"non-utf8":  {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
-		"float":     {In: `1.200e003`, Out: `1.200e003`},
-		"hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
-		"hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
-		"hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+		"trunc":        {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+		"object":       {In: `{}`, Out: `{}`},
+		"non-utf8":     {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+		"float":        {In: `1.200e003`, Out: `1.200e003`},
+		"hex-lower":    {In: `"\uabcd"`, Out: `"\uabcd"`},
+		"hex-upper":    {In: `"\uABCD"`, Out: `"\uABCD"`},
+		"hex-mixed":    {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+		"invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`},
 	}
 	for tcName, tc := range testcases {
 		tc := tc
@@ -105,20 +106,21 @@ func TestCompatIndent(t *testing.T) {
 		Err string
 	}
 	testcases := map[string]testcase{
-		"trunc":     {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
-		"object":    {In: `{}`, Out: `{}`},
-		"non-utf8":  {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
-		"float":     {In: `1.200e003`, Out: `1.200e003`},
-		"tailws0":   {In: `0`, Out: `0`},
-		"tailws1":   {In: `0 `, Out: `0 `},
-		"tailws2":   {In: `0  `, Out: `0  `},
-		"tailws3":   {In: "0\n", Out: "0\n"},
-		"headws1":   {In: ` 0`, Out: `0`},
-		"objws1":    {In: `{"a"  :  1}`, Out: "{\n>.\"a\": 1\n>}"},
-		"objws2":    {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
-		"hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
-		"hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
-		"hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+		"trunc":        {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+		"object":       {In: `{}`, Out: `{}`},
+		"non-utf8":     {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+		"float":        {In: `1.200e003`, Out: `1.200e003`},
+		"tailws0":      {In: `0`, Out: `0`},
+		"tailws1":      {In: `0 `, Out: `0 `},
+		"tailws2":      {In: `0  `, Out: `0  `},
+		"tailws3":      {In: "0\n", Out: "0\n"},
+		"headws1":      {In: ` 0`, Out: `0`},
+		"objws1":       {In: `{"a"  :  1}`, Out: "{\n>.\"a\": 1\n>}"},
+		"objws2":       {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
+		"hex-lower":    {In: `"\uabcd"`, Out: `"\uabcd"`},
+		"hex-upper":    {In: `"\uABCD"`, Out: `"\uABCD"`},
+		"hex-mixed":    {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+		"invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`},
 	}
 	for tcName, tc := range testcases {
 		tc := tc
@@ -181,6 +183,7 @@ func TestCompatUnmarshal(t *testing.T) {
 		"two-objs":             {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`},
 		"two-numbers1":         {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`},
 		"two-numbers2":         {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`},
+		"invalid-utf8":         {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`},
 		// 2e308 is slightly more than math.MaxFloat64 (~1.79e308)
 		"obj-overflow":      {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
 		"ary-overflow":      {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
@@ -223,6 +226,7 @@ func TestCompatDecode(t *testing.T) {
 		"two-objs":             {In: `{} {}`, ExpOut: map[string]any{}},
 		"two-numbers1":         {In: `00`, ExpOut: float64(0)},
 		"two-numbers2":         {In: `1 2`, ExpOut: float64(1)},
+		"invalid-utf8":         {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`},
 		// 2e308 is slightly more than math.MaxFloat64 (~1.79e308)
 		"obj-overflow":      {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
 		"ary-overflow":      {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
diff --git a/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b
new file mode 100644
index 0000000..bb8752b
--- /dev/null
+++ b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b
@@ -0,0 +1,2 @@
+go test fuzz v1
+[]byte("\x85")
diff --git a/decode_scan.go b/decode_scan.go
index 63694c4..940de49 100644
--- a/decode_scan.go
+++ b/decode_scan.go
@@ -6,6 +6,7 @@ package lowmemjson
 
 import (
 	"io"
+	"unicode/utf8"
 
 	"git.lukeshu.com/go/lowmemjson/internal/jsonparse"
 )
@@ -55,6 +56,17 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
 		sc.offset += int64(sc.rSize)
 		switch err {
 		case nil:
+			invalidUTF8 := false
+			if sc.rRune == utf8.RuneError && sc.rSize == 1 {
+				if bs, ok := sc.inner.(io.ByteScanner); ok {
+					_ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos
+					b, _ := bs.ReadByte()
+					_ = bs.UnreadByte()
+					_, _, _ = sc.inner.ReadRune()
+					sc.rRune = rune(b)
+					invalidUTF8 = true
+				}
+			}
 			sc.rType, err = sc.parser.HandleRune(sc.rRune)
 			if err != nil {
 				sc.rErr = &DecodeSyntaxError{
@@ -62,6 +74,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
 					Err:    err,
 				}
 			} else {
+				if invalidUTF8 {
+					sc.rRune = utf8.RuneError
+				}
 				sc.rErr = nil
 			}
 			switch sc.rType {
author	Luke Shumaker <lukeshu@lukeshu.com>	2023-02-23 21:30:12 -0700
committer	Luke Shumaker <lukeshu@lukeshu.com>	2023-02-25 01:18:59 -0700
commit	051f966039028d257f27fc3a42c10cbff9f7c738 (patch)
tree	6ae73810c4a2959a23294c6e46d13cc7fb7034be
parent	d35495540df2b6d3ba16c84ce21627d9dbae000c (diff)