summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-25 16:17:01 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-25 17:51:36 -0700
commitf68498a6fdb421483d9aebb45527452f6255bb68 (patch)
treeeb2bd34a7c51a8df7239b0174a939ee64b136467
parent7301cd6c4e97b272bd708d4c87d26609510e6ca7 (diff)
jsonparse: Don't show raw bytes as Unicode
-rw-r--r--ReleaseNotes.md4
-rw-r--r--compat/json/compat_test.go8
-rw-r--r--compat/json/equiv_test.go29
-rw-r--r--compat/json/testcompat_test.go2
-rw-r--r--decode_scan.go23
-rw-r--r--internal/jsonparse/parse.go73
-rw-r--r--internal/jsonparse/parse_test.go2
-rw-r--r--reencode.go2
8 files changed, 90 insertions, 53 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 1981678..e047f7d 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -21,6 +21,10 @@
invalid UTF-8, include that byte value in the error message
rather than including the U+FFFD Unicode replacement character.
+ - Bugfix: Syntax errors on raw-bytes (for invalid UTF-8) no longer
+ show the raw byte as a `\u00XX` Unicode codepoint, but now as a
+ `\xXX` byte.
+
# v0.3.7 (2023-02-20)
Theme: Fixes from fuzzing (part 1?)
diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go
index cf9e359..2380c53 100644
--- a/compat/json/compat_test.go
+++ b/compat/json/compat_test.go
@@ -79,7 +79,7 @@ func TestCompatCompact(t *testing.T) {
"hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
"hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
"hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
- "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`},
+ "invalid-utf8": {In: "\x85", Err: `invalid character '\x85' looking for beginning of value`},
}
for tcName, tc := range testcases {
tc := tc
@@ -120,7 +120,7 @@ func TestCompatIndent(t *testing.T) {
"hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
"hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
"hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
- "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`},
+ "invalid-utf8": {In: "\x85", Err: `invalid character '\x85' looking for beginning of value`},
}
for tcName, tc := range testcases {
tc := tc
@@ -183,7 +183,7 @@ func TestCompatUnmarshal(t *testing.T) {
"two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`},
"two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`},
"two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`},
- "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`},
+ "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\x85' looking for beginning of value`},
// 2e308 is slightly more than math.MaxFloat64 (~1.79e308)
"obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
"ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
@@ -244,7 +244,7 @@ func TestCompatDecode(t *testing.T) {
"two-objs": {In: `{} {}`, ExpOut: map[string]any{}},
"two-numbers1": {In: `00`, ExpOut: float64(0)},
"two-numbers2": {In: `1 2`, ExpOut: float64(1)},
- "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`},
+ "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\x85' looking for beginning of value`},
// 2e308 is slightly more than math.MaxFloat64 (~1.79e308)
"obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
"ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`},
diff --git a/compat/json/equiv_test.go b/compat/json/equiv_test.go
index 246e4b3..cb02f43 100644
--- a/compat/json/equiv_test.go
+++ b/compat/json/equiv_test.go
@@ -44,8 +44,27 @@ func assertEquivErr(t *testing.T, stdErr, lowErr error) {
lowByte := lowMsg[len(prefix)]
if lowByte == '\\' {
switch lowMsg[len(prefix)+1] {
+ case 'a':
+ lowByte = '\a'
+ case 'b':
+ lowByte = '\b'
+ case 'f':
+ lowByte = '\f'
+ case 'n':
+ lowByte = '\n'
+ case 'r':
+ lowByte = '\r'
+ case 't':
+ lowByte = '\t'
+ case 'v':
+ lowByte = '\v'
+ case '\\', '\'':
+ lowByte = lowMsg[len(prefix)+1]
+ case 'x':
+ lowByte64, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:2], 16, 8)
+ lowByte = byte(lowByte64)
case 'u':
- lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 32)
+ lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 16)
var buf [4]byte
utf8.EncodeRune(buf[:], rune(lowRune))
lowByte = buf[0]
@@ -63,6 +82,14 @@ func assertEquivErr(t *testing.T, stdErr, lowErr error) {
stdErr = errors.New(stdMsg)
}
}
+
+ // I'd file a ticket for this, but @dsnet (one of the encoding/json maintainers) says that he's
+ // working on a parser-rewrite that would fix a bunch of this type of issue.
+ // https://github.com/golang/go/issues/58680#issuecomment-1444224084
+ if strings.HasPrefix(stdMsg, `invalid character '\u00`) && strings.HasPrefix(lowMsg, `invalid character '\x`) {
+ stdMsg = `invalid character '\x` + strings.TrimPrefix(stdMsg, `invalid character '\u00`)
+ stdErr = errors.New(stdMsg)
+ }
}
// Text-equal.
assert.Equal(t, stdErr.Error(), lowErr.Error())
diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go
index 73153d9..affcd7c 100644
--- a/compat/json/testcompat_test.go
+++ b/compat/json/testcompat_test.go
@@ -32,7 +32,7 @@ func checkValid(in []byte, scan *lowmemjson.ReEncoderConfig) error {
func isValidNumber(s string) bool {
var parser jsonparse.Parser
for _, r := range s {
- if t, _ := parser.HandleRune(r); !t.IsNumber() {
+ if t, _ := parser.HandleRune(r, true); !t.IsNumber() {
return false
}
}
diff --git a/decode_scan.go b/decode_scan.go
index 940de49..7ef3e71 100644
--- a/decode_scan.go
+++ b/decode_scan.go
@@ -23,10 +23,11 @@ type runeTypeScanner struct {
rTypeOK bool
repeat bool
- rRune rune
- rSize int
- rType jsonparse.RuneType
- rErr error
+ rRune rune
+ rSize int
+ rIsRune bool
+ rType jsonparse.RuneType
+ rErr error
}
// The returned error is a *ReadError, a *SyntaxError, or nil.
@@ -56,7 +57,7 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
sc.offset += int64(sc.rSize)
switch err {
case nil:
- invalidUTF8 := false
+ sc.rIsRune = true
if sc.rRune == utf8.RuneError && sc.rSize == 1 {
if bs, ok := sc.inner.(io.ByteScanner); ok {
_ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos
@@ -64,19 +65,16 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
_ = bs.UnreadByte()
_, _, _ = sc.inner.ReadRune()
sc.rRune = rune(b)
- invalidUTF8 = true
+ sc.rIsRune = false
}
}
- sc.rType, err = sc.parser.HandleRune(sc.rRune)
+ sc.rType, err = sc.parser.HandleRune(sc.rRune, sc.rIsRune)
if err != nil {
sc.rErr = &DecodeSyntaxError{
Offset: sc.offset - int64(sc.rSize),
Err: err,
}
} else {
- if invalidUTF8 {
- sc.rRune = utf8.RuneError
- }
sc.rErr = nil
}
switch sc.rType {
@@ -107,6 +105,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
}
}
sc.repeat = false
+ if sc.rSize > 0 && !sc.rIsRune {
+ return utf8.RuneError, sc.rSize, sc.rType, sc.rErr
+ }
return sc.rRune, sc.rSize, sc.rType, sc.rErr
}
@@ -139,7 +140,7 @@ func (sc *runeTypeScanner) PopReadBarrier() {
case sc.repeat:
// re-figure the rType and rErr
var err error
- sc.rType, err = sc.parser.HandleRune(sc.rRune)
+ sc.rType, err = sc.parser.HandleRune(sc.rRune, sc.rIsRune)
if err != nil {
sc.rErr = &DecodeSyntaxError{
Offset: sc.offset - int64(sc.rSize),
diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go
index 214e3ba..5547df4 100644
--- a/internal/jsonparse/parse.go
+++ b/internal/jsonparse/parse.go
@@ -15,12 +15,17 @@ import (
var ErrParserExceededMaxDepth = errors.New("exceeded max depth")
type InvalidCharacterError struct {
- Char rune
- Where string
+ Char rune
+ IsRune bool
+ Where string
}
func (e *InvalidCharacterError) Error() string {
- return fmt.Sprintf("invalid character %q %s", e.Char, e.Where)
+ if e.IsRune {
+ return fmt.Sprintf("invalid character %q %s", e.Char, e.Where)
+ } else {
+ return fmt.Sprintf("invalid character '\\x%02x' %s", e.Char, e.Where)
+ }
}
func isHex(c rune) bool {
@@ -520,7 +525,7 @@ func (par *Parser) HandleEOF() (RuneType, error) {
case 1:
switch {
case par.stack[0].IsNumber():
- if _, err := par.HandleRune('\n'); err == nil {
+ if _, err := par.HandleRune('\n', true); err == nil {
return RuneTypeEOF, nil
}
case par.stack[0] == runeTypeAny:
@@ -562,7 +567,7 @@ func (par *Parser) IsAtBarrier() bool {
// RuneTypeEOF indicates that the rune cannot be appended to the JSON
// document; a new JSON document must be started in order to process
// that rune.
-func (par *Parser) HandleRune(c rune) (RuneType, error) {
+func (par *Parser) HandleRune(c rune, isRune bool) (RuneType, error) {
if par.closed {
return RuneTypeError, iofs.ErrClosed
}
@@ -580,7 +585,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
if len(par.barriers) > 0 {
return RuneTypeEOF, nil
} else {
- return RuneTypeError, &InvalidCharacterError{c, "after top-level value"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "after top-level value"}
}
}
switch par.stack[len(par.stack)-1] {
@@ -614,7 +619,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
case 'n':
return par.replaceState(RuneTypeNullN), nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of value"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "looking for beginning of value"}
}
// object //////////////////////////////////////////////////////////////////////////////////
case RuneTypeObjectBeg: // waiting for key to start or '}'
@@ -628,7 +633,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
par.popState()
return RuneTypeObjectEnd, nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of object key string"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "looking for beginning of object key string"}
}
case RuneTypeStringEnd: // waiting for ':'
switch c {
@@ -639,7 +644,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
par.pushState(runeTypeAny)
return RuneTypeObjectColon, nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "after object key"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "after object key"}
}
case RuneTypeObjectComma: // waiting for ',' or '}'
switch c {
@@ -652,7 +657,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
par.popState()
return RuneTypeObjectEnd, nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "after object key:value pair"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "after object key:value pair"}
}
// array ///////////////////////////////////////////////////////////////////////////////////
case RuneTypeArrayBeg: // waiting for item to start or ']'
@@ -665,7 +670,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
default:
par.replaceState(RuneTypeArrayComma)
par.pushState(runeTypeAny)
- return par.HandleRune(c)
+ return par.HandleRune(c, isRune)
}
case RuneTypeArrayComma: // waiting for ',' or ']'
switch c {
@@ -678,7 +683,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
par.popState()
return RuneTypeArrayEnd, nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "after array element"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "after array element"}
}
// string //////////////////////////////////////////////////////////////////////////////////
case RuneTypeStringBeg: // waiting for char or '"'
@@ -691,7 +696,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
case 0x0020 <= c && c <= 0x10FFFF:
return RuneTypeStringChar, nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "in string literal"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "in string literal"}
}
case RuneTypeStringEsc: // waiting for escape char
switch c {
@@ -701,7 +706,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
case 'u':
return par.replaceState(RuneTypeStringEscU), nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "in string escape code"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "in string escape code"}
}
case RuneTypeStringEscU:
if !isHex(c) {
@@ -771,7 +776,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
case '1', '2', '3', '4', '5', '6', '7', '8', '9':
return par.replaceState(RuneTypeNumberIntDig), nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "in numeric literal"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "in numeric literal"}
}
case RuneTypeNumberIntZero: // C
switch c {
@@ -781,7 +786,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
return par.replaceState(RuneTypeNumberExpE), nil
default:
par.popState()
- return par.HandleRune(c)
+ return par.HandleRune(c, isRune)
}
case RuneTypeNumberIntDig: // D
switch c {
@@ -793,14 +798,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
return par.replaceState(RuneTypeNumberExpE), nil
default:
par.popState()
- return par.HandleRune(c)
+ return par.HandleRune(c, isRune)
}
case RuneTypeNumberFracDot: // E
switch c {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return par.replaceState(RuneTypeNumberFracDig), nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "after decimal point in numeric literal"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "after decimal point in numeric literal"}
}
case RuneTypeNumberFracDig: // F
switch c {
@@ -810,7 +815,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
return par.replaceState(RuneTypeNumberExpE), nil
default:
par.popState()
- return par.HandleRune(c)
+ return par.HandleRune(c, isRune)
}
case RuneTypeNumberExpE: // G
switch c {
@@ -819,14 +824,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return par.replaceState(RuneTypeNumberExpDig), nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "in exponent of numeric literal"}
}
case RuneTypeNumberExpSign: // H
switch c {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return par.replaceState(RuneTypeNumberExpDig), nil
default:
- return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, "in exponent of numeric literal"}
}
case RuneTypeNumberExpDig: // I
switch c {
@@ -834,40 +839,40 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) {
return par.replaceState(RuneTypeNumberExpDig), nil
default:
par.popState()
- return par.HandleRune(c)
+ return par.HandleRune(c, isRune)
}
// literals ////////////////////////////////////////////////////////////////////////////////
// true
case RuneTypeTrueT:
- return par.expectRune(c, 'r', RuneTypeTrueR, "true", false)
+ return par.expectRune(c, isRune, 'r', RuneTypeTrueR, "true", false)
case RuneTypeTrueR:
- return par.expectRune(c, 'u', RuneTypeTrueU, "true", false)
+ return par.expectRune(c, isRune, 'u', RuneTypeTrueU, "true", false)
case RuneTypeTrueU:
- return par.expectRune(c, 'e', RuneTypeTrueE, "true", true)
+ return par.expectRune(c, isRune, 'e', RuneTypeTrueE, "true", true)
// false
case RuneTypeFalseF:
- return par.expectRune(c, 'a', RuneTypeFalseA, "false", false)
+ return par.expectRune(c, isRune, 'a', RuneTypeFalseA, "false", false)
case RuneTypeFalseA:
- return par.expectRune(c, 'l', RuneTypeFalseL, "false", false)
+ return par.expectRune(c, isRune, 'l', RuneTypeFalseL, "false", false)
case RuneTypeFalseL:
- return par.expectRune(c, 's', RuneTypeFalseS, "false", false)
+ return par.expectRune(c, isRune, 's', RuneTypeFalseS, "false", false)
case RuneTypeFalseS:
- return par.expectRune(c, 'e', RuneTypeFalseE, "false", true)
+ return par.expectRune(c, isRune, 'e', RuneTypeFalseE, "false", true)
// null
case RuneTypeNullN:
- return par.expectRune(c, 'u', RuneTypeNullU, "null", false)
+ return par.expectRune(c, isRune, 'u', RuneTypeNullU, "null", false)
case RuneTypeNullU:
- return par.expectRune(c, 'l', RuneTypeNullL1, "null", false)
+ return par.expectRune(c, isRune, 'l', RuneTypeNullL1, "null", false)
case RuneTypeNullL1:
- return par.expectRune(c, 'l', RuneTypeNullL2, "null", true)
+ return par.expectRune(c, isRune, 'l', RuneTypeNullL2, "null", true)
default:
panic(fmt.Errorf(`should not happen: invalid stack: "%s"`, par.stackString()))
}
}
-func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) {
+func (par *Parser) expectRune(c rune, isRune bool, exp rune, typ RuneType, context string, pop bool) (RuneType, error) {
if c != exp {
- return RuneTypeError, &InvalidCharacterError{c, fmt.Sprintf("in literal %s (expecting %q)", context, exp)}
+ return RuneTypeError, &InvalidCharacterError{c, isRune, fmt.Sprintf("in literal %s (expecting %q)", context, exp)}
}
if pop {
par.popState()
diff --git a/internal/jsonparse/parse_test.go b/internal/jsonparse/parse_test.go
index e531daf..acb43e8 100644
--- a/internal/jsonparse/parse_test.go
+++ b/internal/jsonparse/parse_test.go
@@ -69,7 +69,7 @@ func TestParserHandleRune(t *testing.T) {
}
for i, r := range tc.Input {
assert.Equal(t, tc.ExpStack[i], par.stackString())
- _, err := par.HandleRune(r)
+ _, err := par.HandleRune(r, true)
assert.NoError(t, err)
assert.Equal(t, tc.ExpStack[i+1], par.stackString())
}
diff --git a/reencode.go b/reencode.go
index 7439bf0..8b08aad 100644
--- a/reencode.go
+++ b/reencode.go
@@ -352,7 +352,7 @@ func (enc *ReEncoder) Close() error {
// isRune=false indicates that 'c' is a raw byte from invalid UTF-8.
func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) {
- t, err := enc.par.HandleRune(c)
+ t, err := enc.par.HandleRune(c, isRune)
if err != nil {
enc.err = &ReEncodeSyntaxError{
Err: err,