diff options
author | Luke Shumaker <lukeshu@lukeshu.com> | 2023-02-25 16:17:01 -0700 |
---|---|---|
committer | Luke Shumaker <lukeshu@lukeshu.com> | 2023-02-25 17:51:36 -0700 |
commit | f68498a6fdb421483d9aebb45527452f6255bb68 (patch) | |
tree | eb2bd34a7c51a8df7239b0174a939ee64b136467 | |
parent | 7301cd6c4e97b272bd708d4c87d26609510e6ca7 (diff) |
jsonparse: Don't show raw bytes as Unicode
-rw-r--r-- | ReleaseNotes.md | 4 | ||||
-rw-r--r-- | compat/json/compat_test.go | 8 | ||||
-rw-r--r-- | compat/json/equiv_test.go | 29 | ||||
-rw-r--r-- | compat/json/testcompat_test.go | 2 | ||||
-rw-r--r-- | decode_scan.go | 23 | ||||
-rw-r--r-- | internal/jsonparse/parse.go | 73 | ||||
-rw-r--r-- | internal/jsonparse/parse_test.go | 2 | ||||
-rw-r--r-- | reencode.go | 2 |
8 files changed, 90 insertions, 53 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 1981678..e047f7d 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -21,6 +21,10 @@ invalid UTF-8, include that byte value in the error message rather than including the U+FFFD Unicode replacement character. + - Bugfix: Syntax errors on raw-bytes (for invalid UTF-8) no longer + show the raw byte as a `\u00XX` Unicode codepoint, but now as a + `\xXX` byte. + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index cf9e359..2380c53 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -79,7 +79,7 @@ func TestCompatCompact(t *testing.T) { "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, - "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\x85' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -120,7 +120,7 @@ func TestCompatIndent(t *testing.T) { "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, - "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\x85' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -183,7 +183,7 @@ func TestCompatUnmarshal(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`}, "two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`}, "two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`}, - "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\x85' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, @@ -244,7 +244,7 @@ func TestCompatDecode(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: map[string]any{}}, "two-numbers1": {In: `00`, ExpOut: float64(0)}, "two-numbers2": {In: `1 2`, ExpOut: float64(1)}, - "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\x85' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, diff --git a/compat/json/equiv_test.go b/compat/json/equiv_test.go index 246e4b3..cb02f43 100644 --- a/compat/json/equiv_test.go +++ b/compat/json/equiv_test.go @@ -44,8 +44,27 @@ func assertEquivErr(t *testing.T, stdErr, lowErr error) { lowByte := lowMsg[len(prefix)] if lowByte == '\\' { switch lowMsg[len(prefix)+1] { + case 'a': + lowByte = '\a' + case 'b': + lowByte = '\b' + case 'f': + lowByte = '\f' + case 'n': + lowByte = '\n' + case 'r': + lowByte = '\r' + case 't': + lowByte = '\t' + case 'v': + lowByte = '\v' + case '\\', '\'': + lowByte = lowMsg[len(prefix)+1] + case 'x': + lowByte64, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:2], 16, 8) + lowByte = byte(lowByte64) case 'u': - lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 32) + lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 16) var buf [4]byte utf8.EncodeRune(buf[:], rune(lowRune)) lowByte = buf[0] @@ -63,6 +82,14 @@ func assertEquivErr(t *testing.T, stdErr, lowErr error) { stdErr = errors.New(stdMsg) } } + + // I'd file a ticket for this, but @dsnet (one of the encoding/json maintainers) says that he's + // working on a parser-rewrite that would fix a bunch of this type of issue. + // https://github.com/golang/go/issues/58680#issuecomment-1444224084 + if strings.HasPrefix(stdMsg, `invalid character '\u00`) && strings.HasPrefix(lowMsg, `invalid character '\x`) { + stdMsg = `invalid character '\x` + strings.TrimPrefix(stdMsg, `invalid character '\u00`) + stdErr = errors.New(stdMsg) + } } // Text-equal. assert.Equal(t, stdErr.Error(), lowErr.Error()) diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go index 73153d9..affcd7c 100644 --- a/compat/json/testcompat_test.go +++ b/compat/json/testcompat_test.go @@ -32,7 +32,7 @@ func checkValid(in []byte, scan *lowmemjson.ReEncoderConfig) error { func isValidNumber(s string) bool { var parser jsonparse.Parser for _, r := range s { - if t, _ := parser.HandleRune(r); !t.IsNumber() { + if t, _ := parser.HandleRune(r, true); !t.IsNumber() { return false } } diff --git a/decode_scan.go b/decode_scan.go index 940de49..7ef3e71 100644 --- a/decode_scan.go +++ b/decode_scan.go @@ -23,10 +23,11 @@ type runeTypeScanner struct { rTypeOK bool repeat bool - rRune rune - rSize int - rType jsonparse.RuneType - rErr error + rRune rune + rSize int + rIsRune bool + rType jsonparse.RuneType + rErr error } // The returned error is a *ReadError, a *SyntaxError, or nil. @@ -56,7 +57,7 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) sc.offset += int64(sc.rSize) switch err { case nil: - invalidUTF8 := false + sc.rIsRune = true if sc.rRune == utf8.RuneError && sc.rSize == 1 { if bs, ok := sc.inner.(io.ByteScanner); ok { _ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos @@ -64,19 +65,16 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) _ = bs.UnreadByte() _, _, _ = sc.inner.ReadRune() sc.rRune = rune(b) - invalidUTF8 = true + sc.rIsRune = false } } - sc.rType, err = sc.parser.HandleRune(sc.rRune) + sc.rType, err = sc.parser.HandleRune(sc.rRune, sc.rIsRune) if err != nil { sc.rErr = &DecodeSyntaxError{ Offset: sc.offset - int64(sc.rSize), Err: err, } } else { - if invalidUTF8 { - sc.rRune = utf8.RuneError - } sc.rErr = nil } switch sc.rType { @@ -107,6 +105,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) } } sc.repeat = false + if sc.rSize > 0 && !sc.rIsRune { + return utf8.RuneError, sc.rSize, sc.rType, sc.rErr + } return sc.rRune, sc.rSize, sc.rType, sc.rErr } @@ -139,7 +140,7 @@ func (sc *runeTypeScanner) PopReadBarrier() { case sc.repeat: // re-figure the rType and rErr var err error - sc.rType, err = sc.parser.HandleRune(sc.rRune) + sc.rType, err = sc.parser.HandleRune(sc.rRune, sc.rIsRune) if err != nil { sc.rErr = &DecodeSyntaxError{ Offset: sc.offset - int64(sc.rSize), diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index 214e3ba..5547df4 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -15,12 +15,17 @@ import ( var ErrParserExceededMaxDepth = errors.New("exceeded max depth") type InvalidCharacterError struct { - Char rune - Where string + Char rune + IsRune bool + Where string } func (e *InvalidCharacterError) Error() string { - return fmt.Sprintf("invalid character %q %s", e.Char, e.Where) + if e.IsRune { + return fmt.Sprintf("invalid character %q %s", e.Char, e.Where) + } else { + return fmt.Sprintf("invalid character '\\x%02x' %s", e.Char, e.Where) + } } func isHex(c rune) bool { @@ -520,7 +525,7 @@ func (par *Parser) HandleEOF() (RuneType, error) { case 1: switch { case par.stack[0].IsNumber(): - if _, err := par.HandleRune('\n'); err == nil { + if _, err := par.HandleRune('\n', true); err == nil { return RuneTypeEOF, nil } case par.stack[0] == runeTypeAny: @@ -562,7 +567,7 @@ func (par *Parser) IsAtBarrier() bool { // RuneTypeEOF indicates that the rune cannot be appended to the JSON // document; a new JSON document must be started in order to process // that rune. -func (par *Parser) HandleRune(c rune) (RuneType, error) { +func (par *Parser) HandleRune(c rune, isRune bool) (RuneType, error) { if par.closed { return RuneTypeError, iofs.ErrClosed } @@ -580,7 +585,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { if len(par.barriers) > 0 { return RuneTypeEOF, nil } else { - return RuneTypeError, &InvalidCharacterError{c, "after top-level value"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after top-level value"} } } switch par.stack[len(par.stack)-1] { @@ -614,7 +619,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'n': return par.replaceState(RuneTypeNullN), nil default: - return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of value"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "looking for beginning of value"} } // object ////////////////////////////////////////////////////////////////////////////////// case RuneTypeObjectBeg: // waiting for key to start or '}' @@ -628,7 +633,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of object key string"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "looking for beginning of object key string"} } case RuneTypeStringEnd: // waiting for ':' switch c { @@ -639,7 +644,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.pushState(runeTypeAny) return RuneTypeObjectColon, nil default: - return RuneTypeError, &InvalidCharacterError{c, "after object key"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after object key"} } case RuneTypeObjectComma: // waiting for ',' or '}' switch c { @@ -652,7 +657,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, &InvalidCharacterError{c, "after object key:value pair"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after object key:value pair"} } // array /////////////////////////////////////////////////////////////////////////////////// case RuneTypeArrayBeg: // waiting for item to start or ']' @@ -665,7 +670,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { default: par.replaceState(RuneTypeArrayComma) par.pushState(runeTypeAny) - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeArrayComma: // waiting for ',' or ']' switch c { @@ -678,7 +683,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeArrayEnd, nil default: - return RuneTypeError, &InvalidCharacterError{c, "after array element"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after array element"} } // string ////////////////////////////////////////////////////////////////////////////////// case RuneTypeStringBeg: // waiting for char or '"' @@ -691,7 +696,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 0x0020 <= c && c <= 0x10FFFF: return RuneTypeStringChar, nil default: - return RuneTypeError, &InvalidCharacterError{c, "in string literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in string literal"} } case RuneTypeStringEsc: // waiting for escape char switch c { @@ -701,7 +706,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'u': return par.replaceState(RuneTypeStringEscU), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in string escape code"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in string escape code"} } case RuneTypeStringEscU: if !isHex(c) { @@ -771,7 +776,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberIntDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in numeric literal"} } case RuneTypeNumberIntZero: // C switch c { @@ -781,7 +786,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpE), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeNumberIntDig: // D switch c { @@ -793,14 +798,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpE), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeNumberFracDot: // E switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberFracDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "after decimal point in numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after decimal point in numeric literal"} } case RuneTypeNumberFracDig: // F switch c { @@ -810,7 +815,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpE), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeNumberExpE: // G switch c { @@ -819,14 +824,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in exponent of numeric literal"} } case RuneTypeNumberExpSign: // H switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in exponent of numeric literal"} } case RuneTypeNumberExpDig: // I switch c { @@ -834,40 +839,40 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpDig), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } // literals //////////////////////////////////////////////////////////////////////////////// // true case RuneTypeTrueT: - return par.expectRune(c, 'r', RuneTypeTrueR, "true", false) + return par.expectRune(c, isRune, 'r', RuneTypeTrueR, "true", false) case RuneTypeTrueR: - return par.expectRune(c, 'u', RuneTypeTrueU, "true", false) + return par.expectRune(c, isRune, 'u', RuneTypeTrueU, "true", false) case RuneTypeTrueU: - return par.expectRune(c, 'e', RuneTypeTrueE, "true", true) + return par.expectRune(c, isRune, 'e', RuneTypeTrueE, "true", true) // false case RuneTypeFalseF: - return par.expectRune(c, 'a', RuneTypeFalseA, "false", false) + return par.expectRune(c, isRune, 'a', RuneTypeFalseA, "false", false) case RuneTypeFalseA: - return par.expectRune(c, 'l', RuneTypeFalseL, "false", false) + return par.expectRune(c, isRune, 'l', RuneTypeFalseL, "false", false) case RuneTypeFalseL: - return par.expectRune(c, 's', RuneTypeFalseS, "false", false) + return par.expectRune(c, isRune, 's', RuneTypeFalseS, "false", false) case RuneTypeFalseS: - return par.expectRune(c, 'e', RuneTypeFalseE, "false", true) + return par.expectRune(c, isRune, 'e', RuneTypeFalseE, "false", true) // null case RuneTypeNullN: - return par.expectRune(c, 'u', RuneTypeNullU, "null", false) + return par.expectRune(c, isRune, 'u', RuneTypeNullU, "null", false) case RuneTypeNullU: - return par.expectRune(c, 'l', RuneTypeNullL1, "null", false) + return par.expectRune(c, isRune, 'l', RuneTypeNullL1, "null", false) case RuneTypeNullL1: - return par.expectRune(c, 'l', RuneTypeNullL2, "null", true) + return par.expectRune(c, isRune, 'l', RuneTypeNullL2, "null", true) default: panic(fmt.Errorf(`should not happen: invalid stack: "%s"`, par.stackString())) } } -func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { +func (par *Parser) expectRune(c rune, isRune bool, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { if c != exp { - return RuneTypeError, &InvalidCharacterError{c, fmt.Sprintf("in literal %s (expecting %q)", context, exp)} + return RuneTypeError, &InvalidCharacterError{c, isRune, fmt.Sprintf("in literal %s (expecting %q)", context, exp)} } if pop { par.popState() diff --git a/internal/jsonparse/parse_test.go b/internal/jsonparse/parse_test.go index e531daf..acb43e8 100644 --- a/internal/jsonparse/parse_test.go +++ b/internal/jsonparse/parse_test.go @@ -69,7 +69,7 @@ func TestParserHandleRune(t *testing.T) { } for i, r := range tc.Input { assert.Equal(t, tc.ExpStack[i], par.stackString()) - _, err := par.HandleRune(r) + _, err := par.HandleRune(r, true) assert.NoError(t, err) assert.Equal(t, tc.ExpStack[i+1], par.stackString()) } diff --git a/reencode.go b/reencode.go index 7439bf0..8b08aad 100644 --- a/reencode.go +++ b/reencode.go @@ -352,7 +352,7 @@ func (enc *ReEncoder) Close() error { // isRune=false indicates that 'c' is a raw byte from invalid UTF-8. func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { - t, err := enc.par.HandleRune(c) + t, err := enc.par.HandleRune(c, isRune) if err != nil { enc.err = &ReEncodeSyntaxError{ Err: err, |