diff options
Diffstat (limited to 'reencode.go')
-rw-r--r-- | reencode.go | 163 |
1 files changed, 122 insertions, 41 deletions
diff --git a/reencode.go b/reencode.go index 0745c43..7439bf0 100644 --- a/reencode.go +++ b/reencode.go @@ -54,6 +54,17 @@ type ReEncoderConfig struct { // this is different than the usual behavior. ForceTrailingNewlines bool + // CompactFloats causes the *ReEncoder to trim unnecessary '0' + // digits from floating-point number values. + CompactFloats bool + + // A JSON document is specified to be a sequence of Unicode + // codepoints; InvalidUTF8 controls how the *ReEncoder behaves + // when it encounters invalid UTF-8 bytes in a JSON string + // (i.e. the string is not representable as a sequence of + // Unicode codepoints, and thus the document is invalid JSON). + InvalidUTF8 InvalidUTF8Mode + // Returns whether a given character in a string should be // backslash-escaped. The bool argument is whether it was // \u-escaped in the input. This does not affect characters @@ -102,8 +113,10 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { } // Numbers - module = &reEncodeCompactNum{ - out: module, + if cfg.CompactFloats { + module = &reEncodeCompactNum{ + out: module, + } } // Strings @@ -119,6 +132,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { return &ReEncoder{ out: module, esc: escaper, + utf: cfg.InvalidUTF8, allowMultipleValues: cfg.AllowMultipleValues, } } @@ -134,6 +148,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { type ReEncoder struct { out reEncoderModule esc BackslashEscaper + utf InvalidUTF8Mode allowMultipleValues bool // state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer @@ -169,6 +184,57 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) { + var tmp []byte + if pos < enc.bufLen { + var buf [utf8.UTFMax]byte + n := copy(buf[:], enc.buf[pos:enc.bufLen]) + n += copy(buf[n:], str) + tmp = buf[:n] + } else { + tmp = str[pos-enc.bufLen:] + } + c, size = utf8.DecodeRune(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true + } +} + +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) { + if pos < enc.bufLen { + var buf [utf8.UTFMax]byte + var tmp []byte + n := copy(buf[:], enc.buf[pos:enc.bufLen]) + n += copy(buf[n:], str) + tmp = buf[:n] + c, size = utf8.DecodeRune(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true + } + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRuneInString(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true + } + } +} + // Write implements io.Writer; it does what you'd expect. // // It is worth noting that Write returns the number of bytes consumed @@ -177,59 +243,68 @@ var ( // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. -func (enc *ReEncoder) Write(p []byte) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) Write(str []byte) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full, isRune := enc.getRuneFromBytes(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRune(p[n:]) { - c, size := utf8.DecodeRune(p[n:]) - enc.handleRune(c, size) + if enc.utf == InvalidUTF8Error && !isRune { + return n, &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("invalid UTF-8: %#02x", c), + } + } + enc.handleRune(c, size, isRune) if enc.err != nil { return n, enc.err } n += size } - enc.bufLen = copy(enc.buf[:], p[n:]) - return len(p), nil } // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. -func (enc *ReEncoder) WriteString(p string) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) WriteString(str string) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full, isRune := enc.getRuneFromString(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRuneInString(p[n:]) { - c, size := utf8.DecodeRuneInString(p[n:]) - enc.handleRune(c, size) + if enc.utf == InvalidUTF8Error && !isRune { + return n, &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("invalid UTF-8: %#02x", c), + } + } + enc.handleRune(c, size, isRune) if enc.err != nil { return n, enc.err } n += size } - return len(p), nil } // WriteByte implements io.ByteWriter; it does what you'd expect. @@ -261,7 +336,7 @@ func (enc *ReEncoder) Close() error { return enc.err } if len(enc.barriers) == 0 { - if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil { + if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, @@ -275,7 +350,8 @@ func (enc *ReEncoder) Close() error { return nil } -func (enc *ReEncoder) handleRune(c rune, size int) { +// isRune=false indicates that 'c' is a raw byte from invalid UTF-8. +func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { t, err := enc.par.HandleRune(c) if err != nil { enc.err = &ReEncodeSyntaxError{ @@ -284,7 +360,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) { } return } - if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil { + if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, @@ -327,13 +403,13 @@ func (enc *ReEncoder) popWriteBarrier() { func (enc *ReEncoder) stackSize() int { sz := enc.par.StackSize() - for _, barrier := range enc.barriers { - sz += barrier.stackSize + if len(enc.barriers) > 0 { + sz += enc.barriers[len(enc.barriers)-1].stackSize } return sz } -func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error { +func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error { switch t { case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU: return nil @@ -365,14 +441,19 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int enc.uhex[2] = byte(c) return nil case jsonparse.RuneTypeStringEscUD: + mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) - return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeUnicode, stackSize) + return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize) case jsonparse.RuneTypeError: panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) default: if t > jsonparse.RuneTypeEOF { panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) } - return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize) + esc := BackslashEscapeNone + if !isRune { + esc = BackslashEscapeRawByte + } + return enc.out.HandleRune(c, t, esc, stackSize) } } |