summaryrefslogtreecommitdiff
path: root/reencode.go
diff options
context:
space:
mode:
Diffstat (limited to 'reencode.go')
-rw-r--r--reencode.go163
1 files changed, 122 insertions, 41 deletions
diff --git a/reencode.go b/reencode.go
index 0745c43..7439bf0 100644
--- a/reencode.go
+++ b/reencode.go
@@ -54,6 +54,17 @@ type ReEncoderConfig struct {
// this is different than the usual behavior.
ForceTrailingNewlines bool
+ // CompactFloats causes the *ReEncoder to trim unnecessary '0'
+ // digits from floating-point number values.
+ CompactFloats bool
+
+ // A JSON document is specified to be a sequence of Unicode
+ // codepoints; InvalidUTF8 controls how the *ReEncoder behaves
+ // when it encounters invalid UTF-8 bytes in a JSON string
+ // (i.e. the string is not representable as a sequence of
+ // Unicode codepoints, and thus the document is invalid JSON).
+ InvalidUTF8 InvalidUTF8Mode
+
// Returns whether a given character in a string should be
// backslash-escaped. The bool argument is whether it was
// \u-escaped in the input. This does not affect characters
@@ -102,8 +113,10 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
}
// Numbers
- module = &reEncodeCompactNum{
- out: module,
+ if cfg.CompactFloats {
+ module = &reEncodeCompactNum{
+ out: module,
+ }
}
// Strings
@@ -119,6 +132,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
return &ReEncoder{
out: module,
esc: escaper,
+ utf: cfg.InvalidUTF8,
allowMultipleValues: cfg.AllowMultipleValues,
}
}
@@ -134,6 +148,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
type ReEncoder struct {
out reEncoderModule
esc BackslashEscaper
+ utf InvalidUTF8Mode
allowMultipleValues bool
// state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer
@@ -169,6 +184,57 @@ var (
_ io.Closer = (*ReEncoder)(nil)
)
+func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) {
+ var tmp []byte
+ if pos < enc.bufLen {
+ var buf [utf8.UTFMax]byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
+ } else {
+ tmp = str[pos-enc.bufLen:]
+ }
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
+ }
+}
+
+func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) {
+ if pos < enc.bufLen {
+ var buf [utf8.UTFMax]byte
+ var tmp []byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
+ }
+ } else {
+ tmp := str[pos-enc.bufLen:]
+ c, size := utf8.DecodeRuneInString(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
+ }
+ }
+}
+
// Write implements io.Writer; it does what you'd expect.
//
// It is worth noting that Write returns the number of bytes consumed
@@ -177,59 +243,68 @@ var (
// but *ReEncoder does because it transforms the data written to it,
// and the number of bytes written may be wildly different than the
// number of bytes handled.
-func (enc *ReEncoder) Write(p []byte) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) Write(str []byte) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full, isRune := enc.getRuneFromBytes(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRune(p[n:]) {
- c, size := utf8.DecodeRune(p[n:])
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
n += size
}
- enc.bufLen = copy(enc.buf[:], p[n:])
- return len(p), nil
}
// WriteString implements io.StringWriter; it does what you'd expect,
// but see the notes on the Write method.
-func (enc *ReEncoder) WriteString(p string) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) WriteString(str string) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full, isRune := enc.getRuneFromString(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRuneInString(p[n:]) {
- c, size := utf8.DecodeRuneInString(p[n:])
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
n += size
}
- return len(p), nil
}
// WriteByte implements io.ByteWriter; it does what you'd expect.
@@ -261,7 +336,7 @@ func (enc *ReEncoder) Close() error {
return enc.err
}
if len(enc.barriers) == 0 {
- if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -275,7 +350,8 @@ func (enc *ReEncoder) Close() error {
return nil
}
-func (enc *ReEncoder) handleRune(c rune, size int) {
+// isRune=false indicates that 'c' is a raw byte from invalid UTF-8.
+func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) {
t, err := enc.par.HandleRune(c)
if err != nil {
enc.err = &ReEncodeSyntaxError{
@@ -284,7 +360,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) {
}
return
}
- if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -327,13 +403,13 @@ func (enc *ReEncoder) popWriteBarrier() {
func (enc *ReEncoder) stackSize() int {
sz := enc.par.StackSize()
- for _, barrier := range enc.barriers {
- sz += barrier.stackSize
+ if len(enc.barriers) > 0 {
+ sz += enc.barriers[len(enc.barriers)-1].stackSize
}
return sz
}
-func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error {
+func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error {
switch t {
case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU:
return nil
@@ -365,14 +441,19 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int
enc.uhex[2] = byte(c)
return nil
case jsonparse.RuneTypeStringEscUD:
+ mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
- return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeUnicode, stackSize)
+ return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize)
case jsonparse.RuneTypeError:
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
default:
if t > jsonparse.RuneTypeEOF {
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
}
- return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize)
+ esc := BackslashEscapeNone
+ if !isRune {
+ esc = BackslashEscapeRawByte
+ }
+ return enc.out.HandleRune(c, t, esc, stackSize)
}
}