summaryrefslogtreecommitdiff
path: root/reencode.go
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-14 22:36:25 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-18 22:45:54 -0700
commitdfc67cecbd95344d296c31b537fa3ae8aec8c292 (patch)
tree1e2e820cbd288d1ebef7b0e9dea14a07e2f33fc5 /reencode.go
parent38989a9c4f69abfe04c3eb4ec3382be88802141c (diff)
encode, reencode: Fix handling of invalid UTF-8
Diffstat (limited to 'reencode.go')
-rw-r--r--reencode.go107
1 files changed, 72 insertions, 35 deletions
diff --git a/reencode.go b/reencode.go
index fd848f8..1a9999b 100644
--- a/reencode.go
+++ b/reencode.go
@@ -54,6 +54,13 @@ type ReEncoderConfig struct {
// this is different than the usual behavior.
ForceTrailingNewlines bool
+ // A JSON document is specified to be a sequence of Unicode
+ // codepoints; InvalidUTF8 controls how the *ReEncoder behaves
+ // when it encounters invalid UTF-8 bytes in a JSON string
+ // (i.e. the string is not representable as a sequence of
+ // Unicode codepoints, and thus the document is invalid JSON).
+ InvalidUTF8 InvalidUTF8Mode
+
// Returns whether a given character in a string should be
// backslash-escaped. The bool argument is whether it was
// \u-escaped in the input. This does not affect characters
@@ -119,6 +126,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
return &ReEncoder{
out: module,
esc: escaper,
+ utf: cfg.InvalidUTF8,
allowMultipleValues: cfg.AllowMultipleValues,
}
}
@@ -134,6 +142,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
type ReEncoder struct {
out reEncoderModule
esc BackslashEscaper
+ utf InvalidUTF8Mode
allowMultipleValues bool
// state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer
@@ -169,43 +178,54 @@ var (
_ io.Closer = (*ReEncoder)(nil)
)
-func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) {
+func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) {
+ var tmp []byte
if pos < enc.bufLen {
- var tmp [utf8.UTFMax]byte
- n := copy(tmp[:], enc.buf[pos:enc.bufLen])
- n += copy(tmp[n:], str)
- c, size := utf8.DecodeRune(tmp[:n])
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRune(tmp[:n])
- }
- return c, size, true
+ var buf [utf8.UTFMax]byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
} else {
- tmp := str[pos-enc.bufLen:]
- c, size := utf8.DecodeRune(tmp)
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRune(tmp)
- }
- return c, size, true
+ tmp = str[pos-enc.bufLen:]
+ }
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
}
}
-func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) {
+func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) {
if pos < enc.bufLen {
- var tmp [utf8.UTFMax]byte
- n := copy(tmp[:], enc.buf[pos:enc.bufLen])
- n += copy(tmp[n:], str)
- c, size := utf8.DecodeRune(tmp[:n])
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRune(tmp[:n])
+ var buf [utf8.UTFMax]byte
+ var tmp []byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
}
- return c, size, true
} else {
tmp := str[pos-enc.bufLen:]
c, size := utf8.DecodeRuneInString(tmp)
- if c == utf8.RuneError && size <= 1 {
- return c, size, utf8.FullRuneInString(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
}
- return c, size, true
}
}
@@ -223,7 +243,7 @@ func (enc *ReEncoder) Write(str []byte) (int, error) {
}
var n int
for {
- c, size, full := enc.getRuneFromBytes(str, n)
+ c, size, full, isRune := enc.getRuneFromBytes(str, n)
if !full {
if n < enc.bufLen {
l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
@@ -234,7 +254,13 @@ func (enc *ReEncoder) Write(str []byte) (int, error) {
}
return len(str), nil
}
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
@@ -250,7 +276,7 @@ func (enc *ReEncoder) WriteString(str string) (int, error) {
}
var n int
for {
- c, size, full := enc.getRuneFromString(str, n)
+ c, size, full, isRune := enc.getRuneFromString(str, n)
if !full {
if n < enc.bufLen {
l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
@@ -261,7 +287,13 @@ func (enc *ReEncoder) WriteString(str string) (int, error) {
}
return len(str), nil
}
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
@@ -298,7 +330,7 @@ func (enc *ReEncoder) Close() error {
return enc.err
}
if len(enc.barriers) == 0 {
- if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -312,7 +344,8 @@ func (enc *ReEncoder) Close() error {
return nil
}
-func (enc *ReEncoder) handleRune(c rune, size int) {
+// isRune=false indicates that 'c' is a raw byte from invalid UTF-8.
+func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) {
t, err := enc.par.HandleRune(c)
if err != nil {
enc.err = &ReEncodeSyntaxError{
@@ -321,7 +354,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) {
}
return
}
- if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -370,7 +403,7 @@ func (enc *ReEncoder) stackSize() int {
return sz
}
-func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error {
+func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error {
switch t {
case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU:
return nil
@@ -410,6 +443,10 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int
if t > jsonparse.RuneTypeEOF {
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
}
- return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize)
+ esc := BackslashEscapeNone
+ if !isRune {
+ esc = BackslashEscapeRawByte
+ }
+ return enc.out.HandleRune(c, t, esc, stackSize)
}
}