summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-20 12:47:10 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-20 12:47:10 -0700
commitf5ca3478c68e47ae20fd12748c1552fdf81f75f9 (patch)
treeb3d3f889ed25084fe33ed9e01554d6ca51104bb5
parentd240d0b06c7b5711f583d961eddfc37d07d4546e (diff)
parent49ee8be679add0bd3cf08a2669331b3be7a835f8 (diff)
Merge branch 'lukeshu/fixes'
-rw-r--r--ReleaseNotes.md83
-rw-r--r--compat/json/compat.go150
-rw-r--r--compat/json/compat_test.go241
-rw-r--r--compat/json/testcompat_test.go14
-rw-r--r--decode.go2
-rw-r--r--decode_scan.go6
-rw-r--r--encode.go34
-rw-r--r--encode_escape.go76
-rw-r--r--errors.go3
-rw-r--r--internal/jsonparse/parse.go15
-rw-r--r--internal/jsonstring/encode_string.go121
-rw-r--r--reencode.go163
-rw-r--r--reencode_compactnum.go12
-rw-r--r--reencode_test.go97
14 files changed, 894 insertions, 123 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 081644e..5e8dab7 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,3 +1,86 @@
+# v0.3.7 (TBD)
+
+ Theme: TBD
+
+ User-facing changes:
+
+ - General Changes:
+
+ + Encoder, ReEncoder: Now correctly trims unnecessary the
+ trailing '0's from the fraction-part when compacting numbers.
+
+ + ReEncoder: No longer compact floating-point numbers by
+ default, add a `CompactFloats` ReEncoderConfig option to
+ control this.
+
+ + Decoder: Decoding `json.Unmarshaler` or `lowmemjson.Decodable`
+ as a top-level value no longer needs to read past the closing
+ `"`/`]`/`}`; this can be significant when reading streaming
+ input, as that next read may block.
+
+ - Compatibility bugfixes:
+
+ + compat/json.Valid: No longer considers truncated JSON
+ documents to be valid.
+
+ + compat/json.Compact, compat/json.Indent: Don't write to the
+ destination buffer if there is a syntax error.
+
+ + compat/json.Compact, compat/json.Indent: No longer compact
+ floating-point numbers; as `encoding/json` doesn't.
+
+ + compat/json.HTMLEscape: Just look for problematic UTF-8 runes,
+ don't actually parse as JSON. This is consistent with the
+ function's lack of an `error` return value, and with the
+ behavior of `encoding/json`.
+
+ + compat/json.Indent: Preserve trailing whitespace, same as
+ `encoding/json`.
+
+ + compat/json.Decoder: No longer transforms "unexpected EOF"
+ errors to "unexpected end of JSON input". This makes it
+ different than `compat/json.Unmarshal`, but the same as
+ `encoding/json`.
+
+ + compat/json.Decoder, compat/json.Unmarshal: No longer mutate
+ the target value at all if there is a syntax error in the
+ input.
+
+ - Unicode:
+
+ + Feature: Encoder, ReEncoder: Add an `InvalidUTF8`
+ ReEncoderConfig option and `BackslashEscapeRawByte`
+ BackslashEscapeMode to allow emitted strings to contain
+ invalid UTF-8.
+
+ + Feature: ReEncoder: No longer unconditionally normalizes
+ `\uXXXX` hex characters to lower-case; now this is controlled
+ by the `BackslashEscaper` (and the default is now to leave the
+ capitalization alone).
+
+ + Change: EscapeDefault, EscapeDefaultNonHTMLSafe: No longer
+ force long Unicode `\uXXXX` sequences for the U+FFFD Unicode
+ replacement character.
+
+ + Change: Encoder: Unless overridden by the BackslashEscaper,
+ now by default uses `\uXXXX` sequences when emitting the
+ U+FFFD Unicode replacement character in place of invalid
+ UTF-8.
+
+ + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8
+ that when a codepoint straddles a write boundary it is
+ interpreted as a sequence of U+FFFD runes.
+
+ + Bugfix: compat/json.Valid: Do not consider JSON containing
+ invalid UTF-8 to be valid (this is different than
+ `encoding/json` at the time of this writing; but I consider
+ that to be a bug in `encoding/json`; [go#58517][]).
+
+ + Bugfix: compat/json.Compact, compat/json.Indent: Don't munge
+ invalid UTF-8 in strings; as `encoding/json` doesn't.
+
+ [go#58517]: https://github.com/golang/go/issues/58517
+
# v0.3.6 (2023-02-16)
Theme: Architectural improvements
diff --git a/compat/json/compat.go b/compat/json/compat.go
index c96470d..695c1a8 100644
--- a/compat/json/compat.go
+++ b/compat/json/compat.go
@@ -11,10 +11,13 @@ import (
"bytes"
"encoding/json"
"errors"
+ "fmt"
"io"
"strconv"
+ "unicode/utf8"
"git.lukeshu.com/go/lowmemjson"
+ "git.lukeshu.com/go/lowmemjson/internal/jsonstring"
)
//nolint:stylecheck // ST1021 False positive; these aren't comments on individual types.
@@ -144,7 +147,23 @@ func convertReEncodeError(err error) error {
}
func HTMLEscape(dst *bytes.Buffer, src []byte) {
- _, _ = lowmemjson.NewReEncoder(dst, lowmemjson.ReEncoderConfig{}).Write(src)
+ for n := 0; n < len(src); {
+ c, size := utf8.DecodeRune(src[n:])
+ if c == utf8.RuneError && size == 1 {
+ dst.WriteByte(src[n])
+ } else {
+ mode := lowmemjson.EscapeHTMLSafe(c, lowmemjson.BackslashEscapeNone)
+ switch mode {
+ case lowmemjson.BackslashEscapeNone:
+ dst.WriteRune(c)
+ case lowmemjson.BackslashEscapeUnicode:
+ _ = jsonstring.WriteStringUnicodeEscape(dst, c, mode)
+ default:
+ panic(fmt.Errorf("lowmemjson.EscapeHTMLSafe returned an unexpected escape mode=%d", mode))
+ }
+ }
+ n += size
+ }
}
func reencode(dst io.Writer, src []byte, cfg lowmemjson.ReEncoderConfig) error {
@@ -157,38 +176,75 @@ func reencode(dst io.Writer, src []byte, cfg lowmemjson.ReEncoderConfig) error {
}
func Compact(dst *bytes.Buffer, src []byte) error {
- return reencode(dst, src, lowmemjson.ReEncoderConfig{
+ start := dst.Len()
+ err := reencode(dst, src, lowmemjson.ReEncoderConfig{
Compact: true,
+ InvalidUTF8: lowmemjson.InvalidUTF8Preserve,
BackslashEscape: lowmemjson.EscapePreserve,
})
+ if err != nil {
+ dst.Truncate(start)
+ }
+ return err
+}
+
+func isSpace(c byte) bool {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return true
+ default:
+ return false
+ }
}
func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error {
- return reencode(dst, src, lowmemjson.ReEncoderConfig{
+ start := dst.Len()
+ err := reencode(dst, src, lowmemjson.ReEncoderConfig{
Indent: indent,
Prefix: prefix,
+ InvalidUTF8: lowmemjson.InvalidUTF8Preserve,
BackslashEscape: lowmemjson.EscapePreserve,
})
+ if err != nil {
+ dst.Truncate(start)
+ return err
+ }
+
+ // Preserve trailing whitespace.
+ lastNonWS := len(src) - 1
+ for ; lastNonWS >= 0 && isSpace(src[lastNonWS]); lastNonWS-- {
+ }
+ if _, err := dst.Write(src[lastNonWS+1:]); err != nil {
+ return err
+ }
+
+ return nil
}
func Valid(data []byte) bool {
formatter := lowmemjson.NewReEncoder(io.Discard, lowmemjson.ReEncoderConfig{
- Compact: true,
+ Compact: true,
+ InvalidUTF8: lowmemjson.InvalidUTF8Error,
})
- _, err := formatter.Write(data)
- return err == nil
+ if _, err := formatter.Write(data); err != nil {
+ return false
+ }
+ if err := formatter.Close(); err != nil {
+ return false
+ }
+ return true
}
// Decode wrappers ///////////////////////////////////////////////////
-func convertDecodeError(err error) error {
+func convertDecodeError(err error, isUnmarshal bool) error {
if derr, ok := err.(*lowmemjson.DecodeError); ok {
switch terr := derr.Err.(type) {
case *lowmemjson.DecodeSyntaxError:
switch {
case errors.Is(terr.Err, io.EOF):
err = io.EOF
- case errors.Is(terr.Err, io.ErrUnexpectedEOF):
+ case errors.Is(terr.Err, io.ErrUnexpectedEOF) && isUnmarshal:
err = &SyntaxError{
msg: "unexpected end of JSON input",
Offset: terr.Offset,
@@ -228,13 +284,66 @@ func convertDecodeError(err error) error {
return err
}
+type decodeValidator struct{}
+
+func (*decodeValidator) DecodeJSON(r io.RuneScanner) error {
+ for {
+ if _, _, err := r.ReadRune(); err != nil {
+
+ if err == io.EOF {
+ return nil
+ }
+ return err
+ }
+ }
+}
+
+var _ lowmemjson.Decodable = (*decodeValidator)(nil)
+
func Unmarshal(data []byte, ptr any) error {
- return convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(ptr))
+ if err := convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(&decodeValidator{}), true); err != nil {
+ return err
+ }
+ if err := convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(ptr), true); err != nil {
+ return err
+ }
+ return nil
+}
+
+type teeRuneScanner struct {
+ src io.RuneScanner
+ dst *bytes.Buffer
+ lastSize int
+}
+
+func (tee *teeRuneScanner) ReadRune() (r rune, size int, err error) {
+ r, size, err = tee.src.ReadRune()
+ if err == nil {
+ if _, err := tee.dst.WriteRune(r); err != nil {
+ return 0, 0, err
+ }
+ }
+
+ tee.lastSize = size
+ return
+}
+
+func (tee *teeRuneScanner) UnreadRune() error {
+ if tee.lastSize == 0 {
+ return lowmemjson.ErrInvalidUnreadRune
+ }
+ _ = tee.src.UnreadRune()
+ tee.dst.Truncate(tee.dst.Len() - tee.lastSize)
+ tee.lastSize = 0
+ return nil
}
type Decoder struct {
+ validatorBuf *bufio.Reader
+ validator *lowmemjson.Decoder
+
+ decoderBuf bytes.Buffer
*lowmemjson.Decoder
- buf *bufio.Reader
}
func NewDecoder(r io.Reader) *Decoder {
@@ -242,18 +351,29 @@ func NewDecoder(r io.Reader) *Decoder {
if !ok {
br = bufio.NewReader(r)
}
- return &Decoder{
- Decoder: lowmemjson.NewDecoder(br),
- buf: br,
+ ret := &Decoder{
+ validatorBuf: br,
}
+ ret.validator = lowmemjson.NewDecoder(&teeRuneScanner{
+ src: ret.validatorBuf,
+ dst: &ret.decoderBuf,
+ })
+ ret.Decoder = lowmemjson.NewDecoder(&ret.decoderBuf)
+ return ret
}
func (dec *Decoder) Decode(ptr any) error {
- return convertDecodeError(dec.Decoder.Decode(ptr))
+ if err := convertDecodeError(dec.validator.Decode(&decodeValidator{}), false); err != nil {
+ return err
+ }
+ if err := convertDecodeError(dec.Decoder.Decode(ptr), false); err != nil {
+ return err
+ }
+ return nil
}
func (dec *Decoder) Buffered() io.Reader {
- dat, _ := dec.buf.Peek(dec.buf.Buffered())
+ dat, _ := dec.validatorBuf.Peek(dec.validatorBuf.Buffered())
return bytes.NewReader(dat)
}
diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go
new file mode 100644
index 0000000..df9d387
--- /dev/null
+++ b/compat/json/compat_test.go
@@ -0,0 +1,241 @@
+// Copyright (C) 2023 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package json
+
+import (
+ "bytes"
+ "reflect"
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestCompatHTMLEscape(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In string
+ Out string
+ }
+ testcases := map[string]testcase{
+ "invalid": {In: `x`, Out: `x`},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ t.Logf("in=%q", tc.In)
+ var dst bytes.Buffer
+ HTMLEscape(&dst, []byte(tc.In))
+ assert.Equal(t, tc.Out, dst.String())
+ })
+ }
+}
+
+func TestCompatValid(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In string
+ Exp bool
+ }
+ testcases := map[string]testcase{
+ "empty": {In: ``, Exp: false},
+ "num": {In: `1`, Exp: true},
+ "trunc": {In: `{`, Exp: false},
+ "object": {In: `{}`, Exp: true},
+ "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517
+ "hex-lower": {In: `"\uabcd"`, Exp: true},
+ "hex-upper": {In: `"\uABCD"`, Exp: true},
+ "hex-mixed": {In: `"\uAbCd"`, Exp: true},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ t.Logf("in=%q", tc.In)
+ act := Valid([]byte(tc.In))
+ assert.Equal(t, tc.Exp, act)
+ })
+ }
+}
+
+func TestCompatCompact(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In string
+ Out string
+ Err string
+ }
+ testcases := map[string]testcase{
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+ "float": {In: `1.200e003`, Out: `1.200e003`},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ t.Logf("in=%q", tc.In)
+ var out bytes.Buffer
+ err := Compact(&out, []byte(tc.In))
+ assert.Equal(t, tc.Out, out.String())
+ if tc.Err == "" {
+ assert.NoError(t, err)
+ } else {
+ assert.EqualError(t, err, tc.Err)
+ }
+ })
+ }
+}
+
+func TestCompatIndent(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In string
+ Out string
+ Err string
+ }
+ testcases := map[string]testcase{
+ "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`},
+ "object": {In: `{}`, Out: `{}`},
+ "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""},
+ "float": {In: `1.200e003`, Out: `1.200e003`},
+ "tailws0": {In: `0`, Out: `0`},
+ "tailws1": {In: `0 `, Out: `0 `},
+ "tailws2": {In: `0 `, Out: `0 `},
+ "tailws3": {In: "0\n", Out: "0\n"},
+ "headws1": {In: ` 0`, Out: `0`},
+ "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"},
+ "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"},
+ "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`},
+ "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`},
+ "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ t.Logf("in=%q", tc.In)
+ var out bytes.Buffer
+ err := Indent(&out, []byte(tc.In), ">", ".")
+ assert.Equal(t, tc.Out, out.String())
+ if tc.Err == "" {
+ assert.NoError(t, err)
+ } else {
+ assert.EqualError(t, err, tc.Err)
+ }
+ })
+ }
+}
+
+func TestCompatMarshal(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In any
+ Out string
+ Err string
+ }
+ testcases := map[string]testcase{
+ "non-utf8": {In: "\x85\xcd", Out: "\"\\ufffd\\ufffd\""},
+ "urc": {In: "\ufffd", Out: "\"\ufffd\""},
+ "float": {In: 1.2e3, Out: `1200`},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ out, err := Marshal(tc.In)
+ assert.Equal(t, tc.Out, string(out))
+ if tc.Err == "" {
+ assert.NoError(t, err)
+ } else {
+ assert.EqualError(t, err, tc.Err)
+ }
+ })
+ }
+}
+
+func TestCompatUnmarshal(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In string
+ InPtr any
+ ExpOut any
+ ExpErr string
+ }
+ testcases := map[string]testcase{
+ "empty-obj": {In: `{}`, ExpOut: map[string]any{}},
+ "partial-obj": {In: `{"foo":"bar",`, ExpOut: nil, ExpErr: `unexpected end of JSON input`},
+ "existing-obj": {In: `{"baz":"quz"}`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar", "baz": "quz"}},
+ "existing-obj-partial": {In: `{"baz":"quz"`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar"}, ExpErr: "unexpected end of JSON input"},
+ "empty-ary": {In: `[]`, ExpOut: []any{}},
+ "two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`},
+ "two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`},
+ "two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ ptr := tc.InPtr
+ if ptr == nil {
+ var out any
+ ptr = &out
+ }
+ err := Unmarshal([]byte(tc.In), ptr)
+ assert.Equal(t, tc.ExpOut, reflect.ValueOf(ptr).Elem().Interface())
+ if tc.ExpErr == "" {
+ assert.NoError(t, err)
+ } else {
+ assert.EqualError(t, err, tc.ExpErr)
+ }
+ })
+ }
+}
+
+func TestCompatDecode(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ In string
+ InPtr any
+ ExpOut any
+ ExpErr string
+ }
+ testcases := map[string]testcase{
+ "empty-obj": {In: `{}`, ExpOut: map[string]any{}},
+ "partial-obj": {In: `{"foo":"bar",`, ExpOut: nil, ExpErr: `unexpected EOF`},
+ "existing-obj": {In: `{"baz":"quz"}`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar", "baz": "quz"}},
+ "existing-obj-partial": {In: `{"baz":"quz"`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar"}, ExpErr: "unexpected EOF"},
+ "empty-ary": {In: `[]`, ExpOut: []any{}},
+ "two-objs": {In: `{} {}`, ExpOut: map[string]any{}},
+ "two-numbers1": {In: `00`, ExpOut: float64(0)},
+ "two-numbers2": {In: `1 2`, ExpOut: float64(1)},
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ ptr := tc.InPtr
+ if ptr == nil {
+ var out any
+ ptr = &out
+ }
+ err := NewDecoder(strings.NewReader(tc.In)).Decode(ptr)
+ assert.Equal(t, tc.ExpOut, reflect.ValueOf(ptr).Elem().Interface())
+ if tc.ExpErr == "" {
+ assert.NoError(t, err)
+ } else {
+ assert.EqualError(t, err, tc.ExpErr)
+ }
+ })
+ }
+}
diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go
index 42cbf5c..73153d9 100644
--- a/compat/json/testcompat_test.go
+++ b/compat/json/testcompat_test.go
@@ -8,6 +8,7 @@ import (
"bytes"
"encoding/json"
"io"
+ "reflect"
_ "unsafe"
"git.lukeshu.com/go/lowmemjson"
@@ -45,27 +46,18 @@ const (
startDetectingCyclesAfter = 1000
)
-func isSpace(c byte) bool {
- switch c {
- case 0x0020, 0x000A, 0x000D, 0x0009:
- return true
- default:
- return false
- }
-}
-
type encodeState struct {
bytes.Buffer
}
func (es *encodeState) string(str string, _ bool) {
- if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil {
+ if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil {
panic(err)
}
}
func (es *encodeState) stringBytes(str []byte, _ bool) {
- if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil {
+ if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil {
panic(err)
}
}
diff --git a/decode.go b/decode.go
index 491971a..a136668 100644
--- a/decode.go
+++ b/decode.go
@@ -53,6 +53,8 @@ import (
// or another is encountered; if it does not, then the parent Decode
// call will return a *DecodeTypeError.
//
+// DecodeJSON should return nil (not io.EOF) on success.
+//
// Implementor's note: "withLimitingScanner" is the thing to search
// for in decode.go if you want to read up on that io.RuneScanner.
type Decodable interface {
diff --git a/decode_scan.go b/decode_scan.go
index fcf47ff..63694c4 100644
--- a/decode_scan.go
+++ b/decode_scan.go
@@ -41,6 +41,12 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error)
case sc.repeat:
sc.offset += int64(sc.rSize)
_, _, _ = sc.inner.ReadRune()
+ case sc.parser.IsAtBarrier():
+ sc.rTypeOK = true
+ sc.rType = jsonparse.RuneTypeEOF
+ sc.rRune = 0
+ sc.rSize = 0
+ sc.rErr = nil
default:
sc.rTypeOK = true
again:
diff --git a/encode.go b/encode.go
index 00d3dad..684cc75 100644
--- a/encode.go
+++ b/encode.go
@@ -87,7 +87,7 @@ func (enc *Encoder) Encode(obj any) (err error) {
if escaper == nil {
escaper = EscapeDefault
}
- if err := encode(enc.w, reflect.ValueOf(obj), escaper, false, 0, map[any]struct{}{}); err != nil {
+ if err := encode(enc.w, reflect.ValueOf(obj), escaper, enc.w.utf, false, 0, map[any]struct{}{}); err != nil {
if rwe, ok := err.(*ReEncodeWriteError); ok {
err = &EncodeWriteError{
Err: rwe.Err,
@@ -108,7 +108,7 @@ func discardInt(_ int, err error) error {
const startDetectingCyclesAfter = 1000
-func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error {
+func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error {
if !val.IsValid() {
return discardInt(w.WriteString("null"))
}
@@ -197,7 +197,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
Err: err,
}
}
- if err := jsonstring.EncodeStringFromBytes(w, escaper, text); err != nil {
+ if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, text); err != nil {
return err
}
default:
@@ -309,14 +309,14 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
} else {
if quote {
var buf bytes.Buffer
- if err := jsonstring.EncodeStringFromString(&buf, escaper, val.String()); err != nil {
+ if err := jsonstring.EncodeStringFromString(&buf, escaper, utf, val, val.String()); err != nil {
return err
}
- if err := jsonstring.EncodeStringFromBytes(w, escaper, buf.Bytes()); err != nil {
+ if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, buf.Bytes()); err != nil {
return err
}
} else {
- if err := jsonstring.EncodeStringFromString(w, escaper, val.String()); err != nil {
+ if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, val.String()); err != nil {
return err
}
}
@@ -327,7 +327,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
return err
}
} else {
- if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -350,13 +350,13 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
}
}
empty = false
- if err := jsonstring.EncodeStringFromString(w, escaper, field.Name); err != nil {
+ if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, field.Name); err != nil {
return err
}
if err := w.WriteByte(':'); err != nil {
return err
}
- if err := encode(w, fVal, escaper, field.Quote, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, fVal, escaper, utf, field.Quote, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -394,7 +394,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
for i := 0; iter.Next(); i++ {
// TODO: Avoid buffering the map key
var k strings.Builder
- if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper}), iter.Key(), escaper, false, cycleDepth, cycleSeen); err != nil {
+ if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper, InvalidUTF8: utf}), iter.Key(), escaper, utf, false, cycleDepth, cycleSeen); err != nil {
return err
}
kStr := k.String()
@@ -403,7 +403,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
}
if !strings.HasPrefix(kStr, `"`) {
k.Reset()
- if err := jsonstring.EncodeStringFromString(&k, escaper, kStr); err != nil {
+ if err := jsonstring.EncodeStringFromString(&k, escaper, utf, iter.Key(), kStr); err != nil {
return err
}
kStr = k.String()
@@ -427,7 +427,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
if err := w.WriteByte(':'); err != nil {
return err
}
- if err := encode(w, kv.V, escaper, false, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, kv.V, escaper, utf, false, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -491,12 +491,12 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
cycleSeen[ptr] = struct{}{}
defer delete(cycleSeen, ptr)
}
- if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil {
+ if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil {
return err
}
}
case reflect.Array:
- if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil {
+ if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil {
return err
}
case reflect.Pointer:
@@ -516,7 +516,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
cycleSeen[ptr] = struct{}{}
defer delete(cycleSeen, ptr)
}
- if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil {
return err
}
}
@@ -529,7 +529,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo
return nil
}
-func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycleDepth uint, cycleSeen map[any]struct{}) error {
+func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, cycleDepth uint, cycleSeen map[any]struct{}) error {
if err := w.WriteByte('['); err != nil {
return err
}
@@ -540,7 +540,7 @@ func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycl
return err
}
}
- if err := encode(w, val.Index(i), escaper, false, cycleDepth, cycleSeen); err != nil {
+ if err := encode(w, val.Index(i), escaper, utf, false, cycleDepth, cycleSeen); err != nil {
return err
}
}
diff --git a/encode_escape.go b/encode_escape.go
index 97da6e9..664c762 100644
--- a/encode_escape.go
+++ b/encode_escape.go
@@ -6,12 +6,29 @@ package lowmemjson
import (
"fmt"
- "unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/jsonstring"
)
-// BackslashEscapeMode identifies one of the three ways that a
+// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or
+// ReEncoder can behave when encountering invalid UTF-8 in a string
+// value:
+//
+// - Replace the byte with the Unicode replacement character U+FFFD.
+//
+// - Allow the byte through to the string-encoder, with an
+// escape-mode of BackslashEscapeRawByte.
+//
+// - Emit a syntax error.
+type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode
+
+const (
+ InvalidUTF8Replace = jsonstring.InvalidUTF8Replace
+ InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve
+ InvalidUTF8Error = jsonstring.InvalidUTF8Error
+)
+
+// BackslashEscapeMode identifies one of the four ways that a
// character may be represented in a JSON string:
//
// - literally (no backslash escaping)
@@ -19,13 +36,41 @@ import (
// - as a short "well-known" `\X` backslash sequence (where `X` is a
// single-character)
//
-// - as a long Unicode `\uXXXX` backslash sequence
+// - as a long Unicode `\uXXXX` backslash sequence (with 16
+// permutations of capitalization)
+//
+// - as a raw byte; this allows you to emit invalid JSON; JSON must
+// be valid UTF-8, but this allows you to emit arbitrary binary
+// data. If the character does not satisfy `utf8.RuneSelf <= char
+// <= 0xFF`, then the encoder will panic.
type BackslashEscapeMode = jsonstring.BackslashEscapeMode
const (
BackslashEscapeNone = jsonstring.BackslashEscapeNone
BackslashEscapeShort = jsonstring.BackslashEscapeShort
- BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode
+ BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
+
+ BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx
+ BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX
+ BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx
+ BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX
+ BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx
+ BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX
+ BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx
+ BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX
+ BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx
+ BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX
+ BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx
+ BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX
+ BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx
+ BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX
+ BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin
+ BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax
+
+ BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat
)
func hexToInt(c byte) rune {
@@ -49,13 +94,24 @@ func hexToRune(a, b, c, d byte) rune {
hexToInt(d)<<0
}
+func hexToMode(a, b, c, d byte) BackslashEscapeMode {
+ // The 0b0010_0000 bit is the ASCII "lowercase bit".
+ return BackslashEscapeUnicodeMin + BackslashEscapeMode(0|
+ ((a&0b0010_0000)>>2)|
+ ((b&0b0010_0000)>>3)|
+ ((c&0b0010_0000)>>4)|
+ ((d&0b0010_0000)>>5))
+}
+
// A BackslashEscaper controls how a ReEncoder emits a character in a
// JSON string. The `rune` argument is the character being
// considered, and the `BackslashEscapeMode` argument is how it was
// originally encoded in the input.
//
// The ReEncoder will panic if a BackslashEscaper returns an unknown
-// BackslashEscapeMode.
+// BackslashEscapeMode. However, a BackslashEscaper should be
+// permissive of BackslashEscapeModes it doesn't recognize; it is safe
+// to just return them unmodified.
type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
// EscapePreserve is a BackslashEscaper that preserves the original
@@ -96,14 +152,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode
// behavior of encoding/json.
//
// It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`
//
// A ReEncoder uses EscapeDefault if a BackslashEscaper is not
// specified.
func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
- case '\b', '\f', utf8.RuneError:
+ case '\b', '\f':
return BackslashEscapeUnicode
default:
return EscapeHTMLSafe(c, wasEscaped)
@@ -115,11 +170,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
// SetEscapeHTML(false) called on it.
//
// It is like EscapeJSSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`.
func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
switch c {
- case '\b', '\f', utf8.RuneError:
+ case '\b', '\f':
return BackslashEscapeUnicode
default:
return EscapeJSSafe(c, wasEscaped)
diff --git a/errors.go b/errors.go
index 516018c..da9de4d 100644
--- a/errors.go
+++ b/errors.go
@@ -142,7 +142,8 @@ func (e *EncodeWriteError) Unwrap() error { return e.Err }
type EncodeTypeError = json.UnsupportedTypeError
// An EncodeValueError is returned by Encode when attempting to encode
-// an unsupported value (such as a datastructure with a cycle).
+// an unsupported value (such as a datastructure with a cycle, or (if
+// InvalidUTF8=InvalidUTF8Error) a string with invalid UTF-8).
//
// type UnsupportedValueError struct {
// Value reflect.Value
diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go
index 1c35533..6432d75 100644
--- a/internal/jsonparse/parse.go
+++ b/internal/jsonparse/parse.go
@@ -525,6 +525,21 @@ func (par *Parser) HandleEOF() (RuneType, error) {
}
}
+// IsAtBarrier returns whether a read-barrier has been reached and the
+// next HandleRune call would definitely return RuneTypeEOF.
+func (par *Parser) IsAtBarrier() bool {
+ return par.initialized &&
+ // HandleRune wouldn't return early with an error.
+ !par.closed &&
+ par.err == nil &&
+ // The current (sub-)parser has reached its end, and
+ len(par.stack) == 0 &&
+ // there is a barrier, and
+ len(par.barriers) > 0 &&
+ // that barrier would definitely return RuneTypeEOF.
+ !par.barriers[len(par.barriers)-1].allowWS
+}
+
// HandleRune feeds a Unicode rune to the Parser.
//
// An error is returned if and only if the RuneType is RuneTypeError.
diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go
index fec2cc0..1416b3e 100644
--- a/internal/jsonstring/encode_string.go
+++ b/internal/jsonstring/encode_string.go
@@ -5,14 +5,25 @@
package jsonstring
import (
+ "encoding/json"
"fmt"
"io"
+ "reflect"
"unicode/utf8"
"git.lukeshu.com/go/lowmemjson/internal/fastio"
"git.lukeshu.com/go/lowmemjson/internal/fastio/noescape"
)
+// InvalidUTF8Mode is describe in the main lowmemjson package docs.
+type InvalidUTF8Mode uint8
+
+const (
+ InvalidUTF8Replace InvalidUTF8Mode = iota
+ InvalidUTF8Preserve
+ InvalidUTF8Error
+)
+
// BackslashEscapeMode is describe in the main lowmemjson package
// docs.
type BackslashEscapeMode uint8
@@ -20,21 +31,49 @@ type BackslashEscapeMode uint8
const (
BackslashEscapeNone BackslashEscapeMode = iota
BackslashEscapeShort
- BackslashEscapeUnicode
+ BackslashEscapeRawByte
+
+ // It is significant to the implementation that if X=binary-0
+ // and x=binary-1, then these "BackslashEscapeUnicode"
+ // constants are counting in-order from 0 to 15.
+
+ BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeXXXx
+ BackslashEscapeUnicodeXXxX
+ BackslashEscapeUnicodeXXxx
+ BackslashEscapeUnicodeXxXX
+ BackslashEscapeUnicodeXxXx
+ BackslashEscapeUnicodeXxxX
+ BackslashEscapeUnicodeXxxx
+ BackslashEscapeUnicodexXXX
+ BackslashEscapeUnicodexXXx
+ BackslashEscapeUnicodexXxX
+ BackslashEscapeUnicodexXxx
+ BackslashEscapeUnicodexxXX
+ BackslashEscapeUnicodexxXx
+ BackslashEscapeUnicodexxxX
+ BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicodeMin = BackslashEscapeUnicodeXXXX
+ BackslashEscapeUnicodeMax = BackslashEscapeUnicodexxxx
+
+ BackslashEscapeUnicode = BackslashEscapeUnicodexxxx // back-compat
)
// BackslashEscaper is describe in the main lowmemjson package docs.
type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode
-func writeStringUnicodeEscape(w io.Writer, c rune) error {
- const alphabet = "0123456789abcdef"
+func WriteStringUnicodeEscape(w io.Writer, c rune, mode BackslashEscapeMode) error {
+ const alphabet = "0123456789ABCDEF"
+ _mode := byte(mode - BackslashEscapeUnicodeMin)
buf := [6]byte{
'\\',
'u',
- alphabet[(c>>12)&0xf],
- alphabet[(c>>8)&0xf],
- alphabet[(c>>4)&0xf],
- alphabet[(c>>0)&0xf],
+ // The 0b0010_0000 bit is the ASCII "lowercase bit".
+ alphabet[(c>>12)&0xf] | ((_mode << 2) & 0b0010_0000),
+ alphabet[(c>>8)&0xf] | ((_mode << 3) & 0b0010_0000),
+ alphabet[(c>>4)&0xf] | ((_mode << 4) & 0b0010_0000),
+ alphabet[(c>>0)&0xf] | ((_mode << 5) & 0b0010_0000),
}
_, err := noescape.Write(w, buf[:])
return err
@@ -72,7 +111,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
case '\b', '\f', '\n', '\r', '\t': // short-escape if possible
return writeStringShortEscape(w, c)
default:
- return writeStringUnicodeEscape(w, c)
+ return WriteStringUnicodeEscape(w, c, BackslashEscapeUnicode)
}
case c == '"' || c == '\\': // override, gotta escape these
return writeStringShortEscape(w, c)
@@ -88,27 +127,54 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err
_, err := w.WriteRune(c)
return err
}
- case BackslashEscapeUnicode:
+ case BackslashEscapeRawByte:
switch {
- case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
- _, err := w.WriteRune(c)
- return err
- default: // obey
- return writeStringUnicodeEscape(w, c)
+ case c < utf8.RuneSelf:
+ panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c))
+ case c > 0xFF:
+ panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c))
+ default:
+ return w.WriteByte(byte(c))
}
default:
- panic("escaper returned an invalid escape mode")
+ if BackslashEscapeUnicodeMin <= escape && escape <= BackslashEscapeUnicodeMax {
+ switch {
+ case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?)
+ _, err := w.WriteRune(c)
+ return err
+ default: // obey
+ return WriteStringUnicodeEscape(w, c, escape)
+ }
+ }
+ panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape))
}
}
-func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error {
+func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error {
if err := w.WriteByte('"'); err != nil {
return err
}
- for _, c := range str {
- if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil {
+ for i := 0; i < len(str); {
+ escaped := BackslashEscapeNone
+ c, size := utf8.DecodeRuneInString(str[i:])
+ if c == utf8.RuneError && size == 1 {
+ switch utf {
+ case InvalidUTF8Replace:
+ escaped = BackslashEscapeUnicode
+ case InvalidUTF8Preserve:
+ escaped = BackslashEscapeRawByte
+ c = rune(str[i])
+ case InvalidUTF8Error:
+ return &json.UnsupportedValueError{
+ Value: val,
+ Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
+ }
+ }
+ }
+ if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
return err
}
+ i += size
}
if err := w.WriteByte('"'); err != nil {
return err
@@ -116,13 +182,28 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st
return nil
}
-func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error {
+func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error {
if err := w.WriteByte('"'); err != nil {
return err
}
for i := 0; i < len(str); {
+ escaped := BackslashEscapeNone
c, size := utf8.DecodeRune(str[i:])
- if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil {
+ if c == utf8.RuneError && size == 1 {
+ switch utf {
+ case InvalidUTF8Replace:
+ escaped = BackslashEscapeUnicode
+ case InvalidUTF8Preserve:
+ escaped = BackslashEscapeRawByte
+ c = rune(str[i])
+ case InvalidUTF8Error:
+ return &json.UnsupportedValueError{
+ Value: val,
+ Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]),
+ }
+ }
+ }
+ if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil {
return err
}
i += size
diff --git a/reencode.go b/reencode.go
index 0745c43..7439bf0 100644
--- a/reencode.go
+++ b/reencode.go
@@ -54,6 +54,17 @@ type ReEncoderConfig struct {
// this is different than the usual behavior.
ForceTrailingNewlines bool
+ // CompactFloats causes the *ReEncoder to trim unnecessary '0'
+ // digits from floating-point number values.
+ CompactFloats bool
+
+ // A JSON document is specified to be a sequence of Unicode
+ // codepoints; InvalidUTF8 controls how the *ReEncoder behaves
+ // when it encounters invalid UTF-8 bytes in a JSON string
+ // (i.e. the string is not representable as a sequence of
+ // Unicode codepoints, and thus the document is invalid JSON).
+ InvalidUTF8 InvalidUTF8Mode
+
// Returns whether a given character in a string should be
// backslash-escaped. The bool argument is whether it was
// \u-escaped in the input. This does not affect characters
@@ -102,8 +113,10 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
}
// Numbers
- module = &reEncodeCompactNum{
- out: module,
+ if cfg.CompactFloats {
+ module = &reEncodeCompactNum{
+ out: module,
+ }
}
// Strings
@@ -119,6 +132,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
return &ReEncoder{
out: module,
esc: escaper,
+ utf: cfg.InvalidUTF8,
allowMultipleValues: cfg.AllowMultipleValues,
}
}
@@ -134,6 +148,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder {
type ReEncoder struct {
out reEncoderModule
esc BackslashEscaper
+ utf InvalidUTF8Mode
allowMultipleValues bool
// state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer
@@ -169,6 +184,57 @@ var (
_ io.Closer = (*ReEncoder)(nil)
)
+func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) {
+ var tmp []byte
+ if pos < enc.bufLen {
+ var buf [utf8.UTFMax]byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
+ } else {
+ tmp = str[pos-enc.bufLen:]
+ }
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
+ }
+}
+
+func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) {
+ if pos < enc.bufLen {
+ var buf [utf8.UTFMax]byte
+ var tmp []byte
+ n := copy(buf[:], enc.buf[pos:enc.bufLen])
+ n += copy(buf[n:], str)
+ tmp = buf[:n]
+ c, size = utf8.DecodeRune(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
+ }
+ } else {
+ tmp := str[pos-enc.bufLen:]
+ c, size := utf8.DecodeRuneInString(tmp)
+ switch {
+ case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp):
+ return c, size, false, true
+ case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace:
+ return rune(tmp[0]), 1, true, false
+ default:
+ return c, size, true, true
+ }
+ }
+}
+
// Write implements io.Writer; it does what you'd expect.
//
// It is worth noting that Write returns the number of bytes consumed
@@ -177,59 +243,68 @@ var (
// but *ReEncoder does because it transforms the data written to it,
// and the number of bytes written may be wildly different than the
// number of bytes handled.
-func (enc *ReEncoder) Write(p []byte) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) Write(str []byte) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full, isRune := enc.getRuneFromBytes(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRune(p[n:]) {
- c, size := utf8.DecodeRune(p[n:])
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
n += size
}
- enc.bufLen = copy(enc.buf[:], p[n:])
- return len(p), nil
}
// WriteString implements io.StringWriter; it does what you'd expect,
// but see the notes on the Write method.
-func (enc *ReEncoder) WriteString(p string) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) WriteString(str string) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full, isRune := enc.getRuneFromString(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRuneInString(p[n:]) {
- c, size := utf8.DecodeRuneInString(p[n:])
- enc.handleRune(c, size)
+ if enc.utf == InvalidUTF8Error && !isRune {
+ return n, &ReEncodeSyntaxError{
+ Offset: enc.inputPos,
+ Err: fmt.Errorf("invalid UTF-8: %#02x", c),
+ }
+ }
+ enc.handleRune(c, size, isRune)
if enc.err != nil {
return n, enc.err
}
n += size
}
- return len(p), nil
}
// WriteByte implements io.ByteWriter; it does what you'd expect.
@@ -261,7 +336,7 @@ func (enc *ReEncoder) Close() error {
return enc.err
}
if len(enc.barriers) == 0 {
- if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -275,7 +350,8 @@ func (enc *ReEncoder) Close() error {
return nil
}
-func (enc *ReEncoder) handleRune(c rune, size int) {
+// isRune=false indicates that 'c' is a raw byte from invalid UTF-8.
+func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) {
t, err := enc.par.HandleRune(c)
if err != nil {
enc.err = &ReEncodeSyntaxError{
@@ -284,7 +360,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) {
}
return
}
- if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil {
+ if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil {
enc.err = &ReEncodeWriteError{
Err: err,
Offset: enc.inputPos,
@@ -327,13 +403,13 @@ func (enc *ReEncoder) popWriteBarrier() {
func (enc *ReEncoder) stackSize() int {
sz := enc.par.StackSize()
- for _, barrier := range enc.barriers {
- sz += barrier.stackSize
+ if len(enc.barriers) > 0 {
+ sz += enc.barriers[len(enc.barriers)-1].stackSize
}
return sz
}
-func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error {
+func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error {
switch t {
case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU:
return nil
@@ -365,14 +441,19 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int
enc.uhex[2] = byte(c)
return nil
case jsonparse.RuneTypeStringEscUD:
+ mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c))
- return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeUnicode, stackSize)
+ return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize)
case jsonparse.RuneTypeError:
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
default:
if t > jsonparse.RuneTypeEOF {
panic(fmt.Errorf("should not happen: handleRune called with %#v", t))
}
- return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize)
+ esc := BackslashEscapeNone
+ if !isRune {
+ esc = BackslashEscapeRawByte
+ }
+ return enc.out.HandleRune(c, t, esc, stackSize)
}
}
diff --git a/reencode_compactnum.go b/reencode_compactnum.go
index 5da2c54..bdf1f4e 100644
--- a/reencode_compactnum.go
+++ b/reencode_compactnum.go
@@ -33,14 +33,12 @@ func (enc *reEncodeCompactNum) HandleRune(c rune, t jsonparse.RuneType, escape B
if c == '0' && !enc.fracFirst {
enc.fracZeros++
return nil
- }
- fallthrough
- default:
- for enc.fracZeros > 0 {
- if err := enc.out.HandleRune('0', jsonparse.RuneTypeNumberFracDig, escape, stackSize); err != nil {
- return err
+ } else {
+ for ; enc.fracZeros > 0; enc.fracZeros-- {
+ if err := enc.out.HandleRune('0', jsonparse.RuneTypeNumberFracDig, escape, stackSize); err != nil {
+ return err
+ }
}
- enc.fracZeros--
}
enc.fracFirst = false
}
diff --git a/reencode_test.go b/reencode_test.go
index 83660ef..715e976 100644
--- a/reencode_test.go
+++ b/reencode_test.go
@@ -9,6 +9,8 @@ import (
"testing"
"github.com/stretchr/testify/assert"
+
+ "git.lukeshu.com/go/lowmemjson/internal/fastio"
)
func TestReEncode(t *testing.T) {
@@ -131,6 +133,26 @@ func TestReEncode(t *testing.T) {
โ€”ยป9
โ€”]`,
},
+ "numbers": {
+ enc: ReEncoderConfig{
+ Compact: true,
+ CompactFloats: true,
+ },
+ in: []any{
+ Number("1.200e003"),
+ },
+ exp: `[1.2e3]`,
+ },
+ "numbers-zero": {
+ enc: ReEncoderConfig{
+ Compact: true,
+ CompactFloats: true,
+ },
+ in: []any{
+ Number("1.000e000"),
+ },
+ exp: `[1.0e0]`,
+ },
}
for tcName, tc := range testcases {
tc := tc
@@ -143,3 +165,78 @@ func TestReEncode(t *testing.T) {
})
}
}
+
+func TestReEncodeWriteSize(t *testing.T) {
+ t.Parallel()
+
+ multibyteRune := `๐Ÿ˜‚`
+ assert.Len(t, multibyteRune, 4)
+
+ input := `"` + multibyteRune + `"`
+
+ t.Run("bytes-bigwrite", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ n, err := enc.Write([]byte(input))
+ assert.NoError(t, err)
+ assert.Equal(t, len(input), n)
+
+ assert.Equal(t, input, out.String())
+ })
+ t.Run("string-bigwrite", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ n, err := enc.WriteString(input)
+ assert.NoError(t, err)
+ assert.Equal(t, len(input), n)
+
+ assert.Equal(t, input, out.String())
+ })
+
+ t.Run("bytes-smallwrites", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ var buf [1]byte
+ for i := 0; i < len(input); i++ {
+ buf[0] = input[i]
+ n, err := enc.Write(buf[:])
+ assert.NoError(t, err)
+ assert.Equal(t, 1, n)
+ }
+
+ assert.Equal(t, input, out.String())
+ })
+ t.Run("string-smallwrites", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ for i := 0; i < len(input); i++ {
+ n, err := enc.WriteString(input[i : i+1])
+ assert.NoError(t, err)
+ assert.Equal(t, 1, n)
+ }
+
+ assert.Equal(t, input, out.String())
+ })
+}
+
+func TestReEncoderStackSize(t *testing.T) {
+ t.Parallel()
+
+ enc := NewReEncoder(fastio.Discard, ReEncoderConfig{})
+ assert.Equal(t, 0, enc.stackSize())
+
+ for i := 0; i < 5; i++ {
+ assert.NoError(t, enc.WriteByte('['))
+ assert.Equal(t, i+1, enc.stackSize())
+ enc.pushWriteBarrier()
+ assert.Equal(t, i+2, enc.stackSize())
+ }
+}