// Copyright (C) 2022-2023 Luke Shumaker // // SPDX-License-Identifier: GPL-2.0-or-later package lowmemjson import ( "fmt" "io" "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/fastio" "git.lukeshu.com/go/lowmemjson/internal/jsonparse" ) // A ReEncoderConfig controls how a ReEncoder should behave. type ReEncoderConfig struct { // A JSON document is specified to be a single JSON element; // but it is often desirable to handle streams of multiple // JSON elements. AllowMultipleValues bool // Whether to minify the JSON. // // Trims all whitespace, except that it emits a newline // between two *number* top-level values (or puts a newline // after all top-level values if ForceTrailingNewlines). // // Trims superflous 0s from numbers. Compact bool // CompactIfUnder causes the *ReEncoder to behave as if // Compact=true for individual elements if doing so would // cause that element to be under this number of bytes. // // Has no affect if Compact is true or Indent is empty. // // This has O(2^min(CompactIfUnder, depth)) time overhead, so // set with caution. CompactIfUnder int // String to use to indent; ignored if Compact is true. // // Newlines are emitted *between* top-level values; a newline is // not emitted after the *last* top-level value (unless // ForceTrailingNewlines is on). Indent string // String to put before indents. Prefix string // Whether to emit a newline after each top-level value. See // the comments on Compact and Indent for discussion of how // this is different than the usual behavior. ForceTrailingNewlines bool // CompactFloats causes the *ReEncoder to trim unnecessary '0' // digits from floating-point number values. CompactFloats bool // A JSON document is specified to be a sequence of Unicode // codepoints; InvalidUTF8 controls how the *ReEncoder behaves // when it encounters invalid UTF-8 bytes in a JSON string // (i.e. the string is not representable as a sequence of // Unicode codepoints, and thus the document is invalid JSON). InvalidUTF8 InvalidUTF8Mode // Returns whether a given character in a string should be // backslash-escaped. The bool argument is whether it was // \u-escaped in the input. This does not affect characters // that must or must-not be escaped to be valid JSON. // // If not set, then EscapeDefault is used. BackslashEscape BackslashEscaper } // NewReEncoder returns a new ReEncoder instance. // // A ReEncoder tends to make many small writes; if Out.Write // calls are syscalls, then you may want to wrap Out in a // bufio.Writer. func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { var module reEncoderModule // Basic module = &reEncodeWrite{ out: fastio.NewAllWriter(out), } // Whitespace if cfg.ForceTrailingNewlines { module = &reEncodeForceNL{ out: module, } } switch { case cfg.Compact: module = &reEncodeCompactWS{ out: module, } case cfg.Indent != "": if cfg.CompactIfUnder > 0 { module = &reEncodeCompactWSIfUnder{ out: module, CompactWSIfUnder: cfg.CompactIfUnder, } } module = &reEncodeIndent{ out: module, Indent: cfg.Indent, Prefix: cfg.Prefix, } } // Numbers if cfg.CompactFloats { module = &reEncodeCompactNum{ out: module, } } // Strings escaper := cfg.BackslashEscape if escaper == nil { escaper = EscapeDefault } module = &reEncodeString{ out: module, BackslashEscape: escaper, } return &ReEncoder{ out: module, esc: escaper, utf: cfg.InvalidUTF8, allowMultipleValues: cfg.AllowMultipleValues, } } // A ReEncoder takes a stream of JSON elements (by way of implementing // io.Writer, io.StringWriter, io.ByteWriter, and WriteRune), and // re-encodes the JSON, writing it to the .Out member. // // This is useful for prettifying, minifying, sanitizing, and/or // validating JSON. // // The memory use of a ReEncoder is O(CompactIfUnder+depth). type ReEncoder struct { out reEncoderModule esc BackslashEscaper utf InvalidUTF8Mode allowMultipleValues bool // state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer buf [utf8.UTFMax]byte bufLen int // state: contract between the public API and .handleRune err error par jsonparse.Parser inputPos int64 // state: .pushWriteBarrier and .popWriteBarrier barriers []barrier // state: .handleRuneType uhex [3]byte // "\uABCD"-encoded characters in strings } type barrier struct { inputPos int64 stackSize int } type reEncoderModule interface { HandleRune(c rune, t jsonparse.RuneType, escape BackslashEscapeMode, stackSize int) error PopWriteBarrier() } // public API ////////////////////////////////////////////////////////////////// var ( _ fastio.AllWriter = (*ReEncoder)(nil) _ io.Closer = (*ReEncoder)(nil) ) func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) { var tmp []byte if pos < enc.bufLen { var buf [utf8.UTFMax]byte n := copy(buf[:], enc.buf[pos:enc.bufLen]) n += copy(buf[n:], str) tmp = buf[:n] } else { tmp = str[pos-enc.bufLen:] } c, size = utf8.DecodeRune(tmp) switch { case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): return c, size, false, true case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: return rune(tmp[0]), 1, true, false default: return c, size, true, true } } func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) { if pos < enc.bufLen { var buf [utf8.UTFMax]byte var tmp []byte n := copy(buf[:], enc.buf[pos:enc.bufLen]) n += copy(buf[n:], str) tmp = buf[:n] c, size = utf8.DecodeRune(tmp) switch { case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): return c, size, false, true case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: return rune(tmp[0]), 1, true, false default: return c, size, true, true } } else { tmp := str[pos-enc.bufLen:] c, size := utf8.DecodeRuneInString(tmp) switch { case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp): return c, size, false, true case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: return rune(tmp[0]), 1, true, false default: return c, size, true, true } } } // Write implements io.Writer; it does what you'd expect. // // It is worth noting that Write returns the number of bytes consumed // from p, not number of bytes written to the output stream. This // distinction that most io.Writer implementations don't need to make, // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. // //nolint:dupl // Yes, this is mostly a duplicate of .WriteString(). func (enc *ReEncoder) Write(str []byte) (int, error) { if len(str) == 0 { return 0, nil } origBufLen := enc.bufLen var n int for { c, size, full, isRune := enc.getRuneFromBytes(str, n) if !full { if n < enc.bufLen { l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) l += copy(enc.buf[l:], str) enc.bufLen = l } else { enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) } return len(str), nil } if enc.utf == InvalidUTF8Error && !isRune { return n - origBufLen, &ReEncodeSyntaxError{ Offset: enc.inputPos, Err: fmt.Errorf("invalid UTF-8: %#02x", c), } } enc.handleRune(c, size, isRune) if enc.err != nil { return n - origBufLen, enc.err } n += size } } // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. // //nolint:dupl // Yes, this is mostly a duplicate of .Write(). func (enc *ReEncoder) WriteString(str string) (int, error) { if len(str) == 0 { return 0, nil } origBufLen := enc.bufLen var n int for { c, size, full, isRune := enc.getRuneFromString(str, n) if !full { if n < enc.bufLen { l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) l += copy(enc.buf[l:], str) enc.bufLen = l } else { enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) } return len(str), nil } if enc.utf == InvalidUTF8Error && !isRune { return n - origBufLen, &ReEncodeSyntaxError{ Offset: enc.inputPos, Err: fmt.Errorf("invalid UTF-8: %#02x", c), } } enc.handleRune(c, size, isRune) if enc.err != nil { return n - origBufLen, enc.err } n += size } } // WriteByte implements io.ByteWriter; it does what you'd expect. func (enc *ReEncoder) WriteByte(b byte) error { return fastio.WriteByte(enc, b) } // WriteRune does what you'd expect. func (enc *ReEncoder) WriteRune(c rune) (n int, err error) { return fastio.WriteRune(enc, c) } // Close implements io.Closer; it does what you'd expect, mostly. // // The *ReEncoder may continue to be written to with new JSON values // if enc.AllowMultipleValues is set. func (enc *ReEncoder) Close() error { if enc.bufLen > 0 { if enc.utf == InvalidUTF8Error { return &ReEncodeSyntaxError{ Offset: enc.inputPos, Err: fmt.Errorf("truncated UTF-8: %q", enc.buf[:enc.bufLen]), } } for i := 0; i < enc.bufLen; i++ { if enc.utf == InvalidUTF8Replace { enc.handleRune(utf8.RuneError, 1, true) } else { enc.handleRune(rune(enc.buf[i]), 1, false) } if enc.err != nil { return enc.err } } } if _, err := enc.par.HandleEOF(); err != nil { enc.err = &ReEncodeSyntaxError{ Err: err, Offset: enc.inputPos, } return enc.err } if len(enc.barriers) == 0 { if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, } return enc.err } if enc.allowMultipleValues { enc.par.Reset() } } return nil } // isRune=false indicates that 'c' is a raw byte from invalid UTF-8. func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { t, err := enc.par.HandleRune(c, isRune) if err != nil { enc.err = &ReEncodeSyntaxError{ Err: err, Offset: enc.inputPos, } return } if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, } return } if t == jsonparse.RuneTypeEOF { if len(enc.barriers) == 0 { panic(fmt.Errorf("should not happen: EOF for rune %q without write barriers", c)) } enc.err = &ReEncodeSyntaxError{ Err: fmt.Errorf("invalid character %q after top-level value", c), Offset: enc.inputPos, } return } enc.inputPos += int64(size) } // semi-public API ///////////////////////////////////////////////////////////// func (enc *ReEncoder) pushWriteBarrier() { enc.barriers = append(enc.barriers, barrier{ inputPos: enc.inputPos, stackSize: enc.stackSize(), }) enc.par.PushWriteBarrier() enc.inputPos = 0 } func (enc *ReEncoder) popWriteBarrier() { enc.par.PopBarrier() enc.inputPos += enc.barriers[len(enc.barriers)-1].inputPos enc.barriers = enc.barriers[:len(enc.barriers)-1] enc.out.PopWriteBarrier() } // internal //////////////////////////////////////////////////////////////////// func (enc *ReEncoder) stackSize() int { sz := enc.par.StackSize() if len(enc.barriers) > 0 { sz += enc.barriers[len(enc.barriers)-1].stackSize } return sz } func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error { switch t { case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU: return nil case jsonparse.RuneTypeStringEsc1: switch c { case '"', '\\', '/': // self case 'b': c = '\b' case 'f': c = '\f' case 'n': c = '\n' case 'r': c = '\r' case 't': c = '\t' default: panic(fmt.Errorf("should not happen: rune %q is not a RuneTypeStringEsc1", c)) } return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeShort, stackSize) case jsonparse.RuneTypeStringEscUA: enc.uhex[0] = byte(c) return nil case jsonparse.RuneTypeStringEscUB: enc.uhex[1] = byte(c) return nil case jsonparse.RuneTypeStringEscUC: enc.uhex[2] = byte(c) return nil case jsonparse.RuneTypeStringEscUD: mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize) case jsonparse.RuneTypeError: panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) default: if t > jsonparse.RuneTypeEOF { panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) } esc := BackslashEscapeNone if !isRune { esc = BackslashEscapeRawByte } return enc.out.HandleRune(c, t, esc, stackSize) } }