summaryrefslogtreecommitdiff
path: root/reencode.go
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2022-08-13 15:11:17 -0600
committerLuke Shumaker <lukeshu@lukeshu.com>2022-08-13 15:18:11 -0600
commit234e0836f1040f7724251b4120a2351bcbf64131 (patch)
tree201b30fcc99eed470ae345a9bbe594f7ee7a1178 /reencode.go
parentf2769bd863521cf316ec9237a498bfa4ecaa115f (diff)
set up as a separate repo
Diffstat (limited to 'reencode.go')
-rw-r--r--reencode.go598
1 files changed, 598 insertions, 0 deletions
diff --git a/reencode.go b/reencode.go
new file mode 100644
index 0000000..50c8ba3
--- /dev/null
+++ b/reencode.go
@@ -0,0 +1,598 @@
+// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package lowmemjson
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "unicode/utf8"
+)
+
+type reencodeState func(rune) error
+
+type ReEncoder struct {
+ Out io.Writer
+
+ // Whether to minify the JSON.
+ Compact bool
+ // String to use to indent; ignored if Compact is true.
+ Indent string
+ // String to put before indents, for testing-compat with
+ // encoding/json only.
+ prefix string
+ // Returns whether a given character in a string should be
+ // "\uXXXX" escaped. The bool argument is whether it was
+ // \u-escaped in the input. This does not affect characters
+ // that must or must-not be \u-escaped to be valid JSON.
+ //
+ // If not set, then EscapeUnicodeDefault is used.
+ UnicodeEscape func(rune, bool) bool
+
+ bailAfterCurrent bool
+
+ // state: .Write's utf8-decoding buffer
+ buf [utf8.UTFMax]byte
+ bufLen int
+
+ // state: .WriteRune
+ err error
+ inputPos int64
+ written int
+ stack []reencodeState
+ stack0IsNumber bool
+ curIndent int
+
+ // state: reencodeState-specific
+ stateBuf []byte
+}
+
+// public API //////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) Write(p []byte) (int, error) {
+ if len(p) == 0 {
+ return 0, nil
+ }
+ var n int
+ if enc.bufLen > 0 {
+ copy(enc.buf[enc.bufLen:], p)
+ c, size := utf8.DecodeRune(enc.buf[:])
+ n += size - enc.bufLen
+ enc.bufLen = 0
+ if _, err := enc.WriteRune(c); err != nil {
+ return 0, err
+ }
+ }
+ for utf8.FullRune(p[n:]) {
+ c, size := utf8.DecodeRune(p[n:])
+ if _, err := enc.WriteRune(c); err != nil {
+ return n, err
+ }
+ n += size
+ }
+ enc.bufLen = copy(enc.buf[:], p[n:])
+ return len(p), nil
+}
+
+func (enc *ReEncoder) Flush() error {
+ if enc.bufLen > 0 {
+ return &SyntaxError{fmt.Sprintf("EOF: unflushed unicode garbage: %q", enc.buf[:enc.bufLen]), enc.inputPos}
+ }
+ switch len(enc.stack) {
+ case 0:
+ return nil
+ case 1:
+ if enc.stack0IsNumber {
+ enc.Compact = true
+ return enc.state('\n')
+ }
+ fallthrough
+ default:
+ return &SyntaxError{fmt.Sprintf("EOF: in the middle of a value"), enc.inputPos}
+ }
+}
+
+func (enc *ReEncoder) WriteRune(c rune) (n int, err error) {
+ if enc.err != nil {
+ return 0, enc.err
+ }
+ if enc.bufLen != 0 {
+ enc.err = errors.New("lowmemjson.ReEncoder: cannot .WriteRune() when there is a partial rune that has been .Write()n")
+ return 0, enc.err
+ }
+ enc.written = 0
+ enc.err = enc.state(c)
+ enc.inputPos += int64(utf8.RuneLen(c))
+ return enc.written, enc.err
+}
+
+// io helpers //////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) emitByte(c byte) error {
+ err := writeByte(enc.Out, c)
+ if err == nil {
+ enc.written++
+ }
+ return err
+}
+
+func (enc *ReEncoder) emit(n int, err error) error {
+ enc.written += n
+ return err
+}
+
+func (enc *ReEncoder) nlIndent() error {
+ if enc.Compact || enc.Indent == "" {
+ return nil
+ }
+ if err := enc.emitByte('\n'); err != nil {
+ return err
+ }
+ if enc.prefix != "" {
+ if err := enc.emit(io.WriteString(enc.Out, enc.prefix)); err != nil {
+ return err
+ }
+ }
+ for i := 0; i < enc.curIndent; i++ {
+ if err := enc.emit(io.WriteString(enc.Out, enc.Indent)); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// state helpers ///////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) pushState(state reencodeState, isNumber bool) {
+ if len(enc.stack) == 0 {
+ enc.stack0IsNumber = isNumber
+ }
+ enc.stack = append(enc.stack, state)
+}
+func (enc *ReEncoder) replaceState(state reencodeState, isNumber bool) {
+ if len(enc.stack) == 1 {
+ enc.stack0IsNumber = isNumber
+ }
+ enc.stack[len(enc.stack)-1] = state
+}
+func (enc *ReEncoder) popState() {
+ if len(enc.stack) == 1 {
+ enc.stack0IsNumber = false
+ }
+ enc.stack = enc.stack[:len(enc.stack)-1]
+}
+
+var errBailedAfterCurrent = errors.New("bailed after current")
+
+func (enc *ReEncoder) state(c rune) error {
+ if len(enc.stack) == 0 {
+ if enc.bailAfterCurrent {
+ return errBailedAfterCurrent
+ }
+ enc.pushState(enc.stateAny, false)
+ }
+ return enc.stack[len(enc.stack)-1](c)
+}
+
+// any /////////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateAny(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case '{':
+ enc.replaceState(enc.stateInEmptyObject, false)
+ enc.curIndent++
+ case '[':
+ enc.replaceState(enc.stateInEmptyArray, false)
+ enc.curIndent++
+ case '"':
+ enc.replaceState(enc.stateInString, false)
+ case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberA, true)
+ return enc.state(c)
+ case 't':
+ enc.replaceState(enc.stateInTrue, false)
+ enc.stateBuf = append(enc.stateBuf[:0], 't')
+ case 'f':
+ enc.replaceState(enc.stateInFalse, false)
+ enc.stateBuf = append(enc.stateBuf[:0], 'f')
+ case 'n':
+ enc.replaceState(enc.stateInNull, false)
+ enc.stateBuf = append(enc.stateBuf[:0], 'n')
+ default:
+ return &SyntaxError{fmt.Sprintf("any: unexpected character: %c", c), enc.inputPos}
+ }
+ return enc.emitByte(byte(c))
+}
+
+// object //////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInEmptyObject(c rune) error { return enc._stateInObject(c, false) }
+func (enc *ReEncoder) stateInNonEmptyObject(c rune) error { return enc._stateInObject(c, true) }
+func (enc *ReEncoder) _stateInObject(c rune, nonempty bool) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case '"':
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ enc.replaceState(enc.stateInKV, false)
+ enc.pushState(enc.stateInString, false)
+ case '}':
+ enc.popState()
+ enc.curIndent--
+ if nonempty {
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ }
+ default:
+ return &SyntaxError{fmt.Sprintf("object: unexpected character: %c", c), enc.inputPos}
+ }
+ return enc.emitByte(byte(c))
+}
+func (enc *ReEncoder) stateInKV(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ return enc.emitByte(byte(c))
+ case ':':
+ enc.replaceState(enc.stateAfterV, false)
+ enc.pushState(enc.stateAny, false)
+ if err := enc.emitByte(byte(c)); err != nil {
+ return err
+ }
+ if !enc.Compact && enc.Indent != "" {
+ return enc.emitByte(' ')
+ }
+ return nil
+ default:
+ return &SyntaxError{fmt.Sprintf("object member: unexpected character: %c", c), enc.inputPos}
+ }
+}
+func (enc *ReEncoder) stateAfterV(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ',':
+ enc.replaceState(enc.stateInNonEmptyObject, false)
+ case '}':
+ enc.popState()
+ enc.curIndent--
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ default:
+ return &SyntaxError{fmt.Sprintf("object member: unexpected character: %c", c), enc.inputPos}
+ }
+ return enc.emitByte(byte(c))
+}
+
+// array ///////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInEmptyArray(c rune) error { return enc._stateInArray(c, false) }
+func (enc *ReEncoder) stateInNonEmptyArray(c rune) error { return enc._stateInArray(c, true) }
+func (enc *ReEncoder) _stateInArray(c rune, nonempty bool) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ']':
+ enc.popState()
+ enc.curIndent--
+ if nonempty {
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ }
+ default:
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ enc.replaceState(enc.stateAfterItem, false)
+ enc.pushState(enc.stateAny, false)
+ return enc.state(c)
+ }
+ return enc.emitByte(byte(c))
+}
+func (enc *ReEncoder) stateAfterItem(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ',':
+ enc.replaceState(enc.stateInNonEmptyArray, false)
+ case ']':
+ enc.popState()
+ enc.curIndent--
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ default:
+ return &SyntaxError{fmt.Sprintf("array: unexpected character: %c", c), enc.inputPos}
+ }
+ return enc.emitByte(byte(c))
+}
+
+// string //////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInString(c rune) error {
+ switch {
+ case c == '\\':
+ enc.replaceState(enc.stateInBackslash, false)
+ return nil
+ case c == '"':
+ enc.popState()
+ return enc.emitByte(byte(c))
+ case 0x0020 <= c && c <= 0x10FFFF:
+ return enc.emit(writeStringChar(enc.Out, c, false, enc.UnicodeEscape))
+ default:
+ return &SyntaxError{fmt.Sprintf("string: unexpected character: %c", c), enc.inputPos}
+ }
+}
+func (enc *ReEncoder) stateInBackslash(c rune) error {
+ switch c {
+ case '"':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '"', false, enc.UnicodeEscape))
+ case '\\':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '\\', false, enc.UnicodeEscape))
+ case '/':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '/', false, enc.UnicodeEscape))
+ case 'b':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '\b', false, enc.UnicodeEscape))
+ case 'f':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '\f', false, enc.UnicodeEscape))
+ case 'n':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '\n', false, enc.UnicodeEscape))
+ case 'r':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '\r', false, enc.UnicodeEscape))
+ case 't':
+ enc.replaceState(enc.stateInString, false)
+ return enc.emit(writeStringChar(enc.Out, '\t', false, enc.UnicodeEscape))
+ case 'u':
+ enc.replaceState(enc.stateInUnicode, false)
+ return nil
+ default:
+ return &SyntaxError{fmt.Sprintf("string backslash sequence: unexpected character: %c", c), enc.inputPos}
+ }
+}
+func (enc *ReEncoder) stateInUnicode(c rune) error {
+ switch {
+ case '0' <= c && c <= '9':
+ enc.stateBuf = append(enc.stateBuf, byte(c)-'0')
+ case 'a' <= c && c <= 'f':
+ enc.stateBuf = append(enc.stateBuf, byte(c)-'a'+10)
+ case 'A' <= c && c <= 'F':
+ enc.stateBuf = append(enc.stateBuf, byte(c)-'A'+10)
+ default:
+ return &SyntaxError{fmt.Sprintf("string unicode sequence: unexpected character: %c", c), enc.inputPos}
+ }
+ if len(enc.stateBuf) == 4 {
+ enc.replaceState(enc.stateInString, false)
+ c := 0 |
+ rune(enc.stateBuf[0])<<12 |
+ rune(enc.stateBuf[1])<<8 |
+ rune(enc.stateBuf[2])<<4 |
+ rune(enc.stateBuf[3])<<0
+ enc.stateBuf = enc.stateBuf[:0]
+ return enc.emit(writeStringChar(enc.Out, c, true, enc.UnicodeEscape))
+ }
+ return nil
+}
+
+// number //////////////////////////////////////////////////////////////////////////////////////////
+
+// Here's a flattened drawing of the syntax diagram from www.json.org :
+//
+// [------------ integer ----------][-- fraction ---][-------- exponent -------]
+// >─╮─────╭─╮─"0"───────╭─────────╭──╮─────────────╭──╮───────────────────────╭─>
+// │ │ │ │ │ │ │ │ │
+// ╰─"-"─╯ ╰─digit 1-9─╯─╭digit╮─╯ ╰─"."─╭digit╮─╯ ╰─"e"─╭─╮─────╭─╭digit╮─╯
+// ╰──<──╯ ╰──<──╯ │ │ │ │ ╰──<──╯
+// ╰─"E"─╯ ╰─"-"─╯
+// │ │
+// ╰─"+"─╯
+//
+// Now here it is slightly redrawn, and with each distinct state our
+// decoder can be in marked with a single-capital-letter:
+//
+// [-------------- integer ------------][--------- fraction --------][--------- exponent ---------]
+// >─A─╮───────╭──╮─"0"─────────C─╭─────────╮──────────────────╭─────────╮──────────────────────────╭─>
+// │ │ │ │ │ │ │ │
+// ╰─"-"─B─╯ ╰─digit 1-9─╭─D─╯─digit╮ ╰─"."─E─digit──╭─F─╯─digit╮ ╰─"e"─╭─G─╮─────╭─╭digit─H─╯
+// ╰────<─────╯ ╰────<─────╯ │ │ │ │ ╰────<───╯
+// ╰─"E"─╯ ╰─"-"─╯
+// │ │
+// ╰─"+"─╯
+//
+// Which state we're at is the 'X' in 'stateNumberX'.
+//
+// Besides just traversing that, there are a few compressions we want to make:
+//
+// - trim trailing 0s from fraction the (but don't remove the
+// fraction if it's all 0s); do this by making the F state a little
+// special. This requires a little more state, because when we
+// encounter the 0 we don't yet know if it's trailing. So, store
+// the number of maybe-trailing zeros in enc.stateBuf[0]; if that
+// reaches 255, then bleed over to enc.stateBuf[1] and so on.
+//
+// - trim leading 0s from the exponent (but don't remove the exponent
+// if it's all 0s); do this by making the H state a little special.
+// Record whether we've seen a non-zero digit in enc.stateBuf[0]
+// (0=false, 1=true).
+
+// integer-part ////////////////////////////////////////////////////////////////
+func (enc *ReEncoder) stateNumberA(c rune) error { // start
+ switch c {
+ case '-':
+ enc.replaceState(enc.stateNumberB, true)
+ case '0':
+ enc.replaceState(enc.stateNumberC, true)
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberD, true)
+ default:
+ return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos}
+ }
+ return enc.emitByte(byte(c))
+}
+func (enc *ReEncoder) stateNumberB(c rune) error { // got a leading "-"
+ switch c {
+ case '0':
+ enc.replaceState(enc.stateNumberC, true)
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberD, true)
+ default:
+ return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos}
+ }
+ return enc.emitByte(byte(c))
+}
+func (enc *ReEncoder) stateNumberC(c rune) error { // ready for the fraction or exponent part to start
+ switch c {
+ case '.':
+ enc.replaceState(enc.stateNumberE, true)
+ return enc.emitByte('.')
+ case 'e', 'E':
+ enc.replaceState(enc.stateNumberG, true)
+ enc.stateBuf = append(enc.stateBuf[:0], 0)
+ return enc.emitByte('e')
+ default:
+ enc.popState()
+ return enc.state(c)
+ }
+}
+func (enc *ReEncoder) stateNumberD(c rune) error { // in the integer part
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return enc.emitByte(byte(c))
+ case '.':
+ enc.replaceState(enc.stateNumberE, true)
+ return enc.emitByte('.')
+ case 'e', 'E':
+ enc.replaceState(enc.stateNumberG, true)
+ enc.stateBuf = append(enc.stateBuf[:0], 0)
+ return enc.emitByte('e')
+ default:
+ enc.popState()
+ return enc.state(c)
+ }
+}
+
+// fraction-part ///////////////////////////////////////////////////////////////
+func (enc *ReEncoder) stateNumberE(c rune) error { // got a ".", ready to read a number for the fraction part
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberF, true)
+ return enc.emitByte(byte(c))
+ default:
+ return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos}
+ }
+}
+func (enc *ReEncoder) stateNumberF(c rune) error { // in the fraction part
+ switch c {
+ case '0':
+ if len(enc.stateBuf) > 0 && enc.stateBuf[len(enc.stateBuf)-1] < 255 {
+ enc.stateBuf[len(enc.stateBuf)-1]++
+ } else {
+ enc.stateBuf = append(enc.stateBuf, 1)
+ }
+ return nil
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ for len(enc.stateBuf) > 0 {
+ if err := enc.emitByte('0'); err != nil {
+ return err
+ }
+ if enc.stateBuf[len(enc.stateBuf)-1] == 1 {
+ enc.stateBuf = enc.stateBuf[:len(enc.stateBuf)-1]
+ } else {
+ enc.stateBuf[len(enc.stateBuf)-1]--
+ }
+ }
+ return enc.emitByte(byte(c))
+ case 'e', 'E':
+ enc.replaceState(enc.stateNumberG, true)
+ enc.stateBuf = append(enc.stateBuf[:0], 0)
+ return enc.emitByte('e')
+ default:
+ enc.stateBuf = enc.stateBuf[:0]
+ enc.popState()
+ return enc.state(c)
+ }
+}
+
+// exponent-part ///////////////////////////////////////////////////////////////
+func (enc *ReEncoder) stateNumberG(c rune) error { // got a leading "e"
+ switch c {
+ case '-', '+':
+ enc.replaceState(enc.stateNumberH, true)
+ return enc.emitByte(byte(c))
+ case '0':
+ enc.replaceState(enc.stateNumberH, true)
+ return nil
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberH, true)
+ enc.stateBuf[0] = 1
+ return enc.emitByte(byte(c))
+ default:
+ enc.stateBuf = enc.stateBuf[:0]
+ return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos}
+ }
+}
+func (enc *ReEncoder) stateNumberH(c rune) error { // in the exponent's number part
+ switch c {
+ case '0':
+ if enc.stateBuf[0] == 0 {
+ return nil
+ }
+ return enc.emitByte('0')
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.stateBuf[0] = 1
+ return enc.emitByte(byte(c))
+ default:
+ if enc.stateBuf[0] == 0 {
+ if err := enc.emitByte('0'); err != nil {
+ return err
+ }
+ }
+ enc.stateBuf = enc.stateBuf[:0]
+ enc.popState()
+ return enc.state(c)
+ }
+}
+
+// literals ////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInTrue(c rune) error { return enc._stateInLiteral(c, "true") }
+func (enc *ReEncoder) stateInFalse(c rune) error { return enc._stateInLiteral(c, "false") }
+func (enc *ReEncoder) stateInNull(c rune) error { return enc._stateInLiteral(c, "null") }
+func (enc *ReEncoder) _stateInLiteral(c rune, full string) error {
+ if c != rune(full[len(enc.stateBuf)]) {
+ return &SyntaxError{fmt.Sprintf("%s: unexpected character: %c", full, c), enc.inputPos}
+ }
+ enc.stateBuf = append(enc.stateBuf, byte(c))
+ if len(enc.stateBuf) == len(full) {
+ enc.stateBuf = enc.stateBuf[:0]
+ enc.popState()
+ }
+ return enc.emitByte(byte(c))
+}