// Copyright (C) 2022 Luke Shumaker // // SPDX-License-Identifier: GPL-2.0-or-later package lowmemjson import ( "errors" "fmt" "io" "unicode/utf8" ) type reencodeState func(rune) error type ReEncoder struct { Out io.Writer // Whether to minify the JSON. Compact bool // String to use to indent; ignored if Compact is true. Indent string // String to put before indents, for testing-compat with // encoding/json only. prefix string // Returns whether a given character in a string should be // "\uXXXX" escaped. The bool argument is whether it was // \u-escaped in the input. This does not affect characters // that must or must-not be \u-escaped to be valid JSON. // // If not set, then EscapeUnicodeDefault is used. UnicodeEscape func(rune, bool) bool bailAfterCurrent bool // state: .Write's utf8-decoding buffer buf [utf8.UTFMax]byte bufLen int // state: .WriteRune err error inputPos int64 written int stack []reencodeState stack0IsNumber bool curIndent int // state: reencodeState-specific stateBuf []byte } // public API ////////////////////////////////////////////////////////////////// func (enc *ReEncoder) Write(p []byte) (int, error) { if len(p) == 0 { return 0, nil } var n int if enc.bufLen > 0 { copy(enc.buf[enc.bufLen:], p) c, size := utf8.DecodeRune(enc.buf[:]) n += size - enc.bufLen enc.bufLen = 0 if _, err := enc.WriteRune(c); err != nil { return 0, err } } for utf8.FullRune(p[n:]) { c, size := utf8.DecodeRune(p[n:]) if _, err := enc.WriteRune(c); err != nil { return n, err } n += size } enc.bufLen = copy(enc.buf[:], p[n:]) return len(p), nil } func (enc *ReEncoder) Flush() error { if enc.bufLen > 0 { return &SyntaxError{fmt.Sprintf("EOF: unflushed unicode garbage: %q", enc.buf[:enc.bufLen]), enc.inputPos} } switch len(enc.stack) { case 0: return nil case 1: if enc.stack0IsNumber { enc.Compact = true return enc.state('\n') } fallthrough default: return &SyntaxError{fmt.Sprintf("EOF: in the middle of a value"), enc.inputPos} } } func (enc *ReEncoder) WriteRune(c rune) (n int, err error) { if enc.err != nil { return 0, enc.err } if enc.bufLen != 0 { enc.err = errors.New("lowmemjson.ReEncoder: cannot .WriteRune() when there is a partial rune that has been .Write()n") return 0, enc.err } enc.written = 0 enc.err = enc.state(c) enc.inputPos += int64(utf8.RuneLen(c)) return enc.written, enc.err } // io helpers ////////////////////////////////////////////////////////////////// func (enc *ReEncoder) emitByte(c byte) error { err := writeByte(enc.Out, c) if err == nil { enc.written++ } return err } func (enc *ReEncoder) emit(n int, err error) error { enc.written += n return err } func (enc *ReEncoder) nlIndent() error { if enc.Compact || enc.Indent == "" { return nil } if err := enc.emitByte('\n'); err != nil { return err } if enc.prefix != "" { if err := enc.emit(io.WriteString(enc.Out, enc.prefix)); err != nil { return err } } for i := 0; i < enc.curIndent; i++ { if err := enc.emit(io.WriteString(enc.Out, enc.Indent)); err != nil { return err } } return nil } // state helpers /////////////////////////////////////////////////////////////// func (enc *ReEncoder) pushState(state reencodeState, isNumber bool) { if len(enc.stack) == 0 { enc.stack0IsNumber = isNumber } enc.stack = append(enc.stack, state) } func (enc *ReEncoder) replaceState(state reencodeState, isNumber bool) { if len(enc.stack) == 1 { enc.stack0IsNumber = isNumber } enc.stack[len(enc.stack)-1] = state } func (enc *ReEncoder) popState() { if len(enc.stack) == 1 { enc.stack0IsNumber = false } enc.stack = enc.stack[:len(enc.stack)-1] } var errBailedAfterCurrent = errors.New("bailed after current") func (enc *ReEncoder) state(c rune) error { if len(enc.stack) == 0 { if enc.bailAfterCurrent { return errBailedAfterCurrent } enc.pushState(enc.stateAny, false) } return enc.stack[len(enc.stack)-1](c) } // any ///////////////////////////////////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateAny(c rune) error { switch c { case 0x0020, 0x000A, 0x000D, 0x0009: if enc.Compact || enc.Indent != "" { return nil } case '{': enc.replaceState(enc.stateInEmptyObject, false) enc.curIndent++ case '[': enc.replaceState(enc.stateInEmptyArray, false) enc.curIndent++ case '"': enc.replaceState(enc.stateInString, false) case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': enc.replaceState(enc.stateNumberA, true) return enc.state(c) case 't': enc.replaceState(enc.stateInTrue, false) enc.stateBuf = append(enc.stateBuf[:0], 't') case 'f': enc.replaceState(enc.stateInFalse, false) enc.stateBuf = append(enc.stateBuf[:0], 'f') case 'n': enc.replaceState(enc.stateInNull, false) enc.stateBuf = append(enc.stateBuf[:0], 'n') default: return &SyntaxError{fmt.Sprintf("any: unexpected character: %c", c), enc.inputPos} } return enc.emitByte(byte(c)) } // object ////////////////////////////////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateInEmptyObject(c rune) error { return enc._stateInObject(c, false) } func (enc *ReEncoder) stateInNonEmptyObject(c rune) error { return enc._stateInObject(c, true) } func (enc *ReEncoder) _stateInObject(c rune, nonempty bool) error { switch c { case 0x0020, 0x000A, 0x000D, 0x0009: if enc.Compact || enc.Indent != "" { return nil } case '"': if err := enc.nlIndent(); err != nil { return err } enc.replaceState(enc.stateInKV, false) enc.pushState(enc.stateInString, false) case '}': enc.popState() enc.curIndent-- if nonempty { if err := enc.nlIndent(); err != nil { return err } } default: return &SyntaxError{fmt.Sprintf("object: unexpected character: %c", c), enc.inputPos} } return enc.emitByte(byte(c)) } func (enc *ReEncoder) stateInKV(c rune) error { switch c { case 0x0020, 0x000A, 0x000D, 0x0009: if enc.Compact || enc.Indent != "" { return nil } return enc.emitByte(byte(c)) case ':': enc.replaceState(enc.stateAfterV, false) enc.pushState(enc.stateAny, false) if err := enc.emitByte(byte(c)); err != nil { return err } if !enc.Compact && enc.Indent != "" { return enc.emitByte(' ') } return nil default: return &SyntaxError{fmt.Sprintf("object member: unexpected character: %c", c), enc.inputPos} } } func (enc *ReEncoder) stateAfterV(c rune) error { switch c { case 0x0020, 0x000A, 0x000D, 0x0009: if enc.Compact || enc.Indent != "" { return nil } case ',': enc.replaceState(enc.stateInNonEmptyObject, false) case '}': enc.popState() enc.curIndent-- if err := enc.nlIndent(); err != nil { return err } default: return &SyntaxError{fmt.Sprintf("object member: unexpected character: %c", c), enc.inputPos} } return enc.emitByte(byte(c)) } // array /////////////////////////////////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateInEmptyArray(c rune) error { return enc._stateInArray(c, false) } func (enc *ReEncoder) stateInNonEmptyArray(c rune) error { return enc._stateInArray(c, true) } func (enc *ReEncoder) _stateInArray(c rune, nonempty bool) error { switch c { case 0x0020, 0x000A, 0x000D, 0x0009: if enc.Compact || enc.Indent != "" { return nil } case ']': enc.popState() enc.curIndent-- if nonempty { if err := enc.nlIndent(); err != nil { return err } } default: if err := enc.nlIndent(); err != nil { return err } enc.replaceState(enc.stateAfterItem, false) enc.pushState(enc.stateAny, false) return enc.state(c) } return enc.emitByte(byte(c)) } func (enc *ReEncoder) stateAfterItem(c rune) error { switch c { case 0x0020, 0x000A, 0x000D, 0x0009: if enc.Compact || enc.Indent != "" { return nil } case ',': enc.replaceState(enc.stateInNonEmptyArray, false) case ']': enc.popState() enc.curIndent-- if err := enc.nlIndent(); err != nil { return err } default: return &SyntaxError{fmt.Sprintf("array: unexpected character: %c", c), enc.inputPos} } return enc.emitByte(byte(c)) } // string ////////////////////////////////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateInString(c rune) error { switch { case c == '\\': enc.replaceState(enc.stateInBackslash, false) return nil case c == '"': enc.popState() return enc.emitByte(byte(c)) case 0x0020 <= c && c <= 0x10FFFF: return enc.emit(writeStringChar(enc.Out, c, false, enc.UnicodeEscape)) default: return &SyntaxError{fmt.Sprintf("string: unexpected character: %c", c), enc.inputPos} } } func (enc *ReEncoder) stateInBackslash(c rune) error { switch c { case '"': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '"', false, enc.UnicodeEscape)) case '\\': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '\\', false, enc.UnicodeEscape)) case '/': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '/', false, enc.UnicodeEscape)) case 'b': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '\b', false, enc.UnicodeEscape)) case 'f': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '\f', false, enc.UnicodeEscape)) case 'n': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '\n', false, enc.UnicodeEscape)) case 'r': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '\r', false, enc.UnicodeEscape)) case 't': enc.replaceState(enc.stateInString, false) return enc.emit(writeStringChar(enc.Out, '\t', false, enc.UnicodeEscape)) case 'u': enc.replaceState(enc.stateInUnicode, false) return nil default: return &SyntaxError{fmt.Sprintf("string backslash sequence: unexpected character: %c", c), enc.inputPos} } } func (enc *ReEncoder) stateInUnicode(c rune) error { switch { case '0' <= c && c <= '9': enc.stateBuf = append(enc.stateBuf, byte(c)-'0') case 'a' <= c && c <= 'f': enc.stateBuf = append(enc.stateBuf, byte(c)-'a'+10) case 'A' <= c && c <= 'F': enc.stateBuf = append(enc.stateBuf, byte(c)-'A'+10) default: return &SyntaxError{fmt.Sprintf("string unicode sequence: unexpected character: %c", c), enc.inputPos} } if len(enc.stateBuf) == 4 { enc.replaceState(enc.stateInString, false) c := 0 | rune(enc.stateBuf[0])<<12 | rune(enc.stateBuf[1])<<8 | rune(enc.stateBuf[2])<<4 | rune(enc.stateBuf[3])<<0 enc.stateBuf = enc.stateBuf[:0] return enc.emit(writeStringChar(enc.Out, c, true, enc.UnicodeEscape)) } return nil } // number ////////////////////////////////////////////////////////////////////////////////////////// // Here's a flattened drawing of the syntax diagram from www.json.org : // // [------------ integer ----------][-- fraction ---][-------- exponent -------] // >─╮─────╭─╮─"0"───────╭─────────╭──╮─────────────╭──╮───────────────────────╭─> // │ │ │ │ │ │ │ │ │ // ╰─"-"─╯ ╰─digit 1-9─╯─╭digit╮─╯ ╰─"."─╭digit╮─╯ ╰─"e"─╭─╮─────╭─╭digit╮─╯ // ╰──<──╯ ╰──<──╯ │ │ │ │ ╰──<──╯ // ╰─"E"─╯ ╰─"-"─╯ // │ │ // ╰─"+"─╯ // // Now here it is slightly redrawn, and with each distinct state our // decoder can be in marked with a single-capital-letter: // // [-------------- integer ------------][--------- fraction --------][--------- exponent ---------] // >─A─╮───────╭──╮─"0"─────────C─╭─────────╮──────────────────╭─────────╮──────────────────────────╭─> // │ │ │ │ │ │ │ │ // ╰─"-"─B─╯ ╰─digit 1-9─╭─D─╯─digit╮ ╰─"."─E─digit──╭─F─╯─digit╮ ╰─"e"─╭─G─╮─────╭─╭digit─H─╯ // ╰────<─────╯ ╰────<─────╯ │ │ │ │ ╰────<───╯ // ╰─"E"─╯ ╰─"-"─╯ // │ │ // ╰─"+"─╯ // // Which state we're at is the 'X' in 'stateNumberX'. // // Besides just traversing that, there are a few compressions we want to make: // // - trim trailing 0s from fraction the (but don't remove the // fraction if it's all 0s); do this by making the F state a little // special. This requires a little more state, because when we // encounter the 0 we don't yet know if it's trailing. So, store // the number of maybe-trailing zeros in enc.stateBuf[0]; if that // reaches 255, then bleed over to enc.stateBuf[1] and so on. // // - trim leading 0s from the exponent (but don't remove the exponent // if it's all 0s); do this by making the H state a little special. // Record whether we've seen a non-zero digit in enc.stateBuf[0] // (0=false, 1=true). // integer-part //////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateNumberA(c rune) error { // start switch c { case '-': enc.replaceState(enc.stateNumberB, true) case '0': enc.replaceState(enc.stateNumberC, true) case '1', '2', '3', '4', '5', '6', '7', '8', '9': enc.replaceState(enc.stateNumberD, true) default: return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} } return enc.emitByte(byte(c)) } func (enc *ReEncoder) stateNumberB(c rune) error { // got a leading "-" switch c { case '0': enc.replaceState(enc.stateNumberC, true) case '1', '2', '3', '4', '5', '6', '7', '8', '9': enc.replaceState(enc.stateNumberD, true) default: return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} } return enc.emitByte(byte(c)) } func (enc *ReEncoder) stateNumberC(c rune) error { // ready for the fraction or exponent part to start switch c { case '.': enc.replaceState(enc.stateNumberE, true) return enc.emitByte('.') case 'e', 'E': enc.replaceState(enc.stateNumberG, true) enc.stateBuf = append(enc.stateBuf[:0], 0) return enc.emitByte('e') default: enc.popState() return enc.state(c) } } func (enc *ReEncoder) stateNumberD(c rune) error { // in the integer part switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return enc.emitByte(byte(c)) case '.': enc.replaceState(enc.stateNumberE, true) return enc.emitByte('.') case 'e', 'E': enc.replaceState(enc.stateNumberG, true) enc.stateBuf = append(enc.stateBuf[:0], 0) return enc.emitByte('e') default: enc.popState() return enc.state(c) } } // fraction-part /////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateNumberE(c rune) error { // got a ".", ready to read a number for the fraction part switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': enc.replaceState(enc.stateNumberF, true) return enc.emitByte(byte(c)) default: return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} } } func (enc *ReEncoder) stateNumberF(c rune) error { // in the fraction part switch c { case '0': if len(enc.stateBuf) > 0 && enc.stateBuf[len(enc.stateBuf)-1] < 255 { enc.stateBuf[len(enc.stateBuf)-1]++ } else { enc.stateBuf = append(enc.stateBuf, 1) } return nil case '1', '2', '3', '4', '5', '6', '7', '8', '9': for len(enc.stateBuf) > 0 { if err := enc.emitByte('0'); err != nil { return err } if enc.stateBuf[len(enc.stateBuf)-1] == 1 { enc.stateBuf = enc.stateBuf[:len(enc.stateBuf)-1] } else { enc.stateBuf[len(enc.stateBuf)-1]-- } } return enc.emitByte(byte(c)) case 'e', 'E': enc.replaceState(enc.stateNumberG, true) enc.stateBuf = append(enc.stateBuf[:0], 0) return enc.emitByte('e') default: enc.stateBuf = enc.stateBuf[:0] enc.popState() return enc.state(c) } } // exponent-part /////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateNumberG(c rune) error { // got a leading "e" switch c { case '-', '+': enc.replaceState(enc.stateNumberH, true) return enc.emitByte(byte(c)) case '0': enc.replaceState(enc.stateNumberH, true) return nil case '1', '2', '3', '4', '5', '6', '7', '8', '9': enc.replaceState(enc.stateNumberH, true) enc.stateBuf[0] = 1 return enc.emitByte(byte(c)) default: enc.stateBuf = enc.stateBuf[:0] return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} } } func (enc *ReEncoder) stateNumberH(c rune) error { // in the exponent's number part switch c { case '0': if enc.stateBuf[0] == 0 { return nil } return enc.emitByte('0') case '1', '2', '3', '4', '5', '6', '7', '8', '9': enc.stateBuf[0] = 1 return enc.emitByte(byte(c)) default: if enc.stateBuf[0] == 0 { if err := enc.emitByte('0'); err != nil { return err } } enc.stateBuf = enc.stateBuf[:0] enc.popState() return enc.state(c) } } // literals //////////////////////////////////////////////////////////////////////////////////////// func (enc *ReEncoder) stateInTrue(c rune) error { return enc._stateInLiteral(c, "true") } func (enc *ReEncoder) stateInFalse(c rune) error { return enc._stateInLiteral(c, "false") } func (enc *ReEncoder) stateInNull(c rune) error { return enc._stateInLiteral(c, "null") } func (enc *ReEncoder) _stateInLiteral(c rune, full string) error { if c != rune(full[len(enc.stateBuf)]) { return &SyntaxError{fmt.Sprintf("%s: unexpected character: %c", full, c), enc.inputPos} } enc.stateBuf = append(enc.stateBuf, byte(c)) if len(enc.stateBuf) == len(full) { enc.stateBuf = enc.stateBuf[:0] enc.popState() } return enc.emitByte(byte(c)) }