summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2022-07-30 22:15:37 -0600
committerLuke Shumaker <lukeshu@lukeshu.com>2022-07-31 00:30:31 -0600
commit43b4a255bc074d26e7fb3b60e6b7802033195b51 (patch)
tree1823bb2a43893796991d56afdf573aeb67f8b013
parent6792ed1b431e79c18058414dbb7586977ec2ca62 (diff)
implement low-mem json re-encoding
-rw-r--r--lib/lowmemjson/reencode.go600
1 files changed, 600 insertions, 0 deletions
diff --git a/lib/lowmemjson/reencode.go b/lib/lowmemjson/reencode.go
new file mode 100644
index 0000000..c295d2c
--- /dev/null
+++ b/lib/lowmemjson/reencode.go
@@ -0,0 +1,600 @@
+// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package lowmemjson
+
+import (
+ "errors"
+ "fmt"
+ "unicode/utf8"
+)
+
+type reencodeState func(rune) error
+
+type ReEncoder struct {
+ Out interface {
+ WriteRune(rune) error
+ WriteByte(byte) error
+ WriteString(string) (int, error)
+ }
+
+ // Whether to minify the JSON.
+ Compact bool
+ // String to use to indent; ignored if Compact is true.
+ Indent string
+ // Returns whether a given character in a string should be
+ // "\uXXXX" escaped. The bool argument is whether it was
+ // \u-escaped in the input.
+ UnicodeEscape func(rune, bool) bool
+
+ // state: .Write's utf8-decoding buffer
+ buf [4]byte
+ bufLen int
+
+ // state: .WriteRune
+ err error
+ stack []reencodeState
+ stack0IsNumber bool
+ curIndent int
+
+ // state: reencodeState-specific
+ stateBuf []byte
+}
+
+func (enc *ReEncoder) Write(p []byte) (int, error) {
+ if len(p) == 0 {
+ return 0, nil
+ }
+ var n int
+ if enc.bufLen > 0 {
+ copy(enc.buf[enc.bufLen:], p)
+ c, size := utf8.DecodeRune(enc.buf[:])
+ n += size - enc.bufLen
+ enc.bufLen = 0
+ if err := enc.WriteRune(c); err != nil {
+ return 0, err
+ }
+ }
+ for utf8.FullRune(p[n:]) {
+ c, size := utf8.DecodeRune(p[n:])
+ if err := enc.WriteRune(c); err != nil {
+ return n, err
+ }
+ n += size
+ }
+ enc.bufLen = copy(enc.buf[:], p)
+ return n, nil
+}
+
+func (enc *ReEncoder) Flush() error {
+ if enc.bufLen > 0 {
+ return fmt.Errorf("unflushed unicode garbage: %q", enc.buf[:enc.bufLen])
+ }
+ switch len(enc.stack) {
+ case 0:
+ return nil
+ case 1:
+ if enc.stack0IsNumber {
+ enc.Compact = true
+ return enc.state('\n')
+ }
+ fallthrough
+ default:
+ return fmt.Errorf("in the middle of a value")
+ }
+}
+
+func (enc *ReEncoder) WriteRune(c rune) (err error) {
+ if enc.err != nil {
+ return enc.err
+ }
+ defer func() {
+ if err != nil {
+ enc.err = err
+ }
+ }()
+ if enc.bufLen != 0 {
+ return errors.New("lowmemjson.ReEncoder: cannot .WriteRune() when there is a partial rune that has been .Write()n")
+ }
+ return enc.state(c)
+}
+
+func (enc *ReEncoder) pushState(state reencodeState, isNumber bool) {
+ if len(enc.stack) == 0 {
+ enc.stack0IsNumber = isNumber
+ }
+ enc.stack = append(enc.stack, state)
+}
+func (enc *ReEncoder) replaceState(state reencodeState, isNumber bool) {
+ if len(enc.stack) == 1 {
+ enc.stack0IsNumber = isNumber
+ }
+ enc.stack[len(enc.stack)-1] = state
+}
+func (enc *ReEncoder) popState() {
+ if len(enc.stack) == 1 {
+ enc.stack0IsNumber = false
+ }
+ enc.stack = enc.stack[:len(enc.stack)-1]
+}
+
+func (enc *ReEncoder) state(c rune) error {
+ if len(enc.stack) == 0 {
+ enc.pushState(enc.stateAny, false)
+ }
+ return enc.stack[len(enc.stack)-1](c)
+}
+
+func (enc *ReEncoder) nlIndent() error {
+ if enc.Compact || enc.Indent == "" {
+ return nil
+ }
+ if err := enc.Out.WriteByte('\n'); err != nil {
+ return err
+ }
+ for i := 0; i < enc.curIndent; i++ {
+ if _, err := enc.Out.WriteString(enc.Indent); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// any /////////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateAny(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case '{':
+ enc.replaceState(enc.stateInEmptyObject, false)
+ enc.curIndent++
+ case '[':
+ enc.replaceState(enc.stateInEmptyArray, false)
+ enc.curIndent++
+ case '"':
+ enc.replaceState(enc.stateInString, false)
+ case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberA, true)
+ return enc.state(c)
+ case 't':
+ enc.replaceState(enc.stateInTrue, false)
+ enc.stateBuf = append(enc.stateBuf[:0], 't')
+ case 'f':
+ enc.replaceState(enc.stateInFalse, false)
+ enc.stateBuf = append(enc.stateBuf[:0], 'f')
+ case 'n':
+ enc.replaceState(enc.stateInNull, false)
+ enc.stateBuf = append(enc.stateBuf[:0], 'n')
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteRune(c)
+}
+
+// object //////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInEmptyObject(c rune) error { return enc._stateInObject(c, false) }
+func (enc *ReEncoder) stateInNonEmptyObject(c rune) error { return enc._stateInObject(c, true) }
+func (enc *ReEncoder) _stateInObject(c rune, nonempty bool) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case '"':
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ enc.replaceState(enc.stateInKV, false)
+ enc.pushState(enc.stateInString, false)
+ case '}':
+ enc.popState()
+ enc.curIndent--
+ if nonempty {
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ }
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteRune(c)
+}
+func (enc *ReEncoder) stateInKV(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ':':
+ enc.replaceState(enc.stateAfterV, false)
+ enc.pushState(enc.stateAny, false)
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteRune(c)
+}
+func (enc *ReEncoder) stateAfterV(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ',':
+ enc.replaceState(enc.stateInNonEmptyObject, false)
+ case '}':
+ enc.popState()
+ enc.curIndent--
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteRune(c)
+}
+
+// array ///////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInEmptyArray(c rune) error { return enc._stateInArray(c, false) }
+func (enc *ReEncoder) stateInNonEmptyArray(c rune) error { return enc._stateInArray(c, true) }
+func (enc *ReEncoder) _stateInArray(c rune, nonempty bool) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ']':
+ enc.popState()
+ enc.curIndent--
+ if nonempty {
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ }
+ default:
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ enc.replaceState(enc.stateAfterItem, false)
+ enc.pushState(enc.stateAny, false)
+ return enc.state(c)
+ }
+ return enc.Out.WriteRune(c)
+}
+func (enc *ReEncoder) stateAfterItem(c rune) error {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ if enc.Compact || enc.Indent != "" {
+ return nil
+ }
+ case ',':
+ enc.replaceState(enc.stateInNonEmptyArray, false)
+ case ']':
+ enc.popState()
+ enc.curIndent--
+ if err := enc.nlIndent(); err != nil {
+ return err
+ }
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteRune(c)
+}
+
+// string //////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) emitStringUnicodeEscape(c rune) error {
+ if err := enc.Out.WriteByte('\\'); err != nil {
+ return err
+ }
+ if err := enc.Out.WriteByte('u'); err != nil {
+ return err
+ }
+ if err := enc.Out.WriteByte(hex[(c>>24)&0xff]); err != nil {
+ return err
+ }
+ if err := enc.Out.WriteByte(hex[(c>>16)&0xff]); err != nil {
+ return err
+ }
+ if err := enc.Out.WriteByte(hex[(c>>8)&0xff]); err != nil {
+ return err
+ }
+ if err := enc.Out.WriteByte(hex[(c>>0)&0xff]); err != nil {
+ return err
+ }
+ return nil
+}
+func (enc *ReEncoder) emitStringShortEscape(c byte) error {
+ if err := enc.Out.WriteByte('\\'); err != nil {
+ return err
+ }
+ if err := enc.Out.WriteByte(c); err != nil {
+ return err
+ }
+ return nil
+}
+func (enc *ReEncoder) emitStringChar(c rune) error {
+ switch {
+ case enc.UnicodeEscape(c, false):
+ return enc.emitStringUnicodeEscape(c)
+ case c == '"' || c == '\\':
+ return enc.emitStringShortEscape(byte(c))
+ case c < 0x0020 || c > 0x10FFFF:
+ switch c {
+ case '\b':
+ return enc.emitStringShortEscape('b')
+ case '\f':
+ return enc.emitStringShortEscape('f')
+ case '\n':
+ return enc.emitStringShortEscape('n')
+ case '\r':
+ return enc.emitStringShortEscape('r')
+ case '\t':
+ return enc.emitStringShortEscape('t')
+ default:
+ return enc.emitStringUnicodeEscape(c)
+ }
+ default:
+ return enc.Out.WriteRune(c)
+ }
+}
+
+func (enc *ReEncoder) stateInString(c rune) error {
+ switch {
+ case c == '\\':
+ enc.replaceState(enc.stateInBackslash, false)
+ return nil
+ case c == '"':
+ enc.popState()
+ return enc.Out.WriteRune(c)
+ case 0x0020 <= c && c <= 0x10FFFF:
+ return enc.emitStringChar(c)
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+}
+func (enc *ReEncoder) stateInBackslash(c rune) error {
+ switch c {
+ case '"':
+ return enc.emitStringChar('"')
+ case '\\':
+ return enc.emitStringChar('\\')
+ case 'b':
+ return enc.emitStringChar('\b')
+ case 'f':
+ return enc.emitStringChar('\f')
+ case 'n':
+ return enc.emitStringChar('\n')
+ case 'r':
+ return enc.emitStringChar('\r')
+ case 't':
+ return enc.emitStringChar('\t')
+ case 'u':
+ enc.replaceState(enc.stateInUnicode, false)
+ return nil
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+}
+func (enc *ReEncoder) stateInUnicode(c rune) error {
+ switch {
+ case '0' <= c && c <= '9':
+ enc.stateBuf = append(enc.stateBuf, byte(c)-'0')
+ case 'a' <= c && c <= 'f':
+ enc.stateBuf = append(enc.stateBuf, byte(c)-'a')
+ case 'A' <= c && c <= 'F':
+ enc.stateBuf = append(enc.stateBuf, byte(c)-'A')
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ if len(enc.stateBuf) == 4 {
+ enc.replaceState(enc.stateInString, false)
+ c := 0 |
+ rune(enc.stateBuf[0])<<24 |
+ rune(enc.stateBuf[1])<<16 |
+ rune(enc.stateBuf[2])<<8 |
+ rune(enc.stateBuf[3])<<0
+ enc.stateBuf = enc.stateBuf[:0]
+ return enc.emitStringChar(c)
+ }
+ return nil
+}
+
+// number //////////////////////////////////////////////////////////////////////////////////////////
+
+// Here's a flattened drawing of the syntax diagram from www.json.org :
+//
+// [------------ integer ----------][-- fraction ---][-------- exponent -------]
+// >─╮─────╭─╮─"0"───────╭─────────╭──╮─────────────╭──╮───────────────────────╭─>
+// │ │ │ │ │ │ │ │ │
+// ╰─"-"─╯ ╰─digit 1-9─╯─╭digit╮─╯ ╰─"."─╭digit╮─╯ ╰─"e"─╭─╮─────╭─╭digit╮─╯
+// ╰──<──╯ ╰──<──╯ │ │ │ │ ╰──<──╯
+// ╰─"E"─╯ ╰─"-"─╯
+// │ │
+// ╰─"+"─╯
+//
+// Now here it is slightly redrawn, and with each distinct state our
+// decoder can be in marked with a single-capital-letter:
+//
+// [-------------- integer ------------][--------- fraction --------][--------- exponent ---------]
+// >─A─╮───────╭──╮─"0"─────────C─╭─────────╮──────────────────╭─────────╮──────────────────────────╭─>
+// │ │ │ │ │ │ │ │
+// ╰─"-"─B─╯ ╰─digit 1-9─╭─D─╯─digit╮ ╰─"."─E─digit──╭─F─╯─digit╮ ╰─"e"─╭─G─╮─────╭─╭digit─H─╯
+// ╰────<─────╯ ╰────<─────╯ │ │ │ │ ╰────<───╯
+// ╰─"E"─╯ ╰─"-"─╯
+// │ │
+// ╰─"+"─╯
+//
+// Which state we're at is stored the 'X' in 'stateNumberX'.
+//
+// Besides just traversing that, there are a few compressions we want to make:
+//
+// - trim trailing 0s from fraction the (but don't remove the
+// fraction if it's all 0s); do this by making the F state a little
+// special. This requires a little more state, because when we
+// encounter the 0 we don't yet know if it's trailing. So, store
+// the number of maybe-trailing zeros in enc.stateBuf[0]; if that
+// reaches 255, then bleed over to enc.stateBuf[1] and so on.
+//
+// - trim leading 0s from the exponent (but don't remove the exponent
+// if it's all 0s); do this by making the H state a little special.
+// Record whether we've seen a non-zero digit in enc.stateBuf[0]
+// (0=false, 1=true).
+
+// integer-part ////////////////////////////////////////////////////////////////
+func (enc *ReEncoder) stateNumberA(c rune) error { // start
+ switch c {
+ case '-':
+ enc.replaceState(enc.stateNumberB, true)
+ case '0':
+ enc.replaceState(enc.stateNumberC, true)
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberD, true)
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteByte(byte(c))
+}
+func (enc *ReEncoder) stateNumberB(c rune) error { // got a leading "-"
+ switch c {
+ case '0':
+ enc.replaceState(enc.stateNumberC, true)
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberD, true)
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ return enc.Out.WriteByte(byte(c))
+}
+func (enc *ReEncoder) stateNumberC(c rune) error { // ready for the fraction or exponent part to start
+ switch c {
+ case '.':
+ enc.replaceState(enc.stateNumberE, true)
+ return enc.Out.WriteByte('.')
+ case 'e', 'E':
+ enc.replaceState(enc.stateNumberG, true)
+ enc.stateBuf = append(enc.stateBuf[:0], 0)
+ return enc.Out.WriteByte('e')
+ default:
+ enc.popState()
+ return enc.state(c)
+ }
+}
+func (enc *ReEncoder) stateNumberD(c rune) error { // in the integer part
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return enc.Out.WriteByte(byte(c))
+ case '.':
+ enc.replaceState(enc.stateNumberE, true)
+ return enc.Out.WriteByte('.')
+ case 'e', 'E':
+ enc.replaceState(enc.stateNumberG, true)
+ enc.stateBuf = append(enc.stateBuf[:0], 0)
+ return enc.Out.WriteByte('e')
+ default:
+ enc.popState()
+ return enc.state(c)
+ }
+}
+
+// fraction-part ///////////////////////////////////////////////////////////////
+func (enc *ReEncoder) stateNumberE(c rune) error { // got a ".", ready to read a number for the fraction part
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberF, true)
+ return enc.Out.WriteByte(byte(c))
+ default:
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+}
+func (enc *ReEncoder) stateNumberF(c rune) error { // in the fraction part
+ switch c {
+ case '0':
+ if len(enc.stateBuf) > 0 && enc.stateBuf[len(enc.stateBuf)-1] < 255 {
+ enc.stateBuf[len(enc.stateBuf)-1]++
+ } else {
+ enc.stateBuf = append(enc.stateBuf, 1)
+ }
+ return nil
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ for len(enc.stateBuf) > 0 {
+ if err := enc.Out.WriteByte('0'); err != nil {
+ return err
+ }
+ if enc.stateBuf[len(enc.stateBuf)-1] == 1 {
+ enc.stateBuf = enc.stateBuf[:len(enc.stateBuf)-1]
+ } else {
+ enc.stateBuf[len(enc.stateBuf)-1]--
+ }
+ }
+ return enc.Out.WriteByte(byte(c))
+ case 'e', 'E':
+ enc.stateBuf = append(enc.stateBuf[:0], 0)
+ return enc.Out.WriteByte('e')
+ default:
+ enc.stateBuf = enc.stateBuf[:0]
+ enc.popState()
+ return enc.state(c)
+ }
+}
+
+// exponent-part ///////////////////////////////////////////////////////////////
+func (enc *ReEncoder) stateNumberG(c rune) error { // got a leading "e"
+ switch c {
+ case '-', '+':
+ enc.replaceState(enc.stateNumberH, true)
+ return enc.Out.WriteByte(byte(c))
+ case '0':
+ enc.replaceState(enc.stateNumberH, true)
+ return nil
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.replaceState(enc.stateNumberH, true)
+ enc.stateBuf[1] = 1
+ return enc.Out.WriteByte(byte(c))
+ default:
+ enc.stateBuf = enc.stateBuf[:0]
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+}
+func (enc *ReEncoder) stateNumberH(c rune) error { // in the exponent's number part
+ switch c {
+ case '0':
+ if enc.stateBuf[1] == 0 {
+ return nil
+ }
+ return enc.Out.WriteByte('0')
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ enc.stateBuf[1] = 1
+ return enc.Out.WriteByte(byte(c))
+ default:
+ if enc.stateBuf[1] == 0 {
+ if err := enc.Out.WriteByte('0'); err != nil {
+ return err
+ }
+ }
+ enc.stateBuf = enc.stateBuf[:0]
+ enc.popState()
+ return enc.state(c)
+ }
+}
+
+// literals ////////////////////////////////////////////////////////////////////////////////////////
+
+func (enc *ReEncoder) stateInTrue(c rune) error { return enc._stateInLiteral(c, "true") }
+func (enc *ReEncoder) stateInFalse(c rune) error { return enc._stateInLiteral(c, "false") }
+func (enc *ReEncoder) stateInNull(c rune) error { return enc._stateInLiteral(c, "null") }
+func (enc *ReEncoder) _stateInLiteral(c rune, full string) error {
+ if c != rune(full[len(enc.stateBuf)+1]) {
+ return fmt.Errorf("unexpected character: %c", c)
+ }
+ enc.stateBuf = append(enc.stateBuf, byte(c))
+ if len(enc.stateBuf) == len(full) {
+ enc.popState()
+ }
+ return enc.Out.WriteRune(c)
+}