From 43b4a255bc074d26e7fb3b60e6b7802033195b51 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 30 Jul 2022 22:15:37 -0600 Subject: implement low-mem json re-encoding --- lib/lowmemjson/reencode.go | 600 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 600 insertions(+) create mode 100644 lib/lowmemjson/reencode.go diff --git a/lib/lowmemjson/reencode.go b/lib/lowmemjson/reencode.go new file mode 100644 index 0000000..c295d2c --- /dev/null +++ b/lib/lowmemjson/reencode.go @@ -0,0 +1,600 @@ +// Copyright (C) 2022 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package lowmemjson + +import ( + "errors" + "fmt" + "unicode/utf8" +) + +type reencodeState func(rune) error + +type ReEncoder struct { + Out interface { + WriteRune(rune) error + WriteByte(byte) error + WriteString(string) (int, error) + } + + // Whether to minify the JSON. + Compact bool + // String to use to indent; ignored if Compact is true. + Indent string + // Returns whether a given character in a string should be + // "\uXXXX" escaped. The bool argument is whether it was + // \u-escaped in the input. + UnicodeEscape func(rune, bool) bool + + // state: .Write's utf8-decoding buffer + buf [4]byte + bufLen int + + // state: .WriteRune + err error + stack []reencodeState + stack0IsNumber bool + curIndent int + + // state: reencodeState-specific + stateBuf []byte +} + +func (enc *ReEncoder) Write(p []byte) (int, error) { + if len(p) == 0 { + return 0, nil + } + var n int + if enc.bufLen > 0 { + copy(enc.buf[enc.bufLen:], p) + c, size := utf8.DecodeRune(enc.buf[:]) + n += size - enc.bufLen + enc.bufLen = 0 + if err := enc.WriteRune(c); err != nil { + return 0, err + } + } + for utf8.FullRune(p[n:]) { + c, size := utf8.DecodeRune(p[n:]) + if err := enc.WriteRune(c); err != nil { + return n, err + } + n += size + } + enc.bufLen = copy(enc.buf[:], p) + return n, nil +} + +func (enc *ReEncoder) Flush() error { + if enc.bufLen > 0 { + return fmt.Errorf("unflushed unicode garbage: %q", enc.buf[:enc.bufLen]) + } + switch len(enc.stack) { + case 0: + return nil + case 1: + if enc.stack0IsNumber { + enc.Compact = true + return enc.state('\n') + } + fallthrough + default: + return fmt.Errorf("in the middle of a value") + } +} + +func (enc *ReEncoder) WriteRune(c rune) (err error) { + if enc.err != nil { + return enc.err + } + defer func() { + if err != nil { + enc.err = err + } + }() + if enc.bufLen != 0 { + return errors.New("lowmemjson.ReEncoder: cannot .WriteRune() when there is a partial rune that has been .Write()n") + } + return enc.state(c) +} + +func (enc *ReEncoder) pushState(state reencodeState, isNumber bool) { + if len(enc.stack) == 0 { + enc.stack0IsNumber = isNumber + } + enc.stack = append(enc.stack, state) +} +func (enc *ReEncoder) replaceState(state reencodeState, isNumber bool) { + if len(enc.stack) == 1 { + enc.stack0IsNumber = isNumber + } + enc.stack[len(enc.stack)-1] = state +} +func (enc *ReEncoder) popState() { + if len(enc.stack) == 1 { + enc.stack0IsNumber = false + } + enc.stack = enc.stack[:len(enc.stack)-1] +} + +func (enc *ReEncoder) state(c rune) error { + if len(enc.stack) == 0 { + enc.pushState(enc.stateAny, false) + } + return enc.stack[len(enc.stack)-1](c) +} + +func (enc *ReEncoder) nlIndent() error { + if enc.Compact || enc.Indent == "" { + return nil + } + if err := enc.Out.WriteByte('\n'); err != nil { + return err + } + for i := 0; i < enc.curIndent; i++ { + if _, err := enc.Out.WriteString(enc.Indent); err != nil { + return err + } + } + return nil +} + +// any ///////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateAny(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case '{': + enc.replaceState(enc.stateInEmptyObject, false) + enc.curIndent++ + case '[': + enc.replaceState(enc.stateInEmptyArray, false) + enc.curIndent++ + case '"': + enc.replaceState(enc.stateInString, false) + case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberA, true) + return enc.state(c) + case 't': + enc.replaceState(enc.stateInTrue, false) + enc.stateBuf = append(enc.stateBuf[:0], 't') + case 'f': + enc.replaceState(enc.stateInFalse, false) + enc.stateBuf = append(enc.stateBuf[:0], 'f') + case 'n': + enc.replaceState(enc.stateInNull, false) + enc.stateBuf = append(enc.stateBuf[:0], 'n') + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteRune(c) +} + +// object ////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInEmptyObject(c rune) error { return enc._stateInObject(c, false) } +func (enc *ReEncoder) stateInNonEmptyObject(c rune) error { return enc._stateInObject(c, true) } +func (enc *ReEncoder) _stateInObject(c rune, nonempty bool) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case '"': + if err := enc.nlIndent(); err != nil { + return err + } + enc.replaceState(enc.stateInKV, false) + enc.pushState(enc.stateInString, false) + case '}': + enc.popState() + enc.curIndent-- + if nonempty { + if err := enc.nlIndent(); err != nil { + return err + } + } + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteRune(c) +} +func (enc *ReEncoder) stateInKV(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ':': + enc.replaceState(enc.stateAfterV, false) + enc.pushState(enc.stateAny, false) + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteRune(c) +} +func (enc *ReEncoder) stateAfterV(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ',': + enc.replaceState(enc.stateInNonEmptyObject, false) + case '}': + enc.popState() + enc.curIndent-- + if err := enc.nlIndent(); err != nil { + return err + } + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteRune(c) +} + +// array /////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInEmptyArray(c rune) error { return enc._stateInArray(c, false) } +func (enc *ReEncoder) stateInNonEmptyArray(c rune) error { return enc._stateInArray(c, true) } +func (enc *ReEncoder) _stateInArray(c rune, nonempty bool) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ']': + enc.popState() + enc.curIndent-- + if nonempty { + if err := enc.nlIndent(); err != nil { + return err + } + } + default: + if err := enc.nlIndent(); err != nil { + return err + } + enc.replaceState(enc.stateAfterItem, false) + enc.pushState(enc.stateAny, false) + return enc.state(c) + } + return enc.Out.WriteRune(c) +} +func (enc *ReEncoder) stateAfterItem(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ',': + enc.replaceState(enc.stateInNonEmptyArray, false) + case ']': + enc.popState() + enc.curIndent-- + if err := enc.nlIndent(); err != nil { + return err + } + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteRune(c) +} + +// string ////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) emitStringUnicodeEscape(c rune) error { + if err := enc.Out.WriteByte('\\'); err != nil { + return err + } + if err := enc.Out.WriteByte('u'); err != nil { + return err + } + if err := enc.Out.WriteByte(hex[(c>>24)&0xff]); err != nil { + return err + } + if err := enc.Out.WriteByte(hex[(c>>16)&0xff]); err != nil { + return err + } + if err := enc.Out.WriteByte(hex[(c>>8)&0xff]); err != nil { + return err + } + if err := enc.Out.WriteByte(hex[(c>>0)&0xff]); err != nil { + return err + } + return nil +} +func (enc *ReEncoder) emitStringShortEscape(c byte) error { + if err := enc.Out.WriteByte('\\'); err != nil { + return err + } + if err := enc.Out.WriteByte(c); err != nil { + return err + } + return nil +} +func (enc *ReEncoder) emitStringChar(c rune) error { + switch { + case enc.UnicodeEscape(c, false): + return enc.emitStringUnicodeEscape(c) + case c == '"' || c == '\\': + return enc.emitStringShortEscape(byte(c)) + case c < 0x0020 || c > 0x10FFFF: + switch c { + case '\b': + return enc.emitStringShortEscape('b') + case '\f': + return enc.emitStringShortEscape('f') + case '\n': + return enc.emitStringShortEscape('n') + case '\r': + return enc.emitStringShortEscape('r') + case '\t': + return enc.emitStringShortEscape('t') + default: + return enc.emitStringUnicodeEscape(c) + } + default: + return enc.Out.WriteRune(c) + } +} + +func (enc *ReEncoder) stateInString(c rune) error { + switch { + case c == '\\': + enc.replaceState(enc.stateInBackslash, false) + return nil + case c == '"': + enc.popState() + return enc.Out.WriteRune(c) + case 0x0020 <= c && c <= 0x10FFFF: + return enc.emitStringChar(c) + default: + return fmt.Errorf("unexpected character: %c", c) + } +} +func (enc *ReEncoder) stateInBackslash(c rune) error { + switch c { + case '"': + return enc.emitStringChar('"') + case '\\': + return enc.emitStringChar('\\') + case 'b': + return enc.emitStringChar('\b') + case 'f': + return enc.emitStringChar('\f') + case 'n': + return enc.emitStringChar('\n') + case 'r': + return enc.emitStringChar('\r') + case 't': + return enc.emitStringChar('\t') + case 'u': + enc.replaceState(enc.stateInUnicode, false) + return nil + default: + return fmt.Errorf("unexpected character: %c", c) + } +} +func (enc *ReEncoder) stateInUnicode(c rune) error { + switch { + case '0' <= c && c <= '9': + enc.stateBuf = append(enc.stateBuf, byte(c)-'0') + case 'a' <= c && c <= 'f': + enc.stateBuf = append(enc.stateBuf, byte(c)-'a') + case 'A' <= c && c <= 'F': + enc.stateBuf = append(enc.stateBuf, byte(c)-'A') + default: + return fmt.Errorf("unexpected character: %c", c) + } + if len(enc.stateBuf) == 4 { + enc.replaceState(enc.stateInString, false) + c := 0 | + rune(enc.stateBuf[0])<<24 | + rune(enc.stateBuf[1])<<16 | + rune(enc.stateBuf[2])<<8 | + rune(enc.stateBuf[3])<<0 + enc.stateBuf = enc.stateBuf[:0] + return enc.emitStringChar(c) + } + return nil +} + +// number ////////////////////////////////////////////////////////////////////////////////////////// + +// Here's a flattened drawing of the syntax diagram from www.json.org : +// +// [------------ integer ----------][-- fraction ---][-------- exponent -------] +// >─╮─────╭─╮─"0"───────╭─────────╭──╮─────────────╭──╮───────────────────────╭─> +// │ │ │ │ │ │ │ │ │ +// ╰─"-"─╯ ╰─digit 1-9─╯─╭digit╮─╯ ╰─"."─╭digit╮─╯ ╰─"e"─╭─╮─────╭─╭digit╮─╯ +// ╰──<──╯ ╰──<──╯ │ │ │ │ ╰──<──╯ +// ╰─"E"─╯ ╰─"-"─╯ +// │ │ +// ╰─"+"─╯ +// +// Now here it is slightly redrawn, and with each distinct state our +// decoder can be in marked with a single-capital-letter: +// +// [-------------- integer ------------][--------- fraction --------][--------- exponent ---------] +// >─A─╮───────╭──╮─"0"─────────C─╭─────────╮──────────────────╭─────────╮──────────────────────────╭─> +// │ │ │ │ │ │ │ │ +// ╰─"-"─B─╯ ╰─digit 1-9─╭─D─╯─digit╮ ╰─"."─E─digit──╭─F─╯─digit╮ ╰─"e"─╭─G─╮─────╭─╭digit─H─╯ +// ╰────<─────╯ ╰────<─────╯ │ │ │ │ ╰────<───╯ +// ╰─"E"─╯ ╰─"-"─╯ +// │ │ +// ╰─"+"─╯ +// +// Which state we're at is stored the 'X' in 'stateNumberX'. +// +// Besides just traversing that, there are a few compressions we want to make: +// +// - trim trailing 0s from fraction the (but don't remove the +// fraction if it's all 0s); do this by making the F state a little +// special. This requires a little more state, because when we +// encounter the 0 we don't yet know if it's trailing. So, store +// the number of maybe-trailing zeros in enc.stateBuf[0]; if that +// reaches 255, then bleed over to enc.stateBuf[1] and so on. +// +// - trim leading 0s from the exponent (but don't remove the exponent +// if it's all 0s); do this by making the H state a little special. +// Record whether we've seen a non-zero digit in enc.stateBuf[0] +// (0=false, 1=true). + +// integer-part //////////////////////////////////////////////////////////////// +func (enc *ReEncoder) stateNumberA(c rune) error { // start + switch c { + case '-': + enc.replaceState(enc.stateNumberB, true) + case '0': + enc.replaceState(enc.stateNumberC, true) + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberD, true) + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteByte(byte(c)) +} +func (enc *ReEncoder) stateNumberB(c rune) error { // got a leading "-" + switch c { + case '0': + enc.replaceState(enc.stateNumberC, true) + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberD, true) + default: + return fmt.Errorf("unexpected character: %c", c) + } + return enc.Out.WriteByte(byte(c)) +} +func (enc *ReEncoder) stateNumberC(c rune) error { // ready for the fraction or exponent part to start + switch c { + case '.': + enc.replaceState(enc.stateNumberE, true) + return enc.Out.WriteByte('.') + case 'e', 'E': + enc.replaceState(enc.stateNumberG, true) + enc.stateBuf = append(enc.stateBuf[:0], 0) + return enc.Out.WriteByte('e') + default: + enc.popState() + return enc.state(c) + } +} +func (enc *ReEncoder) stateNumberD(c rune) error { // in the integer part + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return enc.Out.WriteByte(byte(c)) + case '.': + enc.replaceState(enc.stateNumberE, true) + return enc.Out.WriteByte('.') + case 'e', 'E': + enc.replaceState(enc.stateNumberG, true) + enc.stateBuf = append(enc.stateBuf[:0], 0) + return enc.Out.WriteByte('e') + default: + enc.popState() + return enc.state(c) + } +} + +// fraction-part /////////////////////////////////////////////////////////////// +func (enc *ReEncoder) stateNumberE(c rune) error { // got a ".", ready to read a number for the fraction part + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberF, true) + return enc.Out.WriteByte(byte(c)) + default: + return fmt.Errorf("unexpected character: %c", c) + } +} +func (enc *ReEncoder) stateNumberF(c rune) error { // in the fraction part + switch c { + case '0': + if len(enc.stateBuf) > 0 && enc.stateBuf[len(enc.stateBuf)-1] < 255 { + enc.stateBuf[len(enc.stateBuf)-1]++ + } else { + enc.stateBuf = append(enc.stateBuf, 1) + } + return nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + for len(enc.stateBuf) > 0 { + if err := enc.Out.WriteByte('0'); err != nil { + return err + } + if enc.stateBuf[len(enc.stateBuf)-1] == 1 { + enc.stateBuf = enc.stateBuf[:len(enc.stateBuf)-1] + } else { + enc.stateBuf[len(enc.stateBuf)-1]-- + } + } + return enc.Out.WriteByte(byte(c)) + case 'e', 'E': + enc.stateBuf = append(enc.stateBuf[:0], 0) + return enc.Out.WriteByte('e') + default: + enc.stateBuf = enc.stateBuf[:0] + enc.popState() + return enc.state(c) + } +} + +// exponent-part /////////////////////////////////////////////////////////////// +func (enc *ReEncoder) stateNumberG(c rune) error { // got a leading "e" + switch c { + case '-', '+': + enc.replaceState(enc.stateNumberH, true) + return enc.Out.WriteByte(byte(c)) + case '0': + enc.replaceState(enc.stateNumberH, true) + return nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberH, true) + enc.stateBuf[1] = 1 + return enc.Out.WriteByte(byte(c)) + default: + enc.stateBuf = enc.stateBuf[:0] + return fmt.Errorf("unexpected character: %c", c) + } +} +func (enc *ReEncoder) stateNumberH(c rune) error { // in the exponent's number part + switch c { + case '0': + if enc.stateBuf[1] == 0 { + return nil + } + return enc.Out.WriteByte('0') + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.stateBuf[1] = 1 + return enc.Out.WriteByte(byte(c)) + default: + if enc.stateBuf[1] == 0 { + if err := enc.Out.WriteByte('0'); err != nil { + return err + } + } + enc.stateBuf = enc.stateBuf[:0] + enc.popState() + return enc.state(c) + } +} + +// literals //////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInTrue(c rune) error { return enc._stateInLiteral(c, "true") } +func (enc *ReEncoder) stateInFalse(c rune) error { return enc._stateInLiteral(c, "false") } +func (enc *ReEncoder) stateInNull(c rune) error { return enc._stateInLiteral(c, "null") } +func (enc *ReEncoder) _stateInLiteral(c rune, full string) error { + if c != rune(full[len(enc.stateBuf)+1]) { + return fmt.Errorf("unexpected character: %c", c) + } + enc.stateBuf = append(enc.stateBuf, byte(c)) + if len(enc.stateBuf) == len(full) { + enc.popState() + } + return enc.Out.WriteRune(c) +} -- cgit v1.1-4-g5e80