diff options
Diffstat (limited to 'internal/parse.go')
-rw-r--r-- | internal/parse.go | 690 |
1 files changed, 690 insertions, 0 deletions
diff --git a/internal/parse.go b/internal/parse.go new file mode 100644 index 0000000..12d7600 --- /dev/null +++ b/internal/parse.go @@ -0,0 +1,690 @@ +// Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com> +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package internal + +import ( + "errors" + "fmt" + "io" + iofs "io/fs" + "strings" +) + +var ErrParserExceededMaxDepth = errors.New("exceeded max depth") + +type RuneType uint8 + +const ( + RuneTypeError = RuneType(iota) + + RuneTypeSpace // whitespace + + RuneTypeObjectBeg // '{' + RuneTypeObjectColon // ':' + RuneTypeObjectComma // ',' + RuneTypeObjectEnd // '}' + + RuneTypeArrayBeg // '[' + RuneTypeArrayComma // ',' + RuneTypeArrayEnd // ']' + + RuneTypeStringBeg // opening '"' + RuneTypeStringChar // normal character + RuneTypeStringEsc // backslash + RuneTypeStringEsc1 // single-char after a backslash + RuneTypeStringEscU // \uABCD : u + RuneTypeStringEscUA // \uABCD : A + RuneTypeStringEscUB // \uABCD : B + RuneTypeStringEscUC // \uABCD : C + RuneTypeStringEscUD // \uABCD : D + RuneTypeStringEnd // closing '"' + + RuneTypeNumberIntNeg + RuneTypeNumberIntZero + RuneTypeNumberIntDig + RuneTypeNumberFracDot + RuneTypeNumberFracDig + RuneTypeNumberExpE + RuneTypeNumberExpSign + RuneTypeNumberExpDig + + RuneTypeTrueT + RuneTypeTrueR + RuneTypeTrueU + RuneTypeTrueE + + RuneTypeFalseF + RuneTypeFalseA + RuneTypeFalseL + RuneTypeFalseS + RuneTypeFalseE + + RuneTypeNullN + RuneTypeNullU + RuneTypeNullL1 + RuneTypeNullL2 + + RuneTypeEOF +) + +func (t RuneType) GoString() string { + str, ok := map[RuneType]string{ + RuneTypeError: "RuneTypeError", + + RuneTypeSpace: "RuneTypeSpace", + + RuneTypeObjectBeg: "RuneTypeObjectBeg", + RuneTypeObjectColon: "RuneTypeObjectColon", + RuneTypeObjectComma: "RuneTypeObjectComma", + RuneTypeObjectEnd: "RuneTypeObjectEnd", + + RuneTypeArrayBeg: "RuneTypeArrayBeg", + RuneTypeArrayComma: "RuneTypeArrayComma", + RuneTypeArrayEnd: "RuneTypeArrayEnd", + + RuneTypeStringBeg: "RuneTypeStringBeg", + RuneTypeStringChar: "RuneTypeStringChar", + RuneTypeStringEsc: "RuneTypeStringEsc", + RuneTypeStringEsc1: "RuneTypeStringEsc1", + RuneTypeStringEscU: "RuneTypeStringEscU", + RuneTypeStringEscUA: "RuneTypeStringEscUA", + RuneTypeStringEscUB: "RuneTypeStringEscUB", + RuneTypeStringEscUC: "RuneTypeStringEscUC", + RuneTypeStringEscUD: "RuneTypeStringEscUD", + RuneTypeStringEnd: "RuneTypeStringEnd", + + RuneTypeNumberIntNeg: "RuneTypeNumberIntNeg", + RuneTypeNumberIntZero: "RuneTypeNumberIntZero", + RuneTypeNumberIntDig: "RuneTypeNumberIntDig", + RuneTypeNumberFracDot: "RuneTypeNumberFracDot", + RuneTypeNumberFracDig: "RuneTypeNumberFracDig", + RuneTypeNumberExpE: "RuneTypeNumberExpE", + RuneTypeNumberExpSign: "RuneTypeNumberExpSign", + RuneTypeNumberExpDig: "RuneTypeNumberExpDig", + + RuneTypeTrueT: "RuneTypeTrueT", + RuneTypeTrueR: "RuneTypeTrueR", + RuneTypeTrueU: "RuneTypeTrueU", + RuneTypeTrueE: "RuneTypeTrueE", + + RuneTypeFalseF: "RuneTypeFalseF", + RuneTypeFalseA: "RuneTypeFalseA", + RuneTypeFalseL: "RuneTypeFalseL", + RuneTypeFalseS: "RuneTypeFalseS", + RuneTypeFalseE: "RuneTypeFalseE", + + RuneTypeNullN: "RuneTypeNullN", + RuneTypeNullU: "RuneTypeNullU", + RuneTypeNullL1: "RuneTypeNullL1", + RuneTypeNullL2: "RuneTypeNullL2", + + RuneTypeEOF: "RuneTypeEOF", + }[t] + if ok { + return str + } + return fmt.Sprintf("RuneType(%d)", t) +} + +func (t RuneType) String() string { + str, ok := map[RuneType]string{ + RuneTypeError: "x", + + RuneTypeSpace: " ", + + RuneTypeObjectBeg: "{", + RuneTypeObjectColon: ":", + RuneTypeObjectComma: "o", + RuneTypeObjectEnd: "}", + + RuneTypeArrayBeg: "[", + RuneTypeArrayComma: "a", + RuneTypeArrayEnd: "]", + + RuneTypeStringBeg: "โ", + RuneTypeStringChar: "c", + RuneTypeStringEsc: "\\", + RuneTypeStringEsc1: "b", + RuneTypeStringEscU: "u", + RuneTypeStringEscUA: "A", + RuneTypeStringEscUB: "B", + RuneTypeStringEscUC: "C", + RuneTypeStringEscUD: "D", + RuneTypeStringEnd: "โ", + + RuneTypeNumberIntNeg: "-", + RuneTypeNumberIntZero: "0", + RuneTypeNumberIntDig: "1", + RuneTypeNumberFracDot: ".", + RuneTypeNumberFracDig: "2", + RuneTypeNumberExpE: "e", + RuneTypeNumberExpSign: "+", + RuneTypeNumberExpDig: "3", + + RuneTypeTrueT: "๐ฅ", // double-struck + RuneTypeTrueR: "๐ฃ", + RuneTypeTrueU: "๐ฆ", + RuneTypeTrueE: "๐", + + RuneTypeFalseF: "๐ฃ", // fraktur + RuneTypeFalseA: "๐", + RuneTypeFalseL: "๐ฉ", + RuneTypeFalseS: "๐ฐ", + RuneTypeFalseE: "๐ข", + + RuneTypeNullN: "โ", // circled + RuneTypeNullU: "โค", + RuneTypeNullL1: "โ", + RuneTypeNullL2: "โ", // +uppercase + + RuneTypeEOF: "$", + }[t] + if ok { + return str + } + return fmt.Sprintf("<%d>", t) +} + +func (t RuneType) JSONType() string { + return map[RuneType]string{ + RuneTypeObjectBeg: "object", + RuneTypeArrayBeg: "array", + RuneTypeStringBeg: "string", + RuneTypeNumberIntNeg: "number", + RuneTypeNumberIntZero: "number", + RuneTypeNumberIntDig: "number", + RuneTypeTrueT: "true", + RuneTypeFalseF: "false", + RuneTypeNullN: "null", + RuneTypeEOF: "eof", + }[t] +} + +func (t RuneType) IsNumber() bool { + return RuneTypeNumberIntNeg <= t && t <= RuneTypeNumberExpDig +} + +type Parser struct { + // Setting MaxError to a value greater than 0 causes + // HandleRune to return ErrParserExceededMaxDepth if + // objects/arrays become nested more deeply than this. + MaxDepth int + + initialized bool + + err error + closed bool + + // We reuse RuneTypes to store the stack. The base idea is + // that, stack items are "the most recently read + // stack-relevant RuneType". + // + // We treat RuneTypeError as a wildcard. + // + // The "normal"stack-relevant RuneTypes are: + // + // โ\uABC for strings + // -01.2e+3 for numbers + // ๐ฅ๐ฃ๐ฆ for "true" + // ๐ฃ๐๐ฉ๐ฐ for "false" + // โโคโ for "null" + // + // Objects and arrays break the "most recently read RuneType" + // rule; they need some special assignments: + // + // { object: waiting for key to start or '}' + // โ object: reading key / waiting for colon + // : object: waiting for value to start + // o object: reading value / waiting for ',' or '}' + // + // [ array: waiting for item to start or ']' + // a array: reading item / waiting for ',' or ']' + // ] array: waiting for item to start + // + // Within each element type, the stack item is replaced, not pushed. + // + // For example, given the input string + // + // {"x":"y","a":"b"} + // + // The stack would be + // + // stack processed + // x + // { { + // โโ {" + // โโ {"x + // โ {"x" + // : {"x": + // oโ {"x":" + // oโ {"x":"y + // o {"x":"y" + // { {"x":"y", + // โโ {"x":"y"," + // โโ {"x":"y","a + // โ {"x":"y","a" + // : {"x":"y","a": + // oโ {"x":"y","a":" + // oโ {"x":"y","a":"b + // o {"x":"y","a":"b" + // {"x":"y","a":"b"} + stack []RuneType +} + +func (par *Parser) pushState(state RuneType) RuneType { + par.stack = append(par.stack, state) + return state +} +func (par *Parser) replaceState(state RuneType) RuneType { + par.stack[len(par.stack)-1] = state + return state +} +func (par *Parser) popState() { + par.stack = par.stack[:len(par.stack)-1] +} + +func (par *Parser) stackString() string { + var buf strings.Builder + for _, s := range par.stack { + buf.WriteString(s.String()) + } + return buf.String() +} + +func (par *Parser) StackIsEmpty() bool { + return len(par.stack) == 0 || (len(par.stack) == 1 && par.stack[0] == RuneTypeError) +} + +// Reset all Parser state. +func (par *Parser) Reset() { + *par = Parser{ + MaxDepth: par.MaxDepth, + } +} + +// HandleEOF feeds EOF to the Parser. The returned RuneType is either +// RuneTypeEOF or RuneTypeError. +// +// An error is returned if and only if the RuneType is RuneTypeError. +// Returns io/fs.ErrClosed if .HandleEOF() has previously been called +// (and .Reset() has not been called since). +// +// Once RuneTypeError or RuneTypeEOF has been returned, it will keep +// being returned from both .HandleRune(c) and .HandleEOF() until +// .Reset() is called. +// +// RuneTypeEOF indicates that a complete JSON document has been read. +func (par *Parser) HandleEOF() (RuneType, error) { + if par.closed { + return RuneTypeError, iofs.ErrClosed + } + defer func() { + par.closed = true + }() + if par.err != nil { + return RuneTypeError, par.err + } + if !par.initialized { + par.initialized = true + par.pushState(RuneTypeError) + } + switch len(par.stack) { + case 0: + return RuneTypeEOF, nil + case 1: + switch { + case par.stack[0].IsNumber(): + if _, err := par.HandleRune('\n'); err == nil { + return RuneTypeEOF, nil + } + case par.stack[0] == RuneTypeError: + par.err = io.EOF + return RuneTypeError, par.err + } + fallthrough + default: + par.err = io.ErrUnexpectedEOF + return RuneTypeError, par.err + } +} + +// HandleRune feeds a Unicode rune to the Parser. +// +// An error is returned if and only if the RuneType is RuneTypeError. +// Returns io/fs.ErrClosed if .HandleEOF() has previously been called +// (and .Reset() has not been called since). +// +// Once RuneTypeError or RuneTypeEOF has been returned, it will keep +// being returned from both .HandleRune(c) and .HandleEOF() until +// .Reset() is called. +// +// RuneTypeEOF indicates that the rune cannot be appended to the JSON +// document; a new JSON document must be started in order to process +// that rune. +func (par *Parser) HandleRune(c rune) (RuneType, error) { + if par.closed { + return RuneTypeError, iofs.ErrClosed + } + if par.err != nil { + return RuneTypeError, par.err + } + if !par.initialized { + par.initialized = true + par.pushState(RuneTypeError) + } + if len(par.stack) == 0 { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + default: + return RuneTypeEOF, nil + } + } + switch par.stack[len(par.stack)-1] { + // any ///////////////////////////////////////////////////////////////////////////////////// + case RuneTypeError: + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case '{': + if par.MaxDepth > 0 && len(par.stack) > par.MaxDepth { + return RuneTypeError, ErrParserExceededMaxDepth + } + return par.replaceState(RuneTypeObjectBeg), nil + case '[': + if par.MaxDepth > 0 && len(par.stack) > par.MaxDepth { + return RuneTypeError, ErrParserExceededMaxDepth + } + return par.replaceState(RuneTypeArrayBeg), nil + case '"': + return par.replaceState(RuneTypeStringBeg), nil + case '-': + return par.replaceState(RuneTypeNumberIntNeg), nil + case '0': + return par.replaceState(RuneTypeNumberIntZero), nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberIntDig), nil + case 't': + return par.replaceState(RuneTypeTrueT), nil + case 'f': + return par.replaceState(RuneTypeFalseF), nil + case 'n': + return par.replaceState(RuneTypeNullN), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q looking for beginning of value", c) + } + // object ////////////////////////////////////////////////////////////////////////////////// + case RuneTypeObjectBeg: // waiting for key to start or '}' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case '"': + par.replaceState(RuneTypeStringEnd) + return par.pushState(RuneTypeStringBeg), nil + case '}': + par.popState() + return RuneTypeObjectEnd, nil + default: + return RuneTypeError, fmt.Errorf("object: unexpected character: %q", c) + } + case RuneTypeStringEnd: // waiting for ':' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ':': + par.replaceState(RuneTypeObjectComma) + par.pushState(RuneTypeError) + return RuneTypeObjectColon, nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q after object key", c) + } + case RuneTypeObjectComma: // waiting for ',' or '}' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ',': + par.replaceState(RuneTypeObjectBeg) + return RuneTypeObjectComma, nil + case '}': + par.popState() + return RuneTypeObjectEnd, nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q after object key:value pair", c) + } + // array /////////////////////////////////////////////////////////////////////////////////// + case RuneTypeArrayBeg: // waiting for item to start or ']' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ']': + par.popState() + return RuneTypeArrayEnd, nil + default: + par.replaceState(RuneTypeArrayComma) + par.pushState(RuneTypeError) + return par.HandleRune(c) + } + case RuneTypeArrayEnd: // waiting for item + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + default: + par.replaceState(RuneTypeArrayComma) + par.pushState(RuneTypeError) + return par.HandleRune(c) + } + case RuneTypeArrayComma: // waiting for ',' or ']' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ',': + par.replaceState(RuneTypeArrayEnd) + return RuneTypeArrayComma, nil + case ']': + par.popState() + return RuneTypeArrayEnd, nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q after array element", c) + } + // string ////////////////////////////////////////////////////////////////////////////////// + case RuneTypeStringBeg: // waiting for char or '"' + switch { + case c == '\\': + return par.replaceState(RuneTypeStringEsc), nil + case c == '"': + par.popState() + return RuneTypeStringEnd, nil + case 0x0020 <= c && c <= 0x10FFFF: + return RuneTypeStringChar, nil + default: + return RuneTypeError, fmt.Errorf("string: unexpected character: %q", c) + } + case RuneTypeStringEsc: // waiting for escape char + switch c { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + par.replaceState(RuneTypeStringBeg) + return RuneTypeStringEsc1, nil + case 'u': + return par.replaceState(RuneTypeStringEscU), nil + default: + return RuneTypeError, fmt.Errorf("string backslash sequence: unexpected character: %q", c) + } + case RuneTypeStringEscU: + if _, ok := HexToInt(c); ok { + return par.replaceState(RuneTypeStringEscUA), nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + case RuneTypeStringEscUA: + if _, ok := HexToInt(c); ok { + return par.replaceState(RuneTypeStringEscUB), nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + case RuneTypeStringEscUB: + if _, ok := HexToInt(c); ok { + return par.replaceState(RuneTypeStringEscUC), nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + case RuneTypeStringEscUC: + if _, ok := HexToInt(c); ok { + par.replaceState(RuneTypeStringBeg) + return RuneTypeStringEscUD, nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + // number ////////////////////////////////////////////////////////////////////////////////// + // + // Here's a flattened drawing of the syntax diagram from www.json.org : + // + // [------------ integer ----------][-- fraction ---][-------- exponent -------] + // >โโฎโโโโโโญโโฎโ"0"โโโโโโโโญโโโโโโโโโโญโโโฎโโโโโโโโโโโโโโญโโโฎโโโโโโโโโโโโโโโโโโโโโโโโญโ> + // โ โ โ โ โ โ โ โ โ + // โฐโ"-"โโฏ โฐโdigit 1-9โโฏโโญdigitโฎโโฏ โฐโ"."โโญdigitโฎโโฏ โฐโ"e"โโญโโฎโโโโโโญโโญdigitโฎโโฏ + // โฐโโ<โโโฏ โฐโโ<โโโฏ โ โ โ โ โฐโโ<โโโฏ + // โฐโ"E"โโฏ โฐโ"-"โโฏ + // โ โ + // โฐโ"+"โโฏ + // + // Now here it is slightly redrawn, and with each distinct state our + // parser can be in marked with a single-capital-letter: + // + // [-------------- integer ------------][--------- fraction --------][--------- exponent ---------] + // >โAโโฎโโโโโโโโญโโโฎโ"0"โโโโโโโโโCโโญโโโโโโโโโโฎโโโโโโโโโโโโโโโโโโโญโโโโโโโโโโฎโโโโโโโโโโโโโโโโโโโโโโโโโโโญโ> + // โ โ โ โ โ โ โ โ + // โฐโ"-"โBโโฏ โฐโdigit 1-9โโญโDโโฏโdigitโฎ โฐโ"."โEโdigitโโโญโFโโฏโdigitโฎ โฐโ"e"โโญโGโโฎโโโโโโญโโญdigitโIโโฏ + // โฐโโโโ<โโโโโโฏ โฐโโโโ<โโโโโโฏ โ โ โ H โฐโโโโ<โโโโฏ + // โฐโ"E"โโฏ โฐโ"-"โโฏ + // โ โ + // โฐโ"+"โโฏ + // + // You may notice that each of these states may be uniquely identified + // by the last-read RuneType: + // + // A = (nothing yet) + // B = IntNeg + // C = IntZero + // D = IntDig + // E = FracDot + // F = FracDig + // G = ExpE + // H = ExpSign + // I = ExpDig + // + // The 'A' state is part of the RuneTypeError "any" case + // above, and the remainder follow: + case RuneTypeNumberIntNeg: // B + switch c { + case '0': + return par.replaceState(RuneTypeNumberIntZero), nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberIntDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberIntZero: // C + switch c { + case '.': + return par.replaceState(RuneTypeNumberFracDot), nil + case 'e', 'E': + return par.replaceState(RuneTypeNumberExpE), nil + default: + par.popState() + return par.HandleRune(c) + } + case RuneTypeNumberIntDig: // D + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberIntDig), nil + case '.': + return par.replaceState(RuneTypeNumberFracDot), nil + case 'e', 'E': + return par.replaceState(RuneTypeNumberExpE), nil + default: + par.popState() + return par.HandleRune(c) + } + case RuneTypeNumberFracDot: // E + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberFracDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberFracDig: // F + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberFracDig), nil + case 'e', 'E': + return par.replaceState(RuneTypeNumberExpE), nil + default: + par.popState() + return par.HandleRune(c) + } + case RuneTypeNumberExpE: // G + switch c { + case '-', '+': + return par.replaceState(RuneTypeNumberExpSign), nil + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberExpDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberExpSign: // H + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberExpDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberExpDig: // I + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberExpDig), nil + default: + par.popState() + return par.HandleRune(c) + } + // literals //////////////////////////////////////////////////////////////////////////////// + // true + case RuneTypeTrueT: + return par.expectRune(c, 'r', RuneTypeTrueR, "true", false) + case RuneTypeTrueR: + return par.expectRune(c, 'u', RuneTypeTrueU, "true", false) + case RuneTypeTrueU: + return par.expectRune(c, 'e', RuneTypeTrueE, "true", true) + // false + case RuneTypeFalseF: + return par.expectRune(c, 'a', RuneTypeFalseA, "false", false) + case RuneTypeFalseA: + return par.expectRune(c, 'l', RuneTypeFalseL, "false", false) + case RuneTypeFalseL: + return par.expectRune(c, 's', RuneTypeFalseS, "false", false) + case RuneTypeFalseS: + return par.expectRune(c, 'e', RuneTypeFalseE, "false", true) + // null + case RuneTypeNullN: + return par.expectRune(c, 'u', RuneTypeNullU, "null", false) + case RuneTypeNullU: + return par.expectRune(c, 'l', RuneTypeNullL1, "null", false) + case RuneTypeNullL1: + return par.expectRune(c, 'l', RuneTypeNullL2, "null", true) + default: + panic(fmt.Errorf(`invalid stack: "%s"`, par.stackString())) + } +} + +func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { + if c != exp { + return RuneTypeError, fmt.Errorf("invalid character %q in literal %s (expecting %q)", c, context, exp) + } + if pop { + par.popState() + return typ, nil + } else { + return par.replaceState(typ), nil + } +} |