From 2b9473f5e8816eeea76b2fdada184532be00d3a2 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 7 Feb 2023 12:18:29 -0700 Subject: internal: Split in to sub-packages --- internal/jsonparse/hex.go | 20 + internal/jsonparse/parse.go | 845 +++++++++++++++++++++++++++++++++++++++ internal/jsonparse/parse_test.go | 78 ++++ 3 files changed, 943 insertions(+) create mode 100644 internal/jsonparse/hex.go create mode 100644 internal/jsonparse/parse.go create mode 100644 internal/jsonparse/parse_test.go (limited to 'internal/jsonparse') diff --git a/internal/jsonparse/hex.go b/internal/jsonparse/hex.go new file mode 100644 index 0000000..3ed5f01 --- /dev/null +++ b/internal/jsonparse/hex.go @@ -0,0 +1,20 @@ +// Copyright (C) 2022-2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package jsonparse + +const Hex = "0123456789abcdef" + +func HexToInt(c rune) (byte, bool) { + switch { + case '0' <= c && c <= '9': + return byte(c) - '0', true + case 'a' <= c && c <= 'f': + return byte(c) - 'a' + 10, true + case 'A' <= c && c <= 'F': + return byte(c) - 'A' + 10, true + default: + return 0, false + } +} diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go new file mode 100644 index 0000000..7d97be0 --- /dev/null +++ b/internal/jsonparse/parse.go @@ -0,0 +1,845 @@ +// Copyright (C) 2022-2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package jsonparse + +import ( + "errors" + "fmt" + "io" + iofs "io/fs" + "strings" +) + +var ErrParserExceededMaxDepth = errors.New("exceeded max depth") + +// RuneType is the classification of a rune when parsing JSON input. +// A Parser, rather than grouping runes into tokens and classifying +// tokens, classifies runes directly. +type RuneType uint8 + +const ( + RuneTypeError RuneType = iota + + RuneTypeSpace // whitespace + + RuneTypeObjectBeg // '{' + RuneTypeObjectColon // ':' + RuneTypeObjectComma // ',' + RuneTypeObjectEnd // '}' + + RuneTypeArrayBeg // '[' + RuneTypeArrayComma // ',' + RuneTypeArrayEnd // ']' + + RuneTypeStringBeg // opening '"' + RuneTypeStringChar // normal character + RuneTypeStringEsc // backslash + RuneTypeStringEsc1 // single-char after a backslash + RuneTypeStringEscU // \uABCD : u + RuneTypeStringEscUA // \uABCD : A + RuneTypeStringEscUB // \uABCD : B + RuneTypeStringEscUC // \uABCD : C + RuneTypeStringEscUD // \uABCD : D + RuneTypeStringEnd // closing '"' + + RuneTypeNumberIntNeg + RuneTypeNumberIntZero // leading zero only; non-leading zeros are IntDig, not IntZero + RuneTypeNumberIntDig + RuneTypeNumberFracDot + RuneTypeNumberFracDig + RuneTypeNumberExpE + RuneTypeNumberExpSign + RuneTypeNumberExpDig + + RuneTypeTrueT + RuneTypeTrueR + RuneTypeTrueU + RuneTypeTrueE + + RuneTypeFalseF + RuneTypeFalseA + RuneTypeFalseL + RuneTypeFalseS + RuneTypeFalseE + + RuneTypeNullN + RuneTypeNullU + RuneTypeNullL1 + RuneTypeNullL2 + + RuneTypeEOF + + // Not a real rune type, but used as a stack state. + runeTypeAny +) + +// GoString implements fmt.GoStringer. +// +//nolint:dupl // False positive due to similarly shaped AST. +func (t RuneType) GoString() string { + str, ok := map[RuneType]string{ + RuneTypeError: "RuneTypeError", + + RuneTypeSpace: "RuneTypeSpace", + + RuneTypeObjectBeg: "RuneTypeObjectBeg", + RuneTypeObjectColon: "RuneTypeObjectColon", + RuneTypeObjectComma: "RuneTypeObjectComma", + RuneTypeObjectEnd: "RuneTypeObjectEnd", + + RuneTypeArrayBeg: "RuneTypeArrayBeg", + RuneTypeArrayComma: "RuneTypeArrayComma", + RuneTypeArrayEnd: "RuneTypeArrayEnd", + + RuneTypeStringBeg: "RuneTypeStringBeg", + RuneTypeStringChar: "RuneTypeStringChar", + RuneTypeStringEsc: "RuneTypeStringEsc", + RuneTypeStringEsc1: "RuneTypeStringEsc1", + RuneTypeStringEscU: "RuneTypeStringEscU", + RuneTypeStringEscUA: "RuneTypeStringEscUA", + RuneTypeStringEscUB: "RuneTypeStringEscUB", + RuneTypeStringEscUC: "RuneTypeStringEscUC", + RuneTypeStringEscUD: "RuneTypeStringEscUD", + RuneTypeStringEnd: "RuneTypeStringEnd", + + RuneTypeNumberIntNeg: "RuneTypeNumberIntNeg", + RuneTypeNumberIntZero: "RuneTypeNumberIntZero", + RuneTypeNumberIntDig: "RuneTypeNumberIntDig", + RuneTypeNumberFracDot: "RuneTypeNumberFracDot", + RuneTypeNumberFracDig: "RuneTypeNumberFracDig", + RuneTypeNumberExpE: "RuneTypeNumberExpE", + RuneTypeNumberExpSign: "RuneTypeNumberExpSign", + RuneTypeNumberExpDig: "RuneTypeNumberExpDig", + + RuneTypeTrueT: "RuneTypeTrueT", + RuneTypeTrueR: "RuneTypeTrueR", + RuneTypeTrueU: "RuneTypeTrueU", + RuneTypeTrueE: "RuneTypeTrueE", + + RuneTypeFalseF: "RuneTypeFalseF", + RuneTypeFalseA: "RuneTypeFalseA", + RuneTypeFalseL: "RuneTypeFalseL", + RuneTypeFalseS: "RuneTypeFalseS", + RuneTypeFalseE: "RuneTypeFalseE", + + RuneTypeNullN: "RuneTypeNullN", + RuneTypeNullU: "RuneTypeNullU", + RuneTypeNullL1: "RuneTypeNullL1", + RuneTypeNullL2: "RuneTypeNullL2", + + RuneTypeEOF: "RuneTypeEOF", + + runeTypeAny: "runeTypeAny", + }[t] + if ok { + return str + } + return fmt.Sprintf("RuneType(%d)", t) +} + +// String implements fmt.Stringer. +// +//nolint:dupl // False positive due to similarly shaped AST. +func (t RuneType) String() string { + str, ok := map[RuneType]string{ + RuneTypeError: "x", + + RuneTypeSpace: " ", + + RuneTypeObjectBeg: "{", + RuneTypeObjectColon: ":", + RuneTypeObjectComma: "o", + RuneTypeObjectEnd: "}", + + RuneTypeArrayBeg: "[", + RuneTypeArrayComma: "a", + RuneTypeArrayEnd: "]", + + RuneTypeStringBeg: "\"", + RuneTypeStringChar: "c", + RuneTypeStringEsc: "\\", + RuneTypeStringEsc1: "b", + RuneTypeStringEscU: "u", + RuneTypeStringEscUA: "A", + RuneTypeStringEscUB: "B", + RuneTypeStringEscUC: "C", + RuneTypeStringEscUD: "D", + RuneTypeStringEnd: "ยป", + + RuneTypeNumberIntNeg: "-", + RuneTypeNumberIntZero: "0", + RuneTypeNumberIntDig: "1", + RuneTypeNumberFracDot: ".", + RuneTypeNumberFracDig: "2", + RuneTypeNumberExpE: "e", + RuneTypeNumberExpSign: "+", + RuneTypeNumberExpDig: "3", + + RuneTypeTrueT: "๐•ฅ", // double-struck + RuneTypeTrueR: "๐•ฃ", + RuneTypeTrueU: "๐•ฆ", + RuneTypeTrueE: "๐•–", + + RuneTypeFalseF: "๐”ฃ", // fraktur + RuneTypeFalseA: "๐”ž", + RuneTypeFalseL: "๐”ฉ", + RuneTypeFalseS: "๐”ฐ", + RuneTypeFalseE: "๐”ข", + + RuneTypeNullN: "โ“", // circled + RuneTypeNullU: "โ“ค", + RuneTypeNullL1: "โ“›", + RuneTypeNullL2: "โ“", // +uppercase + + RuneTypeEOF: "$", + + runeTypeAny: "?", + }[t] + if ok { + return str + } + return fmt.Sprintf("<%d>", t) +} + +func (t RuneType) JSONType() string { + return map[RuneType]string{ + RuneTypeObjectBeg: "object", + RuneTypeArrayBeg: "array", + RuneTypeStringBeg: "string", + RuneTypeNumberIntNeg: "number", + RuneTypeNumberIntZero: "number", + RuneTypeNumberIntDig: "number", + RuneTypeTrueT: "true", + RuneTypeFalseF: "false", + RuneTypeNullN: "null", + RuneTypeEOF: "eof", + }[t] +} + +// IsNumber returns whether the RuneType is one of the +// RuneTypeNumberXXX values. +func (t RuneType) IsNumber() bool { + return RuneTypeNumberIntNeg <= t && t <= RuneTypeNumberExpDig +} + +// Parser is the low-level JSON parser that powers both *Decoder and +// *ReEncoder. +type Parser struct { + // Setting MaxError to a value greater than 0 causes + // HandleRune to return ErrParserExceededMaxDepth if + // objects/arrays become nested more deeply than this. + MaxDepth int + + initialized bool + + err error + closed bool + + // We reuse RuneTypes to store the stack. The base idea is: + // stack items are "the most recently read stack-relevant + // RuneType". + // + // The stack starts out with the special pseudo-RuneType + // `runeTypeAny` that means we're willing to accept any + // element type; an empty stack means that we have reached the + // end of the top-level element and should accept no more + // input except for whitespace. + // + // The "normal" stack-relevant RuneTypes are: + // + // "\uABC for strings + // -01.2e+3 for numbers + // ๐•ฅ๐•ฃ๐•ฆ for "true" + // ๐”ฃ๐”ž๐”ฉ๐”ฐ for "false" + // โ“โ“คโ“› for "null" + // + // Objects and arrays break the "most recently read RuneType" + // rule; they need some special assignments: + // + // { object: waiting for key to start or '}' + // ยป object: reading key / waiting for colon + // o object: reading value / waiting for ',' or '}' + // + // [ array: waiting for item to start or ']' + // a array: reading item / waiting for ',' or ']' + // + // Within each element type, the stack item is replaced, not pushed. + // + // (Keep each of these examples in-sync with parse_test.go.) + // + // For example, given the input string + // + // {"x":"y","a":"b"} + // + // The stack would be + // + // stack processed + // ? + // { { + // ยป" {" + // ยป" {"x + // ยป {"x" + // o? {"x": + // o" {"x":" + // o" {"x":"y + // o {"x":"y" + // { {"x":"y", + // ยป" {"x":"y"," + // ยป" {"x":"y","a + // ยป {"x":"y","a" + // o? {"x":"y","a": + // o" {"x":"y","a":" + // o" {"x":"y","a":"b + // o {"x":"y","a":"b" + // {"x":"y","a":"b"} + // + // Or, given the input string + // + // ["x","y"] + // + // The stack would be + // + // stack processed + // ? + // [ [ + // a" [" + // a" ["x + // a ["x" + // a? ["x", + // a" ["x"," + // a" ["x","y + // a ["x","y" + // ["x","y"] + stack []RuneType + + barriers []barrier +} + +type barrier struct { + closed bool + stack []RuneType +} + +func (par *Parser) init() { + if !par.initialized { + par.initialized = true + par.pushState(runeTypeAny) + } +} + +func (par *Parser) pushState(state RuneType) RuneType { + par.stack = append(par.stack, state) + return state +} + +func (par *Parser) replaceState(state RuneType) RuneType { + par.stack[len(par.stack)-1] = state + return state +} + +func (par *Parser) popState() { + par.stack = par.stack[:len(par.stack)-1] +} + +func (par *Parser) stackString() string { + par.init() + var buf strings.Builder + for _, s := range par.stack { + buf.WriteString(s.String()) + } + return buf.String() +} + +func (par *Parser) depth() int { + n := len(par.stack) + for _, barrier := range par.barriers { + n += len(barrier.stack) + } + return n +} + +func (par *Parser) StackIsEmpty() bool { + if len(par.barriers) > 0 { + return false + } + if len(par.stack) == 0 { + return true + } + return len(par.stack) == 1 && par.stack[0] == runeTypeAny +} + +func (par *Parser) StackSize() int { + return len(par.stack) +} + +// Reset all Parser state. +func (par *Parser) Reset() { + *par = Parser{ + MaxDepth: par.MaxDepth, + } +} + +// PushReadBarrier causes the parser to expect EOF once the end of the +// element that is started by the current top-of-stack is reached, +// until this is un-done with PopBarrier. It essentially turns the +// parser in to a sub-parser. +// +// PushReadBarrier may only be called at the beginning of an element, +// whether that be +// +// - runeTypeAny +// - RuneTypeObjectBeg +// - RuneTypeArrayBeg +// - RuneTypeStringBeg +// - RuneTypeNumberIntNeg, RuneTypeNumberIntZero, RuneTypeNumberIntDig +// - RuneTypeTrueT +// - RuneTypeFalseF +// - RuneTypeNullN +func (par *Parser) PushReadBarrier() { + // Sanity checking. + par.init() + if len(par.stack) == 0 { + panic(errors.New("illegal PushReadBarrier call: empty stack")) + } + curState := par.stack[len(par.stack)-1] + switch curState { + case runeTypeAny, + RuneTypeObjectBeg, + RuneTypeArrayBeg, + RuneTypeStringBeg, + RuneTypeNumberIntNeg, RuneTypeNumberIntZero, RuneTypeNumberIntDig, + RuneTypeTrueT, + RuneTypeFalseF, + RuneTypeNullN: + // OK + default: + panic(fmt.Errorf("illegal PushReadBarrier call: %q", curState)) + } + // Actually push. + par.barriers = append(par.barriers, barrier{ + closed: par.closed, + stack: par.stack[:len(par.stack)-1], + }) + par.stack = []RuneType{curState} +} + +// PushWriteBarrier causes the parser to expect EOF once the end of +// the about-to-start element is reached, until this is un-done with +// PopBarrier. It essentially turns the parser in to a sub-parser. +// +// PushWriteBarrier may only be called at the places where an element +// of any type may start: +// +// - runeTypeAny for top-level and object-value elements +// - RuneTypeArrayBeg for array-item elements +// +// PushWriteBarrier signals intent to write an element; if it is +// called in a place where an element is optional (at the beginning of +// an array), it becomes a syntax error to not write the element. +func (par *Parser) PushWriteBarrier() { + par.init() + if len(par.stack) == 0 { + panic(errors.New("illegal PushWriteBarrier call: empty stack")) + } + switch par.stack[len(par.stack)-1] { + case runeTypeAny: + par.popState() + par.barriers = append(par.barriers, barrier{ + closed: par.closed, + stack: par.stack, + }) + par.stack = []RuneType{runeTypeAny} + case RuneTypeArrayBeg: + par.replaceState(RuneTypeArrayComma) + par.barriers = append(par.barriers, barrier{ + closed: par.closed, + stack: par.stack, + }) + par.stack = []RuneType{runeTypeAny} + default: + panic(fmt.Errorf("illegal PushWriteBarrier call: %q", par.stack[len(par.stack)-1])) + } +} + +// PopBarrier reverses a call to PushReadBarrier or PushWriteBarrier. +func (par *Parser) PopBarrier() { + if len(par.barriers) == 0 { + panic(errors.New("illegal PopBarrier call: empty barrier stack")) + } + barrier := par.barriers[len(par.barriers)-1] + par.barriers = par.barriers[:len(par.barriers)-1] + par.closed = barrier.closed + par.stack = append(barrier.stack, par.stack...) +} + +// HandleEOF feeds EOF to the Parser. The returned RuneType is either +// RuneTypeEOF or RuneTypeError. +// +// An error is returned if and only if the RuneType is RuneTypeError. +// Returns io/fs.ErrClosed if .HandleEOF() has previously been called +// (and .Reset() has not been called since). +// +// Once RuneTypeError or RuneTypeEOF has been returned, it will keep +// being returned from both .HandleRune(c) and .HandleEOF() until +// .Reset() is called. +// +// RuneTypeEOF indicates that a complete JSON document has been read. +func (par *Parser) HandleEOF() (RuneType, error) { + if par.closed { + return RuneTypeError, iofs.ErrClosed + } + defer func() { + par.closed = true + }() + if par.err != nil { + return RuneTypeError, par.err + } + par.init() + switch len(par.stack) { + case 0: + return RuneTypeEOF, nil + case 1: + switch { + case par.stack[0].IsNumber(): + if _, err := par.HandleRune('\n'); err == nil { + return RuneTypeEOF, nil + } + case par.stack[0] == runeTypeAny: + par.err = io.EOF + return RuneTypeError, par.err + } + fallthrough + default: + par.err = io.ErrUnexpectedEOF + return RuneTypeError, par.err + } +} + +// HandleRune feeds a Unicode rune to the Parser. +// +// An error is returned if and only if the RuneType is RuneTypeError. +// Returns io/fs.ErrClosed if .HandleEOF() has previously been called +// (and .Reset() has not been called since). +// +// Once RuneTypeError or RuneTypeEOF has been returned, it will keep +// being returned from both .HandleRune(c) and .HandleEOF() until +// .Reset() is called. +// +// RuneTypeEOF indicates that the rune cannot be appended to the JSON +// document; a new JSON document must be started in order to process +// that rune. +func (par *Parser) HandleRune(c rune) (RuneType, error) { + if par.closed { + return RuneTypeError, iofs.ErrClosed + } + if par.err != nil { + return RuneTypeError, par.err + } + par.init() + if len(par.stack) == 0 { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + default: + return RuneTypeEOF, nil + } + } + switch par.stack[len(par.stack)-1] { + // any ///////////////////////////////////////////////////////////////////////////////////// + case runeTypeAny: + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case '{': + if par.MaxDepth > 0 && par.depth() > par.MaxDepth { + return RuneTypeError, ErrParserExceededMaxDepth + } + return par.replaceState(RuneTypeObjectBeg), nil + case '[': + if par.MaxDepth > 0 && par.depth() > par.MaxDepth { + return RuneTypeError, ErrParserExceededMaxDepth + } + return par.replaceState(RuneTypeArrayBeg), nil + case '"': + return par.replaceState(RuneTypeStringBeg), nil + case '-': + return par.replaceState(RuneTypeNumberIntNeg), nil + case '0': + return par.replaceState(RuneTypeNumberIntZero), nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberIntDig), nil + case 't': + return par.replaceState(RuneTypeTrueT), nil + case 'f': + return par.replaceState(RuneTypeFalseF), nil + case 'n': + return par.replaceState(RuneTypeNullN), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q looking for beginning of value", c) + } + // object ////////////////////////////////////////////////////////////////////////////////// + case RuneTypeObjectBeg: // waiting for key to start or '}' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case '"': + par.replaceState(RuneTypeStringEnd) + return par.pushState(RuneTypeStringBeg), nil + case '}': + par.popState() + return RuneTypeObjectEnd, nil + default: + return RuneTypeError, fmt.Errorf("object: unexpected character: %q", c) + } + case RuneTypeStringEnd: // waiting for ':' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ':': + par.replaceState(RuneTypeObjectComma) + par.pushState(runeTypeAny) + return RuneTypeObjectColon, nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q after object key", c) + } + case RuneTypeObjectComma: // waiting for ',' or '}' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ',': + par.replaceState(RuneTypeObjectBeg) + return RuneTypeObjectComma, nil + case '}': + par.popState() + return RuneTypeObjectEnd, nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q after object key:value pair", c) + } + // array /////////////////////////////////////////////////////////////////////////////////// + case RuneTypeArrayBeg: // waiting for item to start or ']' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ']': + par.popState() + return RuneTypeArrayEnd, nil + default: + par.replaceState(RuneTypeArrayComma) + par.pushState(runeTypeAny) + return par.HandleRune(c) + } + case RuneTypeArrayComma: // waiting for ',' or ']' + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return RuneTypeSpace, nil + case ',': + par.pushState(runeTypeAny) + return RuneTypeArrayComma, nil + case ']': + par.popState() + return RuneTypeArrayEnd, nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q after array element", c) + } + // string ////////////////////////////////////////////////////////////////////////////////// + case RuneTypeStringBeg: // waiting for char or '"' + switch { + case c == '\\': + return par.replaceState(RuneTypeStringEsc), nil + case c == '"': + par.popState() + return RuneTypeStringEnd, nil + case 0x0020 <= c && c <= 0x10FFFF: + return RuneTypeStringChar, nil + default: + return RuneTypeError, fmt.Errorf("string: unexpected character: %q", c) + } + case RuneTypeStringEsc: // waiting for escape char + switch c { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + par.replaceState(RuneTypeStringBeg) + return RuneTypeStringEsc1, nil + case 'u': + return par.replaceState(RuneTypeStringEscU), nil + default: + return RuneTypeError, fmt.Errorf("string backslash sequence: unexpected character: %q", c) + } + case RuneTypeStringEscU: + if _, ok := HexToInt(c); ok { + return par.replaceState(RuneTypeStringEscUA), nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + case RuneTypeStringEscUA: + if _, ok := HexToInt(c); ok { + return par.replaceState(RuneTypeStringEscUB), nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + case RuneTypeStringEscUB: + if _, ok := HexToInt(c); ok { + return par.replaceState(RuneTypeStringEscUC), nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + case RuneTypeStringEscUC: + if _, ok := HexToInt(c); ok { + par.replaceState(RuneTypeStringBeg) + return RuneTypeStringEscUD, nil + } else { + return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + } + // number ////////////////////////////////////////////////////////////////////////////////// + // + // Here's a flattened drawing of the syntax diagram from www.json.org : + // + // [------------ integer ----------][-- fraction ---][-------- exponent -------] + // >โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ•ฎโ”€"0"โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€> + // โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + // โ•ฐโ”€"-"โ”€โ•ฏ โ•ฐโ”€digit 1-9โ”€โ•ฏโ”€โ•ญdigitโ•ฎโ”€โ•ฏ โ•ฐโ”€"."โ”€โ•ญdigitโ•ฎโ”€โ•ฏ โ•ฐโ”€"e"โ”€โ•ญโ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ•ญdigitโ•ฎโ”€โ•ฏ + // โ•ฐโ”€โ”€<โ”€โ”€โ•ฏ โ•ฐโ”€โ”€<โ”€โ”€โ•ฏ โ”‚ โ”‚ โ”‚ โ”‚ โ•ฐโ”€โ”€<โ”€โ”€โ•ฏ + // โ•ฐโ”€"E"โ”€โ•ฏ โ•ฐโ”€"-"โ”€โ•ฏ + // โ”‚ โ”‚ + // โ•ฐโ”€"+"โ”€โ•ฏ + // + // Now here it is slightly redrawn, and with each distinct state our + // parser can be in marked with a single-capital-letter: + // + // [-------------- integer ------------][--------- fraction --------][--------- exponent ---------] + // >โ”€Aโ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ•ฎโ”€"0"โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€Cโ”€โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€> + // โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + // โ•ฐโ”€"-"โ”€Bโ”€โ•ฏ โ•ฐโ”€digit 1-9โ”€โ•ญโ”€Dโ”€โ•ฏโ”€digitโ•ฎ โ•ฐโ”€"."โ”€Eโ”€digitโ”€โ”€โ•ญโ”€Fโ”€โ•ฏโ”€digitโ•ฎ โ•ฐโ”€"e"โ”€โ•ญโ”€Gโ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ•ญdigitโ”€Iโ”€โ•ฏ + // โ•ฐโ”€โ”€โ”€โ”€<โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€<โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ โ”‚ โ”‚ H โ•ฐโ”€โ”€โ”€โ”€<โ”€โ”€โ”€โ•ฏ + // โ•ฐโ”€"E"โ”€โ•ฏ โ•ฐโ”€"-"โ”€โ•ฏ + // โ”‚ โ”‚ + // โ•ฐโ”€"+"โ”€โ•ฏ + // + // You may notice that each of these states may be uniquely identified + // by the last-read RuneType: + // + // A = (nothing yet) + // B = IntNeg + // C = IntZero + // D = IntDig + // E = FracDot + // F = FracDig + // G = ExpE + // H = ExpSign + // I = ExpDig + // + // The 'A' state is part of the runeTypeAny case above, and + // the remainder follow: + case RuneTypeNumberIntNeg: // B + switch c { + case '0': + return par.replaceState(RuneTypeNumberIntZero), nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberIntDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberIntZero: // C + switch c { + case '.': + return par.replaceState(RuneTypeNumberFracDot), nil + case 'e', 'E': + return par.replaceState(RuneTypeNumberExpE), nil + default: + par.popState() + return par.HandleRune(c) + } + case RuneTypeNumberIntDig: // D + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberIntDig), nil + case '.': + return par.replaceState(RuneTypeNumberFracDot), nil + case 'e', 'E': + return par.replaceState(RuneTypeNumberExpE), nil + default: + par.popState() + return par.HandleRune(c) + } + case RuneTypeNumberFracDot: // E + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberFracDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberFracDig: // F + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberFracDig), nil + case 'e', 'E': + return par.replaceState(RuneTypeNumberExpE), nil + default: + par.popState() + return par.HandleRune(c) + } + case RuneTypeNumberExpE: // G + switch c { + case '-', '+': + return par.replaceState(RuneTypeNumberExpSign), nil + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberExpDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberExpSign: // H + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberExpDig), nil + default: + return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + } + case RuneTypeNumberExpDig: // I + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return par.replaceState(RuneTypeNumberExpDig), nil + default: + par.popState() + return par.HandleRune(c) + } + // literals //////////////////////////////////////////////////////////////////////////////// + // true + case RuneTypeTrueT: + return par.expectRune(c, 'r', RuneTypeTrueR, "true", false) + case RuneTypeTrueR: + return par.expectRune(c, 'u', RuneTypeTrueU, "true", false) + case RuneTypeTrueU: + return par.expectRune(c, 'e', RuneTypeTrueE, "true", true) + // false + case RuneTypeFalseF: + return par.expectRune(c, 'a', RuneTypeFalseA, "false", false) + case RuneTypeFalseA: + return par.expectRune(c, 'l', RuneTypeFalseL, "false", false) + case RuneTypeFalseL: + return par.expectRune(c, 's', RuneTypeFalseS, "false", false) + case RuneTypeFalseS: + return par.expectRune(c, 'e', RuneTypeFalseE, "false", true) + // null + case RuneTypeNullN: + return par.expectRune(c, 'u', RuneTypeNullU, "null", false) + case RuneTypeNullU: + return par.expectRune(c, 'l', RuneTypeNullL1, "null", false) + case RuneTypeNullL1: + return par.expectRune(c, 'l', RuneTypeNullL2, "null", true) + default: + panic(fmt.Errorf(`invalid stack: "%s"`, par.stackString())) + } +} + +func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { + if c != exp { + return RuneTypeError, fmt.Errorf("invalid character %q in literal %s (expecting %q)", c, context, exp) + } + if pop { + par.popState() + return typ, nil + } else { + return par.replaceState(typ), nil + } +} diff --git a/internal/jsonparse/parse_test.go b/internal/jsonparse/parse_test.go new file mode 100644 index 0000000..e531daf --- /dev/null +++ b/internal/jsonparse/parse_test.go @@ -0,0 +1,78 @@ +// Copyright (C) 2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package jsonparse + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParserHandleRune(t *testing.T) { + t.Parallel() + type testcase struct { + Input string + ExpStack []string + } + testcases := map[string]testcase{ + // Keep these test-cases in-sync with the examples in parse.go. + "object": { + Input: `{"x":"y","a":"b"}`, + ExpStack: []string{ + // st,// processed + `?`, + `{`, // { + `ยป"`, // {" + `ยป"`, // {"x + `ยป`, // {"x" + `o?`, // {"x": + `o"`, // {"x":" + `o"`, // {"x":"y + `o`, // {"x":"y" + `{`, // {"x":"y", + `ยป"`, // {"x":"y"," + `ยป"`, // {"x":"y","a + `ยป`, // {"x":"y","a" + `o?`, // {"x":"y","a": + `o"`, // {"x":"y","a":" + `o"`, // {"x":"y","a":"b + `o`, // {"x":"y","a":"b" + ``, // {"x":"y","a":"b"} + }, + }, + "array": { + Input: `["x","y"]`, + ExpStack: []string{ + // st,// processed + `?`, + `[`, // [ + `a"`, // [" + `a"`, // ["x + `a`, // ["x" + `a?`, // ["x", + `a"`, // ["x"," + `a"`, // ["x","y + `a`, // ["x","y" + ``, // ["x","y"] + }, + }, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + var par Parser + if !assert.Equal(t, len(tc.Input)+1, len(tc.ExpStack)) { + return + } + for i, r := range tc.Input { + assert.Equal(t, tc.ExpStack[i], par.stackString()) + _, err := par.HandleRune(r) + assert.NoError(t, err) + assert.Equal(t, tc.ExpStack[i+1], par.stackString()) + } + }) + } +} -- cgit v1.2.3-2-g168b