summaryrefslogtreecommitdiff
path: root/internal/jsonparse
diff options
context:
space:
mode:
Diffstat (limited to 'internal/jsonparse')
-rw-r--r--internal/jsonparse/hex.go20
-rw-r--r--internal/jsonparse/parse.go845
-rw-r--r--internal/jsonparse/parse_test.go78
3 files changed, 943 insertions, 0 deletions
diff --git a/internal/jsonparse/hex.go b/internal/jsonparse/hex.go
new file mode 100644
index 0000000..3ed5f01
--- /dev/null
+++ b/internal/jsonparse/hex.go
@@ -0,0 +1,20 @@
+// Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package jsonparse
+
+const Hex = "0123456789abcdef"
+
+func HexToInt(c rune) (byte, bool) {
+ switch {
+ case '0' <= c && c <= '9':
+ return byte(c) - '0', true
+ case 'a' <= c && c <= 'f':
+ return byte(c) - 'a' + 10, true
+ case 'A' <= c && c <= 'F':
+ return byte(c) - 'A' + 10, true
+ default:
+ return 0, false
+ }
+}
diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go
new file mode 100644
index 0000000..7d97be0
--- /dev/null
+++ b/internal/jsonparse/parse.go
@@ -0,0 +1,845 @@
+// Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package jsonparse
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ iofs "io/fs"
+ "strings"
+)
+
+var ErrParserExceededMaxDepth = errors.New("exceeded max depth")
+
+// RuneType is the classification of a rune when parsing JSON input.
+// A Parser, rather than grouping runes into tokens and classifying
+// tokens, classifies runes directly.
+type RuneType uint8
+
+const (
+ RuneTypeError RuneType = iota
+
+ RuneTypeSpace // whitespace
+
+ RuneTypeObjectBeg // '{'
+ RuneTypeObjectColon // ':'
+ RuneTypeObjectComma // ','
+ RuneTypeObjectEnd // '}'
+
+ RuneTypeArrayBeg // '['
+ RuneTypeArrayComma // ','
+ RuneTypeArrayEnd // ']'
+
+ RuneTypeStringBeg // opening '"'
+ RuneTypeStringChar // normal character
+ RuneTypeStringEsc // backslash
+ RuneTypeStringEsc1 // single-char after a backslash
+ RuneTypeStringEscU // \uABCD : u
+ RuneTypeStringEscUA // \uABCD : A
+ RuneTypeStringEscUB // \uABCD : B
+ RuneTypeStringEscUC // \uABCD : C
+ RuneTypeStringEscUD // \uABCD : D
+ RuneTypeStringEnd // closing '"'
+
+ RuneTypeNumberIntNeg
+ RuneTypeNumberIntZero // leading zero only; non-leading zeros are IntDig, not IntZero
+ RuneTypeNumberIntDig
+ RuneTypeNumberFracDot
+ RuneTypeNumberFracDig
+ RuneTypeNumberExpE
+ RuneTypeNumberExpSign
+ RuneTypeNumberExpDig
+
+ RuneTypeTrueT
+ RuneTypeTrueR
+ RuneTypeTrueU
+ RuneTypeTrueE
+
+ RuneTypeFalseF
+ RuneTypeFalseA
+ RuneTypeFalseL
+ RuneTypeFalseS
+ RuneTypeFalseE
+
+ RuneTypeNullN
+ RuneTypeNullU
+ RuneTypeNullL1
+ RuneTypeNullL2
+
+ RuneTypeEOF
+
+ // Not a real rune type, but used as a stack state.
+ runeTypeAny
+)
+
+// GoString implements fmt.GoStringer.
+//
+//nolint:dupl // False positive due to similarly shaped AST.
+func (t RuneType) GoString() string {
+ str, ok := map[RuneType]string{
+ RuneTypeError: "RuneTypeError",
+
+ RuneTypeSpace: "RuneTypeSpace",
+
+ RuneTypeObjectBeg: "RuneTypeObjectBeg",
+ RuneTypeObjectColon: "RuneTypeObjectColon",
+ RuneTypeObjectComma: "RuneTypeObjectComma",
+ RuneTypeObjectEnd: "RuneTypeObjectEnd",
+
+ RuneTypeArrayBeg: "RuneTypeArrayBeg",
+ RuneTypeArrayComma: "RuneTypeArrayComma",
+ RuneTypeArrayEnd: "RuneTypeArrayEnd",
+
+ RuneTypeStringBeg: "RuneTypeStringBeg",
+ RuneTypeStringChar: "RuneTypeStringChar",
+ RuneTypeStringEsc: "RuneTypeStringEsc",
+ RuneTypeStringEsc1: "RuneTypeStringEsc1",
+ RuneTypeStringEscU: "RuneTypeStringEscU",
+ RuneTypeStringEscUA: "RuneTypeStringEscUA",
+ RuneTypeStringEscUB: "RuneTypeStringEscUB",
+ RuneTypeStringEscUC: "RuneTypeStringEscUC",
+ RuneTypeStringEscUD: "RuneTypeStringEscUD",
+ RuneTypeStringEnd: "RuneTypeStringEnd",
+
+ RuneTypeNumberIntNeg: "RuneTypeNumberIntNeg",
+ RuneTypeNumberIntZero: "RuneTypeNumberIntZero",
+ RuneTypeNumberIntDig: "RuneTypeNumberIntDig",
+ RuneTypeNumberFracDot: "RuneTypeNumberFracDot",
+ RuneTypeNumberFracDig: "RuneTypeNumberFracDig",
+ RuneTypeNumberExpE: "RuneTypeNumberExpE",
+ RuneTypeNumberExpSign: "RuneTypeNumberExpSign",
+ RuneTypeNumberExpDig: "RuneTypeNumberExpDig",
+
+ RuneTypeTrueT: "RuneTypeTrueT",
+ RuneTypeTrueR: "RuneTypeTrueR",
+ RuneTypeTrueU: "RuneTypeTrueU",
+ RuneTypeTrueE: "RuneTypeTrueE",
+
+ RuneTypeFalseF: "RuneTypeFalseF",
+ RuneTypeFalseA: "RuneTypeFalseA",
+ RuneTypeFalseL: "RuneTypeFalseL",
+ RuneTypeFalseS: "RuneTypeFalseS",
+ RuneTypeFalseE: "RuneTypeFalseE",
+
+ RuneTypeNullN: "RuneTypeNullN",
+ RuneTypeNullU: "RuneTypeNullU",
+ RuneTypeNullL1: "RuneTypeNullL1",
+ RuneTypeNullL2: "RuneTypeNullL2",
+
+ RuneTypeEOF: "RuneTypeEOF",
+
+ runeTypeAny: "runeTypeAny",
+ }[t]
+ if ok {
+ return str
+ }
+ return fmt.Sprintf("RuneType(%d)", t)
+}
+
+// String implements fmt.Stringer.
+//
+//nolint:dupl // False positive due to similarly shaped AST.
+func (t RuneType) String() string {
+ str, ok := map[RuneType]string{
+ RuneTypeError: "x",
+
+ RuneTypeSpace: " ",
+
+ RuneTypeObjectBeg: "{",
+ RuneTypeObjectColon: ":",
+ RuneTypeObjectComma: "o",
+ RuneTypeObjectEnd: "}",
+
+ RuneTypeArrayBeg: "[",
+ RuneTypeArrayComma: "a",
+ RuneTypeArrayEnd: "]",
+
+ RuneTypeStringBeg: "\"",
+ RuneTypeStringChar: "c",
+ RuneTypeStringEsc: "\\",
+ RuneTypeStringEsc1: "b",
+ RuneTypeStringEscU: "u",
+ RuneTypeStringEscUA: "A",
+ RuneTypeStringEscUB: "B",
+ RuneTypeStringEscUC: "C",
+ RuneTypeStringEscUD: "D",
+ RuneTypeStringEnd: "ยป",
+
+ RuneTypeNumberIntNeg: "-",
+ RuneTypeNumberIntZero: "0",
+ RuneTypeNumberIntDig: "1",
+ RuneTypeNumberFracDot: ".",
+ RuneTypeNumberFracDig: "2",
+ RuneTypeNumberExpE: "e",
+ RuneTypeNumberExpSign: "+",
+ RuneTypeNumberExpDig: "3",
+
+ RuneTypeTrueT: "๐•ฅ", // double-struck
+ RuneTypeTrueR: "๐•ฃ",
+ RuneTypeTrueU: "๐•ฆ",
+ RuneTypeTrueE: "๐•–",
+
+ RuneTypeFalseF: "๐”ฃ", // fraktur
+ RuneTypeFalseA: "๐”ž",
+ RuneTypeFalseL: "๐”ฉ",
+ RuneTypeFalseS: "๐”ฐ",
+ RuneTypeFalseE: "๐”ข",
+
+ RuneTypeNullN: "โ“", // circled
+ RuneTypeNullU: "โ“ค",
+ RuneTypeNullL1: "โ“›",
+ RuneTypeNullL2: "โ“", // +uppercase
+
+ RuneTypeEOF: "$",
+
+ runeTypeAny: "?",
+ }[t]
+ if ok {
+ return str
+ }
+ return fmt.Sprintf("<%d>", t)
+}
+
+func (t RuneType) JSONType() string {
+ return map[RuneType]string{
+ RuneTypeObjectBeg: "object",
+ RuneTypeArrayBeg: "array",
+ RuneTypeStringBeg: "string",
+ RuneTypeNumberIntNeg: "number",
+ RuneTypeNumberIntZero: "number",
+ RuneTypeNumberIntDig: "number",
+ RuneTypeTrueT: "true",
+ RuneTypeFalseF: "false",
+ RuneTypeNullN: "null",
+ RuneTypeEOF: "eof",
+ }[t]
+}
+
+// IsNumber returns whether the RuneType is one of the
+// RuneTypeNumberXXX values.
+func (t RuneType) IsNumber() bool {
+ return RuneTypeNumberIntNeg <= t && t <= RuneTypeNumberExpDig
+}
+
+// Parser is the low-level JSON parser that powers both *Decoder and
+// *ReEncoder.
+type Parser struct {
+ // Setting MaxError to a value greater than 0 causes
+ // HandleRune to return ErrParserExceededMaxDepth if
+ // objects/arrays become nested more deeply than this.
+ MaxDepth int
+
+ initialized bool
+
+ err error
+ closed bool
+
+ // We reuse RuneTypes to store the stack. The base idea is:
+ // stack items are "the most recently read stack-relevant
+ // RuneType".
+ //
+ // The stack starts out with the special pseudo-RuneType
+ // `runeTypeAny` that means we're willing to accept any
+ // element type; an empty stack means that we have reached the
+ // end of the top-level element and should accept no more
+ // input except for whitespace.
+ //
+ // The "normal" stack-relevant RuneTypes are:
+ //
+ // "\uABC for strings
+ // -01.2e+3 for numbers
+ // ๐•ฅ๐•ฃ๐•ฆ for "true"
+ // ๐”ฃ๐”ž๐”ฉ๐”ฐ for "false"
+ // โ“โ“คโ“› for "null"
+ //
+ // Objects and arrays break the "most recently read RuneType"
+ // rule; they need some special assignments:
+ //
+ // { object: waiting for key to start or '}'
+ // ยป object: reading key / waiting for colon
+ // o object: reading value / waiting for ',' or '}'
+ //
+ // [ array: waiting for item to start or ']'
+ // a array: reading item / waiting for ',' or ']'
+ //
+ // Within each element type, the stack item is replaced, not pushed.
+ //
+ // (Keep each of these examples in-sync with parse_test.go.)
+ //
+ // For example, given the input string
+ //
+ // {"x":"y","a":"b"}
+ //
+ // The stack would be
+ //
+ // stack processed
+ // ?
+ // { {
+ // ยป" {"
+ // ยป" {"x
+ // ยป {"x"
+ // o? {"x":
+ // o" {"x":"
+ // o" {"x":"y
+ // o {"x":"y"
+ // { {"x":"y",
+ // ยป" {"x":"y","
+ // ยป" {"x":"y","a
+ // ยป {"x":"y","a"
+ // o? {"x":"y","a":
+ // o" {"x":"y","a":"
+ // o" {"x":"y","a":"b
+ // o {"x":"y","a":"b"
+ // {"x":"y","a":"b"}
+ //
+ // Or, given the input string
+ //
+ // ["x","y"]
+ //
+ // The stack would be
+ //
+ // stack processed
+ // ?
+ // [ [
+ // a" ["
+ // a" ["x
+ // a ["x"
+ // a? ["x",
+ // a" ["x","
+ // a" ["x","y
+ // a ["x","y"
+ // ["x","y"]
+ stack []RuneType
+
+ barriers []barrier
+}
+
+type barrier struct {
+ closed bool
+ stack []RuneType
+}
+
+func (par *Parser) init() {
+ if !par.initialized {
+ par.initialized = true
+ par.pushState(runeTypeAny)
+ }
+}
+
+func (par *Parser) pushState(state RuneType) RuneType {
+ par.stack = append(par.stack, state)
+ return state
+}
+
+func (par *Parser) replaceState(state RuneType) RuneType {
+ par.stack[len(par.stack)-1] = state
+ return state
+}
+
+func (par *Parser) popState() {
+ par.stack = par.stack[:len(par.stack)-1]
+}
+
+func (par *Parser) stackString() string {
+ par.init()
+ var buf strings.Builder
+ for _, s := range par.stack {
+ buf.WriteString(s.String())
+ }
+ return buf.String()
+}
+
+func (par *Parser) depth() int {
+ n := len(par.stack)
+ for _, barrier := range par.barriers {
+ n += len(barrier.stack)
+ }
+ return n
+}
+
+func (par *Parser) StackIsEmpty() bool {
+ if len(par.barriers) > 0 {
+ return false
+ }
+ if len(par.stack) == 0 {
+ return true
+ }
+ return len(par.stack) == 1 && par.stack[0] == runeTypeAny
+}
+
+func (par *Parser) StackSize() int {
+ return len(par.stack)
+}
+
+// Reset all Parser state.
+func (par *Parser) Reset() {
+ *par = Parser{
+ MaxDepth: par.MaxDepth,
+ }
+}
+
+// PushReadBarrier causes the parser to expect EOF once the end of the
+// element that is started by the current top-of-stack is reached,
+// until this is un-done with PopBarrier. It essentially turns the
+// parser in to a sub-parser.
+//
+// PushReadBarrier may only be called at the beginning of an element,
+// whether that be
+//
+// - runeTypeAny
+// - RuneTypeObjectBeg
+// - RuneTypeArrayBeg
+// - RuneTypeStringBeg
+// - RuneTypeNumberIntNeg, RuneTypeNumberIntZero, RuneTypeNumberIntDig
+// - RuneTypeTrueT
+// - RuneTypeFalseF
+// - RuneTypeNullN
+func (par *Parser) PushReadBarrier() {
+ // Sanity checking.
+ par.init()
+ if len(par.stack) == 0 {
+ panic(errors.New("illegal PushReadBarrier call: empty stack"))
+ }
+ curState := par.stack[len(par.stack)-1]
+ switch curState {
+ case runeTypeAny,
+ RuneTypeObjectBeg,
+ RuneTypeArrayBeg,
+ RuneTypeStringBeg,
+ RuneTypeNumberIntNeg, RuneTypeNumberIntZero, RuneTypeNumberIntDig,
+ RuneTypeTrueT,
+ RuneTypeFalseF,
+ RuneTypeNullN:
+ // OK
+ default:
+ panic(fmt.Errorf("illegal PushReadBarrier call: %q", curState))
+ }
+ // Actually push.
+ par.barriers = append(par.barriers, barrier{
+ closed: par.closed,
+ stack: par.stack[:len(par.stack)-1],
+ })
+ par.stack = []RuneType{curState}
+}
+
+// PushWriteBarrier causes the parser to expect EOF once the end of
+// the about-to-start element is reached, until this is un-done with
+// PopBarrier. It essentially turns the parser in to a sub-parser.
+//
+// PushWriteBarrier may only be called at the places where an element
+// of any type may start:
+//
+// - runeTypeAny for top-level and object-value elements
+// - RuneTypeArrayBeg for array-item elements
+//
+// PushWriteBarrier signals intent to write an element; if it is
+// called in a place where an element is optional (at the beginning of
+// an array), it becomes a syntax error to not write the element.
+func (par *Parser) PushWriteBarrier() {
+ par.init()
+ if len(par.stack) == 0 {
+ panic(errors.New("illegal PushWriteBarrier call: empty stack"))
+ }
+ switch par.stack[len(par.stack)-1] {
+ case runeTypeAny:
+ par.popState()
+ par.barriers = append(par.barriers, barrier{
+ closed: par.closed,
+ stack: par.stack,
+ })
+ par.stack = []RuneType{runeTypeAny}
+ case RuneTypeArrayBeg:
+ par.replaceState(RuneTypeArrayComma)
+ par.barriers = append(par.barriers, barrier{
+ closed: par.closed,
+ stack: par.stack,
+ })
+ par.stack = []RuneType{runeTypeAny}
+ default:
+ panic(fmt.Errorf("illegal PushWriteBarrier call: %q", par.stack[len(par.stack)-1]))
+ }
+}
+
+// PopBarrier reverses a call to PushReadBarrier or PushWriteBarrier.
+func (par *Parser) PopBarrier() {
+ if len(par.barriers) == 0 {
+ panic(errors.New("illegal PopBarrier call: empty barrier stack"))
+ }
+ barrier := par.barriers[len(par.barriers)-1]
+ par.barriers = par.barriers[:len(par.barriers)-1]
+ par.closed = barrier.closed
+ par.stack = append(barrier.stack, par.stack...)
+}
+
+// HandleEOF feeds EOF to the Parser. The returned RuneType is either
+// RuneTypeEOF or RuneTypeError.
+//
+// An error is returned if and only if the RuneType is RuneTypeError.
+// Returns io/fs.ErrClosed if .HandleEOF() has previously been called
+// (and .Reset() has not been called since).
+//
+// Once RuneTypeError or RuneTypeEOF has been returned, it will keep
+// being returned from both .HandleRune(c) and .HandleEOF() until
+// .Reset() is called.
+//
+// RuneTypeEOF indicates that a complete JSON document has been read.
+func (par *Parser) HandleEOF() (RuneType, error) {
+ if par.closed {
+ return RuneTypeError, iofs.ErrClosed
+ }
+ defer func() {
+ par.closed = true
+ }()
+ if par.err != nil {
+ return RuneTypeError, par.err
+ }
+ par.init()
+ switch len(par.stack) {
+ case 0:
+ return RuneTypeEOF, nil
+ case 1:
+ switch {
+ case par.stack[0].IsNumber():
+ if _, err := par.HandleRune('\n'); err == nil {
+ return RuneTypeEOF, nil
+ }
+ case par.stack[0] == runeTypeAny:
+ par.err = io.EOF
+ return RuneTypeError, par.err
+ }
+ fallthrough
+ default:
+ par.err = io.ErrUnexpectedEOF
+ return RuneTypeError, par.err
+ }
+}
+
+// HandleRune feeds a Unicode rune to the Parser.
+//
+// An error is returned if and only if the RuneType is RuneTypeError.
+// Returns io/fs.ErrClosed if .HandleEOF() has previously been called
+// (and .Reset() has not been called since).
+//
+// Once RuneTypeError or RuneTypeEOF has been returned, it will keep
+// being returned from both .HandleRune(c) and .HandleEOF() until
+// .Reset() is called.
+//
+// RuneTypeEOF indicates that the rune cannot be appended to the JSON
+// document; a new JSON document must be started in order to process
+// that rune.
+func (par *Parser) HandleRune(c rune) (RuneType, error) {
+ if par.closed {
+ return RuneTypeError, iofs.ErrClosed
+ }
+ if par.err != nil {
+ return RuneTypeError, par.err
+ }
+ par.init()
+ if len(par.stack) == 0 {
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ default:
+ return RuneTypeEOF, nil
+ }
+ }
+ switch par.stack[len(par.stack)-1] {
+ // any /////////////////////////////////////////////////////////////////////////////////////
+ case runeTypeAny:
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ case '{':
+ if par.MaxDepth > 0 && par.depth() > par.MaxDepth {
+ return RuneTypeError, ErrParserExceededMaxDepth
+ }
+ return par.replaceState(RuneTypeObjectBeg), nil
+ case '[':
+ if par.MaxDepth > 0 && par.depth() > par.MaxDepth {
+ return RuneTypeError, ErrParserExceededMaxDepth
+ }
+ return par.replaceState(RuneTypeArrayBeg), nil
+ case '"':
+ return par.replaceState(RuneTypeStringBeg), nil
+ case '-':
+ return par.replaceState(RuneTypeNumberIntNeg), nil
+ case '0':
+ return par.replaceState(RuneTypeNumberIntZero), nil
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberIntDig), nil
+ case 't':
+ return par.replaceState(RuneTypeTrueT), nil
+ case 'f':
+ return par.replaceState(RuneTypeFalseF), nil
+ case 'n':
+ return par.replaceState(RuneTypeNullN), nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q looking for beginning of value", c)
+ }
+ // object //////////////////////////////////////////////////////////////////////////////////
+ case RuneTypeObjectBeg: // waiting for key to start or '}'
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ case '"':
+ par.replaceState(RuneTypeStringEnd)
+ return par.pushState(RuneTypeStringBeg), nil
+ case '}':
+ par.popState()
+ return RuneTypeObjectEnd, nil
+ default:
+ return RuneTypeError, fmt.Errorf("object: unexpected character: %q", c)
+ }
+ case RuneTypeStringEnd: // waiting for ':'
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ case ':':
+ par.replaceState(RuneTypeObjectComma)
+ par.pushState(runeTypeAny)
+ return RuneTypeObjectColon, nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q after object key", c)
+ }
+ case RuneTypeObjectComma: // waiting for ',' or '}'
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ case ',':
+ par.replaceState(RuneTypeObjectBeg)
+ return RuneTypeObjectComma, nil
+ case '}':
+ par.popState()
+ return RuneTypeObjectEnd, nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q after object key:value pair", c)
+ }
+ // array ///////////////////////////////////////////////////////////////////////////////////
+ case RuneTypeArrayBeg: // waiting for item to start or ']'
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ case ']':
+ par.popState()
+ return RuneTypeArrayEnd, nil
+ default:
+ par.replaceState(RuneTypeArrayComma)
+ par.pushState(runeTypeAny)
+ return par.HandleRune(c)
+ }
+ case RuneTypeArrayComma: // waiting for ',' or ']'
+ switch c {
+ case 0x0020, 0x000A, 0x000D, 0x0009:
+ return RuneTypeSpace, nil
+ case ',':
+ par.pushState(runeTypeAny)
+ return RuneTypeArrayComma, nil
+ case ']':
+ par.popState()
+ return RuneTypeArrayEnd, nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q after array element", c)
+ }
+ // string //////////////////////////////////////////////////////////////////////////////////
+ case RuneTypeStringBeg: // waiting for char or '"'
+ switch {
+ case c == '\\':
+ return par.replaceState(RuneTypeStringEsc), nil
+ case c == '"':
+ par.popState()
+ return RuneTypeStringEnd, nil
+ case 0x0020 <= c && c <= 0x10FFFF:
+ return RuneTypeStringChar, nil
+ default:
+ return RuneTypeError, fmt.Errorf("string: unexpected character: %q", c)
+ }
+ case RuneTypeStringEsc: // waiting for escape char
+ switch c {
+ case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
+ par.replaceState(RuneTypeStringBeg)
+ return RuneTypeStringEsc1, nil
+ case 'u':
+ return par.replaceState(RuneTypeStringEscU), nil
+ default:
+ return RuneTypeError, fmt.Errorf("string backslash sequence: unexpected character: %q", c)
+ }
+ case RuneTypeStringEscU:
+ if _, ok := HexToInt(c); ok {
+ return par.replaceState(RuneTypeStringEscUA), nil
+ } else {
+ return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c)
+ }
+ case RuneTypeStringEscUA:
+ if _, ok := HexToInt(c); ok {
+ return par.replaceState(RuneTypeStringEscUB), nil
+ } else {
+ return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c)
+ }
+ case RuneTypeStringEscUB:
+ if _, ok := HexToInt(c); ok {
+ return par.replaceState(RuneTypeStringEscUC), nil
+ } else {
+ return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c)
+ }
+ case RuneTypeStringEscUC:
+ if _, ok := HexToInt(c); ok {
+ par.replaceState(RuneTypeStringBeg)
+ return RuneTypeStringEscUD, nil
+ } else {
+ return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c)
+ }
+ // number //////////////////////////////////////////////////////////////////////////////////
+ //
+ // Here's a flattened drawing of the syntax diagram from www.json.org :
+ //
+ // [------------ integer ----------][-- fraction ---][-------- exponent -------]
+ // >โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ•ฎโ”€"0"โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€>
+ // โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚
+ // โ•ฐโ”€"-"โ”€โ•ฏ โ•ฐโ”€digit 1-9โ”€โ•ฏโ”€โ•ญdigitโ•ฎโ”€โ•ฏ โ•ฐโ”€"."โ”€โ•ญdigitโ•ฎโ”€โ•ฏ โ•ฐโ”€"e"โ”€โ•ญโ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ•ญdigitโ•ฎโ”€โ•ฏ
+ // โ•ฐโ”€โ”€<โ”€โ”€โ•ฏ โ•ฐโ”€โ”€<โ”€โ”€โ•ฏ โ”‚ โ”‚ โ”‚ โ”‚ โ•ฐโ”€โ”€<โ”€โ”€โ•ฏ
+ // โ•ฐโ”€"E"โ”€โ•ฏ โ•ฐโ”€"-"โ”€โ•ฏ
+ // โ”‚ โ”‚
+ // โ•ฐโ”€"+"โ”€โ•ฏ
+ //
+ // Now here it is slightly redrawn, and with each distinct state our
+ // parser can be in marked with a single-capital-letter:
+ //
+ // [-------------- integer ------------][--------- fraction --------][--------- exponent ---------]
+ // >โ”€Aโ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ•ฎโ”€"0"โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€Cโ”€โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ญโ”€>
+ // โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚
+ // โ•ฐโ”€"-"โ”€Bโ”€โ•ฏ โ•ฐโ”€digit 1-9โ”€โ•ญโ”€Dโ”€โ•ฏโ”€digitโ•ฎ โ•ฐโ”€"."โ”€Eโ”€digitโ”€โ”€โ•ญโ”€Fโ”€โ•ฏโ”€digitโ•ฎ โ•ฐโ”€"e"โ”€โ•ญโ”€Gโ”€โ•ฎโ”€โ”€โ”€โ”€โ”€โ•ญโ”€โ•ญdigitโ”€Iโ”€โ•ฏ
+ // โ•ฐโ”€โ”€โ”€โ”€<โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€<โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ โ”‚ โ”‚ H โ•ฐโ”€โ”€โ”€โ”€<โ”€โ”€โ”€โ•ฏ
+ // โ•ฐโ”€"E"โ”€โ•ฏ โ•ฐโ”€"-"โ”€โ•ฏ
+ // โ”‚ โ”‚
+ // โ•ฐโ”€"+"โ”€โ•ฏ
+ //
+ // You may notice that each of these states may be uniquely identified
+ // by the last-read RuneType:
+ //
+ // A = (nothing yet)
+ // B = IntNeg
+ // C = IntZero
+ // D = IntDig
+ // E = FracDot
+ // F = FracDig
+ // G = ExpE
+ // H = ExpSign
+ // I = ExpDig
+ //
+ // The 'A' state is part of the runeTypeAny case above, and
+ // the remainder follow:
+ case RuneTypeNumberIntNeg: // B
+ switch c {
+ case '0':
+ return par.replaceState(RuneTypeNumberIntZero), nil
+ case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberIntDig), nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c)
+ }
+ case RuneTypeNumberIntZero: // C
+ switch c {
+ case '.':
+ return par.replaceState(RuneTypeNumberFracDot), nil
+ case 'e', 'E':
+ return par.replaceState(RuneTypeNumberExpE), nil
+ default:
+ par.popState()
+ return par.HandleRune(c)
+ }
+ case RuneTypeNumberIntDig: // D
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberIntDig), nil
+ case '.':
+ return par.replaceState(RuneTypeNumberFracDot), nil
+ case 'e', 'E':
+ return par.replaceState(RuneTypeNumberExpE), nil
+ default:
+ par.popState()
+ return par.HandleRune(c)
+ }
+ case RuneTypeNumberFracDot: // E
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberFracDig), nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c)
+ }
+ case RuneTypeNumberFracDig: // F
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberFracDig), nil
+ case 'e', 'E':
+ return par.replaceState(RuneTypeNumberExpE), nil
+ default:
+ par.popState()
+ return par.HandleRune(c)
+ }
+ case RuneTypeNumberExpE: // G
+ switch c {
+ case '-', '+':
+ return par.replaceState(RuneTypeNumberExpSign), nil
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberExpDig), nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c)
+ }
+ case RuneTypeNumberExpSign: // H
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberExpDig), nil
+ default:
+ return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c)
+ }
+ case RuneTypeNumberExpDig: // I
+ switch c {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ return par.replaceState(RuneTypeNumberExpDig), nil
+ default:
+ par.popState()
+ return par.HandleRune(c)
+ }
+ // literals ////////////////////////////////////////////////////////////////////////////////
+ // true
+ case RuneTypeTrueT:
+ return par.expectRune(c, 'r', RuneTypeTrueR, "true", false)
+ case RuneTypeTrueR:
+ return par.expectRune(c, 'u', RuneTypeTrueU, "true", false)
+ case RuneTypeTrueU:
+ return par.expectRune(c, 'e', RuneTypeTrueE, "true", true)
+ // false
+ case RuneTypeFalseF:
+ return par.expectRune(c, 'a', RuneTypeFalseA, "false", false)
+ case RuneTypeFalseA:
+ return par.expectRune(c, 'l', RuneTypeFalseL, "false", false)
+ case RuneTypeFalseL:
+ return par.expectRune(c, 's', RuneTypeFalseS, "false", false)
+ case RuneTypeFalseS:
+ return par.expectRune(c, 'e', RuneTypeFalseE, "false", true)
+ // null
+ case RuneTypeNullN:
+ return par.expectRune(c, 'u', RuneTypeNullU, "null", false)
+ case RuneTypeNullU:
+ return par.expectRune(c, 'l', RuneTypeNullL1, "null", false)
+ case RuneTypeNullL1:
+ return par.expectRune(c, 'l', RuneTypeNullL2, "null", true)
+ default:
+ panic(fmt.Errorf(`invalid stack: "%s"`, par.stackString()))
+ }
+}
+
+func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) {
+ if c != exp {
+ return RuneTypeError, fmt.Errorf("invalid character %q in literal %s (expecting %q)", c, context, exp)
+ }
+ if pop {
+ par.popState()
+ return typ, nil
+ } else {
+ return par.replaceState(typ), nil
+ }
+}
diff --git a/internal/jsonparse/parse_test.go b/internal/jsonparse/parse_test.go
new file mode 100644
index 0000000..e531daf
--- /dev/null
+++ b/internal/jsonparse/parse_test.go
@@ -0,0 +1,78 @@
+// Copyright (C) 2023 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package jsonparse
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestParserHandleRune(t *testing.T) {
+ t.Parallel()
+ type testcase struct {
+ Input string
+ ExpStack []string
+ }
+ testcases := map[string]testcase{
+ // Keep these test-cases in-sync with the examples in parse.go.
+ "object": {
+ Input: `{"x":"y","a":"b"}`,
+ ExpStack: []string{
+ // st,// processed
+ `?`,
+ `{`, // {
+ `ยป"`, // {"
+ `ยป"`, // {"x
+ `ยป`, // {"x"
+ `o?`, // {"x":
+ `o"`, // {"x":"
+ `o"`, // {"x":"y
+ `o`, // {"x":"y"
+ `{`, // {"x":"y",
+ `ยป"`, // {"x":"y","
+ `ยป"`, // {"x":"y","a
+ `ยป`, // {"x":"y","a"
+ `o?`, // {"x":"y","a":
+ `o"`, // {"x":"y","a":"
+ `o"`, // {"x":"y","a":"b
+ `o`, // {"x":"y","a":"b"
+ ``, // {"x":"y","a":"b"}
+ },
+ },
+ "array": {
+ Input: `["x","y"]`,
+ ExpStack: []string{
+ // st,// processed
+ `?`,
+ `[`, // [
+ `a"`, // ["
+ `a"`, // ["x
+ `a`, // ["x"
+ `a?`, // ["x",
+ `a"`, // ["x","
+ `a"`, // ["x","y
+ `a`, // ["x","y"
+ ``, // ["x","y"]
+ },
+ },
+ }
+ for tcName, tc := range testcases {
+ tc := tc
+ t.Run(tcName, func(t *testing.T) {
+ t.Parallel()
+ var par Parser
+ if !assert.Equal(t, len(tc.Input)+1, len(tc.ExpStack)) {
+ return
+ }
+ for i, r := range tc.Input {
+ assert.Equal(t, tc.ExpStack[i], par.stackString())
+ _, err := par.HandleRune(r)
+ assert.NoError(t, err)
+ assert.Equal(t, tc.ExpStack[i+1], par.stackString())
+ }
+ })
+ }
+}