From a6cd78ec94f76feba180fa75e942bb5cdeae115f Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 21:05:24 -0700 Subject: Move string-encoding to an internal/jsonstring package --- internal/jsonstring/encode_string.go | 128 +++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 internal/jsonstring/encode_string.go (limited to 'internal') diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go new file mode 100644 index 0000000..f29dc3f --- /dev/null +++ b/internal/jsonstring/encode_string.go @@ -0,0 +1,128 @@ +// Copyright (C) 2022-2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package jsonstring + +import ( + "fmt" + "io" + "unicode/utf8" + + "git.lukeshu.com/go/lowmemjson/internal/fastio" + "git.lukeshu.com/go/lowmemjson/internal/jsonparse" +) + +// BackslashEscapeMode is describe in the main lowmemjson package +// docs. +type BackslashEscapeMode uint8 + +const ( + BackslashEscapeNone BackslashEscapeMode = iota + BackslashEscapeShort + BackslashEscapeUnicode +) + +// BackslashEscaper is describe in the main lowmemjson package docs. +type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode + +func writeStringUnicodeEscape(w io.Writer, c rune) (int, error) { + buf := [6]byte{ + '\\', + 'u', + jsonparse.Hex[(c>>12)&0xf], + jsonparse.Hex[(c>>8)&0xf], + jsonparse.Hex[(c>>4)&0xf], + jsonparse.Hex[(c>>0)&0xf], + } + return w.Write(buf[:]) +} + +func writeStringShortEscape(w io.Writer, c rune) (int, error) { + var b byte + switch c { + case '"', '\\', '/': + b = byte(c) + case '\b': + b = 'b' + case '\f': + b = 'f' + case '\n': + b = 'n' + case '\r': + b = 'r' + case '\t': + b = 't' + default: + panic(fmt.Errorf("should not happen: writeStringShortEscape called with invalid rune: %q", c)) + } + buf := [2]byte{'\\', b} + return w.Write(buf[:]) +} + +func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) (int, error) { + switch escape { + case BackslashEscapeNone: + switch { + case c < 0x0020: // override, gotta escape these + switch c { + case '\b', '\f', '\n', '\r', '\t': // short-escape if possible + return writeStringShortEscape(w, c) + default: + return writeStringUnicodeEscape(w, c) + } + case c == '"' || c == '\\': // override, gotta escape these + return writeStringShortEscape(w, c) + default: // obey + return w.WriteRune(c) + } + case BackslashEscapeShort: + switch c { + case '"', '\\', '/', '\b', '\f', '\n', '\r', '\t': // obey + return writeStringShortEscape(w, c) + default: // override, can't short-escape these + return w.WriteRune(c) + } + case BackslashEscapeUnicode: + switch { + case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?) + return w.WriteRune(c) + default: // obey + return writeStringUnicodeEscape(w, c) + } + default: + panic("escaper returned an invalid escape mode") + } +} + +func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error { + if err := w.WriteByte('"'); err != nil { + return err + } + for _, c := range str { + if _, err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + return err + } + } + if err := w.WriteByte('"'); err != nil { + return err + } + return nil +} + +func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error { + if err := w.WriteByte('"'); err != nil { + return err + } + for i := 0; i < len(str); { + c, size := utf8.DecodeRune(str[i:]) + if _, err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + return err + } + i += size + } + if err := w.WriteByte('"'); err != nil { + return err + } + return nil +} -- cgit v1.2.3-2-g168b From 2b7fff828e29b63ae08a871b4b1e74784fab29e5 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 19:06:46 -0700 Subject: Clean up the hex handling --- internal/jsonparse/hex.go | 20 -------------------- internal/jsonparse/parse.go | 28 +++++++++++++++------------- internal/jsonstring/encode_string.go | 10 +++++----- 3 files changed, 20 insertions(+), 38 deletions(-) delete mode 100644 internal/jsonparse/hex.go (limited to 'internal') diff --git a/internal/jsonparse/hex.go b/internal/jsonparse/hex.go deleted file mode 100644 index 3ed5f01..0000000 --- a/internal/jsonparse/hex.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (C) 2022-2023 Luke Shumaker -// -// SPDX-License-Identifier: GPL-2.0-or-later - -package jsonparse - -const Hex = "0123456789abcdef" - -func HexToInt(c rune) (byte, bool) { - switch { - case '0' <= c && c <= '9': - return byte(c) - '0', true - case 'a' <= c && c <= 'f': - return byte(c) - 'a' + 10, true - case 'A' <= c && c <= 'F': - return byte(c) - 'A' + 10, true - default: - return 0, false - } -} diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index 73584d9..2f5c1ab 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -14,6 +14,12 @@ import ( var ErrParserExceededMaxDepth = errors.New("exceeded max depth") +func isHex(c rune) bool { + return ('0' <= c && c <= '9') || + ('a' <= c && c <= 'f') || + ('A' <= c && c <= 'F') +} + // RuneType is the classification of a rune when parsing JSON input. // A Parser, rather than grouping runes into tokens and classifying // tokens, classifies runes directly. @@ -667,30 +673,26 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return RuneTypeError, fmt.Errorf("string backslash sequence: unexpected character: %q", c) } case RuneTypeStringEscU: - if _, ok := HexToInt(c); ok { - return par.replaceState(RuneTypeStringEscUA), nil - } else { + if !isHex(c) { return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) } + return par.replaceState(RuneTypeStringEscUA), nil case RuneTypeStringEscUA: - if _, ok := HexToInt(c); ok { - return par.replaceState(RuneTypeStringEscUB), nil - } else { + if !isHex(c) { return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) } + return par.replaceState(RuneTypeStringEscUB), nil case RuneTypeStringEscUB: - if _, ok := HexToInt(c); ok { - return par.replaceState(RuneTypeStringEscUC), nil - } else { + if !isHex(c) { return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) } + return par.replaceState(RuneTypeStringEscUC), nil case RuneTypeStringEscUC: - if _, ok := HexToInt(c); ok { - par.replaceState(RuneTypeStringBeg) - return RuneTypeStringEscUD, nil - } else { + if !isHex(c) { return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) } + par.replaceState(RuneTypeStringBeg) + return RuneTypeStringEscUD, nil // number ////////////////////////////////////////////////////////////////////////////////// // // Here's a flattened drawing of the syntax diagram from www.json.org : diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index f29dc3f..a7670c6 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -10,7 +10,6 @@ import ( "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/fastio" - "git.lukeshu.com/go/lowmemjson/internal/jsonparse" ) // BackslashEscapeMode is describe in the main lowmemjson package @@ -27,13 +26,14 @@ const ( type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode func writeStringUnicodeEscape(w io.Writer, c rune) (int, error) { + const alphabet = "0123456789abcdef" buf := [6]byte{ '\\', 'u', - jsonparse.Hex[(c>>12)&0xf], - jsonparse.Hex[(c>>8)&0xf], - jsonparse.Hex[(c>>4)&0xf], - jsonparse.Hex[(c>>0)&0xf], + alphabet[(c>>12)&0xf], + alphabet[(c>>8)&0xf], + alphabet[(c>>4)&0xf], + alphabet[(c>>0)&0xf], } return w.Write(buf[:]) } -- cgit v1.2.3-2-g168b From f823342d5b9c2ca376d038471889176ab74acf1b Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 15:10:00 -0700 Subject: reencode: Don't bother tracking the number of bytes written --- internal/jsonstring/encode_string.go | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'internal') diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index a7670c6..1b0c68a 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -25,7 +25,7 @@ const ( // BackslashEscaper is describe in the main lowmemjson package docs. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode -func writeStringUnicodeEscape(w io.Writer, c rune) (int, error) { +func writeStringUnicodeEscape(w io.Writer, c rune) error { const alphabet = "0123456789abcdef" buf := [6]byte{ '\\', @@ -35,10 +35,11 @@ func writeStringUnicodeEscape(w io.Writer, c rune) (int, error) { alphabet[(c>>4)&0xf], alphabet[(c>>0)&0xf], } - return w.Write(buf[:]) + _, err := w.Write(buf[:]) + return err } -func writeStringShortEscape(w io.Writer, c rune) (int, error) { +func writeStringShortEscape(w io.Writer, c rune) error { var b byte switch c { case '"', '\\', '/': @@ -57,10 +58,11 @@ func writeStringShortEscape(w io.Writer, c rune) (int, error) { panic(fmt.Errorf("should not happen: writeStringShortEscape called with invalid rune: %q", c)) } buf := [2]byte{'\\', b} - return w.Write(buf[:]) + _, err := w.Write(buf[:]) + return err } -func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) (int, error) { +func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) error { switch escape { case BackslashEscapeNone: switch { @@ -74,19 +76,22 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) (in case c == '"' || c == '\\': // override, gotta escape these return writeStringShortEscape(w, c) default: // obey - return w.WriteRune(c) + _, err := w.WriteRune(c) + return err } case BackslashEscapeShort: switch c { case '"', '\\', '/', '\b', '\f', '\n', '\r', '\t': // obey return writeStringShortEscape(w, c) default: // override, can't short-escape these - return w.WriteRune(c) + _, err := w.WriteRune(c) + return err } case BackslashEscapeUnicode: switch { case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?) - return w.WriteRune(c) + _, err := w.WriteRune(c) + return err default: // obey return writeStringUnicodeEscape(w, c) } @@ -100,7 +105,7 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st return err } for _, c := range str { - if _, err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { return err } } @@ -116,7 +121,7 @@ func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []b } for i := 0; i < len(str); { c, size := utf8.DecodeRune(str[i:]) - if _, err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { return err } i += size -- cgit v1.2.3-2-g168b