From dfc67cecbd95344d296c31b537fa3ae8aec8c292 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 22:36:25 -0700 Subject: encode, reencode: Fix handling of invalid UTF-8 --- internal/jsonstring/encode_string.go | 65 ++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 6 deletions(-) (limited to 'internal') diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index fec2cc0..76bbb38 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -5,14 +5,25 @@ package jsonstring import ( + "encoding/json" "fmt" "io" + "reflect" "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/fastio" "git.lukeshu.com/go/lowmemjson/internal/fastio/noescape" ) +// InvalidUTF8Mode is describe in the main lowmemjson package docs. +type InvalidUTF8Mode uint8 + +const ( + InvalidUTF8Replace InvalidUTF8Mode = iota + InvalidUTF8Preserve + InvalidUTF8Error +) + // BackslashEscapeMode is describe in the main lowmemjson package // docs. type BackslashEscapeMode uint8 @@ -21,6 +32,7 @@ const ( BackslashEscapeNone BackslashEscapeMode = iota BackslashEscapeShort BackslashEscapeUnicode + BackslashEscapeRawByte ) // BackslashEscaper is describe in the main lowmemjson package docs. @@ -96,19 +108,45 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err default: // obey return writeStringUnicodeEscape(w, c) } + case BackslashEscapeRawByte: + switch { + case c < utf8.RuneSelf: + panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c)) + case c > 0xFF: + panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c)) + default: + return w.WriteByte(byte(c)) + } default: - panic("escaper returned an invalid escape mode") + panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape)) } } -func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error { +func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error { if err := w.WriteByte('"'); err != nil { return err } - for _, c := range str { - if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + for i := 0; i < len(str); { + escaped := BackslashEscapeNone + c, size := utf8.DecodeRuneInString(str[i:]) + if c == utf8.RuneError && size == 1 { + switch utf { + case InvalidUTF8Replace: + escaped = BackslashEscapeUnicode + case InvalidUTF8Preserve: + escaped = BackslashEscapeRawByte + c = rune(str[i]) + case InvalidUTF8Error: + return &json.UnsupportedValueError{ + Value: val, + Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]), + } + } + } + if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil { return err } + i += size } if err := w.WriteByte('"'); err != nil { return err @@ -116,13 +154,28 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st return nil } -func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error { +func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error { if err := w.WriteByte('"'); err != nil { return err } for i := 0; i < len(str); { + escaped := BackslashEscapeNone c, size := utf8.DecodeRune(str[i:]) - if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + if c == utf8.RuneError && size == 1 { + switch utf { + case InvalidUTF8Replace: + escaped = BackslashEscapeUnicode + case InvalidUTF8Preserve: + escaped = BackslashEscapeRawByte + c = rune(str[i]) + case InvalidUTF8Error: + return &json.UnsupportedValueError{ + Value: val, + Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]), + } + } + } + if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil { return err } i += size -- cgit v1.2.3-2-g168b From 2eb60b8be25a4b0fe3f1c5d5ca302e7e68190bad Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 17:20:41 -0700 Subject: compat/json: Don't do actual JSON parsing in HTMLEscape --- internal/jsonstring/encode_string.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'internal') diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index 76bbb38..2488cb2 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -38,7 +38,7 @@ const ( // BackslashEscaper is describe in the main lowmemjson package docs. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode -func writeStringUnicodeEscape(w io.Writer, c rune) error { +func WriteStringUnicodeEscape(w io.Writer, c rune) error { const alphabet = "0123456789abcdef" buf := [6]byte{ '\\', @@ -84,7 +84,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err case '\b', '\f', '\n', '\r', '\t': // short-escape if possible return writeStringShortEscape(w, c) default: - return writeStringUnicodeEscape(w, c) + return WriteStringUnicodeEscape(w, c) } case c == '"' || c == '\\': // override, gotta escape these return writeStringShortEscape(w, c) @@ -106,7 +106,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err _, err := w.WriteRune(c) return err default: // obey - return writeStringUnicodeEscape(w, c) + return WriteStringUnicodeEscape(w, c) } case BackslashEscapeRawByte: switch { -- cgit v1.2.3-2-g168b From 00187950437a10952b82353405e5ba4b4515fb29 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 19:06:46 -0700 Subject: reencode: Don't normalize the capitalization of \uXXXX hex escapes --- internal/jsonstring/encode_string.go | 60 ++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 16 deletions(-) (limited to 'internal') diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index 2488cb2..1416b3e 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -31,22 +31,49 @@ type BackslashEscapeMode uint8 const ( BackslashEscapeNone BackslashEscapeMode = iota BackslashEscapeShort - BackslashEscapeUnicode BackslashEscapeRawByte + + // It is significant to the implementation that if X=binary-0 + // and x=binary-1, then these "BackslashEscapeUnicode" + // constants are counting in-order from 0 to 15. + + BackslashEscapeUnicodeXXXX + BackslashEscapeUnicodeXXXx + BackslashEscapeUnicodeXXxX + BackslashEscapeUnicodeXXxx + BackslashEscapeUnicodeXxXX + BackslashEscapeUnicodeXxXx + BackslashEscapeUnicodeXxxX + BackslashEscapeUnicodeXxxx + BackslashEscapeUnicodexXXX + BackslashEscapeUnicodexXXx + BackslashEscapeUnicodexXxX + BackslashEscapeUnicodexXxx + BackslashEscapeUnicodexxXX + BackslashEscapeUnicodexxXx + BackslashEscapeUnicodexxxX + BackslashEscapeUnicodexxxx + + BackslashEscapeUnicodeMin = BackslashEscapeUnicodeXXXX + BackslashEscapeUnicodeMax = BackslashEscapeUnicodexxxx + + BackslashEscapeUnicode = BackslashEscapeUnicodexxxx // back-compat ) // BackslashEscaper is describe in the main lowmemjson package docs. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode -func WriteStringUnicodeEscape(w io.Writer, c rune) error { - const alphabet = "0123456789abcdef" +func WriteStringUnicodeEscape(w io.Writer, c rune, mode BackslashEscapeMode) error { + const alphabet = "0123456789ABCDEF" + _mode := byte(mode - BackslashEscapeUnicodeMin) buf := [6]byte{ '\\', 'u', - alphabet[(c>>12)&0xf], - alphabet[(c>>8)&0xf], - alphabet[(c>>4)&0xf], - alphabet[(c>>0)&0xf], + // The 0b0010_0000 bit is the ASCII "lowercase bit". + alphabet[(c>>12)&0xf] | ((_mode << 2) & 0b0010_0000), + alphabet[(c>>8)&0xf] | ((_mode << 3) & 0b0010_0000), + alphabet[(c>>4)&0xf] | ((_mode << 4) & 0b0010_0000), + alphabet[(c>>0)&0xf] | ((_mode << 5) & 0b0010_0000), } _, err := noescape.Write(w, buf[:]) return err @@ -84,7 +111,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err case '\b', '\f', '\n', '\r', '\t': // short-escape if possible return writeStringShortEscape(w, c) default: - return WriteStringUnicodeEscape(w, c) + return WriteStringUnicodeEscape(w, c, BackslashEscapeUnicode) } case c == '"' || c == '\\': // override, gotta escape these return writeStringShortEscape(w, c) @@ -100,14 +127,6 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err _, err := w.WriteRune(c) return err } - case BackslashEscapeUnicode: - switch { - case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?) - _, err := w.WriteRune(c) - return err - default: // obey - return WriteStringUnicodeEscape(w, c) - } case BackslashEscapeRawByte: switch { case c < utf8.RuneSelf: @@ -118,6 +137,15 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err return w.WriteByte(byte(c)) } default: + if BackslashEscapeUnicodeMin <= escape && escape <= BackslashEscapeUnicodeMax { + switch { + case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?) + _, err := w.WriteRune(c) + return err + default: // obey + return WriteStringUnicodeEscape(w, c, escape) + } + } panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape)) } } -- cgit v1.2.3-2-g168b From 49ee8be679add0bd3cf08a2669331b3be7a835f8 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 17 Feb 2023 19:21:37 -0700 Subject: compat/json: Correctly handle syntax-error-in-decode --- internal/jsonparse/parse.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'internal') diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index 1c35533..6432d75 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -525,6 +525,21 @@ func (par *Parser) HandleEOF() (RuneType, error) { } } +// IsAtBarrier returns whether a read-barrier has been reached and the +// next HandleRune call would definitely return RuneTypeEOF. +func (par *Parser) IsAtBarrier() bool { + return par.initialized && + // HandleRune wouldn't return early with an error. + !par.closed && + par.err == nil && + // The current (sub-)parser has reached its end, and + len(par.stack) == 0 && + // there is a barrier, and + len(par.barriers) > 0 && + // that barrier would definitely return RuneTypeEOF. + !par.barriers[len(par.barriers)-1].allowWS +} + // HandleRune feeds a Unicode rune to the Parser. // // An error is returned if and only if the RuneType is RuneTypeError. -- cgit v1.2.3-2-g168b