From 7a938da20e8d243bc254cd821b7cf61b379be4a6 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 15:10:00 -0700 Subject: reencode: Rethink the UTF-8 buffer --- reencode.go | 95 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 29 deletions(-) (limited to 'reencode.go') diff --git a/reencode.go b/reencode.go index 0745c43..a33cc8f 100644 --- a/reencode.go +++ b/reencode.go @@ -169,6 +169,46 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRune(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp) + } + return c, size, true + } +} + +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRuneInString(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRuneInString(tmp) + } + return c, size, true + } +} + // Write implements io.Writer; it does what you'd expect. // // It is worth noting that Write returns the number of bytes consumed @@ -177,59 +217,56 @@ var ( // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. -func (enc *ReEncoder) Write(p []byte) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) Write(str []byte) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromBytes(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRune(p[n:]) { - c, size := utf8.DecodeRune(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - enc.bufLen = copy(enc.buf[:], p[n:]) - return len(p), nil } // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. -func (enc *ReEncoder) WriteString(p string) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) WriteString(str string) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromString(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRuneInString(p[n:]) { - c, size := utf8.DecodeRuneInString(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - return len(p), nil } // WriteByte implements io.ByteWriter; it does what you'd expect. -- cgit v1.2.3-2-g168b