diff options
Diffstat (limited to 'lib/diskio')
8 files changed, 19 insertions, 330 deletions
diff --git a/lib/diskio/kmp.go b/lib/diskio/kmp.go deleted file mode 100644 index 6949aa4..0000000 --- a/lib/diskio/kmp.go +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com> -// -// SPDX-License-Identifier: GPL-2.0-or-later - -package diskio - -import ( - "errors" - "io" -) - -var ErrWildcard = errors.New("wildcard") - -func kmpEq2[K ~int64, V comparable](aS Sequence[K, V], aI K, bS Sequence[K, V], bI K) bool { - aV, aErr := aS.Get(aI) - bV, bErr := bS.Get(bI) - if aErr != nil { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if aErr == ErrWildcard || errors.Is(aErr, ErrWildcard) { - aV = bV - aErr = nil - } else { - panic(aErr) - } - } - if bErr != nil { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { - bV = aV - bErr = nil - } else { - panic(bErr) - } - } - if aErr != nil || bErr != nil { - return false - } - return aV == bV -} - -func kmpEq1[K ~int64, V comparable](aV V, bS Sequence[K, V], bI K) bool { - bV, bErr := bS.Get(bI) - if bErr != nil { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { - return true - } - panic(bErr) - } - return aV == bV -} - -// buildKMPTable takes the string 'substr', and returns a table such -// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and -// 'substr[:val] == substr[matchLen-val:matchLen]'. -func buildKMPTable[K ~int64, V comparable](substr Sequence[K, V]) ([]K, error) { - var substrLen K - for { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if _, err := substr.Get(substrLen); err != nil && !(err == ErrWildcard || errors.Is(err, ErrWildcard)) { - if errors.Is(err, io.EOF) { - break - } - return nil, err - } - substrLen++ - } - - table := make([]K, substrLen) - for j := K(0); j < substrLen; j++ { - if j == 0 { - // First entry must always be 0 (in order to - // satisfy 'val < matchLen'). - continue - } - val := table[j-1] - // not a match; go back - for val > 0 && !kmpEq2(substr, j, substr, val) { - val = table[val-1] - } - // is a match; go forward - if kmpEq2(substr, val, substr, j) { - val++ - } - table[j] = val - } - return table, nil -} - -// IndexAll returns the starting-position of all possibly-overlapping -// occurrences of 'substr' in the 'str' sequence. -// -// Will hop around in 'substr', but will only get the natural sequence -// [0...) in order from 'str'. When hopping around in 'substr' it -// assumes that once it has gotten a given index without error, it can -// continue to do so without error; errors appearing later will cause -// panics. -// -// Will panic if the length of 'substr' is 0. -// -// The 'substr' may include wildcard characters by returning -// ErrWildcard for a position. -// -// Uses the Knuth-Morris-Pratt algorithm. -func IndexAll[K ~int64, V comparable](str, substr Sequence[K, V]) ([]K, error) { - table, err := buildKMPTable(substr) - if err != nil { - return nil, err - } - substrLen := K(len(table)) - if substrLen == 0 { - panic(errors.New("diskio.IndexAll: empty substring")) - } - - var matches []K - var curMatchBeg K - var curMatchLen K - - for pos := K(0); ; pos++ { - chr, err := str.Get(pos) - if err != nil { - if errors.Is(err, io.EOF) { - err = nil - } - return matches, err - } - - // Consider 'chr' - for curMatchLen > 0 && !kmpEq1(chr, substr, curMatchLen) { // shorten the match - overlap := table[curMatchLen-1] - curMatchBeg += curMatchLen - overlap - curMatchLen = overlap - } - if kmpEq1(chr, substr, curMatchLen) { // lengthen the match - if curMatchLen == 0 { - curMatchBeg = pos - } - curMatchLen++ - if curMatchLen == substrLen { - matches = append(matches, curMatchBeg) - overlap := table[curMatchLen-1] - curMatchBeg += curMatchLen - overlap - curMatchLen = overlap - } - } - } -} diff --git a/lib/diskio/kmp_test.go b/lib/diskio/kmp_test.go deleted file mode 100644 index 4d4b3be..0000000 --- a/lib/diskio/kmp_test.go +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com> -// -// SPDX-License-Identifier: GPL-2.0-or-later - -package diskio - -import ( - "bytes" - "io" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestBuildKMPTable(t *testing.T) { - t.Parallel() - substr := SliceSequence[int64, byte]([]byte("ababaa")) - table, err := buildKMPTable[int64, byte](substr) - require.NoError(t, err) - require.Equal(t, - []int64{0, 0, 1, 2, 3, 1}, - table) - for j, val := range table { - matchLen := j + 1 - assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen], - "for table[%d]=%d", j, val) - } -} - -func FuzzBuildKMPTable(f *testing.F) { - f.Add([]byte("ababaa")) - f.Fuzz(func(t *testing.T, substr []byte) { - table, err := buildKMPTable[int64, byte](SliceSequence[int64, byte](substr)) - require.NoError(t, err) - require.Equal(t, len(substr), len(table), "length") - for j, val := range table { - matchLen := j + 1 - assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen], - "for table[%d]=%d", j, val) - } - }) -} - -func NaiveIndexAll(str, substr []byte) []int64 { - var matches []int64 - for i := range str { - if bytes.HasPrefix(str[i:], substr) { - matches = append(matches, int64(i)) - } - } - return matches -} - -func FuzzIndexAll(f *testing.F) { - f.Fuzz(func(t *testing.T, str, substr []byte) { - if len(substr) == 0 { - t.Skip() - } - t.Logf("str =%q", str) - t.Logf("substr=%q", substr) - exp := NaiveIndexAll(str, substr) - act, err := IndexAll[int64, byte]( - &ByteReaderSequence[int64]{R: bytes.NewReader(str)}, - SliceSequence[int64, byte](substr)) - assert.NoError(t, err) - assert.Equal(t, exp, act) - }) -} - -type RESeq string - -func (re RESeq) Get(i int64) (byte, error) { - if i < 0 || i >= int64(len(re)) { - return 0, io.EOF - } - chr := re[int(i)] - if chr == '.' { - return 0, ErrWildcard - } - return chr, nil -} - -func TestKMPWildcard(t *testing.T) { - t.Parallel() - type testcase struct { - InStr string - InSubstr string - ExpMatches []int64 - } - testcases := map[string]testcase{ - "trivial-bar": { - InStr: "foo_bar", - InSubstr: "foo.ba.", - ExpMatches: []int64{0}, - }, - "trival-baz": { - InStr: "foo-baz", - InSubstr: "foo.ba.", - ExpMatches: []int64{0}, - }, - "suffix": { - InStr: "foobarbaz", - InSubstr: "...baz", - ExpMatches: []int64{3}, - }, - "overlap": { - InStr: "foobarbar", - InSubstr: "...bar", - ExpMatches: []int64{0, 3}, - }, - } - for tcName, tc := range testcases { - tc := tc - t.Run(tcName, func(t *testing.T) { - t.Parallel() - matches, err := IndexAll[int64, byte]( - StringSequence[int64](tc.InStr), - RESeq(tc.InSubstr)) - assert.NoError(t, err) - assert.Equal(t, tc.ExpMatches, matches) - }) - } -} diff --git a/lib/diskio/seq.go b/lib/diskio/seq.go index 3c5f4ae..f8e6ea8 100644 --- a/lib/diskio/seq.go +++ b/lib/diskio/seq.go @@ -1,68 +1,43 @@ -// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com> +// Copyright (C) 2022-2023 Luke Shumaker <lukeshu@lukeshu.com> // // SPDX-License-Identifier: GPL-2.0-or-later package diskio -import ( - "fmt" - "io" -) - // interface ///////////////////////////////////////////////////////// -type Sequence[K ~int64, V any] interface { +type Sequence[K ~int64 | ~int, V any] interface { + SeqLen() K // Get the value at 'pos' in the sequence. Positions start at - // 0 and increment naturally. Return an error that is io.EOF - // if 'pos' is past the end of the sequence'. - Get(pos K) (V, error) + // 0 and increment naturally. It is invalid to call + // SeqGet(pos) with a pos that is >= SeqLen(). + SeqGet(pos K) V } // implementation: slice ///////////////////////////////////////////// -type SliceSequence[K ~int64, V any] []V +type SliceSequence[K ~int64 | ~int, V any] []V + +var _ Sequence[assertAddr, byte] = SliceSequence[assertAddr, byte](nil) -var _ Sequence[assertAddr, byte] = SliceSequence[assertAddr, byte]([]byte(nil)) +func (s SliceSequence[K, V]) SeqLen() K { + return K(len(s)) +} -func (s SliceSequence[K, V]) Get(i K) (V, error) { - if i >= K(len(s)) { - var v V - return v, io.EOF - } - return s[int(i)], nil +func (s SliceSequence[K, V]) SeqGet(i K) V { + return s[int(i)] } // implementation: string //////////////////////////////////////////// -type StringSequence[K ~int64] string +type StringSequence[K ~int64 | ~int] string var _ Sequence[assertAddr, byte] = StringSequence[assertAddr]("") -func (s StringSequence[K]) Get(i K) (byte, error) { - if i >= K(len(s)) { - return 0, io.EOF - } - return s[int(i)], nil +func (s StringSequence[K]) SeqLen() K { + return K(len(s)) } -// implementation: io.ByteReader ///////////////////////////////////// - -type ByteReaderSequence[K ~int64] struct { - R io.ByteReader - pos K -} - -var _ Sequence[assertAddr, byte] = &ByteReaderSequence[assertAddr]{R: nil} - -func (s *ByteReaderSequence[K]) Get(i K) (byte, error) { - if i != s.pos { - return 0, fmt.Errorf("%T.Get(%v): can only call .Get(%v)", - s, i, s.pos) - } - chr, err := s.R.ReadByte() - if err != nil { - return chr, err - } - s.pos++ - return chr, nil +func (s StringSequence[K]) SeqGet(i K) byte { + return s[int(i)] } diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d deleted file mode 100644 index 9d14adf..0000000 --- a/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("0") -[]byte("1") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 deleted file mode 100644 index 269f061..0000000 --- a/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("0") -[]byte("0") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 deleted file mode 100644 index b8f1562..0000000 --- a/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("0") -[]byte("") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 deleted file mode 100644 index be67506..0000000 --- a/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("\xde\xdb!") -[]byte("\xde\xdb") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d deleted file mode 100644 index c3bfa37..0000000 --- a/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("\x10\x10\x15") -[]byte("\x10\x15") |