From 77f3c0d7cd21274d00984b72dfce05394d11bdd0 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 3 Feb 2023 19:13:22 -0700 Subject: Move KMP IndexAll from diskio to rebuildmappings --- lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go | 149 +++++++++++++++++++++ .../btrfsinspect/rebuildmappings/kmp_test.go | 126 +++++++++++++++++ .../btrfsinspect/rebuildmappings/matchsums.go | 3 +- .../btrfsinspect/rebuildmappings/sumrunwithgaps.go | 5 +- ...6820e3f98383aebadd2af3a22aa248546a228b08451c30d | 3 + ...14e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 | 3 + ...293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 | 3 + ...3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 | 3 + ...7d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d | 3 + 9 files changed, 293 insertions(+), 5 deletions(-) create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp_test.go create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d (limited to 'lib/btrfsprogs/btrfsinspect') diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go new file mode 100644 index 0000000..eeaab0c --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go @@ -0,0 +1,149 @@ +// Copyright (C) 2022-2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package rebuildmappings + +import ( + "errors" + "io" + + "git.lukeshu.com/btrfs-progs-ng/lib/diskio" +) + +var ErrWildcard = errors.New("wildcard") + +func kmpEq2[K ~int64, V comparable](aS diskio.Sequence[K, V], aI K, bS diskio.Sequence[K, V], bI K) bool { + aV, aErr := aS.Get(aI) + bV, bErr := bS.Get(bI) + if aErr != nil { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if aErr == ErrWildcard || errors.Is(aErr, ErrWildcard) { + aV = bV + aErr = nil + } else { + panic(aErr) + } + } + if bErr != nil { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { + bV = aV + bErr = nil + } else { + panic(bErr) + } + } + if aErr != nil || bErr != nil { + return false + } + return aV == bV +} + +func kmpEq1[K ~int64, V comparable](aV V, bS diskio.Sequence[K, V], bI K) bool { + bV, bErr := bS.Get(bI) + if bErr != nil { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { + return true + } + panic(bErr) + } + return aV == bV +} + +// buildKMPTable takes the string 'substr', and returns a table such +// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and +// 'substr[:val] == substr[matchLen-val:matchLen]'. +func buildKMPTable[K ~int64, V comparable](substr diskio.Sequence[K, V]) ([]K, error) { + var substrLen K + for { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if _, err := substr.Get(substrLen); err != nil && !(err == ErrWildcard || errors.Is(err, ErrWildcard)) { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + substrLen++ + } + + table := make([]K, substrLen) + for j := K(0); j < substrLen; j++ { + if j == 0 { + // First entry must always be 0 (in order to + // satisfy 'val < matchLen'). + continue + } + val := table[j-1] + // not a match; go back + for val > 0 && !kmpEq2(substr, j, substr, val) { + val = table[val-1] + } + // is a match; go forward + if kmpEq2(substr, val, substr, j) { + val++ + } + table[j] = val + } + return table, nil +} + +// IndexAll returns the starting-position of all possibly-overlapping +// occurrences of 'substr' in the 'str' sequence. +// +// Will hop around in 'substr', but will only get the natural sequence +// [0...) in order from 'str'. When hopping around in 'substr' it +// assumes that once it has gotten a given index without error, it can +// continue to do so without error; errors appearing later will cause +// panics. +// +// Will panic if the length of 'substr' is 0. +// +// The 'substr' may include wildcard characters by returning +// ErrWildcard for a position. +// +// Uses the Knuth-Morris-Pratt algorithm. +func IndexAll[K ~int64, V comparable](str, substr diskio.Sequence[K, V]) ([]K, error) { + table, err := buildKMPTable(substr) + if err != nil { + return nil, err + } + substrLen := K(len(table)) + if substrLen == 0 { + panic(errors.New("rebuildmappings.IndexAll: empty substring")) + } + + var matches []K + var curMatchBeg K + var curMatchLen K + + for pos := K(0); ; pos++ { + chr, err := str.Get(pos) + if err != nil { + if errors.Is(err, io.EOF) { + err = nil + } + return matches, err + } + + // Consider 'chr' + for curMatchLen > 0 && !kmpEq1(chr, substr, curMatchLen) { // shorten the match + overlap := table[curMatchLen-1] + curMatchBeg += curMatchLen - overlap + curMatchLen = overlap + } + if kmpEq1(chr, substr, curMatchLen) { // lengthen the match + if curMatchLen == 0 { + curMatchBeg = pos + } + curMatchLen++ + if curMatchLen == substrLen { + matches = append(matches, curMatchBeg) + overlap := table[curMatchLen-1] + curMatchBeg += curMatchLen - overlap + curMatchLen = overlap + } + } + } +} diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp_test.go b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp_test.go new file mode 100644 index 0000000..910452a --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp_test.go @@ -0,0 +1,126 @@ +// Copyright (C) 2022-2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package rebuildmappings + +import ( + "bytes" + "io" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "git.lukeshu.com/btrfs-progs-ng/lib/diskio" +) + +func TestBuildKMPTable(t *testing.T) { + t.Parallel() + substr := diskio.SliceSequence[int64, byte]([]byte("ababaa")) + table, err := buildKMPTable[int64, byte](substr) + require.NoError(t, err) + require.Equal(t, + []int64{0, 0, 1, 2, 3, 1}, + table) + for j, val := range table { + matchLen := j + 1 + assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen], + "for table[%d]=%d", j, val) + } +} + +func FuzzBuildKMPTable(f *testing.F) { + f.Add([]byte("ababaa")) + f.Fuzz(func(t *testing.T, substr []byte) { + table, err := buildKMPTable[int64, byte](diskio.SliceSequence[int64, byte](substr)) + require.NoError(t, err) + require.Equal(t, len(substr), len(table), "length") + for j, val := range table { + matchLen := j + 1 + assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen], + "for table[%d]=%d", j, val) + } + }) +} + +func NaiveIndexAll(str, substr []byte) []int64 { + var matches []int64 + for i := range str { + if bytes.HasPrefix(str[i:], substr) { + matches = append(matches, int64(i)) + } + } + return matches +} + +func FuzzIndexAll(f *testing.F) { + f.Fuzz(func(t *testing.T, str, substr []byte) { + if len(substr) == 0 { + t.Skip() + } + t.Logf("str =%q", str) + t.Logf("substr=%q", substr) + exp := NaiveIndexAll(str, substr) + act, err := IndexAll[int64, byte]( + &diskio.ByteReaderSequence[int64]{R: bytes.NewReader(str)}, + diskio.SliceSequence[int64, byte](substr)) + assert.NoError(t, err) + assert.Equal(t, exp, act) + }) +} + +type RESeq string + +func (re RESeq) Get(i int64) (byte, error) { + if i < 0 || i >= int64(len(re)) { + return 0, io.EOF + } + chr := re[int(i)] + if chr == '.' { + return 0, ErrWildcard + } + return chr, nil +} + +func TestKMPWildcard(t *testing.T) { + t.Parallel() + type testcase struct { + InStr string + InSubstr string + ExpMatches []int64 + } + testcases := map[string]testcase{ + "trivial-bar": { + InStr: "foo_bar", + InSubstr: "foo.ba.", + ExpMatches: []int64{0}, + }, + "trival-baz": { + InStr: "foo-baz", + InSubstr: "foo.ba.", + ExpMatches: []int64{0}, + }, + "suffix": { + InStr: "foobarbaz", + InSubstr: "...baz", + ExpMatches: []int64{3}, + }, + "overlap": { + InStr: "foobarbar", + InSubstr: "...bar", + ExpMatches: []int64{0, 3}, + }, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + matches, err := IndexAll[int64, byte]( + diskio.StringSequence[int64](tc.InStr), + RESeq(tc.InSubstr)) + assert.NoError(t, err) + assert.Equal(t, tc.ExpMatches, matches) + }) + } +} diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/matchsums.go b/lib/btrfsprogs/btrfsinspect/rebuildmappings/matchsums.go index c38314a..eda37bd 100644 --- a/lib/btrfsprogs/btrfsinspect/rebuildmappings/matchsums.go +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/matchsums.go @@ -14,7 +14,6 @@ import ( "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfssum" "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfsvol" "git.lukeshu.com/btrfs-progs-ng/lib/containers" - "git.lukeshu.com/btrfs-progs-ng/lib/diskio" "git.lukeshu.com/btrfs-progs-ng/lib/maps" ) @@ -37,7 +36,7 @@ func matchBlockGroupSums(ctx context.Context, var matches []btrfsvol.QualifiedPhysicalAddr if err := WalkUnmappedPhysicalRegions(ctx, physicalSums, regions, func(devID btrfsvol.DeviceID, region btrfssum.SumRun[btrfsvol.PhysicalAddr]) error { - rawMatches, err := diskio.IndexAll[int64, btrfssum.ShortSum](region, bgRun) + rawMatches, err := IndexAll[int64, btrfssum.ShortSum](region, bgRun) if err != nil { return err } diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/sumrunwithgaps.go b/lib/btrfsprogs/btrfsinspect/rebuildmappings/sumrunwithgaps.go index 8c1f3ed..d1064d8 100644 --- a/lib/btrfsprogs/btrfsinspect/rebuildmappings/sumrunwithgaps.go +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/sumrunwithgaps.go @@ -14,7 +14,6 @@ import ( "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfssum" "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfsvol" - "git.lukeshu.com/btrfs-progs-ng/lib/diskio" ) type SumRunWithGaps[Addr btrfsvol.IntAddr[Addr]] struct { @@ -63,7 +62,7 @@ func (sg SumRunWithGaps[Addr]) SumForAddr(addr Addr) (btrfssum.ShortSum, error) } for _, run := range sg.Runs { if run.Addr > addr { - return "", diskio.ErrWildcard + return "", ErrWildcard } if run.Addr.Add(run.Size()) <= addr { continue @@ -71,7 +70,7 @@ func (sg SumRunWithGaps[Addr]) SumForAddr(addr Addr) (btrfssum.ShortSum, error) off := int((addr-run.Addr)/btrfssum.BlockSize) * run.ChecksumSize return run.Sums[off : off+run.ChecksumSize], nil } - return "", diskio.ErrWildcard + return "", ErrWildcard } func (sg SumRunWithGaps[Addr]) Walk(ctx context.Context, fn func(Addr, btrfssum.ShortSum) error) error { diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d new file mode 100644 index 0000000..9d14adf --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("1") diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 new file mode 100644 index 0000000..269f061 --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("0") diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 new file mode 100644 index 0000000..b8f1562 --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("") diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 new file mode 100644 index 0000000..be67506 --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\xde\xdb!") +[]byte("\xde\xdb") diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d new file mode 100644 index 0000000..c3bfa37 --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\x10\x10\x15") +[]byte("\x10\x15") -- cgit v1.2.3-2-g168b