From 77f3c0d7cd21274d00984b72dfce05394d11bdd0 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 3 Feb 2023 19:13:22 -0700 Subject: Move KMP IndexAll from diskio to rebuildmappings --- lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go | 149 +++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go (limited to 'lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go') diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go new file mode 100644 index 0000000..eeaab0c --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go @@ -0,0 +1,149 @@ +// Copyright (C) 2022-2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package rebuildmappings + +import ( + "errors" + "io" + + "git.lukeshu.com/btrfs-progs-ng/lib/diskio" +) + +var ErrWildcard = errors.New("wildcard") + +func kmpEq2[K ~int64, V comparable](aS diskio.Sequence[K, V], aI K, bS diskio.Sequence[K, V], bI K) bool { + aV, aErr := aS.Get(aI) + bV, bErr := bS.Get(bI) + if aErr != nil { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if aErr == ErrWildcard || errors.Is(aErr, ErrWildcard) { + aV = bV + aErr = nil + } else { + panic(aErr) + } + } + if bErr != nil { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { + bV = aV + bErr = nil + } else { + panic(bErr) + } + } + if aErr != nil || bErr != nil { + return false + } + return aV == bV +} + +func kmpEq1[K ~int64, V comparable](aV V, bS diskio.Sequence[K, V], bI K) bool { + bV, bErr := bS.Get(bI) + if bErr != nil { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { + return true + } + panic(bErr) + } + return aV == bV +} + +// buildKMPTable takes the string 'substr', and returns a table such +// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and +// 'substr[:val] == substr[matchLen-val:matchLen]'. +func buildKMPTable[K ~int64, V comparable](substr diskio.Sequence[K, V]) ([]K, error) { + var substrLen K + for { + //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. + if _, err := substr.Get(substrLen); err != nil && !(err == ErrWildcard || errors.Is(err, ErrWildcard)) { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + substrLen++ + } + + table := make([]K, substrLen) + for j := K(0); j < substrLen; j++ { + if j == 0 { + // First entry must always be 0 (in order to + // satisfy 'val < matchLen'). + continue + } + val := table[j-1] + // not a match; go back + for val > 0 && !kmpEq2(substr, j, substr, val) { + val = table[val-1] + } + // is a match; go forward + if kmpEq2(substr, val, substr, j) { + val++ + } + table[j] = val + } + return table, nil +} + +// IndexAll returns the starting-position of all possibly-overlapping +// occurrences of 'substr' in the 'str' sequence. +// +// Will hop around in 'substr', but will only get the natural sequence +// [0...) in order from 'str'. When hopping around in 'substr' it +// assumes that once it has gotten a given index without error, it can +// continue to do so without error; errors appearing later will cause +// panics. +// +// Will panic if the length of 'substr' is 0. +// +// The 'substr' may include wildcard characters by returning +// ErrWildcard for a position. +// +// Uses the Knuth-Morris-Pratt algorithm. +func IndexAll[K ~int64, V comparable](str, substr diskio.Sequence[K, V]) ([]K, error) { + table, err := buildKMPTable(substr) + if err != nil { + return nil, err + } + substrLen := K(len(table)) + if substrLen == 0 { + panic(errors.New("rebuildmappings.IndexAll: empty substring")) + } + + var matches []K + var curMatchBeg K + var curMatchLen K + + for pos := K(0); ; pos++ { + chr, err := str.Get(pos) + if err != nil { + if errors.Is(err, io.EOF) { + err = nil + } + return matches, err + } + + // Consider 'chr' + for curMatchLen > 0 && !kmpEq1(chr, substr, curMatchLen) { // shorten the match + overlap := table[curMatchLen-1] + curMatchBeg += curMatchLen - overlap + curMatchLen = overlap + } + if kmpEq1(chr, substr, curMatchLen) { // lengthen the match + if curMatchLen == 0 { + curMatchBeg = pos + } + curMatchLen++ + if curMatchLen == substrLen { + matches = append(matches, curMatchBeg) + overlap := table[curMatchLen-1] + curMatchBeg += curMatchLen - overlap + curMatchLen = overlap + } + } + } +} -- cgit v1.2.3-2-g168b From ef60daef395b20b67042c011f5b2a1131049e275 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 3 Feb 2023 19:50:35 -0700 Subject: rebuildmappings: Optimize the KMP search --- lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go | 106 +++++++-------------- 1 file changed, 35 insertions(+), 71 deletions(-) (limited to 'lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go') diff --git a/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go index eeaab0c..20772ba 100644 --- a/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go +++ b/lib/btrfsprogs/btrfsinspect/rebuildmappings/kmp.go @@ -6,48 +6,24 @@ package rebuildmappings import ( "errors" - "io" "git.lukeshu.com/btrfs-progs-ng/lib/diskio" ) -var ErrWildcard = errors.New("wildcard") - -func kmpEq2[K ~int64, V comparable](aS diskio.Sequence[K, V], aI K, bS diskio.Sequence[K, V], bI K) bool { - aV, aErr := aS.Get(aI) - bV, bErr := bS.Get(bI) - if aErr != nil { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if aErr == ErrWildcard || errors.Is(aErr, ErrWildcard) { - aV = bV - aErr = nil - } else { - panic(aErr) - } - } - if bErr != nil { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { - bV = aV - bErr = nil - } else { - panic(bErr) - } - } - if aErr != nil || bErr != nil { - return false - } - return aV == bV +type kmpPattern[K ~int64 | ~int, V comparable] interface { + PatLen() K + // Get the value at 'pos' in the sequence. Positions start at + // 0 and increment naturally. It is invalid to call Get(pos) + // with a pos that is >= Len(). If there is a gap/wildcard at + // pos, ok is false. + PatGet(pos K) (v V, ok bool) } -func kmpEq1[K ~int64, V comparable](aV V, bS diskio.Sequence[K, V], bI K) bool { - bV, bErr := bS.Get(bI) - if bErr != nil { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if bErr == ErrWildcard || errors.Is(bErr, ErrWildcard) { - return true - } - panic(bErr) +func kmpSelfEq[K ~int64 | ~int, V comparable](s kmpPattern[K, V], aI K, bI K) bool { + aV, aOK := s.PatGet(aI) + bV, bOK := s.PatGet(bI) + if !aOK || !bOK { + return true } return aV == bV } @@ -55,18 +31,8 @@ func kmpEq1[K ~int64, V comparable](aV V, bS diskio.Sequence[K, V], bI K) bool { // buildKMPTable takes the string 'substr', and returns a table such // that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and // 'substr[:val] == substr[matchLen-val:matchLen]'. -func buildKMPTable[K ~int64, V comparable](substr diskio.Sequence[K, V]) ([]K, error) { - var substrLen K - for { - //nolint:errorlint // The == is just a fast-path; we still fall back to errors.Is. - if _, err := substr.Get(substrLen); err != nil && !(err == ErrWildcard || errors.Is(err, ErrWildcard)) { - if errors.Is(err, io.EOF) { - break - } - return nil, err - } - substrLen++ - } +func buildKMPTable[K ~int64 | ~int, V comparable](substr kmpPattern[K, V]) []K { + substrLen := substr.PatLen() table := make([]K, substrLen) for j := K(0); j < substrLen; j++ { @@ -77,26 +43,31 @@ func buildKMPTable[K ~int64, V comparable](substr diskio.Sequence[K, V]) ([]K, e } val := table[j-1] // not a match; go back - for val > 0 && !kmpEq2(substr, j, substr, val) { + for val > 0 && !kmpSelfEq(substr, j, val) { val = table[val-1] } // is a match; go forward - if kmpEq2(substr, val, substr, j) { + if kmpSelfEq(substr, val, j) { val++ } table[j] = val } - return table, nil + return table } -// IndexAll returns the starting-position of all possibly-overlapping +func kmpEq[K ~int64 | ~int, V comparable](aV V, bS kmpPattern[K, V], bI K) bool { + bV, ok := bS.PatGet(bI) + if !ok { + return true + } + return aV == bV +} + +// indexAll returns the starting-position of all possibly-overlapping // occurrences of 'substr' in the 'str' sequence. // // Will hop around in 'substr', but will only get the natural sequence -// [0...) in order from 'str'. When hopping around in 'substr' it -// assumes that once it has gotten a given index without error, it can -// continue to do so without error; errors appearing later will cause -// panics. +// [0...) in order from 'str'. // // Will panic if the length of 'substr' is 0. // @@ -104,11 +75,8 @@ func buildKMPTable[K ~int64, V comparable](substr diskio.Sequence[K, V]) ([]K, e // ErrWildcard for a position. // // Uses the Knuth-Morris-Pratt algorithm. -func IndexAll[K ~int64, V comparable](str, substr diskio.Sequence[K, V]) ([]K, error) { - table, err := buildKMPTable(substr) - if err != nil { - return nil, err - } +func indexAll[K ~int64 | ~int, V comparable](str diskio.Sequence[K, V], substr kmpPattern[K, V]) []K { + table := buildKMPTable(substr) substrLen := K(len(table)) if substrLen == 0 { panic(errors.New("rebuildmappings.IndexAll: empty substring")) @@ -118,22 +86,17 @@ func IndexAll[K ~int64, V comparable](str, substr diskio.Sequence[K, V]) ([]K, e var curMatchBeg K var curMatchLen K - for pos := K(0); ; pos++ { - chr, err := str.Get(pos) - if err != nil { - if errors.Is(err, io.EOF) { - err = nil - } - return matches, err - } + strLen := str.SeqLen() + for pos := K(0); pos < strLen; pos++ { + chr := str.SeqGet(pos) // Consider 'chr' - for curMatchLen > 0 && !kmpEq1(chr, substr, curMatchLen) { // shorten the match + for curMatchLen > 0 && !kmpEq(chr, substr, curMatchLen) { // shorten the match overlap := table[curMatchLen-1] curMatchBeg += curMatchLen - overlap curMatchLen = overlap } - if kmpEq1(chr, substr, curMatchLen) { // lengthen the match + if kmpEq(chr, substr, curMatchLen) { // lengthen the match if curMatchLen == 0 { curMatchBeg = pos } @@ -146,4 +109,5 @@ func IndexAll[K ~int64, V comparable](str, substr diskio.Sequence[K, V]) ([]K, e } } } + return matches } -- cgit v1.2.3-2-g168b