From a0fc940be0ea888c7b380cd95723da65bbd32bcb Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 13 Jul 2022 19:01:22 -0600 Subject: Implement KMP string search --- lib/kmp/kmp.go | 83 ++++++++++++++++++++++ lib/kmp/kmp_test.go | 62 ++++++++++++++++ ...6820e3f98383aebadd2af3a22aa248546a228b08451c30d | 3 + ...14e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 | 3 + ...293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 | 3 + ...3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 | 3 + ...7d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d | 3 + 7 files changed, 160 insertions(+) create mode 100644 lib/kmp/kmp.go create mode 100644 lib/kmp/kmp_test.go create mode 100644 lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d create mode 100644 lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 create mode 100644 lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 create mode 100644 lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 create mode 100644 lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d (limited to 'lib') diff --git a/lib/kmp/kmp.go b/lib/kmp/kmp.go new file mode 100644 index 0000000..66b5516 --- /dev/null +++ b/lib/kmp/kmp.go @@ -0,0 +1,83 @@ +// Copyright (C) 2022 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package kmp + +import ( + "errors" + "io" +) + +// buildTable takes the string 'substr', and returns a table such +// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and +// 'substr[:val] == substr[matchLen-val:matchLen]'. +func buildTable(substr []byte) []int { + table := make([]int, len(substr)) + for j := range table { + if j == 0 { + // First entry must always be 0 (in order to + // satisfy 'val < matchLen'). + continue + } + val := table[j-1] + // not a match; go back + for val > 0 && substr[j] != substr[val] { + val = table[val-1] + } + // is a match; go forward + if substr[val] == substr[j] { + val++ + } + table[j] = val + } + return table +} + +// FindAll returns the starting-position of all possibly-overlapping +// occurances of 'substr' in the 'r' stream. +// +// Will panic if len(substr)==0. +func FindAll(r io.ByteReader, substr []byte) ([]int64, error) { + if len(substr) == 0 { + panic(errors.New("kmp.FindAll: empty substring")) + } + table := buildTable(substr) + + var matches []int64 + var curMatchBeg int64 + var curMatchLen int + + pos := int64(-1) // if 'r' were a slice; define 'pos' such that 'chr=r[pos]' + for { + // I/O + var chr byte + chr, err := r.ReadByte() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil + } + return matches, err + } + pos++ + + // Consider 'chr' + for curMatchLen > 0 && chr != substr[curMatchLen] { // shorten the match + overlap := table[curMatchLen-1] + curMatchBeg += int64(curMatchLen - overlap) + curMatchLen = overlap + } + if chr == substr[curMatchLen] { // lengthen the match + if curMatchLen == 0 { + curMatchBeg = pos + } + curMatchLen++ + if curMatchLen == len(substr) { + matches = append(matches, curMatchBeg) + overlap := table[curMatchLen-1] + curMatchBeg += int64(curMatchLen - overlap) + curMatchLen = overlap + } + } + } +} diff --git a/lib/kmp/kmp_test.go b/lib/kmp/kmp_test.go new file mode 100644 index 0000000..6a11b50 --- /dev/null +++ b/lib/kmp/kmp_test.go @@ -0,0 +1,62 @@ +// Copyright (C) 2022 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package kmp + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestBuildTable(t *testing.T) { + substr := []byte("ababaa") + table := buildTable(substr) + assert.Equal(t, + []int{0, 0, 1, 2, 3, 1}, + table) + for j, val := range table { + matchLen := j + 1 + assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen], + "for table[%d]=%d", j, val) + } +} + +func FuzzBuildTable(f *testing.F) { + f.Add([]byte("ababaa")) + f.Fuzz(func(t *testing.T, substr []byte) { + table := buildTable(substr) + assert.Equal(t, len(substr), len(table), "length") + for j, val := range table { + matchLen := j + 1 + assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen], + "for table[%d]=%d", j, val) + } + }) +} + +func NaiveFindAll(str, substr []byte) []int64 { + var matches []int64 + for i := range str { + if bytes.HasPrefix(str[i:], substr) { + matches = append(matches, int64(i)) + } + } + return matches +} + +func FuzzFindAll(f *testing.F) { + f.Fuzz(func(t *testing.T, str, substr []byte) { + if len(substr) == 0 { + t.Skip() + } + t.Logf("str =%q", str) + t.Logf("substr=%q", substr) + exp := NaiveFindAll(str, substr) + act, err := FindAll(bytes.NewReader(str), substr) + assert.NoError(t, err) + assert.Equal(t, exp, act) + }) +} diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d new file mode 100644 index 0000000..9d14adf --- /dev/null +++ b/lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("1") diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 new file mode 100644 index 0000000..269f061 --- /dev/null +++ b/lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("0") diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 new file mode 100644 index 0000000..b8f1562 --- /dev/null +++ b/lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("") diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 new file mode 100644 index 0000000..be67506 --- /dev/null +++ b/lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\xde\xdb!") +[]byte("\xde\xdb") diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d new file mode 100644 index 0000000..c3bfa37 --- /dev/null +++ b/lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\x10\x10\x15") +[]byte("\x10\x15") -- cgit v1.2.3-2-g168b