summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2022-07-13 19:01:22 -0600
committerLuke Shumaker <lukeshu@lukeshu.com>2022-07-13 19:01:22 -0600
commita0fc940be0ea888c7b380cd95723da65bbd32bcb (patch)
treec332cd95de9c882d23728e9fe040e63c97ff007f /lib
parentae84737c1bc0a5e137e1a25d7344f2ab8e420489 (diff)
Implement KMP string search
Diffstat (limited to 'lib')
-rw-r--r--lib/kmp/kmp.go83
-rw-r--r--lib/kmp/kmp_test.go62
-rw-r--r--lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d3
-rw-r--r--lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e6003
-rw-r--r--lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd603
-rw-r--r--lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e419537383
-rw-r--r--lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d3
7 files changed, 160 insertions, 0 deletions
diff --git a/lib/kmp/kmp.go b/lib/kmp/kmp.go
new file mode 100644
index 0000000..66b5516
--- /dev/null
+++ b/lib/kmp/kmp.go
@@ -0,0 +1,83 @@
+// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package kmp
+
+import (
+ "errors"
+ "io"
+)
+
+// buildTable takes the string 'substr', and returns a table such
+// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and
+// 'substr[:val] == substr[matchLen-val:matchLen]'.
+func buildTable(substr []byte) []int {
+ table := make([]int, len(substr))
+ for j := range table {
+ if j == 0 {
+ // First entry must always be 0 (in order to
+ // satisfy 'val < matchLen').
+ continue
+ }
+ val := table[j-1]
+ // not a match; go back
+ for val > 0 && substr[j] != substr[val] {
+ val = table[val-1]
+ }
+ // is a match; go forward
+ if substr[val] == substr[j] {
+ val++
+ }
+ table[j] = val
+ }
+ return table
+}
+
+// FindAll returns the starting-position of all possibly-overlapping
+// occurances of 'substr' in the 'r' stream.
+//
+// Will panic if len(substr)==0.
+func FindAll(r io.ByteReader, substr []byte) ([]int64, error) {
+ if len(substr) == 0 {
+ panic(errors.New("kmp.FindAll: empty substring"))
+ }
+ table := buildTable(substr)
+
+ var matches []int64
+ var curMatchBeg int64
+ var curMatchLen int
+
+ pos := int64(-1) // if 'r' were a slice; define 'pos' such that 'chr=r[pos]'
+ for {
+ // I/O
+ var chr byte
+ chr, err := r.ReadByte()
+ if err != nil {
+ if errors.Is(err, io.EOF) {
+ err = nil
+ }
+ return matches, err
+ }
+ pos++
+
+ // Consider 'chr'
+ for curMatchLen > 0 && chr != substr[curMatchLen] { // shorten the match
+ overlap := table[curMatchLen-1]
+ curMatchBeg += int64(curMatchLen - overlap)
+ curMatchLen = overlap
+ }
+ if chr == substr[curMatchLen] { // lengthen the match
+ if curMatchLen == 0 {
+ curMatchBeg = pos
+ }
+ curMatchLen++
+ if curMatchLen == len(substr) {
+ matches = append(matches, curMatchBeg)
+ overlap := table[curMatchLen-1]
+ curMatchBeg += int64(curMatchLen - overlap)
+ curMatchLen = overlap
+ }
+ }
+ }
+}
diff --git a/lib/kmp/kmp_test.go b/lib/kmp/kmp_test.go
new file mode 100644
index 0000000..6a11b50
--- /dev/null
+++ b/lib/kmp/kmp_test.go
@@ -0,0 +1,62 @@
+// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package kmp
+
+import (
+ "bytes"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestBuildTable(t *testing.T) {
+ substr := []byte("ababaa")
+ table := buildTable(substr)
+ assert.Equal(t,
+ []int{0, 0, 1, 2, 3, 1},
+ table)
+ for j, val := range table {
+ matchLen := j + 1
+ assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen],
+ "for table[%d]=%d", j, val)
+ }
+}
+
+func FuzzBuildTable(f *testing.F) {
+ f.Add([]byte("ababaa"))
+ f.Fuzz(func(t *testing.T, substr []byte) {
+ table := buildTable(substr)
+ assert.Equal(t, len(substr), len(table), "length")
+ for j, val := range table {
+ matchLen := j + 1
+ assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen],
+ "for table[%d]=%d", j, val)
+ }
+ })
+}
+
+func NaiveFindAll(str, substr []byte) []int64 {
+ var matches []int64
+ for i := range str {
+ if bytes.HasPrefix(str[i:], substr) {
+ matches = append(matches, int64(i))
+ }
+ }
+ return matches
+}
+
+func FuzzFindAll(f *testing.F) {
+ f.Fuzz(func(t *testing.T, str, substr []byte) {
+ if len(substr) == 0 {
+ t.Skip()
+ }
+ t.Logf("str =%q", str)
+ t.Logf("substr=%q", substr)
+ exp := NaiveFindAll(str, substr)
+ act, err := FindAll(bytes.NewReader(str), substr)
+ assert.NoError(t, err)
+ assert.Equal(t, exp, act)
+ })
+}
diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d
new file mode 100644
index 0000000..9d14adf
--- /dev/null
+++ b/lib/kmp/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d
@@ -0,0 +1,3 @@
+go test fuzz v1
+[]byte("0")
+[]byte("1")
diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600
new file mode 100644
index 0000000..269f061
--- /dev/null
+++ b/lib/kmp/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600
@@ -0,0 +1,3 @@
+go test fuzz v1
+[]byte("0")
+[]byte("0")
diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60
new file mode 100644
index 0000000..b8f1562
--- /dev/null
+++ b/lib/kmp/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60
@@ -0,0 +1,3 @@
+go test fuzz v1
+[]byte("0")
+[]byte("")
diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738
new file mode 100644
index 0000000..be67506
--- /dev/null
+++ b/lib/kmp/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738
@@ -0,0 +1,3 @@
+go test fuzz v1
+[]byte("\xde\xdb!")
+[]byte("\xde\xdb")
diff --git a/lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d
new file mode 100644
index 0000000..c3bfa37
--- /dev/null
+++ b/lib/kmp/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d
@@ -0,0 +1,3 @@
+go test fuzz v1
+[]byte("\x10\x10\x15")
+[]byte("\x10\x15")