From a0fc940be0ea888c7b380cd95723da65bbd32bcb Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 13 Jul 2022 19:01:22 -0600 Subject: Implement KMP string search --- lib/kmp/kmp.go | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 lib/kmp/kmp.go (limited to 'lib/kmp/kmp.go') diff --git a/lib/kmp/kmp.go b/lib/kmp/kmp.go new file mode 100644 index 0000000..66b5516 --- /dev/null +++ b/lib/kmp/kmp.go @@ -0,0 +1,83 @@ +// Copyright (C) 2022 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package kmp + +import ( + "errors" + "io" +) + +// buildTable takes the string 'substr', and returns a table such +// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and +// 'substr[:val] == substr[matchLen-val:matchLen]'. +func buildTable(substr []byte) []int { + table := make([]int, len(substr)) + for j := range table { + if j == 0 { + // First entry must always be 0 (in order to + // satisfy 'val < matchLen'). + continue + } + val := table[j-1] + // not a match; go back + for val > 0 && substr[j] != substr[val] { + val = table[val-1] + } + // is a match; go forward + if substr[val] == substr[j] { + val++ + } + table[j] = val + } + return table +} + +// FindAll returns the starting-position of all possibly-overlapping +// occurances of 'substr' in the 'r' stream. +// +// Will panic if len(substr)==0. +func FindAll(r io.ByteReader, substr []byte) ([]int64, error) { + if len(substr) == 0 { + panic(errors.New("kmp.FindAll: empty substring")) + } + table := buildTable(substr) + + var matches []int64 + var curMatchBeg int64 + var curMatchLen int + + pos := int64(-1) // if 'r' were a slice; define 'pos' such that 'chr=r[pos]' + for { + // I/O + var chr byte + chr, err := r.ReadByte() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil + } + return matches, err + } + pos++ + + // Consider 'chr' + for curMatchLen > 0 && chr != substr[curMatchLen] { // shorten the match + overlap := table[curMatchLen-1] + curMatchBeg += int64(curMatchLen - overlap) + curMatchLen = overlap + } + if chr == substr[curMatchLen] { // lengthen the match + if curMatchLen == 0 { + curMatchBeg = pos + } + curMatchLen++ + if curMatchLen == len(substr) { + matches = append(matches, curMatchBeg) + overlap := table[curMatchLen-1] + curMatchBeg += int64(curMatchLen - overlap) + curMatchLen = overlap + } + } + } +} -- cgit v1.2.3-2-g168b