Olox Olox

Theme

Documentation
Back to Home

String Pattern Matching - KMP, Z, Rabin-Karp

9 min read

String Pattern Matching Algorithms

📚 Summary

Efficient string pattern matching is crucial for text processing. KMP, Z-Algorithm, and Rabin-Karp are the fundamental algorithms with O(n+m) complexity.


1️⃣ KMP (Knuth-Morris-Pratt) Algorithm

LPS (Longest Prefix Suffix) Array

def compute_lps(pattern: str) -> list[int]:
    """
    Compute LPS array for pattern
    lps[i] = length of longest proper prefix which is also suffix of pattern[0..i]
    
    Time: O(m), Space: O(m)
    """
    m = len(pattern)
    lps = [0] * m
    
    length = 0  # Length of previous longest prefix suffix
    i = 1
    
    while i < m:
        if pattern[i] == pattern[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length != 0:
                # Use previous LPS value
                length = lps[length - 1]
            else:
                lps[i] = 0
                i += 1
    
    return lps
def kmp_search(text: str, pattern: str) -> list[int]:
    """
    Find all occurrences of pattern in text using KMP
    Returns list of starting indices
    
    Time: O(n + m), Space: O(m)
    """
    n, m = len(text), len(pattern)
    if m == 0:
        return []
    if m > n:
        return []
    
    lps = compute_lps(pattern)
    result = []
    
    i = 0  # Index for text
    j = 0  # Index for pattern
    
    while i < n:
        if pattern[j] == text[i]:
            i += 1
            j += 1
        
        if j == m:
            # Found pattern at index i - j
            result.append(i - j)
            j = lps[j - 1]
        elif i < n and pattern[j] != text[i]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1
    
    return result


def kmp_count(text: str, pattern: str) -> int:
    """Count occurrences of pattern in text"""
    return len(kmp_search(text, pattern))


def kmp_first_occurrence(text: str, pattern: str) -> int:
    """Find first occurrence of pattern in text, -1 if not found"""
    matches = kmp_search(text, pattern)
    return matches[0] if matches else -1

KMP Applications

def shortest_palindrome(s: str) -> str:
    """
    LC 214: Find shortest palindrome by adding chars at beginning
    Use KMP: find longest prefix of s that is palindrome
    """
    if not s:
        return s
    
    # Build pattern: s + '#' + reverse(s)
    combined = s + '#' + s[::-1]
    lps = compute_lps(combined)
    
    # LPS of last char gives longest palindrome prefix
    palindrome_len = lps[-1]
    suffix_to_add = s[palindrome_len:][::-1]
    
    return suffix_to_add + s


def repeated_substring_pattern(s: str) -> bool:
    """
    LC 459: Check if s can be constructed by repeating a substring
    Use KMP: s is made of repeated pattern iff len(s) % (len(s) - lps[-1]) == 0
    """
    if not s:
        return False
    
    lps = compute_lps(s)
    n = len(s)
    pattern_len = n - lps[-1]
    
    return lps[-1] > 0 and n % pattern_len == 0

2️⃣ Z-Algorithm

Z-Array Computation

def z_function(s: str) -> list[int]:
    """
    Compute Z-array for string s
    z[i] = length of longest substring starting at i which is also a prefix of s
    
    Time: O(n), Space: O(n)
    """
    n = len(s)
    z = [0] * n
    z[0] = n  # Convention: z[0] = n (entire string matches itself)
    
    left, right = 0, 0  # Z-box [left, right]
    
    for i in range(1, n):
        if i < right:
            # i is within current Z-box
            z[i] = min(right - i, z[i - left])
        
        # Try to extend the match
        while i + z[i] < n and s[z[i]] == s[i + z[i]]:
            z[i] += 1
        
        # Update Z-box if needed
        if i + z[i] > right:
            left = i
            right = i + z[i]
    
    return z
def z_search(text: str, pattern: str) -> list[int]:
    """
    Find all occurrences of pattern in text using Z-algorithm
    
    Idea: Build combined = pattern + '$' + text
    z[i] = m means pattern found at position i - m - 1 in text
    """
    m = len(pattern)
    if m == 0:
        return []
    
    combined = pattern + '$' + text
    z = z_function(combined)
    
    result = []
    for i in range(m + 1, len(combined)):
        if z[i] == m:
            result.append(i - m - 1)
    
    return result


def z_search_all_patterns(text: str, patterns: list[str]) -> dict[str, list[int]]:
    """Find all occurrences of multiple patterns"""
    result = {}
    for pattern in patterns:
        result[pattern] = z_search(text, pattern)
    return result

Z-Algorithm Applications

def min_rotation(s: str) -> int:
    """
    Find starting position of lexicographically smallest rotation
    Use Z-function on s + s
    """
    doubled = s + s
    z = z_function(doubled)
    n = len(s)
    
    min_pos = 0
    for i in range(1, n):
        if doubled[i:] < doubled[min_pos:]:
            min_pos = i
    
    return min_pos


def longest_common_prefix_all(strings: list[str]) -> str:
    """
    Find longest common prefix of all strings
    Alternative to simple comparison - useful for specific patterns
    """
    if not strings:
        return ""
    
    combined = '\x00'.join(strings)
    z = z_function(combined)
    
    min_len = min(len(s) for s in strings)
    result = 0
    
    for length in range(1, min_len + 1):
        valid = True
        for s in strings:
            # Check if this prefix matches
            pass  # Implementation varies by use case
    
    return strings[0][:result]

3️⃣ Rabin-Karp Algorithm

Rolling Hash

class RollingHash:
    """
    Rolling hash for efficient substring hashing
    hash(s) = s[0] * base^(n-1) + s[1] * base^(n-2) + ... + s[n-1]
    """
    def __init__(self, s: str, base: int = 31, mod: int = 10**9 + 7):
        self.s = s
        self.n = len(s)
        self.base = base
        self.mod = mod
        
        # Precompute prefix hashes and powers
        self.prefix_hash = [0] * (self.n + 1)
        self.powers = [1] * (self.n + 1)
        
        for i in range(self.n):
            self.prefix_hash[i + 1] = (
                self.prefix_hash[i] * base + ord(s[i]) - ord('a') + 1
            ) % mod
            self.powers[i + 1] = (self.powers[i] * base) % mod
    
    def get_hash(self, left: int, right: int) -> int:
        """
        Get hash of substring s[left:right+1]
        O(1) query
        """
        return (
            self.prefix_hash[right + 1] - 
            self.prefix_hash[left] * self.powers[right - left + 1]
        ) % self.mod
def rabin_karp_search(text: str, pattern: str, base: int = 31, mod: int = 10**9 + 7) -> list[int]:
    """
    Find all occurrences of pattern in text using Rabin-Karp
    
    Time: O(n + m) average, O(nm) worst case (hash collisions)
    Space: O(1)
    """
    n, m = len(text), len(pattern)
    if m > n:
        return []
    
    result = []
    
    # Compute hash of pattern
    pattern_hash = 0
    for c in pattern:
        pattern_hash = (pattern_hash * base + ord(c) - ord('a') + 1) % mod
    
    # Compute highest power needed
    high_power = pow(base, m - 1, mod)
    
    # Compute initial window hash
    window_hash = 0
    for i in range(m):
        window_hash = (window_hash * base + ord(text[i]) - ord('a') + 1) % mod
    
    # Check first window
    if window_hash == pattern_hash and text[:m] == pattern:
        result.append(0)
    
    # Slide window
    for i in range(m, n):
        # Remove leftmost character
        window_hash = (window_hash - (ord(text[i - m]) - ord('a') + 1) * high_power) % mod
        # Add new character
        window_hash = (window_hash * base + ord(text[i]) - ord('a') + 1) % mod
        
        # Check for match (with verification to avoid false positives)
        if window_hash == pattern_hash and text[i - m + 1:i + 1] == pattern:
            result.append(i - m + 1)
    
    return result


def rabin_karp_double_hash(text: str, pattern: str) -> list[int]:
    """
    Rabin-Karp with two hash functions to reduce false positives
    """
    mods = [10**9 + 7, 10**9 + 9]
    bases = [31, 37]
    
    n, m = len(text), len(pattern)
    if m > n:
        return []
    
    result = []
    
    pattern_hashes = []
    for base, mod in zip(bases, mods):
        h = 0
        for c in pattern:
            h = (h * base + ord(c)) % mod
        pattern_hashes.append(h)
    
    window_hashes = [0, 0]
    high_powers = [pow(bases[i], m - 1, mods[i]) for i in range(2)]
    
    for i in range(m):
        for j in range(2):
            window_hashes[j] = (window_hashes[j] * bases[j] + ord(text[i])) % mods[j]
    
    if window_hashes == pattern_hashes:
        result.append(0)
    
    for i in range(m, n):
        for j in range(2):
            window_hashes[j] = (
                (window_hashes[j] - ord(text[i - m]) * high_powers[j]) * bases[j] + ord(text[i])
            ) % mods[j]
        
        if window_hashes == pattern_hashes:
            result.append(i - m + 1)
    
    return result

Rabin-Karp Applications

def longest_duplicate_substring(s: str) -> str:
    """
    LC 1044: Find longest duplicate substring using binary search + Rabin-Karp
    """
    n = len(s)
    base = 26
    mod = 2**63 - 1
    
    def search(length: int) -> int:
        """Check if there's a duplicate substring of given length"""
        if length == 0:
            return -1
        
        h = 0
        for i in range(length):
            h = (h * base + ord(s[i])) % mod
        
        seen = {h: 0}
        high_power = pow(base, length, mod)
        
        for i in range(1, n - length + 1):
            h = (h * base - ord(s[i - 1]) * high_power + ord(s[i + length - 1])) % mod
            if h in seen:
                # Verify to avoid collision
                if s[seen[h]:seen[h] + length] == s[i:i + length]:
                    return i
            seen[h] = i
        
        return -1
    
    # Binary search on length
    left, right = 0, n - 1
    result_pos = -1
    
    while left <= right:
        mid = (left + right) // 2
        pos = search(mid)
        if pos != -1:
            result_pos = pos
            left = mid + 1
        else:
            right = mid - 1
    
    if result_pos == -1:
        return ""
    return s[result_pos:result_pos + left - 1]


def count_distinct_substrings(s: str) -> int:
    """
    Count distinct substrings using rolling hash
    """
    n = len(s)
    seen = set()
    base = 31
    mod = 10**9 + 7
    
    for length in range(1, n + 1):
        h = 0
        for i in range(length):
            h = (h * base + ord(s[i]) - ord('a') + 1) % mod
        seen.add(h)
        
        high_power = pow(base, length - 1, mod)
        for i in range(length, n):
            h = ((h - (ord(s[i - length]) - ord('a') + 1) * high_power) * base + 
                 ord(s[i]) - ord('a') + 1) % mod
            seen.add(h)
    
    return len(seen)

4️⃣ Comparison

AlgorithmTimeSpaceUse Case
KMPO(n+m)O(m)Single pattern, need all matches
Z-AlgorithmO(n+m)O(n)Single pattern, prefix-based problems
Rabin-KarpO(n+m) avgO(1)Multiple patterns, substring problems
Built-inO(nm)O(1)Simple cases, small inputs

5️⃣ Implementation Tips

# Python built-in alternatives (for simple cases)
text.find(pattern)      # First occurrence, -1 if not found
text.count(pattern)     # Count occurrences
pattern in text         # Check existence

# When to use which:
# - KMP: Need LPS array for other purposes, guaranteed O(n+m)
# - Z-Algorithm: Prefix-related problems, simpler to understand
# - Rabin-Karp: Multiple patterns, 2D pattern matching, duplicate finding

📚 Practice Problems

#ProblemKey Concept
1LC 28: Find Index of First OccurrenceBasic pattern matching
2LC 214: Shortest PalindromeKMP + LPS
3LC 459: Repeated Substring PatternKMP property
4LC 686: Repeated String MatchPattern in repeated string
5LC 796: Rotate StringPattern in doubled string
6LC 1044: Longest Duplicate SubstringRabin-Karp + Binary Search
7LC 1392: Longest Happy PrefixZ-function / KMP
8LC 1397: Find All Good StringsKMP + Digit DP
9LC 1948: Delete Duplicate FoldersHashing subtrees
10LC 2223: Sum of Scores of Built StringsZ-function

Last Updated: 2024