Chapter 110

Home

Aho-Corasick Multi-pattern

KMP matches one pattern. Aho-Corasick matches all K patterns in O(N + total pattern length) — a trie with failure links (like KMP's LPS). Essential for malware signatures, bad-word filtering, and bioinformatics.

Aho-Corasick

C++ — Aho-Corasick

struct AhoCorasick {
    static const int ALPH = 26;
    struct Node {
        int next[ALPH], fail;
        int output;  // number of patterns ending here
        Node() : fail(0), output(0) {
            memset(next, 0, sizeof(next));
        }
    };
    vector trie;

    AhoCorasick() { trie.emplace_back(); }

    void insert(string& s) {
        int v = 0;
        for (char c : s) {
            int idx = c - 'a';
            if (!trie[v].next[idx]) {
                trie[v].next[idx] = trie.size();
                trie.emplace_back();
            }
            v = trie[v].next[idx];
        }
        trie[v].output++;
    }

    void build() {  // BFS failure links
        queue q;
        for (int c = 0; c < ALPH; c++)
            if (trie[0].next[c]) q.push(trie[0].next[c]);

        while (!q.empty()) {
            int v = q.front(); q.pop();
            for (int c = 0; c < ALPH; c++) {
                int u = trie[v].next[c];
                if (u) {
                    trie[u].fail = trie[trie[v].fail].next[c];
                    trie[u].output += trie[trie[u].fail].output;
                    q.push(u);
                } else {
                    trie[v].next[c] = trie[trie[v].fail].next[c];
                }
            }
        }
    }

    int count_matches(string& text) {
        int v = 0, ans = 0;
        for (char c : text) {
            v = trie[v].next[c - 'a'];
            ans += trie[v].output;
        }
        return ans;
    }
};

// Build: O(total pattern length). Query: O(text length).

Key Takeaways