后缀树(Suffix Tree)

这篇博客写得不错

Code

使用Ukkonen算法构建后缀树。

const int inf = 1 << 30;

//字符串下标从1开始
template<size_t maxn>
struct SuffixTree {
    int link[maxn << 1], length[maxn << 1], start[maxn << 1];
    int s[maxn << 1], ch[maxn << 1][27];
    /****************************************************************
        link[u]         结点u的后缀链接
        length[u]       到结点u的这条边所代表的串长
        start[u]        到结点u的这条边所代表的串在原串中的起始位置
        s               当前已插入的原串的前缀
    ****************************************************************/
    int n, tail, now, len;
    /****************************************************************
        n               当前已插入到原串的第n个字符
        tail            结点池的下标,即新建结点的编号
        now             当前的活动结点
        s[n-rem+1...n]  最长的被隐式包含的后缀,长度为rem
        (now,char,len)  沿着结点now的开头为char的出边走len个字符到达的位置,
                        从源点到该位置唯一表示的串等于最长的被隐式包含的后缀
                        rem=从源点到该位置的字符个数
                        char=s[n-rem+1]
                        所以只需要维护now和len
    ****************************************************************/

    //初始时令tail=1,n=0,len=0,now=1
    SuffixTree() :tail(1), n(0), len(0), now(1) { length[0] = inf; }

    int newNode(int st, int len) {
        link[++tail] = 1;       //后缀链接默认到源点1
        start[tail] = st;       //原串中的起始位置
        length[tail] = len;     //进入该点的边所代表的串长
        return tail;            //返回新建结点的编号
    }
    void extend(int x) {
        s[++n] = x;++len;       //新扩展一个字符x
        for (int last = 1;len;) {//如果len==0,结束这一轮插入
            //应该走now结点的字符为s[n-len+1]的出边
            //如果len超过now该出边的长度,不断向下更新now
            while (len > length[ch[now][s[n - len + 1]]])
                len -= length[now = ch[now][s[n - len + 1]]];
            //到达now的开头为s[n-len+1]的出边,即进入v的这条边
            int& v = ch[now][s[n - len + 1]];
            int c = s[start[v] + len - 1];
            //若不存在这条出边,新建一条出边指向一个新的结点,并将last的后缀链接指向now,将last更新为now
            //若存在这条出边,对比出边上第len个字符c和当前插入的字符x,
            //若x==c,则说明需要插入的后缀已经存在于后缀树之中,插入失败,修改后缀链接后退出本轮插入
            if (!v || x == c) {
                link[last] = now;last = now;
                if (!v) v = newNode(n, inf);
                else break;
            }
            //若x!=c,则新建一个点u对这条边进行分裂
            //now---->v     now--->u---->v
            //                     |
            //                     --->[n,#]
            else {
                int u = newNode(start[v], len - 1);
                ch[u][c] = v;ch[u][x] = newNode(n, inf);
                start[v] += len - 1;length[v] -= len - 1;
                link[last] = v = u;last = u;
            }
            if (now == 1) --len; //如果now==源点1,则成功插入一个后缀,--len
            else now = link[now];//如果now!=源点1,则令now跳到now的后缀链接指向的节点
        }
    }
};

洛谷P3804 【模板】后缀自动机(SAM)

传送门

给定一个只包含小写字母的字符串 \(S\), 请你求出 \(S\) 的所有出现次数不为 \(1\) 的子串的出现次数乘上该子串长度的最大值。

这题使用后缀树也可以做。从后缀树的根出发到达的每一个结点,路径上的边所代表的字符串依次相连,都是原串的子串。以一个结点为根的子树内的叶结点个数即为该子串在原串中出现的次数。要求出现次数大于1,所以只需要找子树内叶结点数目大于1的结点。子串的长度在dfs的过程中也容易维护。所以只要dfs一遍整棵后缀树,即可求得答案。

Code

#include <bits/stdc++.h>
using namespace std;

#define LL long long
const int inf = 1 << 30;

template<size_t maxn>
struct SuffixTree {
    int link[maxn << 1], length[maxn << 1], start[maxn << 1], sz[maxn << 1];
    int s[maxn << 1], ch[maxn << 1][27];
    int n, tail, now, len;
    SuffixTree() :tail(1), n(0), len(0), now(1) { length[0] = inf; }

    int newNode(int st, int len) {
        link[++tail] = 1;
        start[tail] = st;
        length[tail] = len;
        return tail;
    }
    void extend(int x) {
        s[++n] = x;++len;
        for (int last = 1;len;) {
            while (len > length[ch[now][s[n - len + 1]]])
                len -= length[now = ch[now][s[n - len + 1]]];
            int& v = ch[now][s[n - len + 1]];
            int c = s[start[v] + len - 1];
            if (!v || x == c) {
                link[last] = now;last = now;
                if (!v) v = newNode(n, inf);
                else break;
            }
            else {
                int u = newNode(start[v], len - 1);
                ch[u][c] = v;ch[u][x] = newNode(n, inf);
                start[v] += len - 1;length[v] -= len - 1;
                link[last] = v = u;last = u;
            }
            if (now == 1) --len;
            else now = link[now];
        }
    }
    LL ans = 0;

    int solve(int u, int curLen) {
        int leafNum = 0;
        bool flag = true;
        for (int x = 0;x < 27;++x)
            if (ch[u][x]) { leafNum += solve(ch[u][x], curLen + length[ch[u][x]]);flag = false; }
        if (leafNum > 1) ans = max(ans, 1LL * curLen * leafNum);
        if (flag) leafNum = 1;
        return leafNum;
    }
};

SuffixTree<1000010> suffixTree;
char s[1000010];

int main() {
    scanf("%s", s + 1);
    int len = strlen(s + 1);
    for (int i = 1;i <= len;++i)
        suffixTree.extend(s[i] - 'a');
    suffixTree.extend(26);
    suffixTree.solve(1, 0);
    printf("%lld\n", suffixTree.ans);

    return 0;
}

传送门

给定一个字符串,求本质不同排名第k小的子串。

后缀树的叶结点的dfs序就是后缀数组。所以我们只需要找dfs序为 \(k\) 的字符,从根结点到它所构成的串即为本质不同第 \(k\) 小的子串。可以预处理出后缀树的每个子树的dfs序的区间,这样在找排名为 \(k\) 的子串时就不用再遍历整棵后缀树,而是可以像平衡树找第 \(k\) 小值一样直接走正确的路径。

Code

#include <bits/stdc++.h>
using namespace std;
#pragma gcc optimze(3)

const int inf = 1 << 30;

int ask[505];
char path[90010];
int query, t, idx;
bool flag;

template<size_t maxn>
struct SuffixTree {
    int link[maxn << 1], length[maxn << 1], start[maxn << 1], sz[maxn << 1];
    int s[maxn << 1], ch[maxn << 1][27];
    int n, tail, now, len;
    SuffixTree() :tail(1), n(0), len(0), now(1) { length[0] = inf; }

    int newNode(int st, int len) {
        link[++tail] = 1;
        start[tail] = st;
        length[tail] = len;
        return tail;
    }
    void extend(int x) {
        s[++n] = x;++len;
        for (int last = 1;len;) {
            while (len > length[ch[now][s[n - len + 1]]])
                len -= length[now = ch[now][s[n - len + 1]]];
            int& v = ch[now][s[n - len + 1]];
            int c = s[start[v] + len - 1];
            if (!v || x == c) {
                link[last] = now;last = now;
                if (!v) v = newNode(n, inf);
                else break;
            }
            else {
                int u = newNode(start[v], len - 1);
                ch[u][c] = v;ch[u][x] = newNode(n, inf);
                start[v] += len - 1;length[v] -= len - 1;
                link[last] = v = u;last = u;
            }
            if (now == 1) --len;
            else now = link[now];
        }
    }
    void dfs(int u) {
        sz[u] = 0;
        for (int x = 0;x < 26;++x) {
            int v = ch[u][x];
            if (v) { dfs(v); sz[u] += sz[v] + min(length[v], n - start[v]); }
        }
        return;
    }
    int dfn;
    void solve(int u) {
        if (flag) return;
        for (int x = 0;x < 26;++x) {
            if (flag) return;
            int v = ch[u][x];
            if (v) {
                if (dfn + sz[v] + min(length[v], n - start[v]) < query) {
                    dfn += sz[v] + min(length[v], n - start[v]);
                    continue;
                }
                for (int i = start[v];i < min(start[v] + length[v], n);++i) {
                    ++dfn;
                    path[++idx] = (char)(s[i] + 'a');
                    if (dfn == query) {
                        path[++idx] = '\0';
                        printf("%s\n", path + 1);
                        flag = true;return;
                    }
                }
                solve(v);
                for (int i = start[v];i < min(start[v] + length[v], n);++i)
                    --idx;
            }
        }

    }
};

SuffixTree<90010> suffixTree;
char s[90010];
int n;

int main() {
    scanf("%s", s + 1);
    n = strlen(s + 1);
    for (int i = 1;i <= n;++i)
        suffixTree.extend(s[i] - 'a');
    suffixTree.extend(26);
    suffixTree.dfs(1);
    scanf("%d", &t);
    for (int i = 1;i <= t;++i) {
        scanf("%d", &query);
        suffixTree.dfn = 0;
        flag = false;idx = 0;
        suffixTree.solve(1);
    }
    return 0;
}

洛谷P3975 [TJOI2015]弦论

传送门

求本质不同第 \(k\) 小的子串,和同一个串出现在原串的不同位置算多次时第 \(k\) 小的子串。

第一问和SPOJ 7258相同。对于第二问,只需要修改刚刚dfs序的算法,我们同时还要预处理出每个子树内叶结点的个数,对于同一个字符串,它的dfs序要计算其子树内叶结点个数次。注意对于结尾符 $ 仍要计入叶结点数,但结尾符的长度视为0。

Code

#include <bits/stdc++.h>
using namespace std;

#define LL long long

const int inf = 1 << 30;

template<size_t maxn>
struct SuffixTree {
    int link[maxn << 1], length[maxn << 1], start[maxn << 1];
    LL sz[maxn << 1], sz2[maxn << 1], leafNum[maxn << 1];
    int s[maxn << 1], ch[maxn << 1][27];
    char buf[maxn];
    int n, tail, now, len;
    int bufIdx;LL dfn;
    bool flag;
    SuffixTree() :tail(1), n(0), len(0), now(1) { length[0] = inf; }

    int newNode(int st, int len) {
        link[++tail] = 1;
        start[tail] = st;
        length[tail] = len;
        return tail;
    }
    void extend(int x) {
        s[++n] = x;++len;
        for (int last = 1;len;) {
            while (len > length[ch[now][s[n - len + 1]]])
                len -= length[now = ch[now][s[n - len + 1]]];
            int& v = ch[now][s[n - len + 1]];
            int c = s[start[v] + len - 1];
            if (!v || x == c) {
                link[last] = now;last = now;
                if (!v) v = newNode(n, inf);
                else break;
            }
            else {
                int u = newNode(start[v], len - 1);
                ch[u][c] = v;ch[u][x] = newNode(n, inf);
                start[v] += len - 1;length[v] -= len - 1;
                link[last] = v = u;last = u;
            }
            if (now == 1) --len;
            else now = link[now];
        }
    }
    void dfs(int u) {
        bool flag = true;
        for (int x = 0;x < 26;++x) {
            int v = ch[u][x];
            if (v) {
                flag = false; dfs(v);
                //子树内考虑本质不同时的dfs序大小
                sz[u] += sz[v] + min(length[v], n - start[v]);
                //子树内不考虑本质不同时的dfs序大小
                sz2[u] += sz2[v] + min(length[v], n - start[v]) * leafNum[v];
                leafNum[u] += leafNum[v];
            }
        }
        if (flag) leafNum[u] = 1;
        if (ch[u][26]) ++leafNum[u];//注意终止符也要计入叶结点数
    }
    void dfs1(int u, LL k) {
        if (flag) return;
        for (int x = 0;x < 26;++x) {
            if (flag) return;
            int v = ch[u][x];
            if (!v) continue;
            int m = min(length[v], n - start[v]);
            if (dfn + sz[v] + m < k) { dfn += sz[v] + m; continue; }
            for (int i = start[v];i < start[v] + m;++i) {
                ++dfn;buf[++bufIdx] = s[i] + 'a';
                if (dfn == k) {
                    buf[++bufIdx] = '\0';printf("%s", buf + 1);
                    flag = true;return;
                }
            }
            dfs1(v, k);
        }
    }
    void FindKth1(LL k) {//本质不同第k小
        bufIdx = dfn = 0;flag = false;dfs1(1, k);
        if (!flag) printf("-1\n");
    }
    void dfs2(int u, LL k) {
        if (flag) return;
        for (int x = 0;x < 26;++x) {
            if (flag) return;
            int v = ch[u][x];
            if (!v) continue;
            int m = min(length[v], n - start[v]);
            if (dfn + sz2[v] + leafNum[v] * m < k) { dfn += sz2[v] + leafNum[v] * m; continue; }
            for (int i = start[v];i < start[v] + m;++i) {
                buf[++bufIdx] = s[i] + 'a';
                if (dfn < k && k <= dfn + leafNum[v]) {
                    buf[++bufIdx] = '\0';printf("%s", buf + 1);
                    flag = true;return;
                }
                dfn += leafNum[v];
            }
            dfs2(v, k);
        }
    }
    void FindKth2(LL k) {//不考虑本质不同的第k小
        bufIdx = dfn = 0;flag = false;dfs2(1, k);
        if (!flag) printf("-1\n");
    }
};
SuffixTree<500010> suffixTree;
char s[500010];

int main() {
    scanf("%s", s + 1);
    int n = strlen(s + 1), opt;LL k;
    scanf("%d%lld", &opt, &k);
    for (int i = 1;i <= n;++i)
        suffixTree.extend(s[i] - 'a');
    suffixTree.extend(26);
    suffixTree.dfs(1);
    if (opt == 0) suffixTree.FindKth1(k);
    else suffixTree.FindKth2(k);

    return 0;
}
上一篇:Git飞行规则(Flight Rules)


下一篇:Suffix Trie Construction