后缀数组好题选讲(1)

涉及到的处理问题套路

将字符串复制成两份并连在一起,求出每个字符的后缀排名,再求解

注意事项

注意拼接的字符串中间是否应该加分隔符

luogu P4051 字符加密

题意简述

给定一个长度为$n$的字符串,将其首尾相接,然后从每一个位置开始读出一个长度为$n$的字符串,得到$n$个不同的字符串,将其按字典序从小到大排序,将排好后的每一个字符串的最后一个字符连在一起形成一个新的字符串,输出该字符串。

$n\le 10^6$

Sol

$O(n^2\log n)$暴力

暴力求出每个字符串,排序,输出。

$O(n\log n)$正解

考虑暴力超时的原因所在:sort的每次比较就有$O(n)$的复杂度,我们需要寻找一种算法能在预处理后直接比较。

容易想到后缀数组,$sa[i]$表示后缀排名为$i$的字符串第一个字符的下标

但只用一个字符串对相同的字母没有办法比较,如$DCABDC$中的两个$DC$,按照$sa$数组排名是后面的$DC$在前,但真正的顺序是$DCABDC > DCDCAB$

所以我们把该串复制一遍,不加分隔符,然后比较第一个字符串的每个字符的$sa$的大小即可

后缀数组好题选讲(1)
#include <bits/stdc++.h>
using namespace std;
int n, m, num, x[200005], y[200005], c[200005], sa[200005];
char ch[200005];
void get_SA()
{
    m = 177;
    for (int i = 1; i <= n; i++)
        ++c[x[i] = ch[i]];
    for (int i = 2; i <= m; i++)
        c[i] += c[i - 1];
    for (int i = n; i >= 1; i--)
        sa[c[x[i]]--] = i;
    for (int k = 1; k <= n; k <<= 1)
    {
        num = 0;
        for (int i = n - k + 1; i <= n; i++)
            y[++num] = i;
        for (int i = 1; i <= n; i++)
            if (sa[i] > k)
                y[++num] = sa[i] - k;
        memset(c, 0, sizeof(c));
        for (int i = 1; i <= n; i++)
            c[x[i]]++;
        for (int i = 2; i <= m; i++)
            c[i] += c[i - 1];
        for (int i = n; i >= 1; i--)
        {
            sa[c[x[y[i]]]--] = y[i];
            y[i] = 0;
        }
        swap(x, y);
        x[sa[1]] = 1;
        num = 1;
        for (int i = 2; i <= n; i++)
        {
            if (y[sa[i]] == y[sa[i - 1]] && y[sa[i] + k] == y[sa[i - 1] + k])
                x[sa[i]] = num;
            else
                x[sa[i]] = ++num;
        }
        if (num == n)
            break;
        m = num;
    }
}
int main()
{
    scanf("%s", ch + 1);
    n = strlen(ch + 1);
    for (int i = n + 1; i <= 2 * n; i++)
    {
        ch[i] = ch[i - n];
    }
    n = n * 2;
    get_SA();
    for (int i = 1; i <= n; i++)
    {
        if (sa[i] <= n / 2)
            putchar(ch[sa[i] + n / 2 - 1]);
    }
}
P4051

luogu P2870 Best Cow Line G

题意简述

给定一个长度为$n$的字符串,每次讲最前面或者最后面的那个字符加入新队列,求新队列的最小字典序

$n\le 5\cdot10^5$

Sol

容易想到当首尾字符不同时贪心选取最小的那个加入队列

但是当首尾字符相同的时候呢?

若是$CD...EC$等后一个字符均大于前一个字符的情况还好做,无论取哪边结果都一样,但是如$CB...BC$的情况我们就无法很好选择

对于上面的选择,我们的考虑是选择最有"前途"的那个地方作为开始,比如对于$CADDEFBAC$来说,$CADDEFBAC$的排名小于翻转过来的$CABFEDDAC$,故选择后面的$C$更优

当然,如果相同的话任选即可

所以我们可以搞一个后缀数组,然后翻转过来再搞一个后缀数组,对于上面的这个例子,直接比较前面的$C$在第一个后缀数组中的排名与后面的$C$在第二个后缀数组中的排名

如何实现比较呢?

答案就是直接把原字符串与翻转的字符串拼在一起,中间加分隔符,跑后缀数组即可,位置$i$在后面字符串对应的位置为$len-i+1$

后缀数组好题选讲(1)
#include <bits/stdc++.h>
using namespace std;
int n, m, num, cnt = 0, x[1000005], y[1000005], c[1000005], sa[1000005], rnk[1000005];
char ch[1000005];
void get_SA()
{
    m = 122;
    for (int i = 1; i <= n; i++)
        ++c[x[i]];
    for (int i = 2; i <= m; i++)
        c[i] += c[i - 1];
    for (int i = n; i >= 1; i--)
    {
        sa[c[x[i]]--] = i;
    }
    for (int k = 1; k <= n; k <<= 1)
    {
        num = 0;
        for (int i = n - k + 1; i <= n; i++)
            y[++num] = i;
        for (int i = 1; i <= n; i++)
            if (sa[i] > k)
                y[++num] = sa[i] - k;
        memset(c, 0, sizeof(c));
        for (int i = 1; i <= n; i++)
            ++c[x[i]];
        for (int i = 2; i <= m; i++)
            c[i] += c[i - 1];
        for (int i = n; i >= 1; i--)
        {
            sa[c[x[y[i]]]--] = y[i];
            y[i] = 0;
        }
        swap(x, y);
        x[sa[1]] = 1;
        num = 1;
        for (int i = 2; i <= n; i++)
        {
            if (y[sa[i]] == y[sa[i - 1]] && y[sa[i] + k] == y[sa[i - 1] + k])
                x[sa[i]] = num;
            else
                x[sa[i]] = ++num;
        }
        if (num == n)
            break;
        m = num;
    }
}
int main()
{
    //  freopen("P2870_6.in", "r", stdin);
    //   freopen("P2870.out", "w", stdout);
    cin >> n;
    for (int i = 1; i <= n; i++)
    {
        ch[i] = getchar();
        while (!isalpha(ch[i]))
            ch[i] = getchar();
    }
    for (int i = 1; i <= n; i++)
        x[i] = x[2 * n + 1 - i] = ch[i];
    n = 2 * n;
    /*  for (int i = 1; i <= n; i++)
        putchar(x[i]);*/
    get_SA();
    for (int i = 1; i <= n; i++)
        rnk[sa[i]] = i;
    int l = 1, r = n = (n) / 2;
    while (l < r)
    {
        if (ch[l] < ch[r])
            ++cnt, l++, putchar(ch[l - 1]);
        else if (ch[l] > ch[r])
            ++cnt, r--, putchar(ch[r + 1]);
        else if (ch[l] == ch[r])
        {
            if (rnk[l] < rnk[2 * n + 1 - r])
                ++cnt, l++, putchar(ch[l - 1]);
            else
                ++cnt, r--, putchar(ch[r + 1]);
        }
        if (cnt % 80 == 0)
            puts("");
    }
    putchar(ch[l]);
}
P2870

SP1811 LCS

题意简述

给两个字符串,求它们的最长公共子串。

$len\le 2.5\cdot 10^6$

Sol

考虑将最长公共子串转化为最长重复子串,那么就可以把两个子串拼接在一起,求出$sa$与$height$数组

因为最长的重复子串长度为$height[i]$的最大值,所以直接枚举即可,但要注意枚举的两点一定要在不同的字符串里面。

证明:最长的重复子串长度都为$height[i]$的最大值

$height[i]$的定义为$LCP(rnk[i-1],rnk[i])$

我们设最长的子串为$LCP(rnk[i],rnk[i+k])$,由于$rnk$数组是按字典序排序,所以$LCP(rnk[i],rnk[i+j]),(j\in [1,k))$均包含$LCP(rnk[i],rnk[i+k])$,故选择前面的不会更差。

后缀数组好题选讲(1)
#include <bits/stdc++.h>
using namespace std;
int len, n, m, num, x[500005], y[500005], c[500005], sa[500005], rnk[500005], height[500005];
char ch[500005];
void get_SA()
{
    m = 200;
    for (int i = 1; i <= n; i++)
        ++c[x[i] = ch[i]];
    for (int i = 2; i <= m; i++)
        c[i] += c[i - 1];
    for (int i = n; i >= 1; i--)
        sa[c[x[i]]--] = i;
    for (int k = 1; k <= n; k <<= 1)
    {
        num = 0;
        for (int i = n - k + 1; i <= n; i++)
            y[++num] = i;
        for (int i = 1; i <= n; i++)
            if (sa[i] > k)
                y[++num] = sa[i] - k;
        memset(c, 0, sizeof(c));
        for (int i = 1; i <= n; i++)
            ++c[x[i]];
        for (int i = 2; i <= m; i++)
            c[i] += c[i - 1];
        for (int i = n; i >= 1; i--)
            sa[c[x[y[i]]]--] = y[i], y[i] = 0;
        swap(x, y);
        num = 1;
        x[sa[1]] = 1;
        for (int i = 2; i <= n; i++)
        {
            if (y[sa[i]] == y[sa[i - 1]] && y[sa[i] + k] == y[sa[i - 1] + k])
                x[sa[i]] = num;
            else
                x[sa[i]] = ++num;
        }
        if (num == n)
            break;
        m = num;
    }
    for (int i = 1; i <= n; i++)
        rnk[sa[i]] = i;
    int k = 0;
    for (int i = 1; i <= n; i++)
    {
        if (rnk[i] == 1)
            continue;
        if (k)
            k--;
        int j = sa[rnk[i] - 1];
        while (j + k <= n && i + k <= n && ch[j + k] == ch[i + k])
            k++;
        height[rnk[i]] = k;
    }
}
int main()
{
    scanf("%s", ch + 1);
    n = strlen(ch + 1);
    ch[n + 1] = '$';
    len = n + 1;
    scanf("%s", ch + n + 2);
    n = strlen(ch + 1);
    get_SA();
    int ans = 0;
    for (int i = 2; i <= n; i++)
    {
        if ((sa[i] < len && sa[i - 1] > len) || (sa[i] > len && sa[i - 1] < len))
            ans = max(ans, height[i]);
    }
    cout << ans << endl;
}
SP1811

luogu P2463 Sandy的卡片

题意简述

给定$n$组数字,每组数字包含$m_i$个数,求这$n$组数据的最长相同子串

相同子串的定义为:长度相同,且一个串的全部元素加上一个相同的数会变成另外的一个串

$n\le 1000,m_i\le 101$

Sol

首先将每一组数字转成其差分数组,问题就变成了找相同子串

方法与上一题大致相同,先把所有子串拼接在一起,求出其$sa$与$height$

观察到如果存在长度为$k$的公共子串,那么长度小于$k$的公共子串一定存在

我们二分长度$l$,观察在$[1,n]$中能否找到一个区间,使得该区间中的数的$height$全部$\ge l$且包含种类为$1-n$的所有字符串

时间复杂度$O(nm\log n)$

注意事项

差分数组里会出现负值,要全部转成正值后再跑$SA$.

后缀数组好题选讲(1)
#include <bits/stdc++.h>
using namespace std;
int T, n, m, num, x[200005], y[200005], c[200005], sa[200005], rnk[200005], height[200005];
int b[200005], cnt = 0, a[200005], id[200005], minn = 2147483647;
void get_SA()
{
    m = 10000;
    for (int i = 1; i <= n; i++)
        ++c[x[i] = a[i]];
    for (int i = 2; i <= m; i++)
        c[i] += c[i - 1];
    for (int i = n; i >= 1; i--)
        sa[c[x[i]]--] = i;
    for (int k = 1; k <= n; k <<= 1)
    {
        num = 0;
        for (int i = n - k + 1; i <= n; i++)
            y[++num] = i;
        for (int i = 1; i <= n; i++)
            if (sa[i] > k)
                y[++num] = sa[i] - k;
        memset(c, 0, sizeof(c));
        for (int i = 1; i <= n; i++)
            ++c[x[i]];
        for (int i = 2; i <= m; i++)
            c[i] += c[i - 1];
        for (int i = n; i >= 1; i--)
            sa[c[x[y[i]]]--] = y[i], y[i] = 0;
        swap(x, y);
        x[sa[1]] = 1;
        num = 1;
        for (int i = 2; i <= n; i++)
            if (y[sa[i]] == y[sa[i - 1]] && y[sa[i] + k] == y[sa[i - 1] + k])
                x[sa[i]] = num;
            else
                x[sa[i]] = ++num;
        if (num == n)
            break;
        m = num;
    }
    for (int i = 1; i <= n; i++)
        rnk[sa[i]] = i;
    int k = 0;
    for (int i = 1; i <= n; i++)
    {
        if (rnk[i] == 1)
            continue;
        if (k)
            k--;
        int j = sa[rnk[i] - 1];
        while (j + k <= n && i + k <= n && a[j + k] == a[i + k])
            k++;
        height[rnk[i]] = k;
    }
}
int stk[100005], vis[100005], tp = 0;
int chk(int x)
{
    while (tp)
        vis[stk[tp--]] = 0;
    for (int i = 1; i <= n; i++)
    {
        if (height[i] < x)
        {
            while (tp)
                vis[stk[tp--]] = 0;
        }
        if (!vis[id[sa[i]]])
        {
            vis[id[sa[i]]] = 1;
            stk[++tp] = id[sa[i]];
            if (tp == T)
                return 1;
        }
    }
    return 0;
}
int main()
{
    cin >> T;
    for (int Q = 1; Q <= T; Q++)
    {
        int k;
        scanf("%d", &k);
        for (int i = 1; i <= k; i++)
            scanf("%d", &b[i]);
        for (int i = 2; i <= k; i++)
            a[++n] = (b[i] - b[i - 1]), id[n] = Q;
        a[++n] = 5000;
    }
    for (int i = 1; i <= n; i++)
        a[i] = a[i] + 5000;
    get_SA();
    int l = 1, r = n, ans = 0;
    while (l <= r)
    {
        int mid = (l + r) >> 1;
        if (chk(mid))
            l = mid + 1, ans = mid;
        else
            r = mid - 1;
    }
    printf("%d\n", ans + 1);
}
P2463

 

上一篇:[noi1754]SA


下一篇:IPSec协议