CTSC2010性能优化 DFT循环卷积

[CTSC2010]性能优化 首先题目要求的是$AB^C$,乘法为循环卷积。众所周知DFT有循环卷积的性质,只需要快速求出DFT就行了。 题目里提到了$n+1$为质数,且$n$的质因子$\in \lbrace 2,3,5,7 \rbrace$。那么求出$n+1$的一个原根,考虑NTT。 在常用的NTT中,是这样做的: $$F(\omega_n^i)=F_0(\omega_{n/2}^i)+\omega_{n}^iF_1(w_{n/2}^i)$$ 实际上就是把$n$的一个因子提出来的操作 $$F(\omega_n^i)=\sum_{j < p} \omega_{n}^{ij} F_j(\omega_{n/p}^i) $$ 这样子,只需要把$n$分解,就可以实现DFT了。 考虑去掉递归,写成递推。在一般DFT中有蝶形变换,在这里也可以类似的计算数组的新位置。实际上实现很简单,直接枚举$p$,维护当前块大小,考虑每一块,在块内把元素按${\rm mod}\ p$分一下就可以 那么考虑DFT。对于一个元素,会存在一组模块大小相同的数对它贡献。精细计算贡献即可 (洛谷题解里第一篇代码太妙了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
const int N = 5e5 + 233;

int n, C, A[N], B[N], P, Gp;

inline int fpow(int x, int y) {
int ret = 1;
for ( ; y; y >>= 1, x = 1ll * x * x % P)
if (y & 1) ret = 1ll * ret * x % P;
return ret;
}

inline int get_root(int x) {
std::vector<int> fac;
for (int i = 2; i * i <= x - 1; ++i)
if (!((x - 1) % i)) {
fac.push_back(i);
if (i * i != x - 1)
fac.push_back((x - 1) / i);
}
for (int i = 2; ; ++i) {
int ok = 1;
for (int j : fac)
if (fpow(i, j) == 1) {
ok = 0; break;
}
if (ok) return i;
}
}

int prime[N], tot;

inline void get_prime(int x) {
for (int i = 2; i * i <= n; ++i) {
if (!(x % i))
while (!(x % i))
x /= i, prime[++tot] = i;
}
if (x > 1) prime[++tot] = x;
}

inline void reverse(int f[]) {
static int tmp[N];
for (int b = tot, block = n; b; --b) {
for (int i = 0, t = 0; i < n; i += block)
for (int j = 0; j < prime[b]; ++j)
for (int k = 0; k < block; k += prime[b])
tmp[t++] = f[i + j + k];
for (int i = 0; i < n; ++i)
f[i] = tmp[i];
block /= prime[b];
}
}

inline void DFT(int f[], int o) {
reverse(f);
static int tmp[N];
for (int i = 1, block = 1; i <= tot; ++i) {
int last = block;
block *= prime[i];
int T = fpow(Gp, n / block);
for (int j = 0; j < n; j += block) {
int wn = 1;
for (int k = 0; k < block; ++k) {
for (int l = k % last, w = 1; l < block; l += last, w = 1ll * w * wn % P)
tmp[j + k] = (tmp[j + k] + 1ll * w * f[j + l]) % P;
wn = 1ll * wn * T % P;
}
}
for (int j = 0; j < n; ++j)
f[j] = tmp[j], tmp[j] = 0;
}
if (o == -1) {
std::reverse(f + 1, f + n);
for (int i = 0; i < n; ++i)
f[i] = 1ll * f[i] * n % P;
}
}

int main() {
n = io.rd(), C = io.rd(), P = n + 1;
Gp = get_root(P); get_prime(n);
for (int i = 0; i < n; ++i) A[i] = io.rd();
for (int i = 0; i < n; ++i) B[i] = io.rd();
DFT(A, 1), DFT(B, 1);
for (int i = 0; i < n; ++i)
A[i] = 1ll * A[i] * fpow(B[i], C) % P;
DFT(A, -1);
for (int i = 0; i < n; ++i)
io.out(A[i]);
return 0;
}