DFT – Fast algorithms and C implementations - Part3
Radix-3 DFT
我在第一篇博客里分析了Radix-2 DFT的计算原理,并据此写出了Radix-2 DFT的代码,并验证了其有效性。现在我们有信心认为,只要有了公式,就能写出正确的程序,产生正确的结果!
下面我们推导一下Radix-3 DFT的公式,其实非常简单:
设N长序列的DFT为,且N能被3整除,则:
所以N长序列的DFT被分解成3个N/3长序列的DFT的线性组合。
Mixed Radix-2 & Radix-3 DFT
现在我们考虑一个更加复杂的问题,就是当需要交叉进行Radix-2与Radix-3变换时该怎么做。
比如当N=12时,显然它既不是2的指数倍,也不是3的指数倍,但是我们可以基于Radix-2和Radix-3思想对它进行分解:
上图显示的是先对原序列进行Radix-2分解,再对两个子序列分别使用Radix-3分解。
上图显示的是先对原序列进行Radix-3分解,再对3个子序列分别使用Radix-2分解。
显然,两种分解方式都可以将一个N=12的序列分解成最小长序列(N=1),从而我们可以通过反复运用Radix-2变换和Radix-3变换来求解出原序列的DFT变换。
上述DFT分解过程看起来简单,但如果我们希望像之前一样先对输入序列进行合理排序,然后循环调用Raidx-based DFT算法来获取结果,我们先要解决一个问题:怎样对数据进行重排序?
我在网上找了半天,并没有找到一种类似bit-reversal-ordering这样的通用算法,所以我只能回到最笨的那个方法,就是按照图解的思路一步步推演重排序列。
下面贴出混合基-2和基-3离散傅里叶变换的FFT代码,要求输入序列的长度必须能表达成这样的形式。
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
typedef struct { float re, im; } ComplexFloat;
#define YU_COMPLEX_MUL(A, B, C) \
C.re = A.re * B.re - A.im * B.im; \
C.im = A.re * B.im + A.im * B.re
#define YU_COMPLEX_ADD(A, B, C) \
C.re = A.re + B.re; C.im = A.im + B.im
#define YU_COMPLEX_SUB(A, B, C) \
C.re = A.re - B.re; C.im = A.im - B.im
#define YU_COMPLEX_CONJ(A, B) \
B.re = A.re; B.im = -A.im
#define YU_PI 3.1415926535897932384626433832795
int yuRadix2DFT(ComplexFloat *dst, const ComplexFloat *coef, int len, int folds, int N2)
{
/*N is current dft length; N2 is half of N;
* stride is len/N; len2 is N2*stride; */
int N = N2 * 2, stride = len / N, len2 = len / 2;
int k, j, i;
ComplexFloat *p, v;
for(k = 0; k < folds; k++) {
p = dst;
for(i = 0; i < stride; i++) {
/*radix-2 transform (loop N2 times)*/
for(j = 0; j < len2; j += stride, p++) {
YU_COMPLEX_MUL(p[N2], coef[j], v);
YU_COMPLEX_SUB(p[0], v, p[N2]);
YU_COMPLEX_ADD(p[0], v, p[0]);
}
p += N2;
}
N2 = N; N *= 2; stride /= 2;
}
return N2;
}
int yuRadix3DFT(ComplexFloat *dst, const ComplexFloat *coef, int len, int folds, int N3)
{
/*N is current dft length; N3 is one third of N;
* stride is len/N; len3 is N3*stride; */
int N = N3 * 3, stride = len / N, len3 = len / 3;
int k, j, i;
ComplexFloat *p, v, v1, v2, u1, u2, W31, W32;
W31 = coef[len3];
W32 = coef[len3 * 2]; /*YU_COMPLEX_CONJ(W31, W32);*/
for(k = 0; k < folds; k++) {
p = dst;
for(i = 0; i < stride; i++) {
/*radix-3 transform (loop N3 times)*/
for(j = 0; j < len3; j += stride, p++) {
v = p[0];
YU_COMPLEX_MUL(p[N3], coef[j], v1);
YU_COMPLEX_MUL(p[N3 * 2], coef[j * 2], v2);
/*p[0] = v + v1 + v2*/
YU_COMPLEX_ADD(v1, v2, u1);
YU_COMPLEX_ADD(v, u1, p[0]);
/*p[N3] = v + v1*W31 + v2*W32*/
YU_COMPLEX_MUL(v1, W31, u1);
YU_COMPLEX_MUL(v2, W32, u2);
YU_COMPLEX_ADD(u1, u2, u1);
YU_COMPLEX_ADD(v, u1, p[N3]);
/*p[N3*2] = v + v1*W32 + v2*W31*/
YU_COMPLEX_MUL(v1, W32, u1);
YU_COMPLEX_MUL(v2, W31, u2);
YU_COMPLEX_ADD(u1, u2, u1);
YU_COMPLEX_ADD(v, u1, p[N3 * 2]);
}
p += N3 * 2;
}
N3 = N; N *= 3; stride /= 3;
}
return N3;
}
/*return 0 if failed, return 1 if success*/
int yuFFT(const float *src0, float *dst0, int len)
{
const ComplexFloat *src = (ComplexFloat*)src0;
ComplexFloat *dst = (ComplexFloat*)dst0;
/*Step 1. Factorize N into power of 2 and power of 3*/
int powers2 = 0, powers3 = 0, i, j, k = len;
while(!(k % 2)) {
k /= 2;
powers2++;
}
while(!(k % 3)) {
k /= 3;
powers3++;
}
if(k != 1) {
printf("Error: len cannot be factorized into 2 and 3");
return 0;
}
/*Step 2. rearrange data in proper order*/
int *order = (int*)malloc(len * sizeof(int));
for(k = 0; k < len; k++)
order[k] = k;
int *tmp = (int*)malloc(len * sizeof(int));
int N = len, ndft = 1;
/*Step 2.1*/
for(i = 0; i < powers3; i++) {
for(j = 0; j < ndft; j++) {
for(k = 0; k < N / 3; k++) {
tmp[j * N + k] = order[j * N + k * 3];
tmp[j * N + N / 3 + k] = order[j * N + k * 3 + 1];
tmp[j * N + N * 2 / 3 + k] = order[j * N + k * 3 + 2];
}
}
memcpy(order, tmp, len * sizeof(int));
N /= 3; ndft *= 3;
}
/*Step 2.2*/
for(i = 0; i < powers2; i++) {
/*current dft length is N, and there are ndft arrays.
we want to separate the odd-indexed sub-array from the even-indexed sub-array*/
for(j = 0; j < ndft; j++) {
for(k = 0; k < N / 2; k++) {
tmp[j * N + k] = order[j * N + k * 2];
tmp[j * N + N / 2 + k] = order[j * N + k * 2 + 1];
}
}
memcpy(order, tmp, len * sizeof(int));
N /= 2; ndft *= 2;
}
/*dst = src[order]*/
for(k = 0; k < len; k++)
dst[k] = src[order[k]];
/*Step 3. prepare dft coefficients*/
ComplexFloat *coef = (ComplexFloat*)malloc(len * sizeof(ComplexFloat));
double theta;
for(k = 0; k < len; k++) {
theta = -2.0 * YU_PI * k / len;
coef[k].re = (float)cos(theta);
coef[k].im = (float)sin(theta);
}
/*Step 4. do radix dft*/
N = 1;
/*Step 4.1*/
N = yuRadix2DFT(dst, coef, len, powers2, N);
/*Step 4.2*/
N = yuRadix3DFT(dst, coef, len, powers3, N);
free(order);
free(tmp);
free(coef);
return 1;
}
#undef YU_COMPLEX_MUL
#undef YU_COMPLEX_ADD
#undef YU_COMPLEX_SUB
#undef YU_PI
在写程序的过程中,我先基于上一篇博客的代码,把Radix-2部分单独拎出来,封装成一个函数。然后模仿它写了Radix-3。在主函数yuFFT里面,我先把N分解成2的指数与3的指数的乘积,然后两个独立的循环分别调用Radix-2和Radix-3算法即可。
在yuFFT函数中,Step 2部分是推演重排序方法的,完全模拟了图解的过程,可能比较笨,但是比较好理解。并且我们可以提前计算好傅里叶系数以及重排序下标索引,建成一个表格,然后在实时运用中直接调用即可,而不是像上面的代码中那样,每次都从头算一遍。
需要指出的是,虽然上述代码里是先调用Radix-2再调用Radix-3,但实际上我们也可以反过来做:先调用Radix-3再调用Radix-2,理论上结果不应发生变化。实际测试情况也是如此:将Step 2.1和Step 2.2调换顺序,相应地把Step 4.1和Step 4.2也调换顺序,最终结果不发生改变。
验证程序
为了测试程序是否正确,我写了如下main函数,在随机生成的序列上计算dft,然后将结果与fftw的计算结果进行比较。
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <time.h>
#include "fftw3.h"
int yuFFT(const float *src, float *dst, int len);
int main()
{
const int maxLength = 1024, nTrials = 3;
/*Allocate memory*/
float * const src = (float*)malloc(maxLength * 2 * sizeof(float));
float * const dst = (float*)malloc(maxLength * 2 * sizeof(float));
fftw_complex * const dsrc = (fftw_complex*)fftw_malloc(maxLength * sizeof(fftw_complex));
fftw_complex * const ddst = (fftw_complex*)fftw_malloc(maxLength * sizeof(fftw_complex));
srand((unsigned int)time(0));
for(int len = 1; len <= maxLength; len++) {
/*Test if len can be factorized into power of 2 and power of 3*/
int k = len;
while(!(k % 2)) k /= 2;
while(!(k % 3)) k /= 3;
if(k != 1)
continue;
/*Build fftw_plan*/
int n[1] = { len };
fftw_plan plan = fftw_plan_many_dft(1, n, 1, dsrc, 0, 1, 1, ddst, 0, 1, 1, FFTW_FORWARD, FFTW_ESTIMATE);
for(int trial = 0; trial < nTrials; trial++) {
/*Generate randoms between 0~1*/
int *p = (int*)src, m = 0;
for(k = 0; k < len * 2; k++) {
p[k] = rand();
if(p[k] > m)
m = p[k];
}
if(!m)
continue;
float scale = 1.f / m;
for(k = 0; k < len * 2; k++)
src[k] = p[k] * scale;
/*Apply yufft*/
yuFFT(src, dst, (unsigned int)len);
/*Apply fftw to the same data (using double precision)*/
double *src1 = (double*)dsrc, *dst1 = (double*)ddst;
for(k = 0; k < len * 2; k++)
src1[k] = (double)src[k];
fftw_execute(plan);
/*Compute the difference*/
double re, im, dif = 0;
for(k = 0; k < len * 2;) {
re = dst1[k] - dst[k]; k++;
im = dst1[k] - dst[k]; k++;
re = re * re + im * im;
if(dif < re)
dif = re;
}
dif = sqrt(dif);
printf("len = %4d, trial = %2d, dif = %g\n", len, trial, dif);
}
fftw_destroy_plan(plan);
}
free(src);
free(dst);
fftw_free(dsrc);
fftw_free(ddst);
getchar();
return 0;
}
程序在我电脑上运行后在屏幕上打印了以下信息:
len = 1, trial = 0, dif = 0
len = 1, trial = 1, dif = 0
len = 1, trial = 2, dif = 0
len = 2, trial = 0, dif = 2.26601e-08
len = 2, trial = 1, dif = 5.96046e-08
len = 2, trial = 2, dif = 0
len = 3, trial = 0, dif = 4.21468e-08
len = 3, trial = 1, dif = 9.06402e-08
len = 3, trial = 2, dif = 1.78969e-07
len = 4, trial = 0, dif = 8.2041e-08
len = 4, trial = 1, dif = 7.45058e-08
len = 4, trial = 2, dif = 4.01226e-08
len = 6, trial = 0, dif = 9.87202e-08
len = 6, trial = 1, dif = 1.06676e-07
len = 6, trial = 2, dif = 8.20255e-08
len = 8, trial = 0, dif = 2.40105e-07
len = 8, trial = 1, dif = 1.19209e-07
len = 8, trial = 2, dif = 1.77098e-07
len = 9, trial = 0, dif = 3.80564e-07
len = 9, trial = 1, dif = 2.9578e-07
len = 9, trial = 2, dif = 2.82399e-07
len = 12, trial = 0, dif = 3.38711e-07
len = 12, trial = 1, dif = 5.27257e-07
len = 12, trial = 2, dif = 2.74954e-07
len = 16, trial = 0, dif = 6.43344e-07
len = 16, trial = 1, dif = 3.53334e-07
len = 16, trial = 2, dif = 4.21732e-07
len = 18, trial = 0, dif = 3.48184e-07
len = 18, trial = 1, dif = 3.94143e-07
len = 18, trial = 2, dif = 6.19363e-07
len = 24, trial = 0, dif = 8.69895e-07
len = 24, trial = 1, dif = 6.437e-07
len = 24, trial = 2, dif = 4.47097e-07
len = 27, trial = 0, dif = 5.06819e-07
len = 27, trial = 1, dif = 6.37192e-07
len = 27, trial = 2, dif = 8.66024e-07
len = 32, trial = 0, dif = 1.6421e-06
len = 32, trial = 1, dif = 1.01241e-06
len = 32, trial = 2, dif = 1.01715e-06
len = 36, trial = 0, dif = 1.54931e-06
len = 36, trial = 1, dif = 1.37023e-06
len = 36, trial = 2, dif = 1.24578e-06
len = 48, trial = 0, dif = 1.45634e-06
len = 48, trial = 1, dif = 1.60746e-06
len = 48, trial = 2, dif = 1.20504e-06
len = 54, trial = 0, dif = 1.01185e-06
len = 54, trial = 1, dif = 1.26829e-06
len = 54, trial = 2, dif = 2.09262e-06
len = 64, trial = 0, dif = 2.38896e-06
len = 64, trial = 1, dif = 2.07697e-06
len = 64, trial = 2, dif = 9.96666e-07
len = 72, trial = 0, dif = 2.56995e-06
len = 72, trial = 1, dif = 2.7394e-06
len = 72, trial = 2, dif = 1.41041e-06
len = 81, trial = 0, dif = 2.33831e-06
len = 81, trial = 1, dif = 2.76485e-06
len = 81, trial = 2, dif = 1.91473e-06
len = 96, trial = 0, dif = 2.72731e-06
len = 96, trial = 1, dif = 2.64419e-06
len = 96, trial = 2, dif = 2.89764e-06
len = 108, trial = 0, dif = 3.06442e-06
len = 108, trial = 1, dif = 2.36151e-06
len = 108, trial = 2, dif = 2.38387e-06
len = 128, trial = 0, dif = 1.84808e-06
len = 128, trial = 1, dif = 6.65461e-06
len = 128, trial = 2, dif = 1.84561e-06
len = 144, trial = 0, dif = 3.87558e-06
len = 144, trial = 1, dif = 3.0805e-06
len = 144, trial = 2, dif = 7.03167e-06
len = 162, trial = 0, dif = 8.56079e-06
len = 162, trial = 1, dif = 7.01387e-06
len = 162, trial = 2, dif = 4.59154e-06
len = 192, trial = 0, dif = 3.47296e-06
len = 192, trial = 1, dif = 7.76133e-06
len = 192, trial = 2, dif = 6.44599e-06
len = 216, trial = 0, dif = 5.48365e-06
len = 216, trial = 1, dif = 8.35908e-06
len = 216, trial = 2, dif = 8.87525e-06
len = 243, trial = 0, dif = 5.9614e-06
len = 243, trial = 1, dif = 5.46705e-06
len = 243, trial = 2, dif = 8.96566e-06
len = 256, trial = 0, dif = 1.15965e-05
len = 256, trial = 1, dif = 6.85758e-06
len = 256, trial = 2, dif = 5.0958e-06
len = 288, trial = 0, dif = 1.24003e-05
len = 288, trial = 1, dif = 8.42293e-06
len = 288, trial = 2, dif = 1.08131e-05
len = 324, trial = 0, dif = 6.82868e-06
len = 324, trial = 1, dif = 1.09608e-05
len = 324, trial = 2, dif = 1.11663e-05
len = 384, trial = 0, dif = 6.61297e-06
len = 384, trial = 1, dif = 2.31987e-05
len = 384, trial = 2, dif = 9.79988e-06
len = 432, trial = 0, dif = 1.82128e-05
len = 432, trial = 1, dif = 2.16808e-05
len = 432, trial = 2, dif = 1.35217e-05
len = 486, trial = 0, dif = 1.19002e-05
len = 486, trial = 1, dif = 9.25769e-06
len = 486, trial = 2, dif = 1.14542e-05
len = 512, trial = 0, dif = 7.7354e-06
len = 512, trial = 1, dif = 8.70772e-06
len = 512, trial = 2, dif = 1.67701e-05
len = 576, trial = 0, dif = 2.12049e-05
len = 576, trial = 1, dif = 1.80019e-05
len = 576, trial = 2, dif = 1.81657e-05
len = 648, trial = 0, dif = 2.52407e-05
len = 648, trial = 1, dif = 1.60892e-05
len = 648, trial = 2, dif = 2.56206e-05
len = 729, trial = 0, dif = 1.24652e-05
len = 729, trial = 1, dif = 2.3825e-05
len = 729, trial = 2, dif = 1.2219e-05
len = 768, trial = 0, dif = 2.42783e-05
len = 768, trial = 1, dif = 1.72344e-05
len = 768, trial = 2, dif = 2.45952e-05
len = 864, trial = 0, dif = 3.24563e-05
len = 864, trial = 1, dif = 2.2366e-05
len = 864, trial = 2, dif = 2.00084e-05
len = 972, trial = 0, dif = 3.12402e-05
len = 972, trial = 1, dif = 2.28148e-05
len = 972, trial = 2, dif = 3.02124e-05
len = 1024, trial = 0, dif = 4.21749e-05
len = 1024, trial = 1, dif = 2.85143e-05
len = 1024, trial = 2, dif = 3.80336e-05