Radix4時間間引きFFT

基数4の時間間引きの高速フーリエ変換
ちょっと汚いソースですが、資料があまりないので誰かの役に立つことを祈って
ちなみに速度は大浦氏のライブラリのほうが速いので、あくまでもLarningですが。

template <unsigned long N,long isgn>
struct W
{
    W()
    {
        double theta=isgn>0?-2*M_PI/N:2*M_PI/N;
        for(unsigned long loop=0;loop<N;++loop)
        {
            cos[loop]=::cos(theta*loop);
            sin[loop]=::sin(theta*loop);
        }
    }
    _declspec(align(16)) double cos[N];
    _declspec(align(16)) double sin[N];
};

template <unsigned long N>
struct B
{
    B()
    {
        int k,i=0;
        for(int j=1;j<N-1;++j)
        {
            for(k=N>>1;k>(i^=k);k>>=1);
            bit_res[j]=i;
        }
    }
    _declspec(align(16)) unsigned long bit_res[N];
};

//Radix4時間間引き
template <unsigned long n,long isgn>
void fftt(double* a)
{
    int m, mq, i, j, j1, j2, j3;
    double w1r, w1i, w2r, w2i, w3r, w3i;
    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    static W<n,isgn> Wn;
    static B<n> Bn;

    ///* ---- unscrambler ---- */
    for (j = 1; j < n - 1; j++) {
        i=Bn.bit_res[j];
        if (j < i) {
            x0r = a[j*2];
            x0i = a[j*2+1];
            a[j*2] = a[i*2];
            a[j*2+1] = a[i*2+1];
            a[i*2] = x0r;
            a[i*2+1] = x0i;
        }
    }

    unsigned long ldm=(n&0xAAAAAAAA)?1:0;

    /* ---- radix 2 butterflies (n % 4 == 2) ---- */
    if(ldm!=0)
    {
        for(j=0;j<n;j+=2)
        {
            x0r=a[j*2]-a[j*2+2];
            x0i=a[j*2+1]-a[j*2+3];
            a[j*2]+=a[j*2+2];
            a[j*2+1]+=a[j*2+3];
            a[j*2+2]=x0r;
            a[j*2+3]=x0i;
        }
    }

    ///* ---- radix 4 butterflies ---- */
    unsigned long theta=n/(1<<ldm);
    for(mq=(1<<ldm);(m=mq<<2)<=n;mq=m)
    {
        theta >>= 2;
        for(i=0;i<mq;++i)
        {
            w1r=Wn.cos[theta*i];
            w1i=Wn.sin[theta*i];
            w2r=Wn.cos[theta*i*2];
            w2i=Wn.sin[theta*i*2];
            w3r=Wn.cos[theta*i*3];
            w3i=Wn.sin[theta*i*3];

            for (j=i;j<n;j+=m)
            {
                j1=j+mq;
                j2=j1+mq;
                j3=j2+mq;

#ifndef COMPLEX_FFT
                x0r=a[j*2];
                x0i=a[j*2+1];

                x1r=a[j2*2]*w1r-a[j2*2+1]*w1i;//!!
                x1i=a[j2*2+1]*w1r+a[j2*2]*w1i;

                x2r=a[j1*2]*w2r-a[j1*2+1]*w2i;//!!
                x2i=a[j1*2+1]*w2r+a[j1*2]*w2i;

                x3r=a[j3*2]*w3r-a[j3*2+1]*w3i;
                x3i=a[j3*2+1]*w3r+a[j3*2]*w3i;                

                //(a0+a2) + (a1+a3);
                a[j*2]=(x0r+x2r)+(x1r+x3r);
                a[j*2+1]=(x0i+x2i)+(x1i+x3i);
                //(a0+a2) - (a1+a3);
                a[j2*2]=(x0r+x2r)-(x1r+x3r);
                a[j2*2+1]=(x0i+x2i)-(x1i+x3i);
                //(a0-a2) - Complex(0,is) * (a1-a3)
                a[j1*2]=(x0r-x2r)+(x1i-x3i)*(double)isgn;
                a[j1*2+1]=(x0i-x2i)-(x1r-x3r)*(double)isgn;
                //(a0-a2) + Complex(0,is) * (a1-a3)
                a[j3*2]=(x0r-x2r)-(x1i-x3i)*(double)isgn;
                a[j3*2+1]=(x0i-x2i)+(x1r-x3r)*(double)isgn;

#else
                Complex a0(a[j*2],a[j*2+1]);
                Complex a1(a[j2*2],a[j2*2+1]);//!!
                Complex a2(a[j1*2],a[j1*2+1]);//!!
                Complex a3(a[j3*2],a[j3*2+1]);
                Complex w1(w1r,w1i);
                Complex w2(w2r,w2i);
                Complex w3(w3r,w3i);

                a1=a1*w1;
                a2=a2*w2;
                a3=a3*w3;

                Complex t0 = (a0+a2) + (a1+a3);
                Complex t2 = (a0+a2) - (a1+a3);

                Complex t1 = (a0-a2) - Complex(0,(double)isgn) * (a1-a3);
                Complex t3 = (a0-a2) + Complex(0,(double)isgn) * (a1-a3);

                a[j*2] = t0.r;
                a[j*2+1] = t0.i;
                a[j1*2] = t1.r;
                a[j1*2+1] = t1.i;
                a[j2*2] = t2.r;
                a[j2*2+1] = t2.i;
                a[j3*2] = t3.r;
                a[j3*2+1] = t3.i;
#endif //COMPLEX_FFT
            }
        }
    }
}