diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c index 562ebc9..de1af3e 100644 --- a/pocketfft/pocketfft.c +++ b/pocketfft/pocketfft.c @@ -31,15 +31,6 @@ #define WARN_UNUSED_RESULT #endif -#if 0 -static void fracsincos(size_t m, size_t n, double *restrict res) - { - static const long double twopi=6.283185307179586476925286766559006L; - long double arg = twopi*(long double)m/((long double)n); - res[0] = (double)cosl(arg); res[1] = (double)sinl(arg); - } -#endif - // adapted from https://stackoverflow.com/questions/42792939/ // CAUTION: this function only works for arguments in the range [-0.25; 0.25]! static void my_sincosm1pi (double a, double *restrict res) @@ -162,26 +153,33 @@ NOINLINE static void fill_first_half(size_t n, double * restrict res) { size_t half = n>>1; if ((n&3)==0) - { res[half] = 0.; res[half+1] = 1.; } - for (size_t i=2, j=2*half-2; i= n */ -static size_t good_size(size_t n) +/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ +NOINLINE static size_t good_size(size_t n) { if (n<=6) return n; @@ -277,16 +279,17 @@ typedef struct cfftp_plan_i typedef struct cfftp_plan_i * cfftp_plan; #define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; } -#define MPC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; } #define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; } #define SCALEC(a,b) { a.r*=b; a.i*=b; } -#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } +#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } +#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; } #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] #define WA(x,i) wa[(i)-1+(x)*(ido-1)] /* a = b*c */ -#define MULPMC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; } -#define MULMPC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; } +#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; } +/* a = conj(b)*c*/ +#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; } #define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; } /* a = b*c */ @@ -294,8 +297,8 @@ typedef struct cfftp_plan_i * cfftp_plan; /* a *= b */ #define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; } -NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc, - cmplx * restrict ch, const cmplx * restrict wa, const int sign) +NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc, + cmplx * restrict ch, const cmplx * restrict wa) { const size_t cdim=2; @@ -310,7 +313,28 @@ NOINLINE static void pass2 (size_t ido, size_t l1, const cmplx * restrict cc, { cmplx t; PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k)) - MULPMSIGNC (CH(i,k,1),WA(0,i),t) + A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t) + } + } + } + +NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc, + cmplx * restrict ch, const cmplx * restrict wa) + { + const size_t cdim=2; + + if (ido==1) + for (size_t k=0; k0) - for (size_t k=0; k0) - for (size_t i=1; ilength==1) return 0; @@ -764,10 +883,18 @@ WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct, size_t ip=plan->fct[k1].fct; size_t l2=ip*l1; size_t ido = len/l2; - if (ip==4) pass4 (ido, l1, p1, p2, plan->fct[k1].tw, sign); - else if(ip==2) pass2 (ido, l1, p1, p2, plan->fct[k1].tw, sign); - else if(ip==3) pass3 (ido, l1, p1, p2, plan->fct[k1].tw, sign); - else if(ip==5) pass5 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + if (ip==4) + sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass4f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==2) + sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass2f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==3) + sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass3f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==5) + sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass5f (ido, l1, p1, p2, plan->fct[k1].tw); else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign); else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign); else @@ -802,29 +929,28 @@ WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct, } #undef PMSIGNC -#undef MULPMC -#undef MULMPC +#undef A_EQ_B_MUL_C +#undef A_EQ_CB_MUL_C #undef MULPMSIGNC #undef MULPMSIGNCEQ #undef WA #undef CC #undef CH -#undef CONJFLIPC +#undef ROT90 #undef SCALEC #undef ADDC -#undef MPC #undef PMC -WARN_UNUSED_RESULT +NOINLINE WARN_UNUSED_RESULT static int cfftp_forward(cfftp_plan plan, double c[], double fct) { return pass_all(plan,(cmplx *)c, fct, -1); } -WARN_UNUSED_RESULT +NOINLINE WARN_UNUSED_RESULT static int cfftp_backward(cfftp_plan plan, double c[], double fct) { return pass_all(plan,(cmplx *)c, fct, 1); } -WARN_UNUSED_RESULT +NOINLINE WARN_UNUSED_RESULT static int cfftp_factorize (cfftp_plan plan) { size_t length=plan->length; @@ -856,7 +982,7 @@ static int cfftp_factorize (cfftp_plan plan) return 0; } -static size_t cfftp_twsize (cfftp_plan plan) +NOINLINE static size_t cfftp_twsize (cfftp_plan plan) { size_t twsize=0, l1=1; for (size_t k=0; knfct; ++k) @@ -870,7 +996,7 @@ static size_t cfftp_twsize (cfftp_plan plan) return twsize; } -WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan) +NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan) { size_t length=plan->length; double *twid = RALLOC(double, 2*length); @@ -1685,12 +1811,12 @@ static size_t rfftp_twsize(rfftp_plan plan) return 0; } -WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan) +WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan) { size_t length=plan->length; double *twid = RALLOC(double, 2*length); if (!twid) return -1; - sincos_2pibyn(length, twid); + sincos_2pibyn_half(length, twid); size_t l1=1; double *ptr=plan->mem; for (size_t k=0; knfct; ++k) @@ -1709,10 +1835,14 @@ WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan) if (ip>5) // special factors required by *g functions { plan->fct[k].tws=ptr; ptr+=2*ip; - for (size_t i=0; ifct[k].tws[0] = 1.; + plan->fct[k].tws[1] = 0.; + for (size_t i=1; i<=(ip>>1); ++i) { plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)]; plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; + plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; + plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; } } l1*=ip; @@ -1721,7 +1851,7 @@ WARN_UNUSED_RESULT static int rfftp_comp_twiddle (rfftp_plan plan) return 0; } -static rfftp_plan make_rfftp_plan (size_t length) +NOINLINE static rfftp_plan make_rfftp_plan (size_t length) { if (length==0) return NULL; rfftp_plan plan = RALLOC(rfftp_plan_i,1); @@ -1741,7 +1871,7 @@ static rfftp_plan make_rfftp_plan (size_t length) return plan; } -static void destroy_rfftp_plan (rfftp_plan plan) +NOINLINE static void destroy_rfftp_plan (rfftp_plan plan) { DEALLOC(plan->mem); DEALLOC(plan); @@ -1756,7 +1886,7 @@ typedef struct fftblue_plan_i } fftblue_plan_i; typedef struct fftblue_plan_i * fftblue_plan; -static fftblue_plan make_fftblue_plan (size_t length) +NOINLINE static fftblue_plan make_fftblue_plan (size_t length) { fftblue_plan plan = RALLOC(fftblue_plan_i,1); if (!plan) return NULL; @@ -1804,14 +1934,14 @@ static fftblue_plan make_fftblue_plan (size_t length) return plan; } -static void destroy_fftblue_plan (fftblue_plan plan) +NOINLINE static void destroy_fftblue_plan (fftblue_plan plan) { DEALLOC(plan->mem); destroy_cfftp_plan(plan->plan); DEALLOC(plan); } -WARN_UNUSED_RESULT +NOINLINE WARN_UNUSED_RESULT static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct) { size_t n=plan->n;