From 8c854b705e191343b4096407140846f36a70499b Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 7 May 2019 13:56:08 +0200 Subject: [PATCH] prepare simple Fortran interface --- Makefile.am | 4 +- configure.ac | 5 +- fortran/sharp.f90 | 242 +++++ fortran/test_sharp.f90 | 56 + pocketfft/pocketfft.c | 2194 ++++++++++++++++++++++++++++++++++++++++ pocketfft/pocketfft.h | 50 + 6 files changed, 2546 insertions(+), 5 deletions(-) create mode 100644 fortran/sharp.f90 create mode 100644 fortran/test_sharp.f90 create mode 100644 pocketfft/pocketfft.c create mode 100644 pocketfft/pocketfft.h diff --git a/Makefile.am b/Makefile.am index 4eb08c6..b172233 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,6 +3,8 @@ ACLOCAL_AMFLAGS = -I m4 lib_LTLIBRARIES = libsharp.la libsharp_la_SOURCES = \ + pocketfft/pocketfft.c \ + pocketfft/pocketfft.h \ libsharp/sharp_utils.c \ libsharp/sharp_utils.h \ libsharp/sharp.c \ @@ -57,7 +59,7 @@ EXTRA_DIST = \ check_PROGRAMS = sharp_testsuite sharp_testsuite_SOURCES = test/sharp_testsuite.c test/memusage.c test/memusage.h -sharp_testsuite_LDADD = libsharp.la @POCKETFFT_LIBS@ +sharp_testsuite_LDADD = libsharp.la -lm TESTS = runtest.sh diff --git a/configure.ac b/configure.ac index f636f83..b020181 100644 --- a/configure.ac +++ b/configure.ac @@ -24,14 +24,11 @@ AC_PROG_CC_C99 AC_OPENMP AC_PROG_LIBTOOL -PKG_CHECK_MODULES([POCKETFFT], [pocketfft], , [POCKETFFT_LIBS="-lpocketfft -lm"]) - tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'` AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) -AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS $POCKETFFT_CFLAGS" +AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS" -LIBS=$POCKETFFT_LIBS PACKAGE_LIBS="-lsharp" PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS" PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS" diff --git a/fortran/sharp.f90 b/fortran/sharp.f90 new file mode 100644 index 0000000..d6b5eee --- /dev/null +++ b/fortran/sharp.f90 @@ -0,0 +1,242 @@ +module sharp + use iso_c_binding + implicit none + ! alm_info flags + integer, parameter :: SHARP_PACKED = 1 + + ! sharp job types + enum, bind(c) + enumerator :: SHARP_YtW = 0 + enumerator :: SHARP_Y = 1 + enumerator :: SHARP_Yt = 2 + enumerator :: SHARP_WY = 3 + enumerator :: SHARP_ALM2MAP_DERIV1 = 4 + end enum + + ! sharp job flags + integer, parameter :: SHARP_DP = ISHFT(1, 4) + integer, parameter :: SHARP_ADD = ISHFT(1, 5) + integer, parameter :: SHARP_REAL_HARMONICS = ISHFT(1, 6) + integer, parameter :: SHARP_NO_FFT = ISHFT(1, 7) + + type sharp_geom_info + type(c_ptr) :: handle + integer(c_intptr_t) :: n_local + end type sharp_geom_info + + type sharp_alm_info + type(c_ptr) :: handle + integer(c_intptr_t) :: n_local + end type sharp_alm_info + + interface + + ! alm_info + subroutine sharp_make_general_alm_info( & + lmax, nm, stride, mval, mvstart, flags, alm_info) bind(c) + use iso_c_binding + integer(c_int), value, intent(in) :: lmax, nm, stride, flags + integer(c_int), intent(in) :: mval(nm) + integer(c_intptr_t), intent(in) :: mvstart(nm) + type(c_ptr), intent(out) :: alm_info + end subroutine sharp_make_general_alm_info + + subroutine c_sharp_make_mmajor_real_packed_alm_info( & + lmax, stride, nm, ms, alm_info) bind(c, name='sharp_make_mmajor_real_packed_alm_info') + use iso_c_binding + integer(c_int), value, intent(in) :: lmax, nm, stride + integer(c_int), intent(in), optional :: ms(nm) + type(c_ptr), intent(out) :: alm_info + end subroutine c_sharp_make_mmajor_real_packed_alm_info + + function c_sharp_alm_count(alm_info) bind(c, name='sharp_alm_count') + use iso_c_binding + integer(c_intptr_t) :: c_sharp_alm_count + type(c_ptr), value, intent(in) :: alm_info + end function c_sharp_alm_count + + subroutine c_sharp_destroy_alm_info(alm_info) bind(c, name='sharp_destroy_alm_info') + use iso_c_binding + type(c_ptr), value :: alm_info + end subroutine c_sharp_destroy_alm_info + + ! geom_info + subroutine sharp_make_subset_healpix_geom_info ( & + nside, stride, nrings, rings, weight, geom_info) bind(c) + use iso_c_binding + integer(c_int), value, intent(in) :: nside, stride, nrings + integer(c_int), intent(in), optional :: rings(nrings) + real(c_double), intent(in), optional :: weight(2 * nside) + type(c_ptr), intent(out) :: geom_info + end subroutine sharp_make_subset_healpix_geom_info + + subroutine c_sharp_destroy_geom_info(geom_info) bind(c, name='sharp_destroy_geom_info') + use iso_c_binding + type(c_ptr), value :: geom_info + end subroutine c_sharp_destroy_geom_info + + function c_sharp_map_size(info) bind(c, name='sharp_map_size') + use iso_c_binding + integer(c_intptr_t) :: c_sharp_map_size + type(c_ptr), value :: info + end function c_sharp_map_size + + + ! execute + subroutine c_sharp_execute(type, spin, alm, map, geom_info, alm_info, & + flags, time, opcnt) bind(c, name='sharp_execute') + use iso_c_binding + integer(c_int), value :: type, spin, flags + type(c_ptr), value :: alm_info, geom_info + real(c_double), intent(out), optional :: time + integer(c_long_long), intent(out), optional :: opcnt + type(c_ptr), intent(in) :: alm(*), map(*) + end subroutine c_sharp_execute + + subroutine c_sharp_execute_mpi(comm, type, spin, alm, map, geom_info, alm_info, & + flags, time, opcnt) bind(c, name='sharp_execute_mpi_fortran') + use iso_c_binding + integer(c_int), value :: comm, type, spin, flags + type(c_ptr), value :: alm_info, geom_info + real(c_double), intent(out), optional :: time + integer(c_long_long), intent(out), optional :: opcnt + type(c_ptr), intent(in) :: alm(*), map(*) + end subroutine c_sharp_execute_mpi + end interface + + interface sharp_execute + module procedure sharp_execute_d + end interface + +contains + ! alm info + + ! if ms is not passed, we default to using m=0..lmax. + subroutine sharp_make_mmajor_real_packed_alm_info(lmax, ms, alm_info) + use iso_c_binding + integer(c_int), value, intent(in) :: lmax + integer(c_int), intent(in), optional :: ms(:) + type(sharp_alm_info), intent(out) :: alm_info + !-- + integer(c_int), allocatable :: ms_copy(:) + integer(c_int) :: nm + + if (present(ms)) then + nm = size(ms) + allocate(ms_copy(nm)) + ms_copy = ms + call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, nm, ms_copy, alm_info=alm_info%handle) + deallocate(ms_copy) + else + call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, lmax + 1, alm_info=alm_info%handle) + end if + alm_info%n_local = c_sharp_alm_count(alm_info%handle) + end subroutine sharp_make_mmajor_real_packed_alm_info + + subroutine sharp_destroy_alm_info(alm_info) + use iso_c_binding + type(sharp_alm_info), intent(inout) :: alm_info + call c_sharp_destroy_alm_info(alm_info%handle) + alm_info%handle = c_null_ptr + end subroutine sharp_destroy_alm_info + + + ! geom info + subroutine sharp_make_healpix_geom_info(nside, rings, weight, geom_info) + integer(c_int), value :: nside + integer(c_int), optional :: rings(:) + real(c_double), intent(in), optional :: weight(2 * nside) + type(sharp_geom_info), intent(out) :: geom_info + !-- + integer(c_int) :: nrings + integer(c_int), allocatable :: rings_copy(:) + + if (present(rings)) then + nrings = size(rings) + allocate(rings_copy(nrings)) + rings_copy = rings + call sharp_make_subset_healpix_geom_info(nside, 1, nrings, rings_copy, & + weight, geom_info%handle) + deallocate(rings_copy) + else + call sharp_make_subset_healpix_geom_info(nside, 1, nrings=4 * nside - 1, & + weight=weight, geom_info=geom_info%handle) + end if + geom_info%n_local = c_sharp_map_size(geom_info%handle) + end subroutine sharp_make_healpix_geom_info + + subroutine sharp_destroy_geom_info(geom_info) + use iso_c_binding + type(sharp_geom_info), intent(inout) :: geom_info + call c_sharp_destroy_geom_info(geom_info%handle) + geom_info%handle = c_null_ptr + end subroutine sharp_destroy_geom_info + + + ! Currently the only mode supported is stacked (not interleaved) maps. + ! + ! Note that passing the exact dimension of alm/map is necessary, it + ! prevents the caller from doing too crazy slicing prior to pass array + ! in... + ! + ! Usage: + ! + ! The alm array must have shape exactly alm(alm_info%n_local, nmaps) + ! The maps array must have shape exactly map(map_info%n_local, nmaps). + subroutine sharp_execute_d(type, spin, nmaps, alm, alm_info, map, geom_info, & + add, time, opcnt, comm) + use iso_c_binding + use mpi + implicit none + integer(c_int), value :: type, spin, nmaps + integer(c_int), optional :: comm + logical, value, optional :: add ! should add instead of replace out + + type(sharp_alm_info) :: alm_info + type(sharp_geom_info) :: geom_info + real(c_double), intent(out), optional :: time + integer(c_long_long), intent(out), optional :: opcnt + real(c_double), target, intent(inout) :: alm(0:alm_info%n_local - 1, 1:nmaps) + real(c_double), target, intent(inout) :: map(0:geom_info%n_local - 1, 1:nmaps) + !-- + integer(c_int) :: mod_flags, ntrans, k + type(c_ptr), target :: alm_ptr(nmaps) + type(c_ptr), target :: map_ptr(nmaps) + + mod_flags = SHARP_DP + if (present(add) .and. add) then + mod_flags = or(mod_flags, SHARP_ADD) + end if + + if (spin == 0) then + ntrans = nmaps + else + ntrans = nmaps / 2 + end if + if (ntrans/=1) print *, "ERROR: ntrans /= 1" + + ! Set up pointer table to access maps + alm_ptr(:) = c_null_ptr + map_ptr(:) = c_null_ptr + do k = 1, nmaps + if (alm_info%n_local > 0) alm_ptr(k) = c_loc(alm(0, k)) + if (geom_info%n_local > 0) map_ptr(k) = c_loc(map(0, k)) + end do + + if (present(comm)) then + call c_sharp_execute_mpi(comm, type, spin, alm_ptr, map_ptr, & + geom_info=geom_info%handle, & + alm_info=alm_info%handle, & + flags=mod_flags, & + time=time, & + opcnt=opcnt) + else + call c_sharp_execute(type, spin, alm_ptr, map_ptr, & + geom_info=geom_info%handle, & + alm_info=alm_info%handle, & + flags=mod_flags, & + time=time, & + opcnt=opcnt) + end if + end subroutine sharp_execute_d +end module diff --git a/fortran/test_sharp.f90 b/fortran/test_sharp.f90 new file mode 100644 index 0000000..78e6c16 --- /dev/null +++ b/fortran/test_sharp.f90 @@ -0,0 +1,56 @@ +program test_sharp + use mpi + use sharp + use iso_c_binding, only : c_ptr, c_double + implicit none + + integer, parameter :: lmax = 2, nside = 2 + type(sharp_alm_info) :: alm_info + type(sharp_geom_info) :: geom_info + + real(c_double), dimension(0:(lmax + 1)**2 - 1, 1:1) :: alm + real(c_double), dimension(0:12*nside**2 - 1, 1:1) :: map + + integer(c_int), dimension(1:lmax + 1) :: ms + integer(c_int), dimension(1:4 * nside - 1) :: rings + integer(c_int) :: nm, m, nrings, iring + integer :: nodecount, rank, ierr + + call MPI_Init(ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nodecount, ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) + + nm = 0 + do m = rank, lmax, nodecount + nm = nm + 1 + ms(nm) = m + end do + + nrings = 0 + do iring = rank + 1, 4 * nside - 1, nodecount + nrings = nrings + 1 + rings(nrings) = iring + end do + + alm = 0 + map = 0 + if (rank == 0) then + alm(0, 1) = 1 + end if + + print *, ms(1:nm) + call sharp_make_mmajor_real_packed_alm_info(lmax, ms=ms(1:nm), alm_info=alm_info) + print *, 'alm_info%n_local', alm_info%n_local + call sharp_make_healpix_geom_info(nside, rings=rings(1:nrings), geom_info=geom_info) + print *, 'geom_info%n_local', geom_info%n_local + print *, 'execute' + call sharp_execute(SHARP_Y, 0, 1, alm, alm_info, map, geom_info, comm=MPI_COMM_WORLD) + + print *, alm + print *, map + + call sharp_destroy_alm_info(alm_info) + call sharp_destroy_geom_info(geom_info) + print *, 'DONE' + call MPI_Finalize(ierr) +end program test_sharp diff --git a/pocketfft/pocketfft.c b/pocketfft/pocketfft.c new file mode 100644 index 0000000..56c9dc5 --- /dev/null +++ b/pocketfft/pocketfft.c @@ -0,0 +1,2194 @@ +/* + * This file is part of pocketfft. + * Licensed under a 3-clause BSD style license - see LICENSE.md + */ + +/* + * Main implementation file. + * + * Copyright (C) 2004-2019 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include + +#include "pocketfft/pocketfft.h" + +#define RALLOC(type,num) \ + ((type *)malloc((num)*sizeof(type))) +#define DEALLOC(ptr) \ + do { free(ptr); (ptr)=NULL; } while(0) + +#define SWAP(a,b,type) \ + do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0) + +#ifdef __GNUC__ +#define NOINLINE __attribute__((noinline)) +#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result)) +#else +#define NOINLINE +#define WARN_UNUSED_RESULT +#endif + +// adapted from https://stackoverflow.com/questions/42792939/ +// CAUTION: this function only works for arguments in the range [-0.25; 0.25]! +static void my_sincosm1pi (double a, double *restrict res) + { + double s = a * a; + /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ + double r = -1.0369917389758117e-4; + r = fma (r, s, 1.9294935641298806e-3); + r = fma (r, s, -2.5806887942825395e-2); + r = fma (r, s, 2.3533063028328211e-1); + r = fma (r, s, -1.3352627688538006e+0); + r = fma (r, s, 4.0587121264167623e+0); + r = fma (r, s, -4.9348022005446790e+0); + double c = r*s; + /* Approximate sin(pi*x) for x in [-0.25,0.25] */ + r = 4.6151442520157035e-4; + r = fma (r, s, -7.3700183130883555e-3); + r = fma (r, s, 8.2145868949323936e-2); + r = fma (r, s, -5.9926452893214921e-1); + r = fma (r, s, 2.5501640398732688e+0); + r = fma (r, s, -5.1677127800499516e+0); + s = s * a; + r = r * s; + s = fma (a, 3.1415926535897931e+0, r); + res[0] = c; + res[1] = s; + } + +NOINLINE static void calc_first_octant(size_t den, double * restrict res) + { + size_t n = (den+4)>>3; + if (n==0) return; + res[0]=1.; res[1]=0.; + if (n==1) return; + size_t l1=(size_t)sqrt(n); + for (size_t i=1; in) end = n-start; + for (size_t i=1; i>2; + size_t i=0, idx1=0, idx2=2*ndone-2; + for (; i+1>1; + double * p = res+n-1; + calc_first_octant(n<<2, p); + int i4=0, in=n, i=0; + for (; i4<=in-i4; ++i, i4+=4) // octant 0 + { + res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; + } + for (; i4-in <= 0; ++i, i4+=4) // octant 1 + { + int xm = in-i4; + res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; + } + for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 + { + int xm = i4-in; + res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; + } + for (; i>2; + if ((n&7)==0) + res[quart] = res[quart+1] = hsqt2; + for (size_t i=2, j=2*quart-2; i>1; + if ((n&3)==0) + for (size_t i=0; i>1))<<1)==n) + { res=2; n=tmp; } + + size_t limit=(size_t)sqrt(n+0.01); + for (size_t x=3; x<=limit; x+=2) + while (((tmp=(n/x))*x)==n) + { + res=x; + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) res=n; + + return res; + } + +NOINLINE static double cost_guess (size_t n) + { + const double lfp=1.1; // penalty for non-hardcoded larger factors + size_t ni=n; + double result=0.; + size_t tmp; + while (((tmp=(n>>1))<<1)==n) + { result+=2; n=tmp; } + + size_t limit=(size_t)sqrt(n+0.01); + for (size_t x=3; x<=limit; x+=2) + while ((tmp=(n/x))*x==n) + { + result+= (x<=5) ? x : lfp*x; // penalize larger prime factors + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) result+=(n<=5) ? n : lfp*n; + + return result*ni; + } + +/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ +NOINLINE static size_t good_size(size_t n) + { + if (n<=6) return n; + + size_t bestfac=2*n; + for (size_t f2=1; f2=n) bestfac=f235711; + return bestfac; + } + +typedef struct cmplx { + double r,i; +} cmplx; + +#define NFCT 25 +typedef struct cfftp_fctdata + { + size_t fct; + cmplx *tw, *tws; + } cfftp_fctdata; + +typedef struct cfftp_plan_i + { + size_t length, nfct; + cmplx *mem; + cfftp_fctdata fct[NFCT]; + } cfftp_plan_i; +typedef struct cfftp_plan_i * cfftp_plan; + +#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; } +#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; } +#define SCALEC(a,b) { a.r*=b; a.i*=b; } +#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } +#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; } +#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] +#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] +#define WA(x,i) wa[(i)-1+(x)*(ido-1)] +/* a = b*c */ +#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; } +/* a = conj(b)*c*/ +#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; } + +#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; } +/* a = b*c */ +#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; } +/* a *= b */ +#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; } + +NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc, + cmplx * restrict ch, const cmplx * restrict wa) + { + const size_t cdim=2; + + if (ido==1) + for (size_t k=0; kip) iwal-=ip; + cmplx xwal=wal[iwal]; + iwal+=l; if (iwal>ip) iwal-=ip; + cmplx xwal2=wal[iwal]; + for (size_t ik=0; ikip) iwal-=ip; + cmplx xwal=wal[iwal]; + for (size_t ik=0; iklength==1) return 0; + size_t len=plan->length; + size_t l1=1, nf=plan->nfct; + cmplx *ch = RALLOC(cmplx, len); + if (!ch) return -1; + cmplx *p1=c, *p2=ch; + + for(size_t k1=0; k1fct[k1].fct; + size_t l2=ip*l1; + size_t ido = len/l2; + if (ip==4) + sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass4f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==2) + sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass2f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==3) + sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass3f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==5) + sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass5f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign); + else + { + if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign)) + { DEALLOC(ch); return -1; } + SWAP(p1,p2,cmplx *); + } + SWAP(p1,p2,cmplx *); + l1=l2; + } + if (p1!=c) + { + if (fct!=1.) + for (size_t i=0; ilength; + size_t nfct=0; + while ((length%4)==0) + { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } + if ((length%2)==0) + { + length>>=1; + // factor 2 should be at the front of the factor list + if (nfct>=NFCT) return -1; + plan->fct[nfct++].fct=2; + SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); + } + size_t maxl=(size_t)(sqrt((double)length))+1; + for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; + plan->fct[nfct++].fct=divisor; + length/=divisor; + } + maxl=(size_t)(sqrt((double)length))+1; + } + if (length>1) plan->fct[nfct++].fct=length; + plan->nfct=nfct; + return 0; + } + +NOINLINE static size_t cfftp_twsize (cfftp_plan plan) + { + size_t twsize=0, l1=1; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); + twsize+=(ip-1)*(ido-1); + if (ip>11) + twsize+=ip; + l1*=ip; + } + return twsize; + } + +NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan) + { + size_t length=plan->length; + double *twid = RALLOC(double, 2*length); + if (!twid) return -1; + sincos_2pibyn(length, twid); + size_t l1=1; + size_t memofs=0; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= length/(l1*ip); + plan->fct[k].tw=plan->mem+memofs; + memofs+=(ip-1)*(ido-1); + for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i]; + plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1]; + } + if (ip>11) + { + plan->fct[k].tws=plan->mem+memofs; + memofs+=ip; + for (size_t j=0; jfct[k].tws[j].r = twid[2*j*l1*ido]; + plan->fct[k].tws[j].i = twid[2*j*l1*ido+1]; + } + } + l1*=ip; + } + DEALLOC(twid); + return 0; + } + +static cfftp_plan make_cfftp_plan (size_t length) + { + if (length==0) return NULL; + cfftp_plan plan = RALLOC(cfftp_plan_i,1); + if (!plan) return NULL; + plan->length=length; + plan->nfct=0; + for (size_t i=0; ifct[i]=(cfftp_fctdata){0,0,0}; + plan->mem=0; + if (length==1) return plan; + if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } + size_t tws=cfftp_twsize(plan); + plan->mem=RALLOC(cmplx,tws); + if (!plan->mem) { DEALLOC(plan); return NULL; } + if (cfftp_comp_twiddle(plan)!=0) + { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + return plan; + } + +static void destroy_cfftp_plan (cfftp_plan plan) + { + DEALLOC(plan->mem); + DEALLOC(plan); + } + +typedef struct rfftp_fctdata + { + size_t fct; + double *tw, *tws; + } rfftp_fctdata; + +typedef struct rfftp_plan_i + { + size_t length, nfct; + double *mem; + rfftp_fctdata fct[NFCT]; + } rfftp_plan_i; +typedef struct rfftp_plan_i * rfftp_plan; + +#define WA(x,i) wa[(i)+(x)*(ido-1)] +#define PM(a,b,c,d) { a=c+d; b=c-d; } +/* (a+ib) = conj(c+id) * (e+if) */ +#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } + +#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] +#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] + +NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc, + double * restrict ch, const double * restrict wa) + { + const size_t cdim=2; + + for (size_t k=0; k1) + { + for (size_t j=1, jc=ip-1; j=ip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + double ar=csarr[2*iang], ai=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double war=csarr[2*iang], wai=csarr[2*iang+1]; + for (size_t ik=0; iklength==1) return 0; + size_t n=plan->length; + size_t l1=n, nf=plan->nfct; + double *ch = RALLOC(double, n); + if (!ch) return -1; + double *p1=c, *p2=ch; + + for(size_t k1=0; k1fct[k].fct; + size_t ido=n / l1; + l1 /= ip; + if(ip==4) + radf4(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==2) + radf2(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==3) + radf3(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==5) + radf5(ido, l1, p1, p2, plan->fct[k].tw); + else + { + radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); + SWAP (p1,p2,double *); + } + SWAP (p1,p2,double *); + } + copy_and_norm(c,p1,n,fct); + DEALLOC(ch); + return 0; + } + +WARN_UNUSED_RESULT +static int rfftp_backward(rfftp_plan plan, double c[], double fct) + { + if (plan->length==1) return 0; + size_t n=plan->length; + size_t l1=1, nf=plan->nfct; + double *ch = RALLOC(double, n); + if (!ch) return -1; + double *p1=c, *p2=ch; + + for(size_t k=0; kfct[k].fct, + ido= n/(ip*l1); + if(ip==4) + radb4(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==2) + radb2(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==3) + radb3(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==5) + radb5(ido, l1, p1, p2, plan->fct[k].tw); + else + radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); + SWAP (p1,p2,double *); + l1*=ip; + } + copy_and_norm(c,p1,n,fct); + DEALLOC(ch); + return 0; + } + +WARN_UNUSED_RESULT +static int rfftp_factorize (rfftp_plan plan) + { + size_t length=plan->length; + size_t nfct=0; + while ((length%4)==0) + { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } + if ((length%2)==0) + { + length>>=1; + // factor 2 should be at the front of the factor list + if (nfct>=NFCT) return -1; + plan->fct[nfct++].fct=2; + SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); + } + size_t maxl=(size_t)(sqrt((double)length))+1; + for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; + plan->fct[nfct++].fct=divisor; + length/=divisor; + } + maxl=(size_t)(sqrt((double)length))+1; + } + if (length>1) plan->fct[nfct++].fct=length; + plan->nfct=nfct; + return 0; + } + +static size_t rfftp_twsize(rfftp_plan plan) + { + size_t twsize=0, l1=1; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); + twsize+=(ip-1)*(ido-1); + if (ip>5) twsize+=2*ip; + l1*=ip; + } + return twsize; + return 0; + } + +WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan) + { + size_t length=plan->length; + double *twid = RALLOC(double, 2*length); + if (!twid) return -1; + sincos_2pibyn_half(length, twid); + size_t l1=1; + double *ptr=plan->mem; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido=length/(l1*ip); + if (knfct-1) // last factor doesn't need twiddles + { + plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1); + for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; + plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; + } + } + if (ip>5) // special factors required by *g functions + { + plan->fct[k].tws=ptr; ptr+=2*ip; + plan->fct[k].tws[0] = 1.; + plan->fct[k].tws[1] = 0.; + for (size_t i=1; i<=(ip>>1); ++i) + { + plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)]; + plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; + plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; + plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; + } + } + l1*=ip; + } + DEALLOC(twid); + return 0; + } + +NOINLINE static rfftp_plan make_rfftp_plan (size_t length) + { + if (length==0) return NULL; + rfftp_plan plan = RALLOC(rfftp_plan_i,1); + if (!plan) return NULL; + plan->length=length; + plan->nfct=0; + plan->mem=NULL; + for (size_t i=0; ifct[i]=(rfftp_fctdata){0,0,0}; + if (length==1) return plan; + if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } + size_t tws=rfftp_twsize(plan); + plan->mem=RALLOC(double,tws); + if (!plan->mem) { DEALLOC(plan); return NULL; } + if (rfftp_comp_twiddle(plan)!=0) + { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + return plan; + } + +NOINLINE static void destroy_rfftp_plan (rfftp_plan plan) + { + DEALLOC(plan->mem); + DEALLOC(plan); + } + +typedef struct fftblue_plan_i + { + size_t n, n2; + cfftp_plan plan; + double *mem; + double *bk, *bkf; + } fftblue_plan_i; +typedef struct fftblue_plan_i * fftblue_plan; + +NOINLINE static fftblue_plan make_fftblue_plan (size_t length) + { + fftblue_plan plan = RALLOC(fftblue_plan_i,1); + if (!plan) return NULL; + plan->n = length; + plan->n2 = good_size(plan->n*2-1); + plan->mem = RALLOC(double, 2*plan->n+2*plan->n2); + if (!plan->mem) { DEALLOC(plan); return NULL; } + plan->bk = plan->mem; + plan->bkf = plan->bk+2*plan->n; + +/* initialize b_k */ + double *tmp = RALLOC(double,4*plan->n); + if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + sincos_2pibyn(2*plan->n,tmp); + plan->bk[0] = 1; + plan->bk[1] = 0; + + size_t coeff=0; + for (size_t m=1; mn; ++m) + { + coeff+=2*m-1; + if (coeff>=2*plan->n) coeff-=2*plan->n; + plan->bk[2*m ] = tmp[2*coeff ]; + plan->bk[2*m+1] = tmp[2*coeff+1]; + } + + /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + double xn2 = 1./plan->n2; + plan->bkf[0] = plan->bk[0]*xn2; + plan->bkf[1] = plan->bk[1]*xn2; + for (size_t m=2; m<2*plan->n; m+=2) + { + plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2; + plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2; + } + for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m) + plan->bkf[m]=0.; + plan->plan=make_cfftp_plan(plan->n2); + if (!plan->plan) + { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + if (cfftp_forward(plan->plan,plan->bkf,1.)!=0) + { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + DEALLOC(tmp); + + return plan; + } + +NOINLINE static void destroy_fftblue_plan (fftblue_plan plan) + { + DEALLOC(plan->mem); + destroy_cfftp_plan(plan->plan); + DEALLOC(plan); + } + +NOINLINE WARN_UNUSED_RESULT +static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct) + { + size_t n=plan->n; + size_t n2=plan->n2; + double *bk = plan->bk; + double *bkf = plan->bkf; + double *akf = RALLOC(double, 2*n2); + if (!akf) return -1; + +/* initialize a_k and FFT it */ + if (isign>0) + for (size_t m=0; m<2*n; m+=2) + { + akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1]; + akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m]; + } + else + for (size_t m=0; m<2*n; m+=2) + { + akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1]; + akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m]; + } + for (size_t m=2*n; m<2*n2; ++m) + akf[m]=0; + + if (cfftp_forward (plan->plan,akf,fct)!=0) + { DEALLOC(akf); return -1; } + +/* do the convolution */ + if (isign>0) + for (size_t m=0; m<2*n2; m+=2) + { + double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + else + for (size_t m=0; m<2*n2; m+=2) + { + double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + +/* inverse FFT */ + if (cfftp_backward (plan->plan,akf,1.)!=0) + { DEALLOC(akf); return -1; } + +/* multiply by b_k */ + if (isign>0) + for (size_t m=0; m<2*n; m+=2) + { + c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1]; + c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + else + for (size_t m=0; m<2*n; m+=2) + { + c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1]; + c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + DEALLOC(akf); + return 0; + } + +WARN_UNUSED_RESULT +static int cfftblue_backward(fftblue_plan plan, double c[], double fct) + { return fftblue_fft(plan,c,1,fct); } + +WARN_UNUSED_RESULT +static int cfftblue_forward(fftblue_plan plan, double c[], double fct) + { return fftblue_fft(plan,c,-1,fct); } + +WARN_UNUSED_RESULT +static int rfftblue_backward(fftblue_plan plan, double c[], double fct) + { + size_t n=plan->n; + double *tmp = RALLOC(double,2*n); + if (!tmp) return -1; + tmp[0]=c[0]; + tmp[1]=0.; + memcpy (tmp+2,c+1, (n-1)*sizeof(double)); + if ((n&1)==0) tmp[n+1]=0.; + for (size_t m=2; mn; + double *tmp = RALLOC(double,2*n); + if (!tmp) return -1; + for (size_t m=0; mblueplan=0; + plan->packplan=0; + if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + { + plan->packplan=make_cfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + return plan; + } + double comp1 = cost_guess(length); + double comp2 = 2*cost_guess(good_size(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2blueplan=make_fftblue_plan(length); + if (!plan->blueplan) { DEALLOC(plan); return NULL; } + } + else + { + plan->packplan=make_cfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + } + return plan; + } + +void pocketfft_delete_plan_c (pocketfft_plan_c plan) + { + if (plan->blueplan) + destroy_fftblue_plan(plan->blueplan); + if (plan->packplan) + destroy_cfftp_plan(plan->packplan); + DEALLOC(plan); + } + +WARN_UNUSED_RESULT +int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct) + { + if (plan->packplan) + return cfftp_backward(plan->packplan,c,fct); + // if (plan->blueplan) + return cfftblue_backward(plan->blueplan,c,fct); + } + +WARN_UNUSED_RESULT +int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct) + { + if (plan->packplan) + return cfftp_forward(plan->packplan,c,fct); + // if (plan->blueplan) + return cfftblue_forward(plan->blueplan,c,fct); + } + +typedef struct pocketfft_plan_r_i + { + rfftp_plan packplan; + fftblue_plan blueplan; + } pocketfft_plan_r_i; + +pocketfft_plan_r pocketfft_make_plan_r (size_t length) + { + if (length==0) return NULL; + pocketfft_plan_r plan = RALLOC(pocketfft_plan_r_i,1); + if (!plan) return NULL; + plan->blueplan=0; + plan->packplan=0; + if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + { + plan->packplan=make_rfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + return plan; + } + double comp1 = 0.5*cost_guess(length); + double comp2 = 2*cost_guess(good_size(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2blueplan=make_fftblue_plan(length); + if (!plan->blueplan) { DEALLOC(plan); return NULL; } + } + else + { + plan->packplan=make_rfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + } + return plan; + } + +void pocketfft_delete_plan_r (pocketfft_plan_r plan) + { + if (plan->blueplan) + destroy_fftblue_plan(plan->blueplan); + if (plan->packplan) + destroy_rfftp_plan(plan->packplan); + DEALLOC(plan); + } + +size_t pocketfft_length_r(pocketfft_plan_r plan) + { + if (plan->packplan) return plan->packplan->length; + return plan->blueplan->n; + } + +size_t pocketfft_length_c(pocketfft_plan_c plan) + { + if (plan->packplan) return plan->packplan->length; + return plan->blueplan->n; + } + +WARN_UNUSED_RESULT +int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct) + { + if (plan->packplan) + return rfftp_backward(plan->packplan,c,fct); + else // if (plan->blueplan) + return rfftblue_backward(plan->blueplan,c,fct); + } + +WARN_UNUSED_RESULT +int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct) + { + if (plan->packplan) + return rfftp_forward(plan->packplan,c,fct); + else // if (plan->blueplan) + return rfftblue_forward(plan->blueplan,c,fct); + } diff --git a/pocketfft/pocketfft.h b/pocketfft/pocketfft.h new file mode 100644 index 0000000..b1a02ee --- /dev/null +++ b/pocketfft/pocketfft.h @@ -0,0 +1,50 @@ +/* + * This file is part of pocketfft. + * Licensed under a 3-clause BSD style license - see LICENSE.md + */ + +/*! \file pocketfft.h + * Public interface of the pocketfft library + * + * Copyright (C) 2008-2019 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef POCKETFFT_H +#define POCKETFFT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct pocketfft_plan_c_i; +typedef struct pocketfft_plan_c_i * pocketfft_plan_c; +pocketfft_plan_c pocketfft_make_plan_c (size_t length); +void pocketfft_delete_plan_c (pocketfft_plan_c plan); +int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct); +int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct); +size_t pocketfft_length_c(pocketfft_plan_c plan); + +struct pocketfft_plan_r_i; +typedef struct pocketfft_plan_r_i * pocketfft_plan_r; +pocketfft_plan_r pocketfft_make_plan_r (size_t length); +void pocketfft_delete_plan_r (pocketfft_plan_r plan); +/*! Computes a real backward FFT on \a c, using \a plan + and assuming the FFTPACK storage scheme: + - on entry, \a c has the form r0, r1, i1, r2, i2, ... + - on exit, it has the form r0, r1, ..., r[length-1]. */ +int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct); +/*! Computes a real forward FFT on \a c, using \a plan + and assuming the FFTPACK storage scheme: + - on entry, \a c has the form r0, r1, ..., r[length-1]; + - on exit, it has the form r0, r1, i1, r2, i2, ... */ +int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct); +size_t pocketfft_length_r(pocketfft_plan_r plan); + +#ifdef __cplusplus +} +#endif + +#endif