From 0378ce155a288b38bb327b62aa9795b9d40533b4 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 3 Jan 2020 17:22:31 +0100 Subject: [PATCH 1/6] runs as C++, no vector support yet --- COMPILE | 2 +- Makefile.am | 42 +++++++-------- configure.ac | 15 ++---- libsharp2/{pocketfft.c => pocketfft.cc} | 0 libsharp2/{sharp.c => sharp.cc} | 52 +++++++++---------- ...sharp_almhelpers.c => sharp_almhelpers.cc} | 0 libsharp2/{sharp_core.c => sharp_core.cc} | 2 +- .../{sharp_core_inc.c => sharp_core_inc.cc} | 26 +++++----- ...arp_geomhelpers.c => sharp_geomhelpers.cc} | 0 libsharp2/sharp_internal.h | 12 ++--- ...gendre_roots.c => sharp_legendre_roots.cc} | 0 libsharp2/{sharp_mpi.c => sharp_mpi.cc} | 0 libsharp2/{sharp_utils.c => sharp_utils.cc} | 0 libsharp2/sharp_vecsupport.h | 18 ++++--- .../{sharp_ylmgen_c.c => sharp_ylmgen_c.cc} | 0 test/{memusage.c => memusage.cc} | 0 ...sharp2_testsuite.c => sharp2_testsuite.cc} | 17 +++--- 17 files changed, 90 insertions(+), 96 deletions(-) rename libsharp2/{pocketfft.c => pocketfft.cc} (100%) rename libsharp2/{sharp.c => sharp.cc} (95%) rename libsharp2/{sharp_almhelpers.c => sharp_almhelpers.cc} (100%) rename libsharp2/{sharp_core.c => sharp_core.cc} (98%) rename libsharp2/{sharp_core_inc.c => sharp_core_inc.cc} (98%) rename libsharp2/{sharp_geomhelpers.c => sharp_geomhelpers.cc} (100%) rename libsharp2/{sharp_legendre_roots.c => sharp_legendre_roots.cc} (100%) rename libsharp2/{sharp_mpi.c => sharp_mpi.cc} (100%) rename libsharp2/{sharp_utils.c => sharp_utils.cc} (100%) rename libsharp2/{sharp_ylmgen_c.c => sharp_ylmgen_c.cc} (100%) rename test/{memusage.c => memusage.cc} (100%) rename test/{sharp2_testsuite.c => sharp2_testsuite.cc} (98%) diff --git a/COMPILE b/COMPILE index e3a87b7..876a6b3 100644 --- a/COMPILE +++ b/COMPILE @@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the configure command line. At runtime OMP_NUM_THREADS should be set to the number of hardware threads (not physical cores) of the system. -(Usually this is already the default setting when OMP_NUM_THREADS is not +(Usually this is already the default setting when OMP_NUM_THREADS is not specified.) diff --git a/Makefile.am b/Makefile.am index e105ca0..7e27ca8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,16 +3,16 @@ ACLOCAL_AMFLAGS = -I m4 lib_LTLIBRARIES = libsharp2.la libsharp2_la_SOURCES = \ - libsharp2/pocketfft.c \ + libsharp2/pocketfft.cc \ libsharp2/pocketfft.h \ - libsharp2/sharp_utils.c \ + libsharp2/sharp_utils.cc \ libsharp2/sharp_utils.h \ - libsharp2/sharp.c \ - libsharp2/sharp_almhelpers.c \ - libsharp2/sharp_core.c \ - libsharp2/sharp_geomhelpers.c \ - libsharp2/sharp_legendre_roots.c \ - libsharp2/sharp_ylmgen_c.c \ + libsharp2/sharp.cc \ + libsharp2/sharp_almhelpers.cc \ + libsharp2/sharp_core.cc \ + libsharp2/sharp_geomhelpers.cc \ + libsharp2/sharp_legendre_roots.cc \ + libsharp2/sharp_ylmgen_c.cc \ libsharp2/sharp_internal.h \ libsharp2/sharp_legendre_roots.h \ libsharp2/sharp_vecsupport.h \ @@ -26,23 +26,23 @@ libsharp2_la_SOURCES = \ # ==> age <= current libsharp2_la_LDFLAGS = -version-info 0:0:0 -AM_CFLAGS = @AM_CFLAGS@ +AM_CXXFLAGS = @AM_CXXFLAGS@ if HAVE_MULTIARCH -libavx_la_SOURCES = libsharp2/sharp_core_inc.c -libavx2_la_SOURCES = libsharp2/sharp_core_inc.c -libfma_la_SOURCES = libsharp2/sharp_core_inc.c -libfma4_la_SOURCES = libsharp2/sharp_core_inc.c -libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c +libavx_la_SOURCES = libsharp2/sharp_core_inc.cc +libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc +libfma_la_SOURCES = libsharp2/sharp_core_inc.cc +libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc +libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la -libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx -libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 -libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma -libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 -libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f +libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx +libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2 +libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma +libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4 +libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la @@ -56,10 +56,10 @@ nobase_include_HEADERS = \ libsharp2/sharp_cxx.h EXTRA_DIST = \ - runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c + runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc check_PROGRAMS = sharp2_testsuite -sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h +sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h sharp2_testsuite_LDADD = libsharp2.la TESTS = runtest.sh diff --git a/configure.ac b/configure.ac index 3b40dc2..0b510b8 100644 --- a/configure.ac +++ b/configure.ac @@ -20,28 +20,21 @@ dnl m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) -AC_PROG_CC_C99 -AC_OPENMP - -# add math library -LIBS="-lm" +AC_PROG_CXX +AX_CXX_COMPILE_STDCXX([11]) AC_PROG_LIBTOOL -tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'` +tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'` AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) -AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS" - PACKAGE_LIBS="-lsharp2" -PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS" -PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS" dnl dnl Create pkgconfig .pc file. dnl AX_CREATE_PKGCONFIG_INFO(,,,,[]) -AC_SUBST([AM_CFLAGS]) +AC_SUBST([AM_CXXFLAGS]) AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/libsharp2/pocketfft.c b/libsharp2/pocketfft.cc similarity index 100% rename from libsharp2/pocketfft.c rename to libsharp2/pocketfft.cc diff --git a/libsharp2/sharp.c b/libsharp2/sharp.cc similarity index 95% rename from libsharp2/sharp.c rename to libsharp2/sharp.cc index 1e5ac03..0224fa2 100644 --- a/libsharp2/sharp.c +++ b/libsharp2/sharp.cc @@ -34,8 +34,8 @@ #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_geomhelpers.h" -typedef complex double dcmplx; -typedef complex float fcmplx; +typedef complex dcmplx; +typedef complex fcmplx; static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_two = 1.414213562373095145474621858739; @@ -105,7 +105,7 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou self->phi0_ = phi0; // FIXME: improve this by using sincos2pibyn(nph) etc. for (int m=0; m<=mmax; ++m) - self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); + self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0)); // double *tmp=(double *) self->shiftarr; // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); } @@ -120,12 +120,12 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou static int ringinfo_compare (const void *xa, const void *xb) { - const sharp_ringinfo *a=xa, *b=xb; + const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; } static int ringpair_compare (const void *xa, const void *xb) { - const sharp_ringpair *a=xa, *b=xb; + const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; if (a->r1.nph==b->r1.nph) return (a->r1.phi0 < b->r1.phi0) ? -1 : @@ -292,22 +292,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self, if (self->norot) for (int m=0; m<=mmax; ++m) { - data[2*m]=creal(phase[m*pstride])*wgt; - data[2*m+1]=cimag(phase[m*pstride])*wgt; + data[2*m]=phase[m*pstride].real()*wgt; + data[2*m+1]=phase[m*pstride].imag()*wgt; } else for (int m=0; m<=mmax; ++m) { dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; - data[2*m]=creal(tmp)*wgt; - data[2*m+1]=cimag(tmp)*wgt; + data[2*m]=tmp.real()*wgt; + data[2*m+1]=tmp.imag()*wgt; } for (int m=2*(mmax+1); mnorot) tmp*=self->shiftarr[m]; if (idx1<(nph+2)/2) { - data[2*idx1]+=creal(tmp); - data[2*idx1+1]+=cimag(tmp); + data[2*idx1]+=tmp.real(); + data[2*idx1+1]+=tmp.imag(); } if (idx2<(nph+2)/2) { - data[2*idx2]+=creal(tmp); - data[2*idx2+1]-=cimag(tmp); + data[2*idx2]+=tmp.real(); + data[2*idx2+1]-=tmp.imag(); } if (++idx1>=nph) idx1=0; if (--idx2<0) idx2=nph-1; @@ -357,11 +357,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self, { if (self->norot) for (int m=0; m<=maxidx; ++m) - phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; + phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt; else for (int m=0; m<=maxidx; ++m) phase[m*pstride] = - (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; + dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt; } else { @@ -370,9 +370,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self, int idx=m%nph; dcmplx val; if (idx<(nph-idx)) - val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; + val = dcmplx(data[2*idx], data[2*idx+1]) * wgt; else - val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; + val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt; if (!self->norot) val *= self->shiftarr[m]; phase[m*pstride]=val; @@ -577,7 +577,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x*job->norm_l[l]) else - COPY_LOOP(float, fcmplx, x*job->norm_l[l]) + COPY_LOOP(float, fcmplx, x*float(job->norm_l[l])) } } } @@ -617,9 +617,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) if (m==0) { if (job->flags&SHARP_DP) - COPY_LOOP(double, double, creal(x)*norm_m0) + COPY_LOOP(double, double, x.real()*norm_m0) else - COPY_LOOP(float, float, crealf(x)*norm_m0) + COPY_LOOP(float, float, x.real()*norm_m0) } else { @@ -634,9 +634,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) if (m==0) { if (job->flags&SHARP_DP) - COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) + COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0) else - COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) + COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0)) } else { @@ -721,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, for (int m=0; m<=mmax; ++m) phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : - ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; + ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt); } } static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, @@ -934,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, job->flags|=SHARP_REAL_HARMONICS; job->time = 0.; job->opcnt = 0; - job->alm=alm; - job->map=map; + job->alm=(void **)alm; + job->map=(void **)map; } void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, diff --git a/libsharp2/sharp_almhelpers.c b/libsharp2/sharp_almhelpers.cc similarity index 100% rename from libsharp2/sharp_almhelpers.c rename to libsharp2/sharp_almhelpers.cc diff --git a/libsharp2/sharp_core.c b/libsharp2/sharp_core.cc similarity index 98% rename from libsharp2/sharp_core.c rename to libsharp2/sharp_core.cc index 6f41ad2..9a7ef20 100644 --- a/libsharp2/sharp_core.c +++ b/libsharp2/sharp_core.cc @@ -27,7 +27,7 @@ #define ARCH default #define GENERIC_ARCH -#include "libsharp2/sharp_core_inc.c" +#include "libsharp2/sharp_core_inc.cc" #undef GENERIC_ARCH #undef ARCH diff --git a/libsharp2/sharp_core_inc.c b/libsharp2/sharp_core_inc.cc similarity index 98% rename from libsharp2/sharp_core_inc.c rename to libsharp2/sharp_core_inc.cc index eaf34f0..79ca707 100644 --- a/libsharp2/sharp_core_inc.c +++ b/libsharp2/sharp_core_inc.cc @@ -34,7 +34,7 @@ #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) -#include +#include #include #include #include "libsharp2/sharp_vecsupport.h" @@ -49,7 +49,9 @@ // Unfortunately, most compilers don't act on this pragma yet. #pragma STDC FP_CONTRACT ON -typedef complex double dcmplx; +typedef complex dcmplx; +inline double creal(const dcmplx &v) {return v.real(); } +inline double cimag(const dcmplx &v) {return v.imag(); } #define nv0 (128/VLEN) #define nvx (64/VLEN) @@ -224,10 +226,10 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, { for (; l<=lmax-2; il+=2, l+=4) { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); - Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv ar1=vload(alm[l ].real()), ai1=vload(alm[l ].imag()); + Tv ar2=vload(alm[l+1].real()), ai2=vload(alm[l+1].imag()); + Tv ar3=vload(alm[l+2].real()), ai3=vload(alm[l+2].imag()); + Tv ar4=vload(alm[l+3].real()), ai4=vload(alm[l+3].imag()); Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); for (int i=0; is_th + mi*job->s_m; - complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, - r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; + complex r1(d.s.p1r[i], d.s.p1i[i]), + r2(d.s.p2r[i], d.s.p2i[i]); job->phase[phas_idx] = r1+r2; if (ispair[tgt]) job->phase[phas_idx+1] = r1-r2; @@ -1027,10 +1029,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, { int tgt=itgt[i]; int phas_idx = tgt*job->s_th + mi*job->s_m; - complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, - q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, - u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, - u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; + complex q1(d.s.p1pr[i], d.s.p1pi[i]), + q2(d.s.p2pr[i], d.s.p2pi[i]), + u1(d.s.p1mr[i], d.s.p1mi[i]), + u2(d.s.p2mr[i], d.s.p2mi[i]); job->phase[phas_idx ] = q1+q2; job->phase[phas_idx+2] = u1+u2; if (ispair[tgt]) diff --git a/libsharp2/sharp_geomhelpers.c b/libsharp2/sharp_geomhelpers.cc similarity index 100% rename from libsharp2/sharp_geomhelpers.c rename to libsharp2/sharp_geomhelpers.cc diff --git a/libsharp2/sharp_internal.h b/libsharp2/sharp_internal.h index 09e45e8..fef0c5d 100644 --- a/libsharp2/sharp_internal.h +++ b/libsharp2/sharp_internal.h @@ -28,14 +28,12 @@ #ifndef SHARP2_INTERNAL_H #define SHARP2_INTERNAL_H -#ifdef __cplusplus -#error This header file cannot be included from C++, only from C -#endif - -#include +#include #include "libsharp2/sharp.h" #include "libsharp2/sharp_ylmgen_c.h" +using std::complex; + typedef struct { sharp_jobtype type; @@ -45,9 +43,9 @@ typedef struct void **map; void **alm; int s_m, s_th; // strides in m and theta direction - complex double *phase; + complex *phase; double *norm_l; - complex double *almtmp; + complex *almtmp; const sharp_geom_info *ginfo; const sharp_alm_info *ainfo; double time; diff --git a/libsharp2/sharp_legendre_roots.c b/libsharp2/sharp_legendre_roots.cc similarity index 100% rename from libsharp2/sharp_legendre_roots.c rename to libsharp2/sharp_legendre_roots.cc diff --git a/libsharp2/sharp_mpi.c b/libsharp2/sharp_mpi.cc similarity index 100% rename from libsharp2/sharp_mpi.c rename to libsharp2/sharp_mpi.cc diff --git a/libsharp2/sharp_utils.c b/libsharp2/sharp_utils.cc similarity index 100% rename from libsharp2/sharp_utils.c rename to libsharp2/sharp_utils.cc diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index ca9bee7..221edef 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -28,7 +28,9 @@ #ifndef SHARP2_VECSUPPORT_H #define SHARP2_VECSUPPORT_H -#include +#include +#include +using std::complex; #ifndef VLEN @@ -73,8 +75,8 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; } #define vallTrue(a) (a) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - _Complex double * restrict cc) - { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; } + complex * restrict cc) + { cc[0] += complex(a,b); cc[1] += complex(c,d); } #endif @@ -121,9 +123,9 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vallTrue(a) (_mm_movemask_pd(a)==3) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, - Tv d, _Complex double * restrict cc) + Tv d, complex * restrict cc) { - union {Tv v; _Complex double c; } u1, u2; + union {Tv v; complex c; } u1, u2; #if defined(__SSE3__) u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); #else @@ -167,13 +169,13 @@ typedef __m256d Tm; #define vallTrue(a) (_mm256_movemask_pd(a)==15) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - _Complex double * restrict cc) + complex * restrict cc) { Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); tmp1=tmp3+tmp4; - union {Tv v; _Complex double c[2]; } u; + union {Tv v; complex c[2]; } u; u.v=tmp1; cc[0]+=u.c[0]; cc[1]+=u.c[1]; } @@ -209,7 +211,7 @@ typedef __mmask8 Tm; #define vallTrue(a) (a==255) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - _Complex double * restrict cc) + complex * restrict cc) { cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); diff --git a/libsharp2/sharp_ylmgen_c.c b/libsharp2/sharp_ylmgen_c.cc similarity index 100% rename from libsharp2/sharp_ylmgen_c.c rename to libsharp2/sharp_ylmgen_c.cc diff --git a/test/memusage.c b/test/memusage.cc similarity index 100% rename from test/memusage.c rename to test/memusage.cc diff --git a/test/sharp2_testsuite.c b/test/sharp2_testsuite.cc similarity index 98% rename from test/sharp2_testsuite.c rename to test/sharp2_testsuite.cc index 0e3302a..dc74da2 100644 --- a/test/sharp2_testsuite.c +++ b/test/sharp2_testsuite.cc @@ -26,7 +26,8 @@ #include #include -#include +#include +using std::complex; #ifdef USE_MPI #include "mpi.h" #include "libsharp2/sharp_mpi.h" @@ -38,7 +39,7 @@ #include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_utils.h" -#include "libsharp2/sharp_utils.c" +#include "libsharp2/sharp_utils.cc" #include "test/memusage.h" static void OpenMP_status(void) @@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected, exit(1); } -typedef complex double dcmplx; +typedef complex dcmplx; int ntasks, mytask; @@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt) { double rv = drand(-1,1,&state); double iv = (m==0) ? 0 : drand(-1,1,&state); - alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv; + alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv); } } } @@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp) sqsum[i]=0; for (ptrdiff_t j=0; jmaxdiff) maxdiff=sqr; } @@ -414,7 +413,7 @@ static void check_sign_scale(void) ALLOC2D(alm,dcmplx,2,nalms); for (int i=0; i<2; ++i) for (int j=0; j Date: Fri, 3 Jan 2020 18:01:34 +0100 Subject: [PATCH 2/6] vectors work with experimental::simd --- libsharp2/sharp_vecsupport.h | 182 +++-------------------------------- 1 file changed, 16 insertions(+), 166 deletions(-) diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index 221edef..5130e52 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -32,36 +32,24 @@ #include using std::complex; -#ifndef VLEN - -#if (defined(__AVX512F__)) -#define VLEN 8 -#elif (defined (__AVX__)) -#define VLEN 4 -#elif (defined (__SSE2__)) -#define VLEN 2 -#else -#define VLEN 1 -#endif - -#endif +#include +using std::experimental::native_simd; +using std::experimental::reduce; +static constexpr size_t VLEN=native_simd::size(); +using Tv=native_simd; +using Tm=Tv::mask_type; typedef double Ts; -#if (VLEN==1) - -typedef double Tv; -typedef int Tm; - #define vload(a) (a) #define vzero 0. #define vone 1. -#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b); -#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b); -#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b); +#define vaddeq_mask(mask,a,b) where(mask,a)+=b; +#define vsubeq_mask(mask,a,b) where(mask,a)-=b; +#define vmuleq_mask(mask,a,b) where(mask,a)*=b; #define vneg(a) (-(a)) -#define vabs(a) fabs(a) +#define vabs(a) abs(a) #define vsqrt(a) sqrt(a) #define vlt(a,b) ((a)<(b)) #define vgt(a,b) ((a)>(b)) @@ -69,154 +57,16 @@ typedef int Tm; #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) #define vor_mask(a,b) ((a)||(b)) -static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } -#define vanyTrue(a) (a) -#define vallTrue(a) (a) - -static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex * restrict cc) - { cc[0] += complex(a,b); cc[1] += complex(c,d); } - - -#endif - -#if (VLEN==2) - -#include - -#if defined (__SSE3__) -#include -#endif -#if defined (__SSE4_1__) -#include -#endif - -typedef __m128d Tv; -typedef __m128d Tm; - -#if defined(__SSE4_1__) -#define vblend__(m,a,b) _mm_blendv_pd(b,a,m) -#else -static inline Tv vblend__(Tv m, Tv a, Tv b) - { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); } -#endif -#define vload(a) _mm_set1_pd(a) -#define vzero _mm_setzero_pd() -#define vone vload(1.) - -#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero) -#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero) -#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone) -#define vneg(a) _mm_xor_pd(vload(-0.),a) -#define vabs(a) _mm_andnot_pd(vload(-0.),a) -#define vsqrt(a) _mm_sqrt_pd(a) -#define vlt(a,b) _mm_cmplt_pd(a,b) -#define vgt(a,b) _mm_cmpgt_pd(a,b) -#define vge(a,b) _mm_cmpge_pd(a,b) -#define vne(a,b) _mm_cmpneq_pd(a,b) -#define vand_mask(a,b) _mm_and_pd(a,b) -#define vor_mask(a,b) _mm_or_pd(a,b) -#define vmin(a,b) _mm_min_pd(a,b) -#define vmax(a,b) _mm_max_pd(a,b); -#define vanyTrue(a) (_mm_movemask_pd(a)!=0) -#define vallTrue(a) (_mm_movemask_pd(a)==3) - -static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, - Tv d, complex * restrict cc) - { - union {Tv v; complex c; } u1, u2; -#if defined(__SSE3__) - u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); -#else - u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) + - _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)); - u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) + - _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)); -#endif - cc[0]+=u1.c; cc[1]+=u2.c; - } - -#endif - -#if (VLEN==4) - -#include - -typedef __m256d Tv; -typedef __m256d Tm; - -#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m) -#define vload(a) _mm256_set1_pd(a) -#define vzero _mm256_setzero_pd() -#define vone vload(1.) - -#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero) -#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero) -#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone) -#define vneg(a) _mm256_xor_pd(vload(-0.),a) -#define vabs(a) _mm256_andnot_pd(vload(-0.),a) -#define vsqrt(a) _mm256_sqrt_pd(a) -#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) -#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ) -#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ) -#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) -#define vand_mask(a,b) _mm256_and_pd(a,b) -#define vor_mask(a,b) _mm256_or_pd(a,b) -#define vmin(a,b) _mm256_min_pd(a,b) -#define vmax(a,b) _mm256_max_pd(a,b) -#define vanyTrue(a) (_mm256_movemask_pd(a)!=0) -#define vallTrue(a) (_mm256_movemask_pd(a)==15) +static inline Tv vmin (Tv a, Tv b) { return min(a,b); } +static inline Tv vmax (Tv a, Tv b) { return max(a,b); } +#define vanyTrue(a) (any_of(a)) +#define vallTrue(a) (all_of(a)) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, complex * restrict cc) { - Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); - Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), - tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); - tmp1=tmp3+tmp4; - union {Tv v; complex c[2]; } u; - u.v=tmp1; - cc[0]+=u.c[0]; cc[1]+=u.c[1]; + cc[0] += complex(reduce(a,std::plus<>()),reduce(b,std::plus<>())); + cc[1] += complex(reduce(c,std::plus<>()),reduce(d,std::plus<>())); } #endif - -#if (VLEN==8) - -#include - -typedef __m512d Tv; -typedef __mmask8 Tm; - -#define vload(a) _mm512_set1_pd(a) -#define vzero _mm512_setzero_pd() -#define vone vload(1.) - -#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b); -#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b); -#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b); -#define vneg(a) _mm512_mul_pd(a,vload(-1.)) -#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a) -#define vsqrt(a) _mm512_sqrt_pd(a) -#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ) -#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ) -#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ) -#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ) -#define vand_mask(a,b) ((a)&(b)) -#define vor_mask(a,b) ((a)|(b)) -#define vmin(a,b) _mm512_min_pd(a,b) -#define vmax(a,b) _mm512_max_pd(a,b) -#define vanyTrue(a) (a!=0) -#define vallTrue(a) (a==255) - -static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex * restrict cc) - { - cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); - cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); - } - -#endif - -#endif From 54cbae0534df4773b5f5924e9d0665c80dbc935d Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 3 Jan 2020 18:26:53 +0100 Subject: [PATCH 3/6] multithreading demonstrated to work, but not safe yet --- Makefile.am | 2 +- libsharp2/sharp.cc | 123 ++++++++++++++++++----------------- libsharp2/sharp_vecsupport.h | 4 +- 3 files changed, 65 insertions(+), 64 deletions(-) diff --git a/Makefile.am b/Makefile.am index 7e27ca8..fe09aa6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ libsharp2_la_SOURCES = \ # any backward-compatible change: increase age # any backward-incompatible change: age=0 # ==> age <= current -libsharp2_la_LDFLAGS = -version-info 0:0:0 +libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread AM_CXXFLAGS = @AM_CXXFLAGS@ diff --git a/libsharp2/sharp.cc b/libsharp2/sharp.cc index 0224fa2..ef780d8 100644 --- a/libsharp2/sharp.cc +++ b/libsharp2/sharp.cc @@ -33,6 +33,7 @@ #include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_geomhelpers.h" +#include "mr_util/threading.h" typedef complex dcmplx; typedef complex fcmplx; @@ -760,31 +761,31 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) } else { -#pragma omp parallel -{ - ringhelper helper; - ringhelper_init(&helper); - int rstride=job->ginfo->nphmax+2; - double *ringtmp=RALLOC(double,job->nmaps*rstride); -#pragma omp for schedule(dynamic,1) - for (int ith=llim; iths_th*(ith-llim); - ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); - for (int i=0; inmaps; ++i) - ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); - if (job->ginfo->pair[ith].r2.nph>0) + ringhelper helper; + ringhelper_init(&helper); + int rstride=job->ginfo->nphmax+2; + double *ringtmp=RALLOC(double,job->nmaps*rstride); + + while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ithginfo->pair[ith].r2),ringtmp,rstride); + int dim2 = job->s_th*(ith-llim); + ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); for (int i=0; inmaps; ++i) - ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); + ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); + if (job->ginfo->pair[ith].r2.nph>0) + { + ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); + for (int i=0; inmaps; ++i) + ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); + } } - } - DEALLOC(ringtmp); - ringhelper_destroy(&helper); -} /* end of parallel region */ + DEALLOC(ringtmp); + ringhelper_destroy(&helper); + }); /* end of parallel region */ } } @@ -805,31 +806,31 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) } else { -#pragma omp parallel -{ - ringhelper helper; - ringhelper_init(&helper); - int rstride=job->ginfo->nphmax+2; - double *ringtmp=RALLOC(double,job->nmaps*rstride); -#pragma omp for schedule(dynamic,1) - for (int ith=llim; iths_th*(ith-llim); - for (int i=0; inmaps; ++i) - ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); - ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); - if (job->ginfo->pair[ith].r2.nph>0) + ringhelper helper; + ringhelper_init(&helper); + int rstride=job->ginfo->nphmax+2; + double *ringtmp=RALLOC(double,job->nmaps*rstride); + + while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; iths_th*(ith-llim); for (int i=0; inmaps; ++i) - ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); - ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); + ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); + ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); + if (job->ginfo->pair[ith].r2.nph>0) + { + for (int i=0; inmaps; ++i) + ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); + ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); + } } - } - DEALLOC(ringtmp); - ringhelper_destroy(&helper); -} /* end of parallel region */ + DEALLOC(ringtmp); + ringhelper_destroy(&helper); + }); /* end of parallel region */ } } @@ -871,32 +872,32 @@ NOINLINE static void sharp_execute_job (sharp_job *job) /* map->phase where necessary */ map2phase (job, mmax, llim, ulim); -#pragma omp parallel -{ - sharp_job ljob = *job; - ljob.opcnt=0; - sharp_Ylmgen_C generator; - sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin); - alloc_almtmp(&ljob,lmax); - -#pragma omp for schedule(dynamic,1) - for (int mi=0; miainfo->nm; ++mi) + mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched) { -/* alm->alm_tmp where necessary */ - alm2almtmp (&ljob, lmax, mi); + sharp_job ljob = *job; + ljob.opcnt=0; + sharp_Ylmgen_C generator; + sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin); + alloc_almtmp(&ljob,lmax); - inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); + while (auto rng=sched.getNext()) for(auto mi=rng.lo; mialm_tmp where necessary */ + alm2almtmp (&ljob, lmax, mi); + + inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); /* alm_tmp->alm where necessary */ - almtmp2alm (&ljob, lmax, mi); - } + almtmp2alm (&ljob, lmax, mi); + } - sharp_Ylmgen_destroy(&generator); - dealloc_almtmp(&ljob); + sharp_Ylmgen_destroy(&generator); + dealloc_almtmp(&ljob); -#pragma omp critical +//#pragma omp critical +//FIXME!!!! job->opcnt+=ljob.opcnt; -} /* end of parallel region */ + }); /* end of parallel region */ /* phase->map where necessary */ phase2map (job, mmax, llim, ulim); diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index 5130e52..79ce2a7 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -36,10 +36,10 @@ using std::complex; #include using std::experimental::native_simd; using std::experimental::reduce; -static constexpr size_t VLEN=native_simd::size(); using Tv=native_simd; using Tm=Tv::mask_type; -typedef double Ts; +using Ts=Tv::value_type; +static constexpr size_t VLEN=Tv::size(); #define vload(a) (a) #define vzero 0. From 75559e6894fd17c59b798cb004986b4f6fcbfc00 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 3 Jan 2020 22:47:49 +0100 Subject: [PATCH 4/6] vectorization cleanup --- libsharp2/sharp.cc | 6 +- libsharp2/sharp_core_inc.cc | 308 +++++++++++++++++------------------ libsharp2/sharp_vecsupport.h | 19 --- 3 files changed, 156 insertions(+), 177 deletions(-) diff --git a/libsharp2/sharp.cc b/libsharp2/sharp.cc index ef780d8..2b6e722 100644 --- a/libsharp2/sharp.cc +++ b/libsharp2/sharp.cc @@ -853,6 +853,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job) &nchunks,&chunksize); //FIXME: needs to be changed to "nm" alloc_phase (job,mmax+1,chunksize); + std::atomic opcnt = 0; /* chunk loop */ for (int chunk=0; chunkopcnt+=ljob.opcnt; + opcnt+=ljob.opcnt; }); /* end of parallel region */ /* phase->map where necessary */ @@ -910,6 +909,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job) DEALLOC(job->norm_l); dealloc_phase (job); + job->opcnt = opcnt; job->time=sharp_wallTime()-timer; } diff --git a/libsharp2/sharp_core_inc.cc b/libsharp2/sharp_core_inc.cc index 79ca707..a4b57fc 100644 --- a/libsharp2/sharp_core_inc.cc +++ b/libsharp2/sharp_core_inc.cc @@ -50,8 +50,6 @@ #pragma STDC FP_CONTRACT ON typedef complex dcmplx; -inline double creal(const dcmplx &v) {return v.real(); } -inline double cimag(const dcmplx &v) {return v.imag(); } #define nv0 (128/VLEN) #define nvx (64/VLEN) @@ -99,32 +97,32 @@ typedef union static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, double maxval) { - const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); - const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); - Tm mask = vgt(vabs(*val),vfmax); - while (vanyTrue(mask)) + const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval; + const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig; + auto mask = abs(*val)>vfmax; + while (any_of(mask)) { - vmuleq_mask(mask,*val,vfsmall); - vaddeq_mask(mask,*scale,vone); - mask = vgt(vabs(*val),vfmax); + where(mask,*val)*=vfsmall; + where(mask,*scale)+=1; + mask = abs(*val)>vfmax; } - mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero)); - while (vanyTrue(mask)) + mask = (abs(*val)>=1); *resd=res; - *ress=vzero; + *ress=0; } else { - Tv scale=vzero, scaleint=vzero, res=vone; + Tv scale=0, scaleint=0, res=1; Tvnormalize(&val,&scaleint,sharp_fbighalf); do { @@ -171,47 +169,47 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac, *corfac=corf.v; } -static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps) +static inline bool rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps) { - Tm mask = vgt(vabs(*v2),eps); - if (vanyTrue(mask)) + auto mask = abs(*v2)>eps; + if (any_of(mask)) { - vmuleq_mask(mask,*v1,vload(sharp_fsmall)); - vmuleq_mask(mask,*v2,vload(sharp_fsmall)); - vaddeq_mask(mask,*s,vone); - return 1; + where(mask,*v1)*=sharp_fsmall; + where(mask,*v2)*=sharp_fsmall; + where(mask,*s)+=1; + return true; } - return 0; + return false; } NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2) { int l=gen->m, il=0; - Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - Tv limscale=vload(sharp_limscale); + Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]; + Tv limscale=sharp_limscale; int below_limit = 1; for (int i=0; ilam1[i]=vzero; + d->lam1[i]=0; mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]); d->lam2[i] *= mfac; Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); - below_limit &= vallTrue(vlt(d->scale[i],limscale)); + below_limit &= all_of(d->scale[i]gen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; - Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b); - Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b); + Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b; + Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b; for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) - below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale))); + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) + below_limit &= all_of(d->scale[i]p1r[i] += d->lam2[i]*ar1; @@ -251,12 +249,12 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, { for (; l<=lmax-2; il+=2, l+=4) { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); - Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); - Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); - Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); + Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); + Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); + Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag(); + Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag(); + Tv a1=coef[il ].a, b1=coef[il ].b; + Tv a2=coef[il+1].a, b2=coef[il+1].b; for (int i=0; ip1r[i] += d->lam2[i]*ar1; @@ -274,9 +272,9 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, } for (; l<=lmax; ++il, l+=2) { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv a=vload(coef[il].a), b=vload(coef[il].b); + Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); + Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); + Tv a=coef[il].a, b=coef[il].b; for (int i=0; ip1r[i] += d->lam2[i]*ar1; @@ -306,14 +304,14 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job, for (int i=0; iscale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scale[i]>=sharp_minscale); } while((!full_ieee) && (l<=lmax)) { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv a=vload(coef[il].a), b=vload(coef[il].b); + Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); + Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); + Tv a=coef[il].a, b=coef[il].b; full_ieee=1; for (int i=0; icsq[i] + b)*d->lam2[i] + d->lam1[i]; d->lam1[i] = d->lam2[i]; d->lam2[i] = tmp; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) getCorfac(d->scale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scale[i]>=sharp_minscale); } l+=2; ++il; } @@ -346,10 +344,10 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d, { for (; l<=lmax-2; il+=2, l+=4) { - Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); - Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); - Tv atmp1[4] = {vzero, vzero, vzero, vzero}; - Tv atmp2[4] = {vzero, vzero, vzero, vzero}; + Tv a1=coef[il ].a, b1=coef[il ].b; + Tv a2=coef[il+1].a, b2=coef[il+1].b; + Tv atmp1[4] = {0,0,0,0}; + Tv atmp2[4] = {0,0,0,0}; for (int i=0; ilam2[i]*d->p1r[i]; @@ -368,8 +366,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d, } for (; l<=lmax; ++il, l+=2) { - Tv a=vload(coef[il].a), b=vload(coef[il].b); - Tv atmp[4] = {vzero, vzero, vzero, vzero}; + Tv a=coef[il].a, b=coef[il].b; + Tv atmp[4] = {0,0,0,0}; for (int i=0; ilam2[i]*d->p1r[i]; @@ -400,13 +398,13 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job, for (int i=0; iscale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scale[i]>=sharp_minscale); } while((!full_ieee) && (l<=lmax)) { - Tv a=vload(coef[il].a), b=vload(coef[il].b); - Tv atmp[4] = {vzero, vzero, vzero, vzero}; + Tv a=coef[il].a, b=coef[il].b; + Tv atmp[4] = {0,0,0,0}; full_ieee=1; for (int i=0; icsq[i] + b)*d->lam2[i] + d->lam1[i]; d->lam1[i] = d->lam2[i]; d->lam2[i] = tmp; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) getCorfac(d->scale[i], &d->corfac[i], gen->cf); - full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scale[i]>=sharp_minscale); } vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); l+=2; ++il; @@ -438,17 +436,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int * restrict l_, int nv2) { const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - Tv prefac=vload(gen->prefac[gen->m]), - prescale=vload(gen->fscale[gen->m]); - Tv limscale=vload(sharp_limscale); + Tv prefac=gen->prefac[gen->m], + prescale=gen->fscale[gen->m]; + Tv limscale=sharp_limscale; int below_limit=1; for (int i=0; icth[i])*vload(0.5))); - Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5))); - Tm mask=vlt(d->sth[i],vzero); - vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.)); - vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.)); + Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5)); + Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5)); + auto mask=d->sth[i]<0; + where(mask&(d->cth[i]<0),cth2)*=-1.; + where(mask&(d->cth[i]<0),sth2)*=-1.; Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); @@ -456,8 +454,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); - d->l1p[i] = vzero; - d->l1m[i] = vzero; + d->l1p[i] = 0; + d->l1m[i] = 0; d->l2p[i] = prefac*ccp; d->scp[i] = prescale+ccps; d->l2m[i] = prefac*csp; @@ -469,17 +467,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, d->l2m[i] *= scp; d->scm[i] += scps; if (gen->preMinus_p) - d->l2p[i] = vneg(d->l2p[i]); + d->l2p[i] = -d->l2p[i]; if (gen->preMinus_m) - d->l2m[i] = vneg(d->l2m[i]); + d->l2m[i] = -d->l2m[i]; if (gen->s&1) - d->l2p[i] = vneg(d->l2p[i]); + d->l2p[i] = -d->l2p[i]; Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); - below_limit &= vallTrue(vlt(d->scm[i],limscale)) && - vallTrue(vlt(d->scp[i],limscale)); + below_limit &= all_of(d->scm[i]scp[i]mhi; @@ -488,18 +486,18 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || - rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) - below_limit &= vallTrue(vlt(d->scp[i],limscale)) && - vallTrue(vlt(d->scm[i],limscale)); + if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) || + rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol)) + below_limit &= all_of(d->scp[i]scm[i]l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -539,12 +537,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, l=lsave; while (l<=lmax) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), - acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); - Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), - acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), + acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); + Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), + acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; @@ -580,18 +578,18 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, { getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && - vallTrue(vge(d->scm[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scp[i]>=sharp_minscale) && + all_of(d->scm[i]>=sharp_minscale); } while((!full_ieee) && (l<=lmax)) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), - acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); - Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), - acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), + acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); + Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), + acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); full_ieee=1; for (int i=0; il2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + full_ieee &= all_of(d->scp[i]>=sharp_minscale); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scm[i]>=sharp_minscale); } l+=2; } @@ -650,10 +648,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, int lsave=l; while (l<=lmax) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; - Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv agr1=0, agi1=0, acr1=0, aci1=0; + Tv agr2=0, agi2=0, acr2=0, aci2=0; for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -674,10 +672,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, l=lsave; while (l<=lmax) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; - Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv agr1=0, agi1=0, acr1=0, aci1=0; + Tv agr2=0, agi2=0, acr2=0, aci2=0; for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; @@ -714,8 +712,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, { getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && - vallTrue(vge(d->scm[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scp[i]>=sharp_minscale) && + all_of(d->scm[i]>=sharp_minscale); } for (int i=0; il2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + full_ieee &= all_of(d->scp[i]>=sharp_minscale); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scm[i]>=sharp_minscale); } vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); @@ -781,10 +779,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, int lsave=l; while (l<=lmax) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), - ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), + ar2=alm[l+1].real(), ai2=alm[l+1].imag(); for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -800,10 +798,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, l=lsave; while (l<=lmax) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), - ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), + ar2=alm[l+1].real(), ai2=alm[l+1].imag(); for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; @@ -835,16 +833,16 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, { getCorfac(d->scp[i], &d->cfp[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && - vallTrue(vge(d->scm[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scp[i]>=sharp_minscale) && + all_of(d->scm[i]>=sharp_minscale); } while((!full_ieee) && (l<=lmax)) { - Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); - Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), - ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv fx10=fx[l+1].a,fx11=fx[l+1].b; + Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), + ar2=alm[l+1].real(), ai2=alm[l+1].imag(); full_ieee=1; for (int i=0; il2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) + full_ieee &= all_of(d->scp[i]>=sharp_minscale); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); + full_ieee &= all_of(d->scm[i]>=sharp_minscale); } l+=2; } @@ -1085,8 +1083,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, int phas_idx = ith*job->s_th + mi*job->s_m; dcmplx ph1=job->phase[phas_idx]; dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; - d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2); - d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2); + d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag(); + d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag(); //adjust for new algorithm d.s.p2r[nth]*=cth_[ith]; d.s.p2i[nth]*=cth_[ith]; @@ -1140,10 +1138,10 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; if ((gen->mhi-gen->m+gen->s)&1) { p2Q=-p2Q; p2U=-p2U; } - d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q); - d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U); - d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q); - d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U); + d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag(); + d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag(); + d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag(); + d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag(); ++nth; } ++ith; diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index 79ce2a7..2c3d6d2 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -42,25 +42,6 @@ using Ts=Tv::value_type; static constexpr size_t VLEN=Tv::size(); #define vload(a) (a) -#define vzero 0. -#define vone 1. - -#define vaddeq_mask(mask,a,b) where(mask,a)+=b; -#define vsubeq_mask(mask,a,b) where(mask,a)-=b; -#define vmuleq_mask(mask,a,b) where(mask,a)*=b; -#define vneg(a) (-(a)) -#define vabs(a) abs(a) -#define vsqrt(a) sqrt(a) -#define vlt(a,b) ((a)<(b)) -#define vgt(a,b) ((a)>(b)) -#define vge(a,b) ((a)>=(b)) -#define vne(a,b) ((a)!=(b)) -#define vand_mask(a,b) ((a)&&(b)) -#define vor_mask(a,b) ((a)||(b)) -static inline Tv vmin (Tv a, Tv b) { return min(a,b); } -static inline Tv vmax (Tv a, Tv b) { return max(a,b); } -#define vanyTrue(a) (any_of(a)) -#define vallTrue(a) (all_of(a)) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, complex * restrict cc) From f8a9c96acf6043f388d6fcb4b31040abdc0f6c9d Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 3 Jan 2020 22:55:52 +0100 Subject: [PATCH 5/6] use MRUTIL macros --- libsharp2/sharp.cc | 39 ++++++++-------- libsharp2/sharp_core_inc.cc | 90 ++++++++++++++++++------------------ libsharp2/sharp_utils.h | 6 --- libsharp2/sharp_vecsupport.h | 7 +-- 4 files changed, 69 insertions(+), 73 deletions(-) diff --git a/libsharp2/sharp.cc b/libsharp2/sharp.cc index 2b6e722..a49c106 100644 --- a/libsharp2/sharp.cc +++ b/libsharp2/sharp.cc @@ -34,6 +34,7 @@ #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_geomhelpers.h" #include "mr_util/threading.h" +#include "mr_util/useful_macros.h" typedef complex dcmplx; typedef complex fcmplx; @@ -58,7 +59,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) *nchunks = (ndata+(*chunksize)-1)/(*chunksize); } -NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) +MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) { double ofs=lmax*0.01; if (ofs<100.) ofs=100.; @@ -95,7 +96,7 @@ static void ringhelper_destroy (ringhelper *self) ringhelper_init(self); } -NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) +MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) { self->norot = (fabs(phi0)<1e-14); if (!(self->norot)) @@ -276,7 +277,7 @@ static int sharp_get_mmax (int *mval, int nm) return nm-1; } -NOINLINE static void ringhelper_phase2ring (ringhelper *self, +MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, int pstride, int flags) { @@ -334,7 +335,7 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self, pocketfft_backward_r (self->plan, &(data[1]), 1.); } -NOINLINE static void ringhelper_ring2phase (ringhelper *self, +MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, int pstride, int flags) { @@ -384,7 +385,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self, phase[m*pstride]=0.; } -NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, +MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, int flags) { if (flags & SHARP_NO_FFT) @@ -441,7 +442,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, } } -NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, +MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) { #define CLEARLOOP(real_t,body) \ @@ -478,7 +479,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, } } -NOINLINE static void init_output (sharp_job *job) +MRUTIL_NOINLINE static void init_output (sharp_job *job) { if (job->flags&SHARP_ADD) return; if (job->type == SHARP_MAP2ALM) @@ -489,7 +490,7 @@ NOINLINE static void init_output (sharp_job *job) clear_map (job->ginfo,job->map[i],job->flags); } -NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) +MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) { if (job->type==SHARP_MAP2ALM) { @@ -515,7 +516,7 @@ static void alloc_almtmp (sharp_job *job, int lmax) static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } -NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) +MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, source_t, expr_of_x) \ @@ -589,7 +590,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) +MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, target_t, expr_of_x) \ @@ -651,7 +652,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, +MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, const double *ringtmp, int rstride) { if (job->flags & SHARP_DP) @@ -659,8 +660,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double **dmap = (double **)job->map; for (int i=0; inmaps; ++i) { - double *restrict p1=&dmap[i][ri->ofs]; - const double *restrict p2=&ringtmp[i*rstride+1]; + double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs]; + const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1]; if (ri->stride==1) { if (job->flags&SHARP_ADD) @@ -683,14 +684,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, } } -NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, +MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, int rstride) { if (job->flags & SHARP_DP) for (int i=0; inmaps; ++i) { - double *restrict p1=&ringtmp[i*rstride+1], - *restrict p2=&(((double *)(job->map[i]))[ri->ofs]); + double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1], + *MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]); if (ri->stride==1) memcpy(p1,p2,ri->nph*sizeof(double)); else @@ -744,7 +745,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, } //FIXME: set phase to zero if not SHARP_MAP2ALM? -NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) +MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -789,7 +790,7 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) } } -NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) +MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -834,7 +835,7 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) } } -NOINLINE static void sharp_execute_job (sharp_job *job) +MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job) { double timer=sharp_wallTime(); job->opcnt=0; diff --git a/libsharp2/sharp_core_inc.cc b/libsharp2/sharp_core_inc.cc index a4b57fc..10c4322 100644 --- a/libsharp2/sharp_core_inc.cc +++ b/libsharp2/sharp_core_inc.cc @@ -94,7 +94,7 @@ typedef union sxdata_s s; } sxdata_u; -static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, +static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale, double maxval) { const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval; @@ -115,8 +115,8 @@ static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, } } -static void mypow(Tv val, int npow, const double * restrict powlimit, - Tv * restrict resd, Tv * restrict ress) +static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit, + Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress) { Tv vminv=powlimit[npow]; auto mask = abs(val)eps; if (any_of(mask)) @@ -182,8 +182,8 @@ static inline bool rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, return false; } -NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, - s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2) +MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, + s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2) { int l=gen->m, il=0; Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]; @@ -216,8 +216,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, *l_=l; *il_=il; } -NOINLINE static void alm2map_kernel(s0data_v * restrict d, - const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, +MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm, int l, int il, int lmax, int nv2) { if (nv2==nv0) @@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, } } -NOINLINE static void calc_alm2map (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) +MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job, + const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth) { int l,il,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -298,8 +298,8 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * restrict coef = gen->coef; - const dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; + const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ilmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -392,8 +392,8 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * restrict coef = gen->coef; - dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; icoef; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; Tv prefac=gen->prefac[gen->m], prescale=gen->fscale[gen->m]; Tv limscale=sharp_limscale; @@ -505,8 +505,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, *l_=l; } -NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, - const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, +MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm, int l, int lmax, int nv2) { int lsave = l; @@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, } } -NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) +MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job, + const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) { int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -571,8 +571,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - const dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; + const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ilmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -705,8 +705,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ilmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -826,8 +826,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 15*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - const dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; + const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ispin==0) { //adjust the a_lm for the new algorithm - dcmplx * restrict alm=job->almtmp; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) { dcmplx al = alm[l]; @@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, } } -NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, +MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { @@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, } } //adjust the a_lm for the new algorithm - dcmplx * restrict alm=job->almtmp; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; dcmplx alm2 = 0.; double alold=0; for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) diff --git a/libsharp2/sharp_utils.h b/libsharp2/sharp_utils.h index d2a1c74..a3c9190 100644 --- a/libsharp2/sharp_utils.h +++ b/libsharp2/sharp_utils.h @@ -122,10 +122,4 @@ double sharp_wallTime(void); } #endif -#ifdef __GNUC__ -#define NOINLINE __attribute__((noinline)) -#else -#define NOINLINE -#endif - #endif diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index 2c3d6d2..efc7c66 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -31,11 +31,12 @@ #include #include using std::complex; - - #include using std::experimental::native_simd; using std::experimental::reduce; + +#include "mr_util/useful_macros.h" + using Tv=native_simd; using Tm=Tv::mask_type; using Ts=Tv::value_type; @@ -44,7 +45,7 @@ static constexpr size_t VLEN=Tv::size(); #define vload(a) (a) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex * restrict cc) + complex * MRUTIL_RESTRICT cc) { cc[0] += complex(reduce(a,std::plus<>()),reduce(b,std::plus<>())); cc[1] += complex(reduce(c,std::plus<>()),reduce(d,std::plus<>())); From b27ce30cdea9bc4fd3bc6df81170ae991da9e7b7 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 4 Jan 2020 12:59:59 +0100 Subject: [PATCH 6/6] switch to new FFT --- Makefile.am | 2 - libsharp2/pocketfft.cc | 2198 -------------------------------- libsharp2/pocketfft.h | 50 - libsharp2/sharp.cc | 19 +- libsharp2/sharp_geomhelpers.cc | 14 +- 5 files changed, 14 insertions(+), 2269 deletions(-) delete mode 100644 libsharp2/pocketfft.cc delete mode 100644 libsharp2/pocketfft.h diff --git a/Makefile.am b/Makefile.am index fe09aa6..444163b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,8 +3,6 @@ ACLOCAL_AMFLAGS = -I m4 lib_LTLIBRARIES = libsharp2.la libsharp2_la_SOURCES = \ - libsharp2/pocketfft.cc \ - libsharp2/pocketfft.h \ libsharp2/sharp_utils.cc \ libsharp2/sharp_utils.h \ libsharp2/sharp.cc \ diff --git a/libsharp2/pocketfft.cc b/libsharp2/pocketfft.cc deleted file mode 100644 index 50ee2e8..0000000 --- a/libsharp2/pocketfft.cc +++ /dev/null @@ -1,2198 +0,0 @@ -/* - * This file is part of pocketfft. - * Licensed under a 3-clause BSD style license - see LICENSE.md - */ - -/* - * Main implementation file. - * - * Copyright (C) 2004-2019 Max-Planck-Society - * \author Martin Reinecke - */ - -#include -#include - -#include "libsharp2/pocketfft.h" - -#define RALLOC(type,num) \ - ((type *)malloc((num)*sizeof(type))) -#define DEALLOC(ptr) \ - do { free(ptr); (ptr)=NULL; } while(0) - -#define SWAP(a,b,type) \ - do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0) - -#ifdef __GNUC__ -#define NOINLINE __attribute__((noinline)) -#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result)) -#else -#define NOINLINE -#define WARN_UNUSED_RESULT -#endif - -#pragma GCC visibility push(hidden) - -// adapted from https://stackoverflow.com/questions/42792939/ -// CAUTION: this function only works for arguments in the range [-0.25; 0.25]! -static void my_sincosm1pi (double a, double *restrict res) - { - double s = a * a; - /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ - double r = -1.0369917389758117e-4; - r = fma (r, s, 1.9294935641298806e-3); - r = fma (r, s, -2.5806887942825395e-2); - r = fma (r, s, 2.3533063028328211e-1); - r = fma (r, s, -1.3352627688538006e+0); - r = fma (r, s, 4.0587121264167623e+0); - r = fma (r, s, -4.9348022005446790e+0); - double c = r*s; - /* Approximate sin(pi*x) for x in [-0.25,0.25] */ - r = 4.6151442520157035e-4; - r = fma (r, s, -7.3700183130883555e-3); - r = fma (r, s, 8.2145868949323936e-2); - r = fma (r, s, -5.9926452893214921e-1); - r = fma (r, s, 2.5501640398732688e+0); - r = fma (r, s, -5.1677127800499516e+0); - s = s * a; - r = r * s; - s = fma (a, 3.1415926535897931e+0, r); - res[0] = c; - res[1] = s; - } - -NOINLINE static void calc_first_octant(size_t den, double * restrict res) - { - size_t n = (den+4)>>3; - if (n==0) return; - res[0]=1.; res[1]=0.; - if (n==1) return; - size_t l1=(size_t)sqrt(n); - for (size_t i=1; in) end = n-start; - for (size_t i=1; i>2; - size_t i=0, idx1=0, idx2=2*ndone-2; - for (; i+1>1; - double * p = res+n-1; - calc_first_octant(n<<2, p); - int i4=0, in=n, i=0; - for (; i4<=in-i4; ++i, i4+=4) // octant 0 - { - res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; - } - for (; i4-in <= 0; ++i, i4+=4) // octant 1 - { - int xm = in-i4; - res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; - } - for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 - { - int xm = i4-in; - res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; - } - for (; i>2; - if ((n&7)==0) - res[quart] = res[quart+1] = hsqt2; - for (size_t i=2, j=2*quart-2; i>1; - if ((n&3)==0) - for (size_t i=0; i>1))<<1)==n) - { res=2; n=tmp; } - - size_t limit=(size_t)sqrt(n+0.01); - for (size_t x=3; x<=limit; x+=2) - while (((tmp=(n/x))*x)==n) - { - res=x; - n=tmp; - limit=(size_t)sqrt(n+0.01); - } - if (n>1) res=n; - - return res; - } - -NOINLINE static double cost_guess (size_t n) - { - const double lfp=1.1; // penalty for non-hardcoded larger factors - size_t ni=n; - double result=0.; - size_t tmp; - while (((tmp=(n>>1))<<1)==n) - { result+=2; n=tmp; } - - size_t limit=(size_t)sqrt(n+0.01); - for (size_t x=3; x<=limit; x+=2) - while ((tmp=(n/x))*x==n) - { - result+= (x<=5) ? x : lfp*x; // penalize larger prime factors - n=tmp; - limit=(size_t)sqrt(n+0.01); - } - if (n>1) result+=(n<=5) ? n : lfp*n; - - return result*ni; - } - -/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ -NOINLINE static size_t good_size(size_t n) - { - if (n<=6) return n; - - size_t bestfac=2*n; - for (size_t f2=1; f2=n) bestfac=f235711; - return bestfac; - } - -typedef struct cmplx { - double r,i; -} cmplx; - -#define NFCT 25 -typedef struct cfftp_fctdata - { - size_t fct; - cmplx *tw, *tws; - } cfftp_fctdata; - -typedef struct cfftp_plan_i - { - size_t length, nfct; - cmplx *mem; - cfftp_fctdata fct[NFCT]; - } cfftp_plan_i; -typedef struct cfftp_plan_i * cfftp_plan; - -#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; } -#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; } -#define SCALEC(a,b) { a.r*=b; a.i*=b; } -#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } -#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; } -#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] -#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] -#define WA(x,i) wa[(i)-1+(x)*(ido-1)] -/* a = b*c */ -#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; } -/* a = conj(b)*c*/ -#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; } - -#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; } -/* a = b*c */ -#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; } -/* a *= b */ -#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; } - -NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc, - cmplx * restrict ch, const cmplx * restrict wa) - { - const size_t cdim=2; - - if (ido==1) - for (size_t k=0; kip) iwal-=ip; - cmplx xwal=wal[iwal]; - iwal+=l; if (iwal>ip) iwal-=ip; - cmplx xwal2=wal[iwal]; - for (size_t ik=0; ikip) iwal-=ip; - cmplx xwal=wal[iwal]; - for (size_t ik=0; iklength==1) return 0; - size_t len=plan->length; - size_t l1=1, nf=plan->nfct; - cmplx *ch = RALLOC(cmplx, len); - if (!ch) return -1; - cmplx *p1=c, *p2=ch; - - for(size_t k1=0; k1fct[k1].fct; - size_t l2=ip*l1; - size_t ido = len/l2; - if (ip==4) - sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw) - : pass4f (ido, l1, p1, p2, plan->fct[k1].tw); - else if(ip==2) - sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw) - : pass2f (ido, l1, p1, p2, plan->fct[k1].tw); - else if(ip==3) - sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw) - : pass3f (ido, l1, p1, p2, plan->fct[k1].tw); - else if(ip==5) - sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw) - : pass5f (ido, l1, p1, p2, plan->fct[k1].tw); - else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign); - else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign); - else - { - if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign)) - { DEALLOC(ch); return -1; } - SWAP(p1,p2,cmplx *); - } - SWAP(p1,p2,cmplx *); - l1=l2; - } - if (p1!=c) - { - if (fct!=1.) - for (size_t i=0; ilength; - size_t nfct=0; - while ((length%4)==0) - { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } - if ((length%2)==0) - { - length>>=1; - // factor 2 should be at the front of the factor list - if (nfct>=NFCT) return -1; - plan->fct[nfct++].fct=2; - SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); - } - size_t maxl=(size_t)(sqrt((double)length))+1; - for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; - plan->fct[nfct++].fct=divisor; - length/=divisor; - } - maxl=(size_t)(sqrt((double)length))+1; - } - if (length>1) plan->fct[nfct++].fct=length; - plan->nfct=nfct; - return 0; - } - -NOINLINE static size_t cfftp_twsize (cfftp_plan plan) - { - size_t twsize=0, l1=1; - for (size_t k=0; knfct; ++k) - { - size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); - twsize+=(ip-1)*(ido-1); - if (ip>11) - twsize+=ip; - l1*=ip; - } - return twsize; - } - -NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan) - { - size_t length=plan->length; - double *twid = RALLOC(double, 2*length); - if (!twid) return -1; - sincos_2pibyn(length, twid); - size_t l1=1; - size_t memofs=0; - for (size_t k=0; knfct; ++k) - { - size_t ip=plan->fct[k].fct, ido= length/(l1*ip); - plan->fct[k].tw=plan->mem+memofs; - memofs+=(ip-1)*(ido-1); - for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i]; - plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1]; - } - if (ip>11) - { - plan->fct[k].tws=plan->mem+memofs; - memofs+=ip; - for (size_t j=0; jfct[k].tws[j].r = twid[2*j*l1*ido]; - plan->fct[k].tws[j].i = twid[2*j*l1*ido+1]; - } - } - l1*=ip; - } - DEALLOC(twid); - return 0; - } - -static cfftp_plan make_cfftp_plan (size_t length) - { - if (length==0) return NULL; - cfftp_plan plan = RALLOC(cfftp_plan_i,1); - if (!plan) return NULL; - plan->length=length; - plan->nfct=0; - for (size_t i=0; ifct[i]=(cfftp_fctdata){0,0,0}; - plan->mem=0; - if (length==1) return plan; - if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } - size_t tws=cfftp_twsize(plan); - plan->mem=RALLOC(cmplx,tws); - if (!plan->mem) { DEALLOC(plan); return NULL; } - if (cfftp_comp_twiddle(plan)!=0) - { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } - return plan; - } - -static void destroy_cfftp_plan (cfftp_plan plan) - { - DEALLOC(plan->mem); - DEALLOC(plan); - } - -typedef struct rfftp_fctdata - { - size_t fct; - double *tw, *tws; - } rfftp_fctdata; - -typedef struct rfftp_plan_i - { - size_t length, nfct; - double *mem; - rfftp_fctdata fct[NFCT]; - } rfftp_plan_i; -typedef struct rfftp_plan_i * rfftp_plan; - -#define WA(x,i) wa[(i)+(x)*(ido-1)] -#define PM(a,b,c,d) { a=c+d; b=c-d; } -/* (a+ib) = conj(c+id) * (e+if) */ -#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } - -#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] -#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] - -NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc, - double * restrict ch, const double * restrict wa) - { - const size_t cdim=2; - - for (size_t k=0; k1) - { - for (size_t j=1, jc=ip-1; j=ip) iang-=ip; - double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; - iang+=l; if (iang>=ip) iang-=ip; - double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; - iang+=l; if (iang>=ip) iang-=ip; - double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; - iang+=l; if (iang>=ip) iang-=ip; - double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; - for (size_t ik=0; ik=ip) iang-=ip; - double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; - iang+=l; if (iang>=ip) iang-=ip; - double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; - for (size_t ik=0; ik=ip) iang-=ip; - double ar=csarr[2*iang], ai=csarr[2*iang+1]; - for (size_t ik=0; ikip) iang-=ip; - double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; - iang+=l; if(iang>ip) iang-=ip; - double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; - iang+=l; if(iang>ip) iang-=ip; - double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; - iang+=l; if(iang>ip) iang-=ip; - double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; - for (size_t ik=0; ikip) iang-=ip; - double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; - iang+=l; if(iang>ip) iang-=ip; - double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; - for (size_t ik=0; ikip) iang-=ip; - double war=csarr[2*iang], wai=csarr[2*iang+1]; - for (size_t ik=0; iklength==1) return 0; - size_t n=plan->length; - size_t l1=n, nf=plan->nfct; - double *ch = RALLOC(double, n); - if (!ch) return -1; - double *p1=c, *p2=ch; - - for(size_t k1=0; k1fct[k].fct; - size_t ido=n / l1; - l1 /= ip; - if(ip==4) - radf4(ido, l1, p1, p2, plan->fct[k].tw); - else if(ip==2) - radf2(ido, l1, p1, p2, plan->fct[k].tw); - else if(ip==3) - radf3(ido, l1, p1, p2, plan->fct[k].tw); - else if(ip==5) - radf5(ido, l1, p1, p2, plan->fct[k].tw); - else - { - radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); - SWAP (p1,p2,double *); - } - SWAP (p1,p2,double *); - } - copy_and_norm(c,p1,n,fct); - DEALLOC(ch); - return 0; - } - -WARN_UNUSED_RESULT -static int rfftp_backward(rfftp_plan plan, double c[], double fct) - { - if (plan->length==1) return 0; - size_t n=plan->length; - size_t l1=1, nf=plan->nfct; - double *ch = RALLOC(double, n); - if (!ch) return -1; - double *p1=c, *p2=ch; - - for(size_t k=0; kfct[k].fct, - ido= n/(ip*l1); - if(ip==4) - radb4(ido, l1, p1, p2, plan->fct[k].tw); - else if(ip==2) - radb2(ido, l1, p1, p2, plan->fct[k].tw); - else if(ip==3) - radb3(ido, l1, p1, p2, plan->fct[k].tw); - else if(ip==5) - radb5(ido, l1, p1, p2, plan->fct[k].tw); - else - radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); - SWAP (p1,p2,double *); - l1*=ip; - } - copy_and_norm(c,p1,n,fct); - DEALLOC(ch); - return 0; - } - -WARN_UNUSED_RESULT -static int rfftp_factorize (rfftp_plan plan) - { - size_t length=plan->length; - size_t nfct=0; - while ((length%4)==0) - { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } - if ((length%2)==0) - { - length>>=1; - // factor 2 should be at the front of the factor list - if (nfct>=NFCT) return -1; - plan->fct[nfct++].fct=2; - SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); - } - size_t maxl=(size_t)(sqrt((double)length))+1; - for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; - plan->fct[nfct++].fct=divisor; - length/=divisor; - } - maxl=(size_t)(sqrt((double)length))+1; - } - if (length>1) plan->fct[nfct++].fct=length; - plan->nfct=nfct; - return 0; - } - -static size_t rfftp_twsize(rfftp_plan plan) - { - size_t twsize=0, l1=1; - for (size_t k=0; knfct; ++k) - { - size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); - twsize+=(ip-1)*(ido-1); - if (ip>5) twsize+=2*ip; - l1*=ip; - } - return twsize; - return 0; - } - -WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan) - { - size_t length=plan->length; - double *twid = RALLOC(double, 2*length); - if (!twid) return -1; - sincos_2pibyn_half(length, twid); - size_t l1=1; - double *ptr=plan->mem; - for (size_t k=0; knfct; ++k) - { - size_t ip=plan->fct[k].fct, ido=length/(l1*ip); - if (knfct-1) // last factor doesn't need twiddles - { - plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1); - for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; - plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; - } - } - if (ip>5) // special factors required by *g functions - { - plan->fct[k].tws=ptr; ptr+=2*ip; - plan->fct[k].tws[0] = 1.; - plan->fct[k].tws[1] = 0.; - for (size_t i=1; i<=(ip>>1); ++i) - { - plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)]; - plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; - plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; - plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; - } - } - l1*=ip; - } - DEALLOC(twid); - return 0; - } - -NOINLINE static rfftp_plan make_rfftp_plan (size_t length) - { - if (length==0) return NULL; - rfftp_plan plan = RALLOC(rfftp_plan_i,1); - if (!plan) return NULL; - plan->length=length; - plan->nfct=0; - plan->mem=NULL; - for (size_t i=0; ifct[i]=(rfftp_fctdata){0,0,0}; - if (length==1) return plan; - if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } - size_t tws=rfftp_twsize(plan); - plan->mem=RALLOC(double,tws); - if (!plan->mem) { DEALLOC(plan); return NULL; } - if (rfftp_comp_twiddle(plan)!=0) - { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } - return plan; - } - -NOINLINE static void destroy_rfftp_plan (rfftp_plan plan) - { - DEALLOC(plan->mem); - DEALLOC(plan); - } - -typedef struct fftblue_plan_i - { - size_t n, n2; - cfftp_plan plan; - double *mem; - double *bk, *bkf; - } fftblue_plan_i; -typedef struct fftblue_plan_i * fftblue_plan; - -NOINLINE static fftblue_plan make_fftblue_plan (size_t length) - { - fftblue_plan plan = RALLOC(fftblue_plan_i,1); - if (!plan) return NULL; - plan->n = length; - plan->n2 = good_size(plan->n*2-1); - plan->mem = RALLOC(double, 2*plan->n+2*plan->n2); - if (!plan->mem) { DEALLOC(plan); return NULL; } - plan->bk = plan->mem; - plan->bkf = plan->bk+2*plan->n; - -/* initialize b_k */ - double *tmp = RALLOC(double,4*plan->n); - if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } - sincos_2pibyn(2*plan->n,tmp); - plan->bk[0] = 1; - plan->bk[1] = 0; - - size_t coeff=0; - for (size_t m=1; mn; ++m) - { - coeff+=2*m-1; - if (coeff>=2*plan->n) coeff-=2*plan->n; - plan->bk[2*m ] = tmp[2*coeff ]; - plan->bk[2*m+1] = tmp[2*coeff+1]; - } - - /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ - double xn2 = 1./plan->n2; - plan->bkf[0] = plan->bk[0]*xn2; - plan->bkf[1] = plan->bk[1]*xn2; - for (size_t m=2; m<2*plan->n; m+=2) - { - plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2; - plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2; - } - for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m) - plan->bkf[m]=0.; - plan->plan=make_cfftp_plan(plan->n2); - if (!plan->plan) - { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } - if (cfftp_forward(plan->plan,plan->bkf,1.)!=0) - { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } - DEALLOC(tmp); - - return plan; - } - -NOINLINE static void destroy_fftblue_plan (fftblue_plan plan) - { - DEALLOC(plan->mem); - destroy_cfftp_plan(plan->plan); - DEALLOC(plan); - } - -NOINLINE WARN_UNUSED_RESULT -static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct) - { - size_t n=plan->n; - size_t n2=plan->n2; - double *bk = plan->bk; - double *bkf = plan->bkf; - double *akf = RALLOC(double, 2*n2); - if (!akf) return -1; - -/* initialize a_k and FFT it */ - if (isign>0) - for (size_t m=0; m<2*n; m+=2) - { - akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1]; - akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m]; - } - else - for (size_t m=0; m<2*n; m+=2) - { - akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1]; - akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m]; - } - for (size_t m=2*n; m<2*n2; ++m) - akf[m]=0; - - if (cfftp_forward (plan->plan,akf,fct)!=0) - { DEALLOC(akf); return -1; } - -/* do the convolution */ - if (isign>0) - for (size_t m=0; m<2*n2; m+=2) - { - double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; - akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1]; - akf[m+1] = im; - } - else - for (size_t m=0; m<2*n2; m+=2) - { - double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; - akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1]; - akf[m+1] = im; - } - -/* inverse FFT */ - if (cfftp_backward (plan->plan,akf,1.)!=0) - { DEALLOC(akf); return -1; } - -/* multiply by b_k */ - if (isign>0) - for (size_t m=0; m<2*n; m+=2) - { - c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1]; - c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1]; - } - else - for (size_t m=0; m<2*n; m+=2) - { - c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1]; - c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1]; - } - DEALLOC(akf); - return 0; - } - -WARN_UNUSED_RESULT -static int cfftblue_backward(fftblue_plan plan, double c[], double fct) - { return fftblue_fft(plan,c,1,fct); } - -WARN_UNUSED_RESULT -static int cfftblue_forward(fftblue_plan plan, double c[], double fct) - { return fftblue_fft(plan,c,-1,fct); } - -WARN_UNUSED_RESULT -static int rfftblue_backward(fftblue_plan plan, double c[], double fct) - { - size_t n=plan->n; - double *tmp = RALLOC(double,2*n); - if (!tmp) return -1; - tmp[0]=c[0]; - tmp[1]=0.; - memcpy (tmp+2,c+1, (n-1)*sizeof(double)); - if ((n&1)==0) tmp[n+1]=0.; - for (size_t m=2; mn; - double *tmp = RALLOC(double,2*n); - if (!tmp) return -1; - for (size_t m=0; mblueplan=0; - plan->packplan=0; - if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) - { - plan->packplan=make_cfftp_plan(length); - if (!plan->packplan) { DEALLOC(plan); return NULL; } - return plan; - } - double comp1 = cost_guess(length); - double comp2 = 2*cost_guess(good_size(2*length-1)); - comp2*=1.5; /* fudge factor that appears to give good overall performance */ - if (comp2blueplan=make_fftblue_plan(length); - if (!plan->blueplan) { DEALLOC(plan); return NULL; } - } - else - { - plan->packplan=make_cfftp_plan(length); - if (!plan->packplan) { DEALLOC(plan); return NULL; } - } - return plan; - } - -void pocketfft_delete_plan_c (pocketfft_plan_c plan) - { - if (plan->blueplan) - destroy_fftblue_plan(plan->blueplan); - if (plan->packplan) - destroy_cfftp_plan(plan->packplan); - DEALLOC(plan); - } - -WARN_UNUSED_RESULT -int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct) - { - if (plan->packplan) - return cfftp_backward(plan->packplan,c,fct); - // if (plan->blueplan) - return cfftblue_backward(plan->blueplan,c,fct); - } - -WARN_UNUSED_RESULT -int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct) - { - if (plan->packplan) - return cfftp_forward(plan->packplan,c,fct); - // if (plan->blueplan) - return cfftblue_forward(plan->blueplan,c,fct); - } - -typedef struct pocketfft_plan_r_i - { - rfftp_plan packplan; - fftblue_plan blueplan; - } pocketfft_plan_r_i; - -pocketfft_plan_r pocketfft_make_plan_r (size_t length) - { - if (length==0) return NULL; - pocketfft_plan_r plan = RALLOC(pocketfft_plan_r_i,1); - if (!plan) return NULL; - plan->blueplan=0; - plan->packplan=0; - if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) - { - plan->packplan=make_rfftp_plan(length); - if (!plan->packplan) { DEALLOC(plan); return NULL; } - return plan; - } - double comp1 = 0.5*cost_guess(length); - double comp2 = 2*cost_guess(good_size(2*length-1)); - comp2*=1.5; /* fudge factor that appears to give good overall performance */ - if (comp2blueplan=make_fftblue_plan(length); - if (!plan->blueplan) { DEALLOC(plan); return NULL; } - } - else - { - plan->packplan=make_rfftp_plan(length); - if (!plan->packplan) { DEALLOC(plan); return NULL; } - } - return plan; - } - -void pocketfft_delete_plan_r (pocketfft_plan_r plan) - { - if (plan->blueplan) - destroy_fftblue_plan(plan->blueplan); - if (plan->packplan) - destroy_rfftp_plan(plan->packplan); - DEALLOC(plan); - } - -size_t pocketfft_length_r(pocketfft_plan_r plan) - { - if (plan->packplan) return plan->packplan->length; - return plan->blueplan->n; - } - -size_t pocketfft_length_c(pocketfft_plan_c plan) - { - if (plan->packplan) return plan->packplan->length; - return plan->blueplan->n; - } - -WARN_UNUSED_RESULT -int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct) - { - if (plan->packplan) - return rfftp_backward(plan->packplan,c,fct); - else // if (plan->blueplan) - return rfftblue_backward(plan->blueplan,c,fct); - } - -WARN_UNUSED_RESULT -int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct) - { - if (plan->packplan) - return rfftp_forward(plan->packplan,c,fct); - else // if (plan->blueplan) - return rfftblue_forward(plan->blueplan,c,fct); - } - -#pragma GCC visibility pop diff --git a/libsharp2/pocketfft.h b/libsharp2/pocketfft.h deleted file mode 100644 index b1a02ee..0000000 --- a/libsharp2/pocketfft.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * This file is part of pocketfft. - * Licensed under a 3-clause BSD style license - see LICENSE.md - */ - -/*! \file pocketfft.h - * Public interface of the pocketfft library - * - * Copyright (C) 2008-2019 Max-Planck-Society - * \author Martin Reinecke - */ - -#ifndef POCKETFFT_H -#define POCKETFFT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct pocketfft_plan_c_i; -typedef struct pocketfft_plan_c_i * pocketfft_plan_c; -pocketfft_plan_c pocketfft_make_plan_c (size_t length); -void pocketfft_delete_plan_c (pocketfft_plan_c plan); -int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct); -int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct); -size_t pocketfft_length_c(pocketfft_plan_c plan); - -struct pocketfft_plan_r_i; -typedef struct pocketfft_plan_r_i * pocketfft_plan_r; -pocketfft_plan_r pocketfft_make_plan_r (size_t length); -void pocketfft_delete_plan_r (pocketfft_plan_r plan); -/*! Computes a real backward FFT on \a c, using \a plan - and assuming the FFTPACK storage scheme: - - on entry, \a c has the form r0, r1, i1, r2, i2, ... - - on exit, it has the form r0, r1, ..., r[length-1]. */ -int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct); -/*! Computes a real forward FFT on \a c, using \a plan - and assuming the FFTPACK storage scheme: - - on entry, \a c has the form r0, r1, ..., r[length-1]; - - on exit, it has the form r0, r1, i1, r2, i2, ... */ -int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct); -size_t pocketfft_length_r(pocketfft_plan_r plan); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/libsharp2/sharp.cc b/libsharp2/sharp.cc index a49c106..86d7ca5 100644 --- a/libsharp2/sharp.cc +++ b/libsharp2/sharp.cc @@ -25,9 +25,10 @@ * \author Martin Reinecke \author Dag Sverre Seljebotn */ -#include -#include -#include "libsharp2/pocketfft.h" +#include +#include +#include +#include "mr_util/fft.h" #include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_internal.h" #include "libsharp2/sharp_utils.h" @@ -78,7 +79,7 @@ typedef struct double phi0_; dcmplx *shiftarr; int s_shift; - pocketfft_plan_r plan; + mr::detail_fft::rfftp *plan; int length; int norot; } ringhelper; @@ -91,7 +92,7 @@ static void ringhelper_init (ringhelper *self) static void ringhelper_destroy (ringhelper *self) { - if (self->plan) pocketfft_delete_plan_r(self->plan); + delete self->plan; DEALLOC(self->shiftarr); ringhelper_init(self); } @@ -114,8 +115,8 @@ MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mm // if (!self->plan) self->plan=pocketfft_make_plan_r(nph); if (nph!=(int)self->length) { - if (self->plan) pocketfft_delete_plan_r(self->plan); - self->plan=pocketfft_make_plan_r(nph); + if (self->plan) delete self->plan; + self->plan=new mr::detail_fft::rfftp(nph); self->length=nph; } } @@ -332,7 +333,7 @@ MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self, } } data[1]=data[0]; - pocketfft_backward_r (self->plan, &(data[1]), 1.); + self->plan->exec(&(data[1]), 1., false); } MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, @@ -351,7 +352,7 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, if (flags&SHARP_REAL_HARMONICS) wgt *= sqrt_two; - pocketfft_forward_r (self->plan, &(data[1]), 1.); + self->plan->exec (&(data[1]), 1., true); data[0]=data[1]; data[1]=data[nph+1]=0.; diff --git a/libsharp2/sharp_geomhelpers.cc b/libsharp2/sharp_geomhelpers.cc index db459bc..cf838e2 100644 --- a/libsharp2/sharp_geomhelpers.cc +++ b/libsharp2/sharp_geomhelpers.cc @@ -29,7 +29,7 @@ #include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_legendre_roots.h" #include "libsharp2/sharp_utils.h" -#include "libsharp2/pocketfft.h" +#include "mr_util/fft.h" void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings, const int *rings, const double *weight, sharp_geom_info **geom_info) @@ -155,9 +155,7 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0, weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings); } if ((nrings&1)==0) weight[nrings-1]=0.; - pocketfft_plan_r plan = pocketfft_make_plan_r(nrings); - pocketfft_backward_r(plan,weight,1.); - pocketfft_delete_plan_r(plan); + mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); for (int m=0; m<(nrings+1)/2; ++m) { @@ -202,9 +200,7 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0, for (int k=1; k<=(n/2-1); ++k) weight[2*k-1]=2./(1.-4.*k*k) + dw; weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1); - pocketfft_plan_r plan = pocketfft_make_plan_r(n); - pocketfft_backward_r(plan,weight,1.); - pocketfft_delete_plan_r(plan); + mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); weight[n]=weight[0]; for (int m=0; m<(nrings+1)/2; ++m) @@ -250,9 +246,7 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0, for (int k=1; k<=(n/2-1); ++k) weight[2*k-1]=2./(1.-4.*k*k); weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.; - pocketfft_plan_r plan = pocketfft_make_plan_r(n); - pocketfft_backward_r(plan,weight,1.); - pocketfft_delete_plan_r(plan); + mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); for (int m=0; m