diff --git a/COMPILE b/COMPILE index 876a6b3..e3a87b7 100644 --- a/COMPILE +++ b/COMPILE @@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the configure command line. At runtime OMP_NUM_THREADS should be set to the number of hardware threads (not physical cores) of the system. -(Usually this is already the default setting when OMP_NUM_THREADS is not +(Usually this is already the default setting when OMP_NUM_THREADS is not specified.) diff --git a/Makefile.am b/Makefile.am index 444163b..e105ca0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,14 +3,16 @@ ACLOCAL_AMFLAGS = -I m4 lib_LTLIBRARIES = libsharp2.la libsharp2_la_SOURCES = \ - libsharp2/sharp_utils.cc \ + libsharp2/pocketfft.c \ + libsharp2/pocketfft.h \ + libsharp2/sharp_utils.c \ libsharp2/sharp_utils.h \ - libsharp2/sharp.cc \ - libsharp2/sharp_almhelpers.cc \ - libsharp2/sharp_core.cc \ - libsharp2/sharp_geomhelpers.cc \ - libsharp2/sharp_legendre_roots.cc \ - libsharp2/sharp_ylmgen_c.cc \ + libsharp2/sharp.c \ + libsharp2/sharp_almhelpers.c \ + libsharp2/sharp_core.c \ + libsharp2/sharp_geomhelpers.c \ + libsharp2/sharp_legendre_roots.c \ + libsharp2/sharp_ylmgen_c.c \ libsharp2/sharp_internal.h \ libsharp2/sharp_legendre_roots.h \ libsharp2/sharp_vecsupport.h \ @@ -22,25 +24,25 @@ libsharp2_la_SOURCES = \ # any backward-compatible change: increase age # any backward-incompatible change: age=0 # ==> age <= current -libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread +libsharp2_la_LDFLAGS = -version-info 0:0:0 -AM_CXXFLAGS = @AM_CXXFLAGS@ +AM_CFLAGS = @AM_CFLAGS@ if HAVE_MULTIARCH -libavx_la_SOURCES = libsharp2/sharp_core_inc.cc -libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc -libfma_la_SOURCES = libsharp2/sharp_core_inc.cc -libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc -libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc +libavx_la_SOURCES = libsharp2/sharp_core_inc.c +libavx2_la_SOURCES = libsharp2/sharp_core_inc.c +libfma_la_SOURCES = libsharp2/sharp_core_inc.c +libfma4_la_SOURCES = libsharp2/sharp_core_inc.c +libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la -libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx -libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2 -libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma -libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4 -libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f +libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx +libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 +libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma +libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 +libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la @@ -54,10 +56,10 @@ nobase_include_HEADERS = \ libsharp2/sharp_cxx.h EXTRA_DIST = \ - runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc + runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c check_PROGRAMS = sharp2_testsuite -sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h +sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h sharp2_testsuite_LDADD = libsharp2.la TESTS = runtest.sh diff --git a/README.md b/README.md index c526224..c9f64e4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,13 @@ +NOTICE +====== + +Active development of this package has stopped. The package will receive bug fixes +if necessary, but otherwise the code has been integrated into the `ducc0` +package (https://gitlab.mpcdf.mpg.de/mtr/ducc), and further development is +taking place there. + +Please prefer `ducc0` over `libsharp2` if you are starting a new project! + # Libsharp2 Library for efficient spherical harmonic transforms at arbitrary spins, diff --git a/configure.ac b/configure.ac index 0b510b8..9503fc1 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([libsharp2], [1.0.0]) +AC_INIT([libsharp2],[1.0.0]) AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror]) AM_MAINTAINER_MODE([enable]) @@ -20,21 +20,28 @@ dnl m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) -AC_PROG_CXX -AX_CXX_COMPILE_STDCXX([11]) +m4_version_prereq(2.70, [AC_PROG_CC], [AC_PROG_CC_C99]) +AC_OPENMP -AC_PROG_LIBTOOL +# add math library +LIBS="-lm" -tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'` +LT_INIT + +tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'` AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0]) +AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS" + PACKAGE_LIBS="-lsharp2" +PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS" +PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS" dnl dnl Create pkgconfig .pc file. dnl AX_CREATE_PKGCONFIG_INFO(,,,,[]) -AC_SUBST([AM_CXXFLAGS]) +AC_SUBST([AM_CFLAGS]) AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/libsharp2/pocketfft.c b/libsharp2/pocketfft.c new file mode 100644 index 0000000..50ee2e8 --- /dev/null +++ b/libsharp2/pocketfft.c @@ -0,0 +1,2198 @@ +/* + * This file is part of pocketfft. + * Licensed under a 3-clause BSD style license - see LICENSE.md + */ + +/* + * Main implementation file. + * + * Copyright (C) 2004-2019 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include + +#include "libsharp2/pocketfft.h" + +#define RALLOC(type,num) \ + ((type *)malloc((num)*sizeof(type))) +#define DEALLOC(ptr) \ + do { free(ptr); (ptr)=NULL; } while(0) + +#define SWAP(a,b,type) \ + do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0) + +#ifdef __GNUC__ +#define NOINLINE __attribute__((noinline)) +#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result)) +#else +#define NOINLINE +#define WARN_UNUSED_RESULT +#endif + +#pragma GCC visibility push(hidden) + +// adapted from https://stackoverflow.com/questions/42792939/ +// CAUTION: this function only works for arguments in the range [-0.25; 0.25]! +static void my_sincosm1pi (double a, double *restrict res) + { + double s = a * a; + /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ + double r = -1.0369917389758117e-4; + r = fma (r, s, 1.9294935641298806e-3); + r = fma (r, s, -2.5806887942825395e-2); + r = fma (r, s, 2.3533063028328211e-1); + r = fma (r, s, -1.3352627688538006e+0); + r = fma (r, s, 4.0587121264167623e+0); + r = fma (r, s, -4.9348022005446790e+0); + double c = r*s; + /* Approximate sin(pi*x) for x in [-0.25,0.25] */ + r = 4.6151442520157035e-4; + r = fma (r, s, -7.3700183130883555e-3); + r = fma (r, s, 8.2145868949323936e-2); + r = fma (r, s, -5.9926452893214921e-1); + r = fma (r, s, 2.5501640398732688e+0); + r = fma (r, s, -5.1677127800499516e+0); + s = s * a; + r = r * s; + s = fma (a, 3.1415926535897931e+0, r); + res[0] = c; + res[1] = s; + } + +NOINLINE static void calc_first_octant(size_t den, double * restrict res) + { + size_t n = (den+4)>>3; + if (n==0) return; + res[0]=1.; res[1]=0.; + if (n==1) return; + size_t l1=(size_t)sqrt(n); + for (size_t i=1; in) end = n-start; + for (size_t i=1; i>2; + size_t i=0, idx1=0, idx2=2*ndone-2; + for (; i+1>1; + double * p = res+n-1; + calc_first_octant(n<<2, p); + int i4=0, in=n, i=0; + for (; i4<=in-i4; ++i, i4+=4) // octant 0 + { + res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; + } + for (; i4-in <= 0; ++i, i4+=4) // octant 1 + { + int xm = in-i4; + res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; + } + for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 + { + int xm = i4-in; + res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; + } + for (; i>2; + if ((n&7)==0) + res[quart] = res[quart+1] = hsqt2; + for (size_t i=2, j=2*quart-2; i>1; + if ((n&3)==0) + for (size_t i=0; i>1))<<1)==n) + { res=2; n=tmp; } + + size_t limit=(size_t)sqrt(n+0.01); + for (size_t x=3; x<=limit; x+=2) + while (((tmp=(n/x))*x)==n) + { + res=x; + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) res=n; + + return res; + } + +NOINLINE static double cost_guess (size_t n) + { + const double lfp=1.1; // penalty for non-hardcoded larger factors + size_t ni=n; + double result=0.; + size_t tmp; + while (((tmp=(n>>1))<<1)==n) + { result+=2; n=tmp; } + + size_t limit=(size_t)sqrt(n+0.01); + for (size_t x=3; x<=limit; x+=2) + while ((tmp=(n/x))*x==n) + { + result+= (x<=5) ? x : lfp*x; // penalize larger prime factors + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) result+=(n<=5) ? n : lfp*n; + + return result*ni; + } + +/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ +NOINLINE static size_t good_size(size_t n) + { + if (n<=6) return n; + + size_t bestfac=2*n; + for (size_t f2=1; f2=n) bestfac=f235711; + return bestfac; + } + +typedef struct cmplx { + double r,i; +} cmplx; + +#define NFCT 25 +typedef struct cfftp_fctdata + { + size_t fct; + cmplx *tw, *tws; + } cfftp_fctdata; + +typedef struct cfftp_plan_i + { + size_t length, nfct; + cmplx *mem; + cfftp_fctdata fct[NFCT]; + } cfftp_plan_i; +typedef struct cfftp_plan_i * cfftp_plan; + +#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; } +#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; } +#define SCALEC(a,b) { a.r*=b; a.i*=b; } +#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } +#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; } +#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] +#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] +#define WA(x,i) wa[(i)-1+(x)*(ido-1)] +/* a = b*c */ +#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; } +/* a = conj(b)*c*/ +#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; } + +#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; } +/* a = b*c */ +#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; } +/* a *= b */ +#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; } + +NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc, + cmplx * restrict ch, const cmplx * restrict wa) + { + const size_t cdim=2; + + if (ido==1) + for (size_t k=0; kip) iwal-=ip; + cmplx xwal=wal[iwal]; + iwal+=l; if (iwal>ip) iwal-=ip; + cmplx xwal2=wal[iwal]; + for (size_t ik=0; ikip) iwal-=ip; + cmplx xwal=wal[iwal]; + for (size_t ik=0; iklength==1) return 0; + size_t len=plan->length; + size_t l1=1, nf=plan->nfct; + cmplx *ch = RALLOC(cmplx, len); + if (!ch) return -1; + cmplx *p1=c, *p2=ch; + + for(size_t k1=0; k1fct[k1].fct; + size_t l2=ip*l1; + size_t ido = len/l2; + if (ip==4) + sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass4f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==2) + sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass2f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==3) + sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass3f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==5) + sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw) + : pass5f (ido, l1, p1, p2, plan->fct[k1].tw); + else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign); + else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign); + else + { + if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign)) + { DEALLOC(ch); return -1; } + SWAP(p1,p2,cmplx *); + } + SWAP(p1,p2,cmplx *); + l1=l2; + } + if (p1!=c) + { + if (fct!=1.) + for (size_t i=0; ilength; + size_t nfct=0; + while ((length%4)==0) + { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } + if ((length%2)==0) + { + length>>=1; + // factor 2 should be at the front of the factor list + if (nfct>=NFCT) return -1; + plan->fct[nfct++].fct=2; + SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); + } + size_t maxl=(size_t)(sqrt((double)length))+1; + for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; + plan->fct[nfct++].fct=divisor; + length/=divisor; + } + maxl=(size_t)(sqrt((double)length))+1; + } + if (length>1) plan->fct[nfct++].fct=length; + plan->nfct=nfct; + return 0; + } + +NOINLINE static size_t cfftp_twsize (cfftp_plan plan) + { + size_t twsize=0, l1=1; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); + twsize+=(ip-1)*(ido-1); + if (ip>11) + twsize+=ip; + l1*=ip; + } + return twsize; + } + +NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan) + { + size_t length=plan->length; + double *twid = RALLOC(double, 2*length); + if (!twid) return -1; + sincos_2pibyn(length, twid); + size_t l1=1; + size_t memofs=0; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= length/(l1*ip); + plan->fct[k].tw=plan->mem+memofs; + memofs+=(ip-1)*(ido-1); + for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i]; + plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1]; + } + if (ip>11) + { + plan->fct[k].tws=plan->mem+memofs; + memofs+=ip; + for (size_t j=0; jfct[k].tws[j].r = twid[2*j*l1*ido]; + plan->fct[k].tws[j].i = twid[2*j*l1*ido+1]; + } + } + l1*=ip; + } + DEALLOC(twid); + return 0; + } + +static cfftp_plan make_cfftp_plan (size_t length) + { + if (length==0) return NULL; + cfftp_plan plan = RALLOC(cfftp_plan_i,1); + if (!plan) return NULL; + plan->length=length; + plan->nfct=0; + for (size_t i=0; ifct[i]=(cfftp_fctdata){0,0,0}; + plan->mem=0; + if (length==1) return plan; + if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } + size_t tws=cfftp_twsize(plan); + plan->mem=RALLOC(cmplx,tws); + if (!plan->mem) { DEALLOC(plan); return NULL; } + if (cfftp_comp_twiddle(plan)!=0) + { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + return plan; + } + +static void destroy_cfftp_plan (cfftp_plan plan) + { + DEALLOC(plan->mem); + DEALLOC(plan); + } + +typedef struct rfftp_fctdata + { + size_t fct; + double *tw, *tws; + } rfftp_fctdata; + +typedef struct rfftp_plan_i + { + size_t length, nfct; + double *mem; + rfftp_fctdata fct[NFCT]; + } rfftp_plan_i; +typedef struct rfftp_plan_i * rfftp_plan; + +#define WA(x,i) wa[(i)+(x)*(ido-1)] +#define PM(a,b,c,d) { a=c+d; b=c-d; } +/* (a+ib) = conj(c+id) * (e+if) */ +#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } + +#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] +#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] + +NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc, + double * restrict ch, const double * restrict wa) + { + const size_t cdim=2; + + for (size_t k=0; k1) + { + for (size_t j=1, jc=ip-1; j=ip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + double ar=csarr[2*iang], ai=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + double ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + double war=csarr[2*iang], wai=csarr[2*iang+1]; + for (size_t ik=0; iklength==1) return 0; + size_t n=plan->length; + size_t l1=n, nf=plan->nfct; + double *ch = RALLOC(double, n); + if (!ch) return -1; + double *p1=c, *p2=ch; + + for(size_t k1=0; k1fct[k].fct; + size_t ido=n / l1; + l1 /= ip; + if(ip==4) + radf4(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==2) + radf2(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==3) + radf3(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==5) + radf5(ido, l1, p1, p2, plan->fct[k].tw); + else + { + radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); + SWAP (p1,p2,double *); + } + SWAP (p1,p2,double *); + } + copy_and_norm(c,p1,n,fct); + DEALLOC(ch); + return 0; + } + +WARN_UNUSED_RESULT +static int rfftp_backward(rfftp_plan plan, double c[], double fct) + { + if (plan->length==1) return 0; + size_t n=plan->length; + size_t l1=1, nf=plan->nfct; + double *ch = RALLOC(double, n); + if (!ch) return -1; + double *p1=c, *p2=ch; + + for(size_t k=0; kfct[k].fct, + ido= n/(ip*l1); + if(ip==4) + radb4(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==2) + radb2(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==3) + radb3(ido, l1, p1, p2, plan->fct[k].tw); + else if(ip==5) + radb5(ido, l1, p1, p2, plan->fct[k].tw); + else + radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws); + SWAP (p1,p2,double *); + l1*=ip; + } + copy_and_norm(c,p1,n,fct); + DEALLOC(ch); + return 0; + } + +WARN_UNUSED_RESULT +static int rfftp_factorize (rfftp_plan plan) + { + size_t length=plan->length; + size_t nfct=0; + while ((length%4)==0) + { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; } + if ((length%2)==0) + { + length>>=1; + // factor 2 should be at the front of the factor list + if (nfct>=NFCT) return -1; + plan->fct[nfct++].fct=2; + SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t); + } + size_t maxl=(size_t)(sqrt((double)length))+1; + for (size_t divisor=3; (length>1)&&(divisor=NFCT) return -1; + plan->fct[nfct++].fct=divisor; + length/=divisor; + } + maxl=(size_t)(sqrt((double)length))+1; + } + if (length>1) plan->fct[nfct++].fct=length; + plan->nfct=nfct; + return 0; + } + +static size_t rfftp_twsize(rfftp_plan plan) + { + size_t twsize=0, l1=1; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip); + twsize+=(ip-1)*(ido-1); + if (ip>5) twsize+=2*ip; + l1*=ip; + } + return twsize; + return 0; + } + +WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan) + { + size_t length=plan->length; + double *twid = RALLOC(double, 2*length); + if (!twid) return -1; + sincos_2pibyn_half(length, twid); + size_t l1=1; + double *ptr=plan->mem; + for (size_t k=0; knfct; ++k) + { + size_t ip=plan->fct[k].fct, ido=length/(l1*ip); + if (knfct-1) // last factor doesn't need twiddles + { + plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1); + for (size_t j=1; jfct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; + plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; + } + } + if (ip>5) // special factors required by *g functions + { + plan->fct[k].tws=ptr; ptr+=2*ip; + plan->fct[k].tws[0] = 1.; + plan->fct[k].tws[1] = 0.; + for (size_t i=1; i<=(ip>>1); ++i) + { + plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)]; + plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; + plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; + plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; + } + } + l1*=ip; + } + DEALLOC(twid); + return 0; + } + +NOINLINE static rfftp_plan make_rfftp_plan (size_t length) + { + if (length==0) return NULL; + rfftp_plan plan = RALLOC(rfftp_plan_i,1); + if (!plan) return NULL; + plan->length=length; + plan->nfct=0; + plan->mem=NULL; + for (size_t i=0; ifct[i]=(rfftp_fctdata){0,0,0}; + if (length==1) return plan; + if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; } + size_t tws=rfftp_twsize(plan); + plan->mem=RALLOC(double,tws); + if (!plan->mem) { DEALLOC(plan); return NULL; } + if (rfftp_comp_twiddle(plan)!=0) + { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + return plan; + } + +NOINLINE static void destroy_rfftp_plan (rfftp_plan plan) + { + DEALLOC(plan->mem); + DEALLOC(plan); + } + +typedef struct fftblue_plan_i + { + size_t n, n2; + cfftp_plan plan; + double *mem; + double *bk, *bkf; + } fftblue_plan_i; +typedef struct fftblue_plan_i * fftblue_plan; + +NOINLINE static fftblue_plan make_fftblue_plan (size_t length) + { + fftblue_plan plan = RALLOC(fftblue_plan_i,1); + if (!plan) return NULL; + plan->n = length; + plan->n2 = good_size(plan->n*2-1); + plan->mem = RALLOC(double, 2*plan->n+2*plan->n2); + if (!plan->mem) { DEALLOC(plan); return NULL; } + plan->bk = plan->mem; + plan->bkf = plan->bk+2*plan->n; + +/* initialize b_k */ + double *tmp = RALLOC(double,4*plan->n); + if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + sincos_2pibyn(2*plan->n,tmp); + plan->bk[0] = 1; + plan->bk[1] = 0; + + size_t coeff=0; + for (size_t m=1; mn; ++m) + { + coeff+=2*m-1; + if (coeff>=2*plan->n) coeff-=2*plan->n; + plan->bk[2*m ] = tmp[2*coeff ]; + plan->bk[2*m+1] = tmp[2*coeff+1]; + } + + /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + double xn2 = 1./plan->n2; + plan->bkf[0] = plan->bk[0]*xn2; + plan->bkf[1] = plan->bk[1]*xn2; + for (size_t m=2; m<2*plan->n; m+=2) + { + plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2; + plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2; + } + for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m) + plan->bkf[m]=0.; + plan->plan=make_cfftp_plan(plan->n2); + if (!plan->plan) + { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + if (cfftp_forward(plan->plan,plan->bkf,1.)!=0) + { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; } + DEALLOC(tmp); + + return plan; + } + +NOINLINE static void destroy_fftblue_plan (fftblue_plan plan) + { + DEALLOC(plan->mem); + destroy_cfftp_plan(plan->plan); + DEALLOC(plan); + } + +NOINLINE WARN_UNUSED_RESULT +static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct) + { + size_t n=plan->n; + size_t n2=plan->n2; + double *bk = plan->bk; + double *bkf = plan->bkf; + double *akf = RALLOC(double, 2*n2); + if (!akf) return -1; + +/* initialize a_k and FFT it */ + if (isign>0) + for (size_t m=0; m<2*n; m+=2) + { + akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1]; + akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m]; + } + else + for (size_t m=0; m<2*n; m+=2) + { + akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1]; + akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m]; + } + for (size_t m=2*n; m<2*n2; ++m) + akf[m]=0; + + if (cfftp_forward (plan->plan,akf,fct)!=0) + { DEALLOC(akf); return -1; } + +/* do the convolution */ + if (isign>0) + for (size_t m=0; m<2*n2; m+=2) + { + double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + else + for (size_t m=0; m<2*n2; m+=2) + { + double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + +/* inverse FFT */ + if (cfftp_backward (plan->plan,akf,1.)!=0) + { DEALLOC(akf); return -1; } + +/* multiply by b_k */ + if (isign>0) + for (size_t m=0; m<2*n; m+=2) + { + c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1]; + c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + else + for (size_t m=0; m<2*n; m+=2) + { + c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1]; + c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + DEALLOC(akf); + return 0; + } + +WARN_UNUSED_RESULT +static int cfftblue_backward(fftblue_plan plan, double c[], double fct) + { return fftblue_fft(plan,c,1,fct); } + +WARN_UNUSED_RESULT +static int cfftblue_forward(fftblue_plan plan, double c[], double fct) + { return fftblue_fft(plan,c,-1,fct); } + +WARN_UNUSED_RESULT +static int rfftblue_backward(fftblue_plan plan, double c[], double fct) + { + size_t n=plan->n; + double *tmp = RALLOC(double,2*n); + if (!tmp) return -1; + tmp[0]=c[0]; + tmp[1]=0.; + memcpy (tmp+2,c+1, (n-1)*sizeof(double)); + if ((n&1)==0) tmp[n+1]=0.; + for (size_t m=2; mn; + double *tmp = RALLOC(double,2*n); + if (!tmp) return -1; + for (size_t m=0; mblueplan=0; + plan->packplan=0; + if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + { + plan->packplan=make_cfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + return plan; + } + double comp1 = cost_guess(length); + double comp2 = 2*cost_guess(good_size(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2blueplan=make_fftblue_plan(length); + if (!plan->blueplan) { DEALLOC(plan); return NULL; } + } + else + { + plan->packplan=make_cfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + } + return plan; + } + +void pocketfft_delete_plan_c (pocketfft_plan_c plan) + { + if (plan->blueplan) + destroy_fftblue_plan(plan->blueplan); + if (plan->packplan) + destroy_cfftp_plan(plan->packplan); + DEALLOC(plan); + } + +WARN_UNUSED_RESULT +int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct) + { + if (plan->packplan) + return cfftp_backward(plan->packplan,c,fct); + // if (plan->blueplan) + return cfftblue_backward(plan->blueplan,c,fct); + } + +WARN_UNUSED_RESULT +int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct) + { + if (plan->packplan) + return cfftp_forward(plan->packplan,c,fct); + // if (plan->blueplan) + return cfftblue_forward(plan->blueplan,c,fct); + } + +typedef struct pocketfft_plan_r_i + { + rfftp_plan packplan; + fftblue_plan blueplan; + } pocketfft_plan_r_i; + +pocketfft_plan_r pocketfft_make_plan_r (size_t length) + { + if (length==0) return NULL; + pocketfft_plan_r plan = RALLOC(pocketfft_plan_r_i,1); + if (!plan) return NULL; + plan->blueplan=0; + plan->packplan=0; + if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + { + plan->packplan=make_rfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + return plan; + } + double comp1 = 0.5*cost_guess(length); + double comp2 = 2*cost_guess(good_size(2*length-1)); + comp2*=1.5; /* fudge factor that appears to give good overall performance */ + if (comp2blueplan=make_fftblue_plan(length); + if (!plan->blueplan) { DEALLOC(plan); return NULL; } + } + else + { + plan->packplan=make_rfftp_plan(length); + if (!plan->packplan) { DEALLOC(plan); return NULL; } + } + return plan; + } + +void pocketfft_delete_plan_r (pocketfft_plan_r plan) + { + if (plan->blueplan) + destroy_fftblue_plan(plan->blueplan); + if (plan->packplan) + destroy_rfftp_plan(plan->packplan); + DEALLOC(plan); + } + +size_t pocketfft_length_r(pocketfft_plan_r plan) + { + if (plan->packplan) return plan->packplan->length; + return plan->blueplan->n; + } + +size_t pocketfft_length_c(pocketfft_plan_c plan) + { + if (plan->packplan) return plan->packplan->length; + return plan->blueplan->n; + } + +WARN_UNUSED_RESULT +int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct) + { + if (plan->packplan) + return rfftp_backward(plan->packplan,c,fct); + else // if (plan->blueplan) + return rfftblue_backward(plan->blueplan,c,fct); + } + +WARN_UNUSED_RESULT +int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct) + { + if (plan->packplan) + return rfftp_forward(plan->packplan,c,fct); + else // if (plan->blueplan) + return rfftblue_forward(plan->blueplan,c,fct); + } + +#pragma GCC visibility pop diff --git a/libsharp2/pocketfft.h b/libsharp2/pocketfft.h new file mode 100644 index 0000000..b1a02ee --- /dev/null +++ b/libsharp2/pocketfft.h @@ -0,0 +1,50 @@ +/* + * This file is part of pocketfft. + * Licensed under a 3-clause BSD style license - see LICENSE.md + */ + +/*! \file pocketfft.h + * Public interface of the pocketfft library + * + * Copyright (C) 2008-2019 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef POCKETFFT_H +#define POCKETFFT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct pocketfft_plan_c_i; +typedef struct pocketfft_plan_c_i * pocketfft_plan_c; +pocketfft_plan_c pocketfft_make_plan_c (size_t length); +void pocketfft_delete_plan_c (pocketfft_plan_c plan); +int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct); +int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct); +size_t pocketfft_length_c(pocketfft_plan_c plan); + +struct pocketfft_plan_r_i; +typedef struct pocketfft_plan_r_i * pocketfft_plan_r; +pocketfft_plan_r pocketfft_make_plan_r (size_t length); +void pocketfft_delete_plan_r (pocketfft_plan_r plan); +/*! Computes a real backward FFT on \a c, using \a plan + and assuming the FFTPACK storage scheme: + - on entry, \a c has the form r0, r1, i1, r2, i2, ... + - on exit, it has the form r0, r1, ..., r[length-1]. */ +int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct); +/*! Computes a real forward FFT on \a c, using \a plan + and assuming the FFTPACK storage scheme: + - on entry, \a c has the form r0, r1, ..., r[length-1]; + - on exit, it has the form r0, r1, i1, r2, i2, ... */ +int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct); +size_t pocketfft_length_r(pocketfft_plan_r plan); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp2/sharp.cc b/libsharp2/sharp.c similarity index 81% rename from libsharp2/sharp.cc rename to libsharp2/sharp.c index 86d7ca5..1e5ac03 100644 --- a/libsharp2/sharp.cc +++ b/libsharp2/sharp.c @@ -25,20 +25,17 @@ * \author Martin Reinecke \author Dag Sverre Seljebotn */ -#include -#include -#include -#include "mr_util/fft.h" +#include +#include +#include "libsharp2/pocketfft.h" #include "libsharp2/sharp_ylmgen_c.h" #include "libsharp2/sharp_internal.h" #include "libsharp2/sharp_utils.h" #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_geomhelpers.h" -#include "mr_util/threading.h" -#include "mr_util/useful_macros.h" -typedef complex dcmplx; -typedef complex fcmplx; +typedef complex double dcmplx; +typedef complex float fcmplx; static const double sqrt_one_half = 0.707106781186547572737310929369; static const double sqrt_two = 1.414213562373095145474621858739; @@ -60,7 +57,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) *nchunks = (ndata+(*chunksize)-1)/(*chunksize); } -MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) +NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) { double ofs=lmax*0.01; if (ofs<100.) ofs=100.; @@ -79,7 +76,7 @@ typedef struct double phi0_; dcmplx *shiftarr; int s_shift; - mr::detail_fft::rfftp *plan; + pocketfft_plan_r plan; int length; int norot; } ringhelper; @@ -92,12 +89,12 @@ static void ringhelper_init (ringhelper *self) static void ringhelper_destroy (ringhelper *self) { - delete self->plan; + if (self->plan) pocketfft_delete_plan_r(self->plan); DEALLOC(self->shiftarr); ringhelper_init(self); } -MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) +NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) { self->norot = (fabs(phi0)<1e-14); if (!(self->norot)) @@ -108,27 +105,27 @@ MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mm self->phi0_ = phi0; // FIXME: improve this by using sincos2pibyn(nph) etc. for (int m=0; m<=mmax; ++m) - self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0)); + self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); // double *tmp=(double *) self->shiftarr; // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); } // if (!self->plan) self->plan=pocketfft_make_plan_r(nph); if (nph!=(int)self->length) { - if (self->plan) delete self->plan; - self->plan=new mr::detail_fft::rfftp(nph); + if (self->plan) pocketfft_delete_plan_r(self->plan); + self->plan=pocketfft_make_plan_r(nph); self->length=nph; } } static int ringinfo_compare (const void *xa, const void *xb) { - const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb; + const sharp_ringinfo *a=xa, *b=xb; return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; } static int ringpair_compare (const void *xa, const void *xb) { - const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb; + const sharp_ringpair *a=xa, *b=xb; // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; if (a->r1.nph==b->r1.nph) return (a->r1.phi0 < b->r1.phi0) ? -1 : @@ -278,7 +275,7 @@ static int sharp_get_mmax (int *mval, int nm) return nm-1; } -MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self, +NOINLINE static void ringhelper_phase2ring (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, int pstride, int flags) { @@ -295,22 +292,22 @@ MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self, if (self->norot) for (int m=0; m<=mmax; ++m) { - data[2*m]=phase[m*pstride].real()*wgt; - data[2*m+1]=phase[m*pstride].imag()*wgt; + data[2*m]=creal(phase[m*pstride])*wgt; + data[2*m+1]=cimag(phase[m*pstride])*wgt; } else for (int m=0; m<=mmax; ++m) { dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; - data[2*m]=tmp.real()*wgt; - data[2*m+1]=tmp.imag()*wgt; + data[2*m]=creal(tmp)*wgt; + data[2*m+1]=cimag(tmp)*wgt; } for (int m=2*(mmax+1); mnorot) tmp*=self->shiftarr[m]; if (idx1<(nph+2)/2) { - data[2*idx1]+=tmp.real(); - data[2*idx1+1]+=tmp.imag(); + data[2*idx1]+=creal(tmp); + data[2*idx1+1]+=cimag(tmp); } if (idx2<(nph+2)/2) { - data[2*idx2]+=tmp.real(); - data[2*idx2+1]-=tmp.imag(); + data[2*idx2]+=creal(tmp); + data[2*idx2+1]-=cimag(tmp); } if (++idx1>=nph) idx1=0; if (--idx2<0) idx2=nph-1; } } data[1]=data[0]; - self->plan->exec(&(data[1]), 1., false); + pocketfft_backward_r (self->plan, &(data[1]), 1.); } -MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, +NOINLINE static void ringhelper_ring2phase (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, int pstride, int flags) { @@ -352,7 +349,7 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, if (flags&SHARP_REAL_HARMONICS) wgt *= sqrt_two; - self->plan->exec (&(data[1]), 1., true); + pocketfft_forward_r (self->plan, &(data[1]), 1.); data[0]=data[1]; data[1]=data[nph+1]=0.; @@ -360,11 +357,11 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, { if (self->norot) for (int m=0; m<=maxidx; ++m) - phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt; + phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt; else for (int m=0; m<=maxidx; ++m) phase[m*pstride] = - dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt; + (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt; } else { @@ -373,9 +370,9 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, int idx=m%nph; dcmplx val; if (idx<(nph-idx)) - val = dcmplx(data[2*idx], data[2*idx+1]) * wgt; + val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt; else - val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt; + val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt; if (!self->norot) val *= self->shiftarr[m]; phase[m*pstride]=val; @@ -386,7 +383,7 @@ MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, phase[m*pstride]=0.; } -MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, +NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, int flags) { if (flags & SHARP_NO_FFT) @@ -443,7 +440,7 @@ MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, } } -MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, +NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) { #define CLEARLOOP(real_t,body) \ @@ -480,7 +477,7 @@ MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, } } -MRUTIL_NOINLINE static void init_output (sharp_job *job) +NOINLINE static void init_output (sharp_job *job) { if (job->flags&SHARP_ADD) return; if (job->type == SHARP_MAP2ALM) @@ -491,7 +488,7 @@ MRUTIL_NOINLINE static void init_output (sharp_job *job) clear_map (job->ginfo,job->map[i],job->flags); } -MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) +NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) { if (job->type==SHARP_MAP2ALM) { @@ -517,7 +514,7 @@ static void alloc_almtmp (sharp_job *job, int lmax) static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } -MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) +NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, source_t, expr_of_x) \ @@ -580,7 +577,7 @@ MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) if (job->flags&SHARP_DP) COPY_LOOP(double, dcmplx, x*job->norm_l[l]) else - COPY_LOOP(float, fcmplx, x*float(job->norm_l[l])) + COPY_LOOP(float, fcmplx, x*job->norm_l[l]) } } } @@ -591,7 +588,7 @@ MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) +NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, target_t, expr_of_x) \ @@ -620,9 +617,9 @@ MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) if (m==0) { if (job->flags&SHARP_DP) - COPY_LOOP(double, double, x.real()*norm_m0) + COPY_LOOP(double, double, creal(x)*norm_m0) else - COPY_LOOP(float, float, x.real()*norm_m0) + COPY_LOOP(float, float, crealf(x)*norm_m0) } else { @@ -637,9 +634,9 @@ MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) if (m==0) { if (job->flags&SHARP_DP) - COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0) + COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0) else - COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0)) + COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0)) } else { @@ -653,7 +650,7 @@ MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, +NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, const double *ringtmp, int rstride) { if (job->flags & SHARP_DP) @@ -661,8 +658,8 @@ MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double **dmap = (double **)job->map; for (int i=0; inmaps; ++i) { - double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs]; - const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1]; + double *restrict p1=&dmap[i][ri->ofs]; + const double *restrict p2=&ringtmp[i*rstride+1]; if (ri->stride==1) { if (job->flags&SHARP_ADD) @@ -685,14 +682,14 @@ MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, } } -MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, +NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, int rstride) { if (job->flags & SHARP_DP) for (int i=0; inmaps; ++i) { - double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1], - *MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]); + double *restrict p1=&ringtmp[i*rstride+1], + *restrict p2=&(((double *)(job->map[i]))[ri->ofs]); if (ri->stride==1) memcpy(p1,p2,ri->nph*sizeof(double)); else @@ -724,7 +721,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, for (int m=0; m<=mmax; ++m) phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ? ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt : - ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt); + ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt; } } static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, @@ -746,7 +743,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, } //FIXME: set phase to zero if not SHARP_MAP2ALM? -MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) +NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -763,35 +760,35 @@ MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int u } else { - mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched) +#pragma omp parallel +{ + ringhelper helper; + ringhelper_init(&helper); + int rstride=job->ginfo->nphmax+2; + double *ringtmp=RALLOC(double,job->nmaps*rstride); +#pragma omp for schedule(dynamic,1) + for (int ith=llim; ithginfo->nphmax+2; - double *ringtmp=RALLOC(double,job->nmaps*rstride); - - while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; iths_th*(ith-llim); + ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); + for (int i=0; inmaps; ++i) + ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); + if (job->ginfo->pair[ith].r2.nph>0) { - int dim2 = job->s_th*(ith-llim); - ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); + ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); for (int i=0; inmaps; ++i) - ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); - if (job->ginfo->pair[ith].r2.nph>0) - { - ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); - for (int i=0; inmaps; ++i) - ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); - } + ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); } - DEALLOC(ringtmp); - ringhelper_destroy(&helper); - }); /* end of parallel region */ + } + DEALLOC(ringtmp); + ringhelper_destroy(&helper); +} /* end of parallel region */ } } -MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) +NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -808,35 +805,35 @@ MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int u } else { - mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched) +#pragma omp parallel +{ + ringhelper helper; + ringhelper_init(&helper); + int rstride=job->ginfo->nphmax+2; + double *ringtmp=RALLOC(double,job->nmaps*rstride); +#pragma omp for schedule(dynamic,1) + for (int ith=llim; ithginfo->nphmax+2; - double *ringtmp=RALLOC(double,job->nmaps*rstride); - - while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; iths_th*(ith-llim); + for (int i=0; inmaps; ++i) + ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); + ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); + if (job->ginfo->pair[ith].r2.nph>0) { - int dim2 = job->s_th*(ith-llim); for (int i=0; inmaps; ++i) - ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags); - ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride); - if (job->ginfo->pair[ith].r2.nph>0) - { - for (int i=0; inmaps; ++i) - ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), - &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); - ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); - } + ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2), + &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags); + ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride); } - DEALLOC(ringtmp); - ringhelper_destroy(&helper); - }); /* end of parallel region */ + } + DEALLOC(ringtmp); + ringhelper_destroy(&helper); +} /* end of parallel region */ } } -MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job) +NOINLINE static void sharp_execute_job (sharp_job *job) { double timer=sharp_wallTime(); job->opcnt=0; @@ -855,7 +852,6 @@ MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job) &nchunks,&chunksize); //FIXME: needs to be changed to "nm" alloc_phase (job,mmax+1,chunksize); - std::atomic opcnt = 0; /* chunk loop */ for (int chunk=0; chunkphase where necessary */ map2phase (job, mmax, llim, ulim); - mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched) +#pragma omp parallel +{ + sharp_job ljob = *job; + ljob.opcnt=0; + sharp_Ylmgen_C generator; + sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin); + alloc_almtmp(&ljob,lmax); + +#pragma omp for schedule(dynamic,1) + for (int mi=0; miainfo->nm; ++mi) { - sharp_job ljob = *job; - ljob.opcnt=0; - sharp_Ylmgen_C generator; - sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin); - alloc_almtmp(&ljob,lmax); - - while (auto rng=sched.getNext()) for(auto mi=rng.lo; mialm_tmp where necessary */ - alm2almtmp (&ljob, lmax, mi); + alm2almtmp (&ljob, lmax, mi); - inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); + inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim); /* alm_tmp->alm where necessary */ - almtmp2alm (&ljob, lmax, mi); - } + almtmp2alm (&ljob, lmax, mi); + } - sharp_Ylmgen_destroy(&generator); - dealloc_almtmp(&ljob); + sharp_Ylmgen_destroy(&generator); + dealloc_almtmp(&ljob); - opcnt+=ljob.opcnt; - }); /* end of parallel region */ +#pragma omp critical + job->opcnt+=ljob.opcnt; +} /* end of parallel region */ /* phase->map where necessary */ phase2map (job, mmax, llim, ulim); @@ -911,7 +909,6 @@ MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job) DEALLOC(job->norm_l); dealloc_phase (job); - job->opcnt = opcnt; job->time=sharp_wallTime()-timer; } @@ -937,8 +934,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, job->flags|=SHARP_REAL_HARMONICS; job->time = 0.; job->opcnt = 0; - job->alm=(void **)alm; - job->map=(void **)map; + job->alm=alm; + job->map=map; } void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map, diff --git a/libsharp2/sharp_almhelpers.cc b/libsharp2/sharp_almhelpers.c similarity index 100% rename from libsharp2/sharp_almhelpers.cc rename to libsharp2/sharp_almhelpers.c diff --git a/libsharp2/sharp_core.cc b/libsharp2/sharp_core.c similarity index 98% rename from libsharp2/sharp_core.cc rename to libsharp2/sharp_core.c index 9a7ef20..6f41ad2 100644 --- a/libsharp2/sharp_core.cc +++ b/libsharp2/sharp_core.c @@ -27,7 +27,7 @@ #define ARCH default #define GENERIC_ARCH -#include "libsharp2/sharp_core_inc.cc" +#include "libsharp2/sharp_core_inc.c" #undef GENERIC_ARCH #undef ARCH diff --git a/libsharp2/sharp_core_inc.cc b/libsharp2/sharp_core_inc.c similarity index 70% rename from libsharp2/sharp_core_inc.cc rename to libsharp2/sharp_core_inc.c index 10c4322..eaf34f0 100644 --- a/libsharp2/sharp_core_inc.cc +++ b/libsharp2/sharp_core_inc.c @@ -34,7 +34,7 @@ #define XCONCATX2(a,b) XCONCATX(a,b) #define XARCH(a) XCONCATX2(a,ARCH) -#include +#include #include #include #include "libsharp2/sharp_vecsupport.h" @@ -49,7 +49,7 @@ // Unfortunately, most compilers don't act on this pragma yet. #pragma STDC FP_CONTRACT ON -typedef complex dcmplx; +typedef complex double dcmplx; #define nv0 (128/VLEN) #define nvx (64/VLEN) @@ -94,35 +94,35 @@ typedef union sxdata_s s; } sxdata_u; -static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale, +static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, double maxval) { - const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval; - const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig; - auto mask = abs(*val)>vfmax; - while (any_of(mask)) + const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); + const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); + Tm mask = vgt(vabs(*val),vfmax); + while (vanyTrue(mask)) { - where(mask,*val)*=vfsmall; - where(mask,*scale)+=1; - mask = abs(*val)>vfmax; + vmuleq_mask(mask,*val,vfsmall); + vaddeq_mask(mask,*scale,vone); + mask = vgt(vabs(*val),vfmax); } - mask = (abs(*val)>=1); *resd=res; - *ress=0; + *ress=vzero; } else { - Tv scale=0, scaleint=0, res=1; + Tv scale=vzero, scaleint=vzero, res=vone; Tvnormalize(&val,&scaleint,sharp_fbighalf); do { @@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit, } } -static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac, - const double * MRUTIL_RESTRICT cf) +static inline void getCorfac(Tv scale, Tv * restrict corfac, + const double * restrict cf) { typedef union { Tv v; double s[VLEN]; } Tvu; @@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac, *corfac=corf.v; } -static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps) +static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps) { - auto mask = abs(*v2)>eps; - if (any_of(mask)) + Tm mask = vgt(vabs(*v2),eps); + if (vanyTrue(mask)) { - where(mask,*v1)*=sharp_fsmall; - where(mask,*v2)*=sharp_fsmall; - where(mask,*s)+=1; - return true; + vmuleq_mask(mask,*v1,vload(sharp_fsmall)); + vmuleq_mask(mask,*v2,vload(sharp_fsmall)); + vaddeq_mask(mask,*s,vone); + return 1; } - return false; + return 0; } -MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, - s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2) +NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, + s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2) { int l=gen->m, il=0; - Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]; - Tv limscale=sharp_limscale; + Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); + Tv limscale=vload(sharp_limscale); int below_limit = 1; for (int i=0; ilam1[i]=0; + d->lam1[i]=vzero; mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]); d->lam2[i] *= mfac; Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol); - below_limit &= all_of(d->scale[i]scale[i],limscale)); } while (below_limit) { if (l+4>gen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; - Tv a1=gen->coef[il ].a, b1=gen->coef[il ].b; - Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b; + Tv a1=vload(gen->coef[il ].a), b1=vload(gen->coef[il ].b); + Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b); for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) - below_limit &= all_of(d->scale[i]lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) + below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale))); } l+=4; il+=2; } *l_=l; *il_=il; } -MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm, +NOINLINE static void alm2map_kernel(s0data_v * restrict d, + const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, int l, int il, int lmax, int nv2) { if (nv2==nv0) { for (; l<=lmax-2; il+=2, l+=4) { - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); - Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); - Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag(); - Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag(); - Tv a1=coef[il ].a, b1=coef[il ].b; - Tv a2=coef[il+1].a, b2=coef[il+1].b; + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); + Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); + Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); for (int i=0; ip1r[i] += d->lam2[i]*ar1; @@ -249,12 +249,12 @@ MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, { for (; l<=lmax-2; il+=2, l+=4) { - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); - Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); - Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag(); - Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag(); - Tv a1=coef[il ].a, b1=coef[il ].b; - Tv a2=coef[il+1].a, b2=coef[il+1].b; + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); + Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); + Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); for (int i=0; ip1r[i] += d->lam2[i]*ar1; @@ -272,9 +272,9 @@ MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, } for (; l<=lmax; ++il, l+=2) { - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); - Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); - Tv a=coef[il].a, b=coef[il].b; + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv a=vload(coef[il].a), b=vload(coef[il].b); for (int i=0; ip1r[i] += d->lam2[i]*ar1; @@ -288,8 +288,8 @@ MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, } } -MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job, - const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth) +NOINLINE static void calc_alm2map (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) { int l,il,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -298,20 +298,20 @@ MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; - const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + const sharp_ylmgen_dbl2 * restrict coef = gen->coef; + const dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; iscale[i], &d->corfac[i], gen->cf); - full_ieee &= all_of(d->scale[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } while((!full_ieee) && (l<=lmax)) { - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); - Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); - Tv a=coef[il].a, b=coef[il].b; + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv a=vload(coef[il].a), b=vload(coef[il].b); full_ieee=1; for (int i=0; icsq[i] + b)*d->lam2[i] + d->lam1[i]; d->lam1[i] = d->lam2[i]; d->lam2[i] = tmp; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) getCorfac(d->scale[i], &d->corfac[i], gen->cf); - full_ieee &= all_of(d->scale[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } l+=2; ++il; } @@ -338,16 +338,16 @@ MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job, alm2map_kernel(d, coef, alm, l, il, lmax, nv2); } -MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d, - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l, +NOINLINE static void map2alm_kernel(s0data_v * restrict d, + const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l, int il, int lmax, int nv2) { for (; l<=lmax-2; il+=2, l+=4) { - Tv a1=coef[il ].a, b1=coef[il ].b; - Tv a2=coef[il+1].a, b2=coef[il+1].b; - Tv atmp1[4] = {0,0,0,0}; - Tv atmp2[4] = {0,0,0,0}; + Tv a1=vload(coef[il ].a), b1=vload(coef[il ].b); + Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b); + Tv atmp1[4] = {vzero, vzero, vzero, vzero}; + Tv atmp2[4] = {vzero, vzero, vzero, vzero}; for (int i=0; ilam2[i]*d->p1r[i]; @@ -366,8 +366,8 @@ MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d, } for (; l<=lmax; ++il, l+=2) { - Tv a=coef[il].a, b=coef[il].b; - Tv atmp[4] = {0,0,0,0}; + Tv a=vload(coef[il].a), b=vload(coef[il].b); + Tv atmp[4] = {vzero, vzero, vzero, vzero}; for (int i=0; ilam2[i]*d->p1r[i]; @@ -382,8 +382,8 @@ MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d, } } -MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job, - const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth) +NOINLINE static void calc_map2alm (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) { int l,il,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -392,19 +392,19 @@ MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; - dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + const sharp_ylmgen_dbl2 * restrict coef = gen->coef; + dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; iscale[i], &d->corfac[i], gen->cf); - full_ieee &= all_of(d->scale[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } while((!full_ieee) && (l<=lmax)) { - Tv a=coef[il].a, b=coef[il].b; - Tv atmp[4] = {0,0,0,0}; + Tv a=vload(coef[il].a), b=vload(coef[il].b); + Tv atmp[4] = {vzero, vzero, vzero, vzero}; full_ieee=1; for (int i=0; icsq[i] + b)*d->lam2[i] + d->lam1[i]; d->lam1[i] = d->lam2[i]; d->lam2[i] = tmp; - if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol)) + if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol))) getCorfac(d->scale[i], &d->corfac[i], gen->cf); - full_ieee &= all_of(d->scale[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale))); } vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); l+=2; ++il; @@ -432,21 +432,21 @@ MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job, map2alm_kernel(d, coef, alm, l, il, lmax, nv2); } -MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, - sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2) +NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, + sxdata_v * restrict d, int * restrict l_, int nv2) { - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; - Tv prefac=gen->prefac[gen->m], - prescale=gen->fscale[gen->m]; - Tv limscale=sharp_limscale; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + Tv prefac=vload(gen->prefac[gen->m]), + prescale=vload(gen->fscale[gen->m]); + Tv limscale=vload(sharp_limscale); int below_limit=1; for (int i=0; icth[i])*0.5)); - Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5)); - auto mask=d->sth[i]<0; - where(mask&(d->cth[i]<0),cth2)*=-1.; - where(mask&(d->cth[i]<0),sth2)*=-1.; + Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5))); + Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5))); + Tm mask=vlt(d->sth[i],vzero); + vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.)); + vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.)); Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps; mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); @@ -454,8 +454,8 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); - d->l1p[i] = 0; - d->l1m[i] = 0; + d->l1p[i] = vzero; + d->l1m[i] = vzero; d->l2p[i] = prefac*ccp; d->scp[i] = prescale+ccps; d->l2m[i] = prefac*csp; @@ -467,17 +467,17 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES d->l2m[i] *= scp; d->scm[i] += scps; if (gen->preMinus_p) - d->l2p[i] = -d->l2p[i]; + d->l2p[i] = vneg(d->l2p[i]); if (gen->preMinus_m) - d->l2m[i] = -d->l2m[i]; + d->l2m[i] = vneg(d->l2m[i]); if (gen->s&1) - d->l2p[i] = -d->l2p[i]; + d->l2p[i] = vneg(d->l2p[i]); Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol); Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol); - below_limit &= all_of(d->scm[i]scp[i]scm[i],limscale)) && + vallTrue(vlt(d->scp[i],limscale)); } int l=gen->mhi; @@ -486,18 +486,18 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) || - rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol)) - below_limit &= all_of(d->scp[i]scm[i]l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) || + rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol))) + below_limit &= vallTrue(vlt(d->scp[i],limscale)) && + vallTrue(vlt(d->scm[i],limscale)); } l+=2; } @@ -505,19 +505,19 @@ MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RES *l_=l; } -MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm, +NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, int l, int lmax, int nv2) { int lsave = l; while (l<=lmax) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), - acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); - Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), - acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -537,12 +537,12 @@ MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, l=lsave; while (l<=lmax) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), - acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); - Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), - acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; @@ -561,8 +561,8 @@ MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, } } -MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job, - const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) +NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) { int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -571,25 +571,25 @@ MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; - const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + const dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; iscp[i], &d->cfp[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= all_of(d->scp[i]>=sharp_minscale) && - all_of(d->scm[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); } while((!full_ieee) && (l<=lmax)) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), - acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); - Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), - acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), + acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); + Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), + acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); full_ieee=1; for (int i=0; il2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= all_of(d->scp[i]>=sharp_minscale); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= all_of(d->scm[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); } l+=2; } @@ -641,17 +641,17 @@ MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job, } } -MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm, +NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm, int l, int lmax, int nv2) { int lsave=l; while (l<=lmax) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv agr1=0, agi1=0, acr1=0, aci1=0; - Tv agr2=0, agi2=0, acr2=0, aci2=0; + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -672,10 +672,10 @@ MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, l=lsave; while (l<=lmax) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv agr1=0, agi1=0, acr1=0, aci1=0; - Tv agr2=0, agi2=0, acr2=0, aci2=0; + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; @@ -695,8 +695,8 @@ MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, } } -MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job, - const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) +NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) { int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -705,15 +705,15 @@ MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; - dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; iscp[i], &d->cfp[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= all_of(d->scp[i]>=sharp_minscale) && - all_of(d->scm[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); } for (int i=0; il2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= all_of(d->scp[i]>=sharp_minscale); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= all_of(d->scm[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); } vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); @@ -772,17 +772,17 @@ MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job, } -MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d, - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm, +NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, + const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, int l, int lmax, int nv2) { int lsave=l; while (l<=lmax) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), - ar2=alm[l+1].real(), ai2=alm[l+1].imag(); + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -798,10 +798,10 @@ MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d, l=lsave; while (l<=lmax) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), - ar2=alm[l+1].real(), ai2=alm[l+1].imag(); + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; @@ -816,8 +816,8 @@ MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d, } } -MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job, - const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) +NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, + const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) { int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -826,23 +826,23 @@ MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 15*nth; - const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; - const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; + const dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; iscp[i], &d->cfp[i], gen->cf); getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= all_of(d->scp[i]>=sharp_minscale) && - all_of(d->scm[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && + vallTrue(vge(d->scm[i],vload(sharp_minscale))); } while((!full_ieee) && (l<=lmax)) { - Tv fx10=fx[l+1].a,fx11=fx[l+1].b; - Tv fx20=fx[l+2].a,fx21=fx[l+2].b; - Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), - ar2=alm[l+1].real(), ai2=alm[l+1].imag(); + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); full_ieee=1; for (int i=0; il2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol)) + if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= all_of(d->scp[i]>=sharp_minscale); - if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol)) + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); + if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= all_of(d->scm[i]>=sharp_minscale); + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); } l+=2; } @@ -897,7 +897,7 @@ MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job, #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) -MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, +NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { @@ -912,7 +912,7 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, if (job->spin==0) { //adjust the a_lm for the new algorithm - dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + dcmplx * restrict alm=job->almtmp; for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) { dcmplx al = alm[l]; @@ -963,8 +963,8 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, d.s.p2r[i]*=cth_[tgt]; d.s.p2i[i]*=cth_[tgt]; int phas_idx = tgt*job->s_th + mi*job->s_m; - complex r1(d.s.p1r[i], d.s.p1i[i]), - r2(d.s.p2r[i], d.s.p2i[i]); + complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I, + r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I; job->phase[phas_idx] = r1+r2; if (ispair[tgt]) job->phase[phas_idx+1] = r1-r2; @@ -1027,10 +1027,10 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, { int tgt=itgt[i]; int phas_idx = tgt*job->s_th + mi*job->s_m; - complex q1(d.s.p1pr[i], d.s.p1pi[i]), - q2(d.s.p2pr[i], d.s.p2pi[i]), - u1(d.s.p1mr[i], d.s.p1mi[i]), - u2(d.s.p2mr[i], d.s.p2mi[i]); + complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I, + q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I, + u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I, + u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I; job->phase[phas_idx ] = q1+q2; job->phase[phas_idx+2] = u1+u2; if (ispair[tgt]) @@ -1056,7 +1056,7 @@ MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, } } -MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, +NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { @@ -1083,8 +1083,8 @@ MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, int phas_idx = ith*job->s_th + mi*job->s_m; dcmplx ph1=job->phase[phas_idx]; dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.; - d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag(); - d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag(); + d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2); + d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2); //adjust for new algorithm d.s.p2r[nth]*=cth_[ith]; d.s.p2i[nth]*=cth_[ith]; @@ -1105,7 +1105,7 @@ MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, } } //adjust the a_lm for the new algorithm - dcmplx * MRUTIL_RESTRICT alm=job->almtmp; + dcmplx * restrict alm=job->almtmp; dcmplx alm2 = 0.; double alold=0; for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) @@ -1138,10 +1138,10 @@ MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, p2U=ispair[ith] ? job->phase[phas_idx+3]:0.; if ((gen->mhi-gen->m+gen->s)&1) { p2Q=-p2Q; p2U=-p2U; } - d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag(); - d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag(); - d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag(); - d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag(); + d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q); + d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U); + d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q); + d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U); ++nth; } ++ith; diff --git a/libsharp2/sharp_geomhelpers.cc b/libsharp2/sharp_geomhelpers.c similarity index 95% rename from libsharp2/sharp_geomhelpers.cc rename to libsharp2/sharp_geomhelpers.c index cf838e2..db459bc 100644 --- a/libsharp2/sharp_geomhelpers.cc +++ b/libsharp2/sharp_geomhelpers.c @@ -29,7 +29,7 @@ #include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_legendre_roots.h" #include "libsharp2/sharp_utils.h" -#include "mr_util/fft.h" +#include "libsharp2/pocketfft.h" void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings, const int *rings, const double *weight, sharp_geom_info **geom_info) @@ -155,7 +155,9 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0, weight[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings); } if ((nrings&1)==0) weight[nrings-1]=0.; - mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); + pocketfft_plan_r plan = pocketfft_make_plan_r(nrings); + pocketfft_backward_r(plan,weight,1.); + pocketfft_delete_plan_r(plan); for (int m=0; m<(nrings+1)/2; ++m) { @@ -200,7 +202,9 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0, for (int k=1; k<=(n/2-1); ++k) weight[2*k-1]=2./(1.-4.*k*k) + dw; weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1); - mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); + pocketfft_plan_r plan = pocketfft_make_plan_r(n); + pocketfft_backward_r(plan,weight,1.); + pocketfft_delete_plan_r(plan); weight[n]=weight[0]; for (int m=0; m<(nrings+1)/2; ++m) @@ -246,7 +250,9 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0, for (int k=1; k<=(n/2-1); ++k) weight[2*k-1]=2./(1.-4.*k*k); weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.; - mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.); + pocketfft_plan_r plan = pocketfft_make_plan_r(n); + pocketfft_backward_r(plan,weight,1.); + pocketfft_delete_plan_r(plan); for (int m=0; m +#ifdef __cplusplus +#error This header file cannot be included from C++, only from C +#endif + +#include #include "libsharp2/sharp.h" #include "libsharp2/sharp_ylmgen_c.h" -using std::complex; - typedef struct { sharp_jobtype type; @@ -43,9 +45,9 @@ typedef struct void **map; void **alm; int s_m, s_th; // strides in m and theta direction - complex *phase; + complex double *phase; double *norm_l; - complex *almtmp; + complex double *almtmp; const sharp_geom_info *ginfo; const sharp_alm_info *ainfo; double time; diff --git a/libsharp2/sharp_legendre_roots.cc b/libsharp2/sharp_legendre_roots.c similarity index 100% rename from libsharp2/sharp_legendre_roots.cc rename to libsharp2/sharp_legendre_roots.c diff --git a/libsharp2/sharp_mpi.cc b/libsharp2/sharp_mpi.c similarity index 100% rename from libsharp2/sharp_mpi.cc rename to libsharp2/sharp_mpi.c diff --git a/libsharp2/sharp_utils.cc b/libsharp2/sharp_utils.c similarity index 100% rename from libsharp2/sharp_utils.cc rename to libsharp2/sharp_utils.c diff --git a/libsharp2/sharp_utils.h b/libsharp2/sharp_utils.h index a3c9190..d2a1c74 100644 --- a/libsharp2/sharp_utils.h +++ b/libsharp2/sharp_utils.h @@ -122,4 +122,10 @@ double sharp_wallTime(void); } #endif +#ifdef __GNUC__ +#define NOINLINE __attribute__((noinline)) +#else +#define NOINLINE +#endif + #endif diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index efc7c66..ca9bee7 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -28,27 +28,193 @@ #ifndef SHARP2_VECSUPPORT_H #define SHARP2_VECSUPPORT_H -#include -#include -using std::complex; -#include -using std::experimental::native_simd; -using std::experimental::reduce; +#include -#include "mr_util/useful_macros.h" +#ifndef VLEN -using Tv=native_simd; -using Tm=Tv::mask_type; -using Ts=Tv::value_type; -static constexpr size_t VLEN=Tv::size(); +#if (defined(__AVX512F__)) +#define VLEN 8 +#elif (defined (__AVX__)) +#define VLEN 4 +#elif (defined (__SSE2__)) +#define VLEN 2 +#else +#define VLEN 1 +#endif + +#endif + +typedef double Ts; + +#if (VLEN==1) + +typedef double Tv; +typedef int Tm; #define vload(a) (a) +#define vzero 0. +#define vone 1. + +#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b); +#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b); +#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b); +#define vneg(a) (-(a)) +#define vabs(a) fabs(a) +#define vsqrt(a) sqrt(a) +#define vlt(a,b) ((a)<(b)) +#define vgt(a,b) ((a)>(b)) +#define vge(a,b) ((a)>=(b)) +#define vne(a,b) ((a)!=(b)) +#define vand_mask(a,b) ((a)&&(b)) +#define vor_mask(a,b) ((a)||(b)) +static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } +#define vanyTrue(a) (a) +#define vallTrue(a) (a) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex * MRUTIL_RESTRICT cc) + _Complex double * restrict cc) + { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; } + + +#endif + +#if (VLEN==2) + +#include + +#if defined (__SSE3__) +#include +#endif +#if defined (__SSE4_1__) +#include +#endif + +typedef __m128d Tv; +typedef __m128d Tm; + +#if defined(__SSE4_1__) +#define vblend__(m,a,b) _mm_blendv_pd(b,a,m) +#else +static inline Tv vblend__(Tv m, Tv a, Tv b) + { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); } +#endif +#define vload(a) _mm_set1_pd(a) +#define vzero _mm_setzero_pd() +#define vone vload(1.) + +#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero) +#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero) +#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone) +#define vneg(a) _mm_xor_pd(vload(-0.),a) +#define vabs(a) _mm_andnot_pd(vload(-0.),a) +#define vsqrt(a) _mm_sqrt_pd(a) +#define vlt(a,b) _mm_cmplt_pd(a,b) +#define vgt(a,b) _mm_cmpgt_pd(a,b) +#define vge(a,b) _mm_cmpge_pd(a,b) +#define vne(a,b) _mm_cmpneq_pd(a,b) +#define vand_mask(a,b) _mm_and_pd(a,b) +#define vor_mask(a,b) _mm_or_pd(a,b) +#define vmin(a,b) _mm_min_pd(a,b) +#define vmax(a,b) _mm_max_pd(a,b); +#define vanyTrue(a) (_mm_movemask_pd(a)!=0) +#define vallTrue(a) (_mm_movemask_pd(a)==3) + +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, + Tv d, _Complex double * restrict cc) { - cc[0] += complex(reduce(a,std::plus<>()),reduce(b,std::plus<>())); - cc[1] += complex(reduce(c,std::plus<>()),reduce(d,std::plus<>())); + union {Tv v; _Complex double c; } u1, u2; +#if defined(__SSE3__) + u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); +#else + u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)); + u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) + + _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)); +#endif + cc[0]+=u1.c; cc[1]+=u2.c; } #endif + +#if (VLEN==4) + +#include + +typedef __m256d Tv; +typedef __m256d Tm; + +#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m) +#define vload(a) _mm256_set1_pd(a) +#define vzero _mm256_setzero_pd() +#define vone vload(1.) + +#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero) +#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero) +#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone) +#define vneg(a) _mm256_xor_pd(vload(-0.),a) +#define vabs(a) _mm256_andnot_pd(vload(-0.),a) +#define vsqrt(a) _mm256_sqrt_pd(a) +#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) +#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ) +#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ) +#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) +#define vand_mask(a,b) _mm256_and_pd(a,b) +#define vor_mask(a,b) _mm256_or_pd(a,b) +#define vmin(a,b) _mm256_min_pd(a,b) +#define vmax(a,b) _mm256_max_pd(a,b) +#define vanyTrue(a) (_mm256_movemask_pd(a)!=0) +#define vallTrue(a) (_mm256_movemask_pd(a)==15) + +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, + _Complex double * restrict cc) + { + Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); + Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), + tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); + tmp1=tmp3+tmp4; + union {Tv v; _Complex double c[2]; } u; + u.v=tmp1; + cc[0]+=u.c[0]; cc[1]+=u.c[1]; + } + +#endif + +#if (VLEN==8) + +#include + +typedef __m512d Tv; +typedef __mmask8 Tm; + +#define vload(a) _mm512_set1_pd(a) +#define vzero _mm512_setzero_pd() +#define vone vload(1.) + +#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b); +#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b); +#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b); +#define vneg(a) _mm512_mul_pd(a,vload(-1.)) +#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a) +#define vsqrt(a) _mm512_sqrt_pd(a) +#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ) +#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ) +#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ) +#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ) +#define vand_mask(a,b) ((a)&(b)) +#define vor_mask(a,b) ((a)|(b)) +#define vmin(a,b) _mm512_min_pd(a,b) +#define vmax(a,b) _mm512_max_pd(a,b) +#define vanyTrue(a) (a!=0) +#define vallTrue(a) (a==255) + +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, + _Complex double * restrict cc) + { + cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); + cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); + } + +#endif + +#endif diff --git a/libsharp2/sharp_ylmgen_c.cc b/libsharp2/sharp_ylmgen_c.c similarity index 100% rename from libsharp2/sharp_ylmgen_c.cc rename to libsharp2/sharp_ylmgen_c.c diff --git a/test/memusage.cc b/test/memusage.c similarity index 100% rename from test/memusage.cc rename to test/memusage.c diff --git a/test/sharp2_testsuite.cc b/test/sharp2_testsuite.c similarity index 97% rename from test/sharp2_testsuite.cc rename to test/sharp2_testsuite.c index dc74da2..46390a4 100644 --- a/test/sharp2_testsuite.cc +++ b/test/sharp2_testsuite.c @@ -26,8 +26,7 @@ #include #include -#include -using std::complex; +#include #ifdef USE_MPI #include "mpi.h" #include "libsharp2/sharp_mpi.h" @@ -39,7 +38,7 @@ using std::complex; #include "libsharp2/sharp_geomhelpers.h" #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_utils.h" -#include "libsharp2/sharp_utils.cc" +#include "libsharp2/sharp_utils.c" #include "test/memusage.h" static void OpenMP_status(void) @@ -95,7 +94,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected, exit(1); } -typedef complex dcmplx; +typedef complex double dcmplx; int ntasks, mytask; @@ -123,7 +122,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt) { double rv = drand(-1,1,&state); double iv = (m==0) ? 0 : drand(-1,1,&state); - alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv); + alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv; } } } @@ -231,7 +230,8 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp) sqsum[i]=0; for (ptrdiff_t j=0; jmaxdiff) maxdiff=sqr; } @@ -413,7 +414,7 @@ static void check_sign_scale(void) ALLOC2D(alm,dcmplx,2,nalms); for (int i=0; i<2; ++i) for (int j=0; j