diff --git a/libsharp2/sharp.cc b/libsharp2/sharp.cc index 2b6e722..a49c106 100644 --- a/libsharp2/sharp.cc +++ b/libsharp2/sharp.cc @@ -34,6 +34,7 @@ #include "libsharp2/sharp_almhelpers.h" #include "libsharp2/sharp_geomhelpers.h" #include "mr_util/threading.h" +#include "mr_util/useful_macros.h" typedef complex dcmplx; typedef complex fcmplx; @@ -58,7 +59,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) *nchunks = (ndata+(*chunksize)-1)/(*chunksize); } -NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) +MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) { double ofs=lmax*0.01; if (ofs<100.) ofs=100.; @@ -95,7 +96,7 @@ static void ringhelper_destroy (ringhelper *self) ringhelper_init(self); } -NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) +MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) { self->norot = (fabs(phi0)<1e-14); if (!(self->norot)) @@ -276,7 +277,7 @@ static int sharp_get_mmax (int *mval, int nm) return nm-1; } -NOINLINE static void ringhelper_phase2ring (ringhelper *self, +MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, int pstride, int flags) { @@ -334,7 +335,7 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self, pocketfft_backward_r (self->plan, &(data[1]), 1.); } -NOINLINE static void ringhelper_ring2phase (ringhelper *self, +MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, int pstride, int flags) { @@ -384,7 +385,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self, phase[m*pstride]=0.; } -NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, +MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, int flags) { if (flags & SHARP_NO_FFT) @@ -441,7 +442,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, } } -NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, +MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) { #define CLEARLOOP(real_t,body) \ @@ -478,7 +479,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, } } -NOINLINE static void init_output (sharp_job *job) +MRUTIL_NOINLINE static void init_output (sharp_job *job) { if (job->flags&SHARP_ADD) return; if (job->type == SHARP_MAP2ALM) @@ -489,7 +490,7 @@ NOINLINE static void init_output (sharp_job *job) clear_map (job->ginfo,job->map[i],job->flags); } -NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) +MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) { if (job->type==SHARP_MAP2ALM) { @@ -515,7 +516,7 @@ static void alloc_almtmp (sharp_job *job, int lmax) static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } -NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) +MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, source_t, expr_of_x) \ @@ -589,7 +590,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) +MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, target_t, expr_of_x) \ @@ -651,7 +652,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, +MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, const double *ringtmp, int rstride) { if (job->flags & SHARP_DP) @@ -659,8 +660,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double **dmap = (double **)job->map; for (int i=0; inmaps; ++i) { - double *restrict p1=&dmap[i][ri->ofs]; - const double *restrict p2=&ringtmp[i*rstride+1]; + double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs]; + const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1]; if (ri->stride==1) { if (job->flags&SHARP_ADD) @@ -683,14 +684,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, } } -NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, +MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, int rstride) { if (job->flags & SHARP_DP) for (int i=0; inmaps; ++i) { - double *restrict p1=&ringtmp[i*rstride+1], - *restrict p2=&(((double *)(job->map[i]))[ri->ofs]); + double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1], + *MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]); if (ri->stride==1) memcpy(p1,p2,ri->nph*sizeof(double)); else @@ -744,7 +745,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, } //FIXME: set phase to zero if not SHARP_MAP2ALM? -NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) +MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -789,7 +790,7 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) } } -NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) +MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -834,7 +835,7 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) } } -NOINLINE static void sharp_execute_job (sharp_job *job) +MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job) { double timer=sharp_wallTime(); job->opcnt=0; diff --git a/libsharp2/sharp_core_inc.cc b/libsharp2/sharp_core_inc.cc index a4b57fc..10c4322 100644 --- a/libsharp2/sharp_core_inc.cc +++ b/libsharp2/sharp_core_inc.cc @@ -94,7 +94,7 @@ typedef union sxdata_s s; } sxdata_u; -static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, +static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale, double maxval) { const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval; @@ -115,8 +115,8 @@ static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale, } } -static void mypow(Tv val, int npow, const double * restrict powlimit, - Tv * restrict resd, Tv * restrict ress) +static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit, + Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress) { Tv vminv=powlimit[npow]; auto mask = abs(val)eps; if (any_of(mask)) @@ -182,8 +182,8 @@ static inline bool rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, return false; } -NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, - s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2) +MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, + s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2) { int l=gen->m, il=0; Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]; @@ -216,8 +216,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, *l_=l; *il_=il; } -NOINLINE static void alm2map_kernel(s0data_v * restrict d, - const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, +MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d, + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm, int l, int il, int lmax, int nv2) { if (nv2==nv0) @@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, } } -NOINLINE static void calc_alm2map (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth) +MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job, + const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth) { int l,il,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -298,8 +298,8 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * restrict coef = gen->coef; - const dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; + const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ilmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -392,8 +392,8 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * restrict coef = gen->coef; - dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; icoef; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; Tv prefac=gen->prefac[gen->m], prescale=gen->fscale[gen->m]; Tv limscale=sharp_limscale; @@ -505,8 +505,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, *l_=l; } -NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, - const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, +MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d, + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm, int l, int lmax, int nv2) { int lsave = l; @@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, } } -NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, - const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth) +MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job, + const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth) { int l,lmax=gen->lmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -571,8 +571,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - const dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; + const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ilmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -705,8 +705,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ilmax; int nv2 = (nth+VLEN-1)/VLEN; @@ -826,8 +826,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 15*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->coef; - const dcmplx * restrict alm=job->almtmp; + const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef; + const dcmplx * MRUTIL_RESTRICT alm=job->almtmp; int full_ieee=1; for (int i=0; ispin==0) { //adjust the a_lm for the new algorithm - dcmplx * restrict alm=job->almtmp; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) { dcmplx al = alm[l]; @@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, } } -NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, +MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { @@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair, } } //adjust the a_lm for the new algorithm - dcmplx * restrict alm=job->almtmp; + dcmplx * MRUTIL_RESTRICT alm=job->almtmp; dcmplx alm2 = 0.; double alold=0; for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2) diff --git a/libsharp2/sharp_utils.h b/libsharp2/sharp_utils.h index d2a1c74..a3c9190 100644 --- a/libsharp2/sharp_utils.h +++ b/libsharp2/sharp_utils.h @@ -122,10 +122,4 @@ double sharp_wallTime(void); } #endif -#ifdef __GNUC__ -#define NOINLINE __attribute__((noinline)) -#else -#define NOINLINE -#endif - #endif diff --git a/libsharp2/sharp_vecsupport.h b/libsharp2/sharp_vecsupport.h index 2c3d6d2..efc7c66 100644 --- a/libsharp2/sharp_vecsupport.h +++ b/libsharp2/sharp_vecsupport.h @@ -31,11 +31,12 @@ #include #include using std::complex; - - #include using std::experimental::native_simd; using std::experimental::reduce; + +#include "mr_util/useful_macros.h" + using Tv=native_simd; using Tm=Tv::mask_type; using Ts=Tv::value_type; @@ -44,7 +45,7 @@ static constexpr size_t VLEN=Tv::size(); #define vload(a) (a) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, - complex * restrict cc) + complex * MRUTIL_RESTRICT cc) { cc[0] += complex(reduce(a,std::plus<>()),reduce(b,std::plus<>())); cc[1] += complex(reduce(c,std::plus<>()),reduce(d,std::plus<>()));