use MRUTIL macros

This commit is contained in:
Martin Reinecke 2020-01-03 22:55:52 +01:00
parent 75559e6894
commit f8a9c96acf
4 changed files with 69 additions and 73 deletions

View file

@ -34,6 +34,7 @@
#include "libsharp2/sharp_almhelpers.h"
#include "libsharp2/sharp_geomhelpers.h"
#include "mr_util/threading.h"
#include "mr_util/useful_macros.h"
typedef complex<double> dcmplx;
typedef complex<float> fcmplx;
@ -58,7 +59,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
*nchunks = (ndata+(*chunksize)-1)/(*chunksize);
}
NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
{
double ofs=lmax*0.01;
if (ofs<100.) ofs=100.;
@ -95,7 +96,7 @@ static void ringhelper_destroy (ringhelper *self)
ringhelper_init(self);
}
NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
{
self->norot = (fabs(phi0)<1e-14);
if (!(self->norot))
@ -276,7 +277,7 @@ static int sharp_get_mmax (int *mval, int nm)
return nm-1;
}
NOINLINE static void ringhelper_phase2ring (ringhelper *self,
MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
int pstride, int flags)
{
@ -334,7 +335,7 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
pocketfft_backward_r (self->plan, &(data[1]), 1.);
}
NOINLINE static void ringhelper_ring2phase (ringhelper *self,
MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
int pstride, int flags)
{
@ -384,7 +385,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
phase[m*pstride]=0.;
}
NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
int flags)
{
if (flags & SHARP_NO_FFT)
@ -441,7 +442,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
}
}
NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
int flags)
{
#define CLEARLOOP(real_t,body) \
@ -478,7 +479,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
}
}
NOINLINE static void init_output (sharp_job *job)
MRUTIL_NOINLINE static void init_output (sharp_job *job)
{
if (job->flags&SHARP_ADD) return;
if (job->type == SHARP_MAP2ALM)
@ -489,7 +490,7 @@ NOINLINE static void init_output (sharp_job *job)
clear_map (job->ginfo,job->map[i],job->flags);
}
NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
{
if (job->type==SHARP_MAP2ALM)
{
@ -515,7 +516,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
static void dealloc_almtmp (sharp_job *job)
{ DEALLOC(job->almtmp); }
NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
{
#define COPY_LOOP(real_t, source_t, expr_of_x) \
@ -589,7 +590,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP
}
NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
{
#define COPY_LOOP(real_t, target_t, expr_of_x) \
@ -651,7 +652,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
#undef COPY_LOOP
}
NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
const double *ringtmp, int rstride)
{
if (job->flags & SHARP_DP)
@ -659,8 +660,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
double **dmap = (double **)job->map;
for (int i=0; i<job->nmaps; ++i)
{
double *restrict p1=&dmap[i][ri->ofs];
const double *restrict p2=&ringtmp[i*rstride+1];
double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs];
const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1];
if (ri->stride==1)
{
if (job->flags&SHARP_ADD)
@ -683,14 +684,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
}
}
NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
double *ringtmp, int rstride)
{
if (job->flags & SHARP_DP)
for (int i=0; i<job->nmaps; ++i)
{
double *restrict p1=&ringtmp[i*rstride+1],
*restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1],
*MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]);
if (ri->stride==1)
memcpy(p1,p2,ri->nph*sizeof(double));
else
@ -744,7 +745,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
}
//FIXME: set phase to zero if not SHARP_MAP2ALM?
NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type != SHARP_MAP2ALM) return;
int pstride = job->s_m;
@ -789,7 +790,7 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
}
}
NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
{
if (job->type == SHARP_MAP2ALM) return;
int pstride = job->s_m;
@ -834,7 +835,7 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
}
}
NOINLINE static void sharp_execute_job (sharp_job *job)
MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
{
double timer=sharp_wallTime();
job->opcnt=0;

View file

@ -94,7 +94,7 @@ typedef union
sxdata_s s;
} sxdata_u;
static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
double maxval)
{
const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
@ -115,8 +115,8 @@ static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
}
}
static void mypow(Tv val, int npow, const double * restrict powlimit,
Tv * restrict resd, Tv * restrict ress)
static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
{
Tv vminv=powlimit[npow];
auto mask = abs(val)<vminv;
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
}
}
static inline void getCorfac(Tv scale, Tv * restrict corfac,
const double * restrict cf)
static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
const double * MRUTIL_RESTRICT cf)
{
typedef union
{ Tv v; double s[VLEN]; } Tvu;
@ -169,7 +169,7 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac,
*corfac=corf.v;
}
static inline bool rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
{
auto mask = abs(*v2)>eps;
if (any_of(mask))
@ -182,8 +182,8 @@ static inline bool rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s,
return false;
}
NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2)
{
int l=gen->m, il=0;
Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m];
@ -216,8 +216,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
*l_=l; *il_=il;
}
NOINLINE static void alm2map_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm,
int l, int il, int lmax, int nv2)
{
if (nv2==nv0)
@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
}
}
NOINLINE static void calc_alm2map (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
{
int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -298,8 +298,8 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
const dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -338,8 +338,8 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
}
NOINLINE static void map2alm_kernel(s0data_v * restrict d,
const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l,
int il, int lmax, int nv2)
{
for (; l<=lmax-2; il+=2, l+=4)
@ -382,8 +382,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
}
}
NOINLINE static void calc_map2alm (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
{
int l,il,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -392,8 +392,8 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 6*nth;
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -432,10 +432,10 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
}
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
sxdata_v * restrict d, int * restrict l_, int nv2)
MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2)
{
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
Tv prefac=gen->prefac[gen->m],
prescale=gen->fscale[gen->m];
Tv limscale=sharp_limscale;
@ -505,8 +505,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
*l_=l;
}
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2)
{
int lsave = l;
@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
}
}
NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -571,8 +571,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -641,8 +641,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
}
}
NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2)
{
int lsave=l;
@ -695,8 +695,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
}
}
NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -705,8 +705,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 23*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -772,8 +772,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
}
NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
int l, int lmax, int nv2)
{
int lsave=l;
@ -816,8 +816,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
}
}
NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
{
int l,lmax=gen->lmax;
int nv2 = (nth+VLEN-1)/VLEN;
@ -826,8 +826,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 15*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * restrict alm=job->almtmp;
const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
int full_ieee=1;
for (int i=0; i<nv2; ++i)
{
@ -897,7 +897,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
@ -912,7 +912,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
if (job->spin==0)
{
//adjust the a_lm for the new algorithm
dcmplx * restrict alm=job->almtmp;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
{
dcmplx al = alm[l];
@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
}
}
NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
}
}
//adjust the a_lm for the new algorithm
dcmplx * restrict alm=job->almtmp;
dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
dcmplx alm2 = 0.;
double alold=0;
for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)

View file

@ -122,10 +122,4 @@ double sharp_wallTime(void);
}
#endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
#endif

View file

@ -31,11 +31,12 @@
#include <cmath>
#include <complex>
using std::complex;
#include <experimental/simd>
using std::experimental::native_simd;
using std::experimental::reduce;
#include "mr_util/useful_macros.h"
using Tv=native_simd<double>;
using Tm=Tv::mask_type;
using Ts=Tv::value_type;
@ -44,7 +45,7 @@ static constexpr size_t VLEN=Tv::size();
#define vload(a) (a)
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
complex<double> * restrict cc)
complex<double> * MRUTIL_RESTRICT cc)
{
cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>()));
cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>()));