diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index 60fbc6f..fa1f429 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -33,27 +33,27 @@ typedef struct { Tv v[nvec]; } Tb; typedef union - { Tb b; double s[VLEN*nvec]; } Y(Tbu); + { Tb b; double s[VLEN*nvec]; } Tbu; typedef struct - { Tb r, i; } Y(Tbri); + { Tb r, i; } Tbri; typedef struct - { Tb qr, qi, ur, ui; } Y(Tbqu); + { Tb qr, qi, ur, ui; } Tbqu; typedef struct - { double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri); + { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri; typedef struct - { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu); + { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu; typedef union - { Y(Tbri) b; Y(Tsri)s; } Y(Tburi); + { Tbri b; Tsri s; } Tburi; typedef union - { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu); + { Tbqu b; Tsqu s; } Tbuqu; -static inline Tb Y(Tbconst)(double val) +static inline Tb Tbconst(double val) { Tv v=vload(val); Tb res; @@ -61,16 +61,16 @@ static inline Tb Y(Tbconst)(double val) return res; } -static inline void Y(Tbmuleq1)(Tb * restrict a, double b) +static inline void Tbmuleq1(Tb * restrict a, double b) { Tv v=vload(b); for (int i=0; iv[i],v); } -static inline Tb Y(Tbprod)(Tb a, Tb b) +static inline Tb Tbprod(Tb a, Tb b) { Tb r; for (int i=0; iv[i],b.v[i]); } -static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, +static void Tbnormalize (Tb * restrict val, Tb * restrict scale, double maxval) { const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); @@ -94,7 +94,7 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, } } -NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit, +NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit, Tb * restrict resd, Tb * restrict ress) { Tv vminv=vload(powlimit[npow]); @@ -103,7 +103,7 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv)); if (!vanyTrue(mask)) // no underflows possible, use quick algoritm { - Tb res=Y(Tbconst)(1.); + Tb res=Tbconst(1.); do { if (npow&1) @@ -118,12 +118,12 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi } while(npow>>=1); *resd=res; - *ress=Y(Tbconst)(0.); + *ress=Tbconst(0.); } else { - Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); - Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); + Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.); + Tbnormalize(&val,&scaleint,sharp_fbighalf); do { if (npow&1) @@ -133,14 +133,14 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi vmuleq(res.v[i],val.v[i]); vaddeq(scale.v[i],scaleint.v[i]); } - Y(Tbnormalize)(&res,&scale,sharp_fbighalf); + Tbnormalize(&res,&scale,sharp_fbighalf); } for (int i=0; i>=1); *resd=res; @@ -148,7 +148,7 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi } } -static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2, +static inline int rescale(Tb * restrict lam1, Tb * restrict lam2, Tb * restrict scale) { int did_scale=0; @@ -166,7 +166,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2, return did_scale; } -static inline int Y(TballLt)(Tb a,double b) +static inline int TballLt(Tb a,double b) { Tv vb=vload(b); Tm res=vlt(a.v[0],vb); @@ -174,7 +174,7 @@ static inline int Y(TballLt)(Tb a,double b) res=vand_mask(res,vlt(a.v[i],vb)); return vallTrue(res); } -static inline int Y(TballGt)(Tb a,double b) +static inline int TballGt(Tb a,double b) { Tv vb=vload(b); Tm res=vgt(a.v[0],vb); @@ -182,7 +182,7 @@ static inline int Y(TballGt)(Tb a,double b) res=vand_mask(res,vgt(a.v[i],vb)); return vallTrue(res); } -static inline int Y(TballGe)(Tb a,double b) +static inline int TballGe(Tb a,double b) { Tv vb=vload(b); Tm res=vge(a.v[0],vb); @@ -191,10 +191,10 @@ static inline int Y(TballGe)(Tb a,double b) return vallTrue(res); } -static void Y(getCorfac)(Tb scale, Tb * restrict corfac, +static void getCorfac(Tb scale, Tb * restrict corfac, const double * restrict cf) { - Y(Tbu) sc, corf; + Tbu sc, corf; sc.b=scale; for (int i=0; im; - Tb lam_1=Y(Tbconst)(0.), lam_2, scale; - Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale); - Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - Y(Tbnormalize)(&lam_2,&scale,sharp_ftol); + Tb lam_1=Tbconst(0.), lam_2, scale; + mypow(sth,l,gen->powlimit,&lam_2,&scale); + Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); + Tbnormalize(&lam_2,&scale,sharp_ftol); - int below_limit = Y(TballLt)(scale,sharp_limscale); + int below_limit = TballLt(scale,sharp_limscale); while (below_limit) { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} @@ -223,14 +223,14 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) - vload(gen->rf[l+1].f[1])*lam_2.v[i]; } - if (Y(rescale)(&lam_1,&lam_2,&scale)) - below_limit = Y(TballLt)(scale,sharp_limscale); + if (rescale(&lam_1,&lam_2,&scale)) + below_limit = TballLt(scale,sharp_limscale); l+=2; } *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale; } -static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm, +static inline void rec_step(Tb * restrict rxp, Tb * restrict rxm, Tb * restrict ryp, Tb * restrict rym, const Tb cth, const sharp_ylmgen_dbl3 fx) { @@ -242,7 +242,7 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm, } } -static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_, +static void iter_to_ieee_spin(const Tb cth, const Tb sth, int *l_, Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_, Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen) { @@ -262,13 +262,13 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_, } Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps; - Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); - Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps); - Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps); - Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps); + mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); + mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps); + mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps); + mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps); Tb rec2p, rec2m, scalep, scalem; - Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.); + Tb rec1p=Tbconst(0.), rec1m=Tbconst(0.); Tv prefac=vload(gen->prefac[gen->m]), prescale=vload(gen->fscale[gen->m]); for (int i=0; is&1) rec2p.v[i]=vneg(rec2p.v[i]); } - Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol); - Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol); + Tbnormalize(&rec2m,&scalem,sharp_ftol); + Tbnormalize(&rec2p,&scalep,sharp_ftol); int l=gen->mhi; - int below_limit = Y(TballLt)(scalep,sharp_limscale) - && Y(TballLt)(scalem,sharp_limscale); + int below_limit = TballLt(scalep,sharp_limscale) + && TballLt(scalem,sharp_limscale); while (below_limit) { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]); - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) - below_limit = Y(TballLt)(scalep,sharp_limscale) - && Y(TballLt)(scalem,sharp_limscale); + rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]); + rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]); + if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem)) + below_limit = TballLt(scalep,sharp_limscale) + && TballLt(scalem,sharp_limscale); l+=2; } @@ -317,8 +317,8 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_, } -NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, - Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, +NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1, + Tbri * restrict p2, Tb lam_1, Tb lam_2, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, int l, int lmax) { @@ -341,8 +341,8 @@ NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, } } -NOINLINE static void Y(map2alm_kernel) (const Tb cth, - const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, +NOINLINE static void map2alm_kernel (const Tb cth, + const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2, const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp) { while (l<=lmax) @@ -362,22 +362,22 @@ NOINLINE static void Y(map2alm_kernel) (const Tb cth, } } -NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, - Y(Tbri) * restrict p2) +NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, + const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1, + Tbri * restrict p2) { int l,lmax=gen->lmax; - Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; - Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; + iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; if (l>lmax) return; job->opcnt += (lmax+1-l) * 8*VLEN*nvec; Tb corfac; - Y(getCorfac)(scale,&corfac,gen->cf); + getCorfac(scale,&corfac,gen->cf); const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scale,sharp_minscale); + int full_ieee = TballGe(scale,sharp_minscale); while (!full_ieee) { { @@ -406,34 +406,34 @@ NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth, r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); for (int i=0; icf); - full_ieee = Y(TballGe)(scale,sharp_minscale); + getCorfac(scale,&corfac,gen->cf); + full_ieee = TballGe(scale,sharp_minscale); } } if (l>lmax) return; - Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); + Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac); + alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); } -NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth, - const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2, Tv *restrict atmp) +NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, + const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1, + const Tbri * restrict p2, Tv *restrict atmp) { int lmax=gen->lmax; - Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; + Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; int l=gen->m; - Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; if (l>lmax) return; job->opcnt += (lmax+1-l) * 8*VLEN*nvec; const sharp_ylmgen_dbl2 * restrict rf = gen->rf; Tb corfac; - Y(getCorfac)(scale,&corfac,gen->cf); - int full_ieee = Y(TballGe)(scale,sharp_minscale); + getCorfac(scale,&corfac,gen->cf); + int full_ieee = TballGe(scale,sharp_minscale); while (!full_ieee) { for (int i=0; icf); - full_ieee = Y(TballGe)(scale,sharp_minscale); + getCorfac(scale,&corfac,gen->cf); + full_ieee = TballGe(scale,sharp_minscale); } } - Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp); + Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac); + map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp); } -static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, +static inline void saddstep(Tbqu * restrict px, Tbqu * restrict py, const Tb rxp, const Tb rxm, const dcmplx * restrict alm) { Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])), @@ -486,7 +486,7 @@ static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, } } -static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, +static inline void saddstepb(Tbqu * restrict p1, Tbqu * restrict p2, const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, const dcmplx * restrict alm1, const dcmplx * restrict alm2) { @@ -511,8 +511,8 @@ static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, } } -static inline void Y(saddstep2) (const Y(Tbqu) * restrict px, - const Y(Tbqu) * restrict py, const Tb * restrict rxp, +static inline void saddstep2(const Tbqu * restrict px, + const Tbqu * restrict py, const Tb * restrict rxp, const Tb * restrict rxm, dcmplx * restrict alm) { Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero; @@ -532,8 +532,8 @@ static inline void Y(saddstep2) (const Y(Tbqu) * restrict px, vhsum_cmplx_special(agr,agi,acr,aci,alm); } -NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, - Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, +NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1, + Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax) { @@ -546,7 +546,7 @@ NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i]; rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i]; } - Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l], + saddstepb(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l], &alm[2*(l+1)]); fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx2=vload(fx[l+2].f[2]); @@ -558,11 +558,11 @@ NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, l+=2; } if (l==lmax) - Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]); + saddstep(p1, p2, rec2p, rec2m, &alm[2*l]); } -NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1, - const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, +NOINLINE static void map2alm_spin_kernel(Tb cth, const Tbqu * restrict p1, + const Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax) { while (llmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) + iter_to_ieee_spin (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; if (l>lmax) return; @@ -607,45 +607,45 @@ NOINLINE static void Y(calc_alm2map_spin) (const Tb cth, const Tb sth, const sharp_ylmgen_dbl3 * restrict fx = gen->fx; Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); + getCorfac(scalep,&corfacp,gen->cf); + getCorfac(scalem,&corfacm,gen->cf); const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); + int full_ieee = TballGe(scalep,sharp_minscale) + && TballGe(scalem,sharp_minscale); while (!full_ieee) { - Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + saddstep(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm), &alm[2*l]); if (++l>lmax) break; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + saddstep(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm), &alm[2*l]); if (++l>lmax) break; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem)) { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); + getCorfac(scalep,&corfacp,gen->cf); + getCorfac(scalem,&corfacm,gen->cf); + full_ieee = TballGe(scalep,sharp_minscale) + && TballGe(scalem,sharp_minscale); } } if (l>lmax) return; - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp); + Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm); + alm2map_spin_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax); } -NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth, +NOINLINE static void calc_map2alm_spin (Tb cth, Tb sth, const sharp_Ylmgen_C * restrict gen, sharp_job *job, - const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2) + const Tbqu * restrict p1, const Tbqu * restrict p2) { int l, lmax=gen->lmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) + iter_to_ieee_spin (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; if (l>lmax) return; @@ -653,36 +653,36 @@ NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth, const sharp_ylmgen_dbl3 * restrict fx = gen->fx; Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); + getCorfac(scalep,&corfacp,gen->cf); + getCorfac(scalem,&corfacm,gen->cf); dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); + int full_ieee = TballGe(scalep,sharp_minscale) + && TballGe(scalem,sharp_minscale); while (!full_ieee) { - Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); - Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]); + Tb t1=Tbprod(rec2p,corfacp), t2=Tbprod(rec2m,corfacm); + saddstep2(p1, p2, &t1, &t2, &alm[2*l]); if (++l>lmax) return; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); - Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]); + rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + t1=Tbprod(rec1p,corfacp); t2=Tbprod(rec1m,corfacm); + saddstep2(p2, p1, &t1, &t2, &alm[2*l]); if (++l>lmax) return; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem)) { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); + getCorfac(scalep,&corfacp,gen->cf); + getCorfac(scalem,&corfacm,gen->cf); + full_ieee = TballGe(scalep,sharp_minscale) + && TballGe(scalem,sharp_minscale); } } - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); + Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp); + Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm); + map2alm_spin_kernel(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); } -static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, +static inline void saddstep_d(Tbqu * restrict px, Tbqu * restrict py, const Tb rxp, const Tb rxm, const dcmplx * restrict alm) { Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0])); @@ -697,8 +697,8 @@ static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, } } -NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, - Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, +NOINLINE static void alm2map_deriv1_kernel(Tb cth, Tbqu * restrict p1, + Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax) { @@ -713,8 +713,8 @@ NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])), vmul(fx2,rec1m.v[i])); } - Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]); - Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]); + saddstep_d(p1,p2,rec2p,rec2m,&alm[l]); + saddstep_d(p2,p1,rec1p,rec1m,&alm[l+1]); fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); fx2=vload(fx[l+2].f[2]); for (int i=0; ilmax; Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; - Y(iter_to_ieee_spin) + iter_to_ieee_spin (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); job->opcnt += (l-gen->m) * 10*VLEN*nvec; if (l>lmax) return; @@ -744,42 +744,42 @@ NOINLINE static void Y(calc_alm2map_deriv1) (const Tb cth, const Tb sth, const sharp_ylmgen_dbl3 * restrict fx = gen->fx; Tb corfacp,corfacm; - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); + getCorfac(scalep,&corfacp,gen->cf); + getCorfac(scalem,&corfacm,gen->cf); const dcmplx * restrict alm=job->almtmp; - int full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); + int full_ieee = TballGe(scalep,sharp_minscale) + && TballGe(scalem,sharp_minscale); while (!full_ieee) { - Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + saddstep_d(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm), &alm[l]); if (++l>lmax) break; - Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); - Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + saddstep_d(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm), &alm[l]); if (++l>lmax) break; - Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); - if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem)) { - Y(getCorfac)(scalep,&corfacp,gen->cf); - Y(getCorfac)(scalem,&corfacm,gen->cf); - full_ieee = Y(TballGe)(scalep,sharp_minscale) - && Y(TballGe)(scalem,sharp_minscale); + getCorfac(scalep,&corfacp,gen->cf); + getCorfac(scalem,&corfacm,gen->cf); + full_ieee = TballGe(scalep,sharp_minscale) + && TballGe(scalem,sharp_minscale); } } if (l>lmax) return; - Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); - Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); - Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp); + Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm); + alm2map_deriv1_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax); } #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) -NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair, +NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { @@ -796,8 +796,8 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair, { for (int ith=0; ithtype==SHARP_ALM2MAP) ? - Y(calc_alm2map_spin ) + calc_alm2map_spin (cth.b,sth.b,gen,job,&p1.b,&p2.b) : - Y(calc_alm2map_deriv1) + calc_alm2map_deriv1 (cth.b,sth.b,gen,job,&p1.b,&p2.b); for (int i=0; ilmax+2-m)*sizeof(Tv)); for (int ith=0; ithlmax+1; @@ -934,8 +934,8 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair, { for (int ith=0; ithtype==SHARP_MAP2ALM) ? - Y(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : - Y(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); + inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) : + inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim); } #undef VZERO diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c index 1139ef8..b209cae 100644 --- a/libsharp/sharp_core_inc0.c +++ b/libsharp/sharp_core_inc0.c @@ -42,21 +42,17 @@ typedef complex double dcmplx; #define XCONCATX(a,b) a##b #define CONCATX(a,b) XCONCATX(a,b) -#define XCONCAT2(a,b) a##_##b -#define CONCAT2(a,b) XCONCAT2(a,b) #define nvec 6 -#define Tb CONCAT2(Tb,nvec) -#define Y(arg) CONCAT2(arg,nvec) +#define Y(arg) arg #include "sharp_core_inc.c" #undef Y -#undef Tb #undef nvec void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth, const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim) { - inner_loop_6(job, ispair,cth,sth,llim,ulim,gen,mi,mlim); + inner_loop_(job, ispair,cth,sth,llim,ulim,gen,mi,mlim); }