de-macroizing

This commit is contained in:
Martin Reinecke 2018-12-11 11:12:34 +01:00
parent cdc09826a1
commit 1e39d436c6
2 changed files with 179 additions and 183 deletions

View file

@ -33,27 +33,27 @@ typedef struct
{ Tv v[nvec]; } Tb;
typedef union
{ Tb b; double s[VLEN*nvec]; } Y(Tbu);
{ Tb b; double s[VLEN*nvec]; } Tbu;
typedef struct
{ Tb r, i; } Y(Tbri);
{ Tb r, i; } Tbri;
typedef struct
{ Tb qr, qi, ur, ui; } Y(Tbqu);
{ Tb qr, qi, ur, ui; } Tbqu;
typedef struct
{ double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
{ double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
typedef struct
{ double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
{ double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu;
typedef union
{ Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
{ Tbri b; Tsri s; } Tburi;
typedef union
{ Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
{ Tbqu b; Tsqu s; } Tbuqu;
static inline Tb Y(Tbconst)(double val)
static inline Tb Tbconst(double val)
{
Tv v=vload(val);
Tb res;
@ -61,16 +61,16 @@ static inline Tb Y(Tbconst)(double val)
return res;
}
static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
static inline void Tbmuleq1(Tb * restrict a, double b)
{ Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
static inline Tb Y(Tbprod)(Tb a, Tb b)
static inline Tb Tbprod(Tb a, Tb b)
{ Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
static inline void Tbmuleq(Tb * restrict a, Tb b)
{ for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
static void Tbnormalize (Tb * restrict val, Tb * restrict scale,
double maxval)
{
const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
@ -94,7 +94,7 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
}
}
NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit,
Tb * restrict resd, Tb * restrict ress)
{
Tv vminv=vload(powlimit[npow]);
@ -103,7 +103,7 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
{
Tb res=Y(Tbconst)(1.);
Tb res=Tbconst(1.);
do
{
if (npow&1)
@ -118,12 +118,12 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
}
while(npow>>=1);
*resd=res;
*ress=Y(Tbconst)(0.);
*ress=Tbconst(0.);
}
else
{
Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
Tb scale=Tbconst(0.), scaleint=Tbconst(0.), res=Tbconst(1.);
Tbnormalize(&val,&scaleint,sharp_fbighalf);
do
{
if (npow&1)
@ -133,14 +133,14 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
vmuleq(res.v[i],val.v[i]);
vaddeq(scale.v[i],scaleint.v[i]);
}
Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
Tbnormalize(&res,&scale,sharp_fbighalf);
}
for (int i=0; i<nvec; ++i)
{
vmuleq(val.v[i],val.v[i]);
vaddeq(scaleint.v[i],scaleint.v[i]);
}
Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
Tbnormalize(&val,&scaleint,sharp_fbighalf);
}
while(npow>>=1);
*resd=res;
@ -148,7 +148,7 @@ NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimi
}
}
static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
static inline int rescale(Tb * restrict lam1, Tb * restrict lam2,
Tb * restrict scale)
{
int did_scale=0;
@ -166,7 +166,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
return did_scale;
}
static inline int Y(TballLt)(Tb a,double b)
static inline int TballLt(Tb a,double b)
{
Tv vb=vload(b);
Tm res=vlt(a.v[0],vb);
@ -174,7 +174,7 @@ static inline int Y(TballLt)(Tb a,double b)
res=vand_mask(res,vlt(a.v[i],vb));
return vallTrue(res);
}
static inline int Y(TballGt)(Tb a,double b)
static inline int TballGt(Tb a,double b)
{
Tv vb=vload(b);
Tm res=vgt(a.v[0],vb);
@ -182,7 +182,7 @@ static inline int Y(TballGt)(Tb a,double b)
res=vand_mask(res,vgt(a.v[i],vb));
return vallTrue(res);
}
static inline int Y(TballGe)(Tb a,double b)
static inline int TballGe(Tb a,double b)
{
Tv vb=vload(b);
Tm res=vge(a.v[0],vb);
@ -191,10 +191,10 @@ static inline int Y(TballGe)(Tb a,double b)
return vallTrue(res);
}
static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
static void getCorfac(Tb scale, Tb * restrict corfac,
const double * restrict cf)
{
Y(Tbu) sc, corf;
Tbu sc, corf;
sc.b=scale;
for (int i=0; i<VLEN*nvec; ++i)
corf.s[i] = (sc.s[i]<sharp_minscale) ?
@ -202,17 +202,17 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
*corfac=corf.b;
}
NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
const sharp_Ylmgen_C * restrict gen)
{
int l=gen->m;
Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
Tb lam_1=Tbconst(0.), lam_2, scale;
mypow(sth,l,gen->powlimit,&lam_2,&scale);
Tbmuleq1(&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
Tbnormalize(&lam_2,&scale,sharp_ftol);
int below_limit = Y(TballLt)(scale,sharp_limscale);
int below_limit = TballLt(scale,sharp_limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
@ -223,14 +223,14 @@ NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
- vload(gen->rf[l+1].f[1])*lam_2.v[i];
}
if (Y(rescale)(&lam_1,&lam_2,&scale))
below_limit = Y(TballLt)(scale,sharp_limscale);
if (rescale(&lam_1,&lam_2,&scale))
below_limit = TballLt(scale,sharp_limscale);
l+=2;
}
*l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
}
static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
static inline void rec_step(Tb * restrict rxp, Tb * restrict rxm,
Tb * restrict ryp, Tb * restrict rym, const Tb cth,
const sharp_ylmgen_dbl3 fx)
{
@ -242,7 +242,7 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
}
}
static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
static void iter_to_ieee_spin(const Tb cth, const Tb sth, int *l_,
Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
{
@ -262,13 +262,13 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
}
Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
Tb rec2p, rec2m, scalep, scalem;
Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
Tb rec1p=Tbconst(0.), rec1m=Tbconst(0.);
Tv prefac=vload(gen->prefac[gen->m]),
prescale=vload(gen->fscale[gen->m]);
for (int i=0; i<nvec; ++i)
@ -278,8 +278,8 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
rec2m.v[i]=vmul(prefac,csp.v[i]);
scalem.v[i]=vadd(prescale,csps.v[i]);
}
Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf);
Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf);
Tbnormalize(&rec2m,&scalem,sharp_fbighalf);
Tbnormalize(&rec2p,&scalep,sharp_fbighalf);
for (int i=0; i<nvec; ++i)
{
rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
@ -293,21 +293,21 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
if (gen->s&1)
rec2p.v[i]=vneg(rec2p.v[i]);
}
Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol);
Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol);
Tbnormalize(&rec2m,&scalem,sharp_ftol);
Tbnormalize(&rec2p,&scalep,sharp_ftol);
int l=gen->mhi;
int below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
int below_limit = TballLt(scalep,sharp_limscale)
&& TballLt(scalem,sharp_limscale);
while (below_limit)
{
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
below_limit = Y(TballLt)(scalep,sharp_limscale)
&& Y(TballLt)(scalem,sharp_limscale);
rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
below_limit = TballLt(scalep,sharp_limscale)
&& TballLt(scalem,sharp_limscale);
l+=2;
}
@ -317,8 +317,8 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
}
NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
Tbri * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
int l, int lmax)
{
@ -341,8 +341,8 @@ NOINLINE static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
}
}
NOINLINE static void Y(map2alm_kernel) (const Tb cth,
const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
NOINLINE static void map2alm_kernel (const Tb cth,
const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
{
while (l<=lmax)
@ -362,22 +362,22 @@ NOINLINE static void Y(map2alm_kernel) (const Tb cth,
}
}
NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
Y(Tbri) * restrict p2)
NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
Tbri * restrict p2)
{
int l,lmax=gen->lmax;
Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
getCorfac(scale,&corfac,gen->cf);
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scale,sharp_minscale);
int full_ieee = TballGe(scale,sharp_minscale);
while (!full_ieee)
{
{
@ -406,34 +406,34 @@ NOINLINE static void Y(calc_alm2map) (const Tb cth, const Tb sth,
r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
if (Y(rescale)(&lam_1,&lam_2,&scale))
if (rescale(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
getCorfac(scale,&corfac,gen->cf);
full_ieee = TballGe(scale,sharp_minscale);
}
}
if (l>lmax) return;
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
}
NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tv *restrict atmp)
NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
const Tbri * restrict p2, Tv *restrict atmp)
{
int lmax=gen->lmax;
Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
int l=gen->m;
Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen);
job->opcnt += (l-gen->m) * 4*VLEN*nvec;
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
Tb corfac;
Y(getCorfac)(scale,&corfac,gen->cf);
int full_ieee = Y(TballGe)(scale,sharp_minscale);
getCorfac(scale,&corfac,gen->cf);
int full_ieee = TballGe(scale,sharp_minscale);
while (!full_ieee)
{
for (int i=0; i<nvec; ++i)
@ -455,18 +455,18 @@ NOINLINE static void Y(calc_map2alm) (const Tb cth, const Tb sth,
for (int i=0; i<nvec; ++i)
lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
- vload(rf[l-1].f[1])*lam_2.v[i];
if (Y(rescale)(&lam_1,&lam_2,&scale))
if (rescale(&lam_1,&lam_2,&scale))
{
Y(getCorfac)(scale,&corfac,gen->cf);
full_ieee = Y(TballGe)(scale,sharp_minscale);
getCorfac(scale,&corfac,gen->cf);
full_ieee = TballGe(scale,sharp_minscale);
}
}
Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
}
static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
static inline void saddstep(Tbqu * restrict px, Tbqu * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
{
Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
@ -486,7 +486,7 @@ static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
}
}
static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
static inline void saddstepb(Tbqu * restrict p1, Tbqu * restrict p2,
const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
const dcmplx * restrict alm1, const dcmplx * restrict alm2)
{
@ -511,8 +511,8 @@ static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
}
}
static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
const Y(Tbqu) * restrict py, const Tb * restrict rxp,
static inline void saddstep2(const Tbqu * restrict px,
const Tbqu * restrict py, const Tb * restrict rxp,
const Tb * restrict rxm, dcmplx * restrict alm)
{
Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
@ -532,8 +532,8 @@ static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
vhsum_cmplx_special(agr,agi,acr,aci,alm);
}
NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
NOINLINE static void alm2map_spin_kernel(Tb cth, Tbqu * restrict p1,
Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{
@ -546,7 +546,7 @@ NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
}
Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
saddstepb(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
&alm[2*(l+1)]);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
@ -558,11 +558,11 @@ NOINLINE static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
l+=2;
}
if (l==lmax)
Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]);
saddstep(p1, p2, rec2p, rec2m, &alm[2*l]);
}
NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
NOINLINE static void map2alm_spin_kernel(Tb cth, const Tbqu * restrict p1,
const Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
{
while (l<lmax)
@ -576,8 +576,8 @@ NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
saddstep2(p1, p2, &rec2p, &rec2m, &alm[2*l]);
saddstep2(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
@ -590,16 +590,16 @@ NOINLINE static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
l+=2;
}
if (l==lmax)
Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
saddstep2(p1, p2, &rec2p, &rec2m, &alm[2*l]);
}
NOINLINE static void Y(calc_alm2map_spin) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2)
NOINLINE static void calc_alm2map_spin(const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Tbqu * restrict p1,
Tbqu * restrict p2)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin)
iter_to_ieee_spin
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) return;
@ -607,45 +607,45 @@ NOINLINE static void Y(calc_alm2map_spin) (const Tb cth, const Tb sth,
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
getCorfac(scalep,&corfacp,gen->cf);
getCorfac(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
int full_ieee = TballGe(scalep,sharp_minscale)
&& TballGe(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
saddstep(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm),
&alm[2*l]);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
saddstep(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm),
&alm[2*l]);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
getCorfac(scalep,&corfacp,gen->cf);
getCorfac(scalem,&corfacm,gen->cf);
full_ieee = TballGe(scalep,sharp_minscale)
&& TballGe(scalem,sharp_minscale);
}
}
if (l>lmax) return;
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
alm2map_spin_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax);
}
NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth,
NOINLINE static void calc_map2alm_spin (Tb cth, Tb sth,
const sharp_Ylmgen_C * restrict gen, sharp_job *job,
const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2)
const Tbqu * restrict p1, const Tbqu * restrict p2)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin)
iter_to_ieee_spin
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) return;
@ -653,36 +653,36 @@ NOINLINE static void Y(calc_map2alm_spin) (Tb cth, Tb sth,
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
getCorfac(scalep,&corfacp,gen->cf);
getCorfac(scalem,&corfacm,gen->cf);
dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
int full_ieee = TballGe(scalep,sharp_minscale)
&& TballGe(scalem,sharp_minscale);
while (!full_ieee)
{
Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]);
Tb t1=Tbprod(rec2p,corfacp), t2=Tbprod(rec2m,corfacm);
saddstep2(p1, p2, &t1, &t2, &alm[2*l]);
if (++l>lmax) return;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]);
rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
t1=Tbprod(rec1p,corfacp); t2=Tbprod(rec1m,corfacm);
saddstep2(p2, p1, &t1, &t2, &alm[2*l]);
if (++l>lmax) return;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
getCorfac(scalep,&corfacp,gen->cf);
getCorfac(scalem,&corfacm,gen->cf);
full_ieee = TballGe(scalep,sharp_minscale)
&& TballGe(scalem,sharp_minscale);
}
}
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
map2alm_spin_kernel(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
}
static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
static inline void saddstep_d(Tbqu * restrict px, Tbqu * restrict py,
const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
{
Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
@ -697,8 +697,8 @@ static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
}
}
NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
NOINLINE static void alm2map_deriv1_kernel(Tb cth, Tbqu * restrict p1,
Tbqu * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
int lmax)
{
@ -713,8 +713,8 @@ NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
vmul(fx2,rec1m.v[i]));
}
Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]);
Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]);
saddstep_d(p1,p2,rec2p,rec2m,&alm[l]);
saddstep_d(p2,p1,rec1p,rec1m,&alm[l+1]);
fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
fx2=vload(fx[l+2].f[2]);
for (int i=0; i<nvec; ++i)
@ -727,16 +727,16 @@ NOINLINE static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
l+=2;
}
if (l==lmax)
Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[l]);
saddstep_d(p1, p2, rec2p, rec2m, &alm[l]);
}
NOINLINE static void Y(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
Y(Tbqu) * restrict p2)
NOINLINE static void calc_alm2map_deriv1(const Tb cth, const Tb sth,
const sharp_Ylmgen_C *gen, sharp_job *job, Tbqu * restrict p1,
Tbqu * restrict p2)
{
int l, lmax=gen->lmax;
Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
Y(iter_to_ieee_spin)
iter_to_ieee_spin
(cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
job->opcnt += (l-gen->m) * 10*VLEN*nvec;
if (l>lmax) return;
@ -744,42 +744,42 @@ NOINLINE static void Y(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
Tb corfacp,corfacm;
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
getCorfac(scalep,&corfacp,gen->cf);
getCorfac(scalem,&corfacm,gen->cf);
const dcmplx * restrict alm=job->almtmp;
int full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
int full_ieee = TballGe(scalep,sharp_minscale)
&& TballGe(scalem,sharp_minscale);
while (!full_ieee)
{
Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
saddstep_d(p1, p2, Tbprod(rec2p,corfacp), Tbprod(rec2m,corfacm),
&alm[l]);
if (++l>lmax) break;
Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
rec_step(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
saddstep_d(p2, p1, Tbprod(rec1p,corfacp), Tbprod(rec1m,corfacm),
&alm[l]);
if (++l>lmax) break;
Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
rec_step(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
if (rescale(&rec1p,&rec2p,&scalep) | rescale(&rec1m,&rec2m,&scalem))
{
Y(getCorfac)(scalep,&corfacp,gen->cf);
Y(getCorfac)(scalem,&corfacm,gen->cf);
full_ieee = Y(TballGe)(scalep,sharp_minscale)
&& Y(TballGe)(scalem,sharp_minscale);
getCorfac(scalep,&corfacp,gen->cf);
getCorfac(scalem,&corfacm,gen->cf);
full_ieee = TballGe(scalep,sharp_minscale)
&& TballGe(scalem,sharp_minscale);
}
}
if (l>lmax) return;
Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
Tbmuleq(&rec1p,corfacp); Tbmuleq(&rec2p,corfacp);
Tbmuleq(&rec1m,corfacm); Tbmuleq(&rec2m,corfacm);
alm2map_deriv1_kernel(cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
lmax);
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
@ -796,8 +796,8 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
{
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tburi) p1,p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
Tburi p1,p2; VZERO(p1); VZERO(p2);
Tbu cth, sth;
int skip=1;
for (int i=0; i<nval; ++i)
@ -808,7 +808,7 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
}
if (!skip)
Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b);
for (int i=0; i<nval; ++i)
{
@ -829,8 +829,8 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
{
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tbuqu) p1,p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
Tbuqu p1,p2; VZERO(p1); VZERO(p2);
Tbu cth, sth;
int skip=1;
for (int i=0; i<nval; ++i)
@ -842,9 +842,9 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
}
if (!skip)
(job->type==SHARP_ALM2MAP) ?
Y(calc_alm2map_spin )
calc_alm2map_spin
(cth.b,sth.b,gen,job,&p1.b,&p2.b) :
Y(calc_alm2map_deriv1)
calc_alm2map_deriv1
(cth.b,sth.b,gen,job,&p1.b,&p2.b);
for (int i=0; i<nval; ++i)
@ -882,7 +882,7 @@ NOINLINE static void Y(inner_loop_a2m) (sharp_job *job, const int *ispair,
}
}
NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
@ -900,8 +900,8 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
Tburi p1, p2; VZERO(p1); VZERO(p2);
Tbu cth, sth;
int skip=1;
for (int i=0; i<nval; ++i)
@ -920,7 +920,7 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
}
}
if (!skip)
Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
}
{
int istart=m, istop=gen->lmax+1;
@ -934,8 +934,8 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
{
for (int ith=0; ith<ulim-llim; ith+=nval)
{
Y(Tbuqu) p1, p2; VZERO(p1); VZERO(p2);
Y(Tbu) cth, sth;
Tbuqu p1, p2; VZERO(p1); VZERO(p2);
Tbu cth, sth;
int skip=1;
for (int i=0; i<nval; ++i)
@ -960,7 +960,7 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
}
}
if (!skip)
Y(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
calc_map2alm_spin (cth.b,sth.b,gen,job,&p1.b,&p2.b);
}
}
break;
@ -973,13 +973,13 @@ NOINLINE static void Y(inner_loop_m2a) (sharp_job *job, const int *ispair,
}
}
static void Y(inner_loop) (sharp_job *job, const int *ispair,
static void inner_loop_ (sharp_job *job, const int *ispair,
const double *cth_, const double *sth_, int llim, int ulim,
sharp_Ylmgen_C *gen, int mi, const int *mlim)
{
(job->type==SHARP_MAP2ALM) ?
Y(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
Y(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
inner_loop_m2a(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
inner_loop_a2m(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
}
#undef VZERO

View file

@ -42,21 +42,17 @@ typedef complex double dcmplx;
#define XCONCATX(a,b) a##b
#define CONCATX(a,b) XCONCATX(a,b)
#define XCONCAT2(a,b) a##_##b
#define CONCAT2(a,b) XCONCAT2(a,b)
#define nvec 6
#define Tb CONCAT2(Tb,nvec)
#define Y(arg) CONCAT2(arg,nvec)
#define Y(arg) arg
#include "sharp_core_inc.c"
#undef Y
#undef Tb
#undef nvec
void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *cth,
const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
const int *mlim)
{
inner_loop_6(job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
inner_loop_(job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
}