diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index eda81f4..f13d83d 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -54,21 +54,12 @@ typedef union typedef struct { Tb r, i; } Tbri; -typedef struct - { Tb qr, qi, ur, ui; } Tbqu; - typedef struct { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri; -typedef struct - { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Tsqu; - typedef union { Tbri b; Tsri s; } Tburi; -typedef union - { Tbqu b; Tsqu s; } Tbuqu; - static inline Tb Tbconst(double val) { Tv v=vload(val); @@ -80,9 +71,6 @@ static inline Tb Tbconst(double val) static inline void Tbmuleq1(Tb * restrict a, double b) { Tv v=vload(b); for (int i=0; iv[i],v); } -static inline Tb Tbprod(Tb a, Tb b) - { Tb r; for (int i=0; iv[i],b.v[i]); } @@ -107,86 +95,56 @@ static void Tbnormalize (Tv * restrict val, Tv * restrict scale, } } -NOINLINE static void mypow (Tb val, int npow, const double * restrict powlimit, - Tb * restrict resd, Tb * restrict ress) +static void mypow(Tv val, int npow, const double * restrict powlimit, + Tv * restrict resd, Tv * restrict ress) { Tv vminv=vload(powlimit[npow]); - int npsave=npow; - for (int i=0;i>=1); - resd->v[i]=res; - ress->v[i]=vzero; + if (npow&1) + vmuleq(res,val); + vmuleq(val,val); } - else + while(npow>>=1); + *resd=res; + *ress=vzero; + } + else + { + Tv scale=vzero, scaleint=vzero, res=vone; + Tbnormalize(&val,&scaleint,sharp_fbighalf); + do { - Tv scale=vzero, scaleint=vzero, res=vone; - Tbnormalize(&val.v[i],&scaleint,sharp_fbighalf); - do + if (npow&1) { - if (npow&1) - { - vmuleq(res,val.v[i]); - vaddeq(scale,scaleint); - Tbnormalize(&res,&scale,sharp_fbighalf); - } - vmuleq(val.v[i],val.v[i]); - vaddeq(scaleint,scaleint); - Tbnormalize(&val.v[i],&scaleint,sharp_fbighalf); + vmuleq(res,val); + vaddeq(scale,scaleint); + Tbnormalize(&res,&scale,sharp_fbighalf); } - while(npow>>=1); - resd->v[i]=res; - ress->v[i]=scale; + vmuleq(val,val); + vaddeq(scaleint,scaleint); + Tbnormalize(&val,&scaleint,sharp_fbighalf); } + while(npow>>=1); + *resd=res; + *ress=scale; } } -static inline int TballLt(Tb a,double b) - { - Tv vb=vload(b); - Tm res=vlt(a.v[0],vb); - for (int i=1; im; - for (int i=0; iv[i]=vzero; - mypow(sth,l,gen->powlimit,lam_2,scale); - Tbmuleq1(lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); - for (int i=0; iv[i],&scale->v[i],sharp_ftol); + Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale); + int below_limit = 1; + for (int i=0; iv[i]=vzero; + mypow(sth.v[i],l,gen->powlimit,&lam_2->v[i],&scale->v[i]); + lam_2->v[i] *= mfac; + Tbnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol); + below_limit &= vallTrue(vlt(scale->v[i],limscale)); + } - int below_limit = TballLt(*scale,sharp_limscale); while (below_limit) { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} @@ -279,17 +241,22 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, Tbri * restrict p2) { int l,lmax=gen->lmax; - Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; + Tb lam_1,lam_2,scale; iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; if (l>lmax) return; job->opcnt += (lmax+1-l) * 8*VLEN*nvec; - Tb corfac; - getCorfac(scale,&corfac,gen->cf); const sharp_ylmgen_dbl2 * restrict rf = gen->rf; const dcmplx * restrict alm=job->almtmp; - int full_ieee = TballGe(scale,sharp_minscale); + int full_ieee=1; + Tb corfac; + for (int i=0; icf); + full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); + } + while((!full_ieee) && (l<=lmax)) { Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); @@ -309,12 +276,7 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth, vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall)); vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall)); vaddeq_mask(mask,scale.v[i],vone); - Tvu sc, corf; - sc.v=scale.v[i]; - for (int j=0; jcf[(int)(sc.s[j])-sharp_minscale]; - corfac.v[i]=corf.v; + getCorfac(scale.v[i], &corfac.v[i], gen->cf); full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); } vfmaeq(p2->r.v[i],lam_1.v[i]*corfac.v[i],ar2); @@ -333,7 +295,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, const Tbri * restrict p2) { int lmax=gen->lmax; - Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale; + Tb lam_1,lam_2,scale; int l=gen->m; iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; @@ -342,9 +304,14 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, const sharp_ylmgen_dbl2 * restrict rf = gen->rf; dcmplx * restrict alm=job->almtmp; + int full_ieee=1; Tb corfac; - getCorfac(scale,&corfac,gen->cf); - int full_ieee = TballGe(scale,sharp_minscale); + for (int i=0; icf); + full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); + } + while ((!full_ieee) && (l<=lmax)) { full_ieee=1; @@ -363,12 +330,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth, vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall)); vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall)); vaddeq_mask(mask,scale.v[i],vone); - Tvu sc, corf; - sc.v=scale.v[i]; - for (int j=0; jcf[(int)(sc.s[j])-sharp_minscale]; - corfac.v[i]=corf.v; + getCorfac(scale.v[i], &corfac.v[i], gen->cf); full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale))); } vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]);