cleanup
This commit is contained in:
parent
cec84e5853
commit
716cd9e558
1 changed files with 17 additions and 21 deletions
|
@ -253,21 +253,23 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
|
||||||
|
|
||||||
NOINLINE static void map2alm_kernel (const Tb cth,
|
NOINLINE static void map2alm_kernel (const Tb cth,
|
||||||
const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
|
const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
|
||||||
const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
|
const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
|
||||||
{
|
{
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]),
|
Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]),
|
||||||
f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
|
f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
|
||||||
|
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
||||||
for (int i=0; i<nvec; ++i)
|
for (int i=0; i<nvec; ++i)
|
||||||
{
|
{
|
||||||
lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
|
lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
|
||||||
vfmaeq(atmp[2*l ],lam_2.v[i],p1->r.v[i]);
|
vfmaeq(atmp[0],lam_2.v[i],p1->r.v[i]);
|
||||||
vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
|
vfmaeq(atmp[1],lam_2.v[i],p1->i.v[i]);
|
||||||
lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
|
lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
|
||||||
vfmaeq(atmp[2*(l+1) ],lam_1.v[i],p2->r.v[i]);
|
vfmaeq(atmp[2],lam_1.v[i],p2->r.v[i]);
|
||||||
vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
|
vfmaeq(atmp[3],lam_1.v[i],p2->i.v[i]);
|
||||||
}
|
}
|
||||||
|
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
|
||||||
l+=2;
|
l+=2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -328,7 +330,7 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
|
||||||
|
|
||||||
NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
|
NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
|
||||||
const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
|
const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
|
||||||
const Tbri * restrict p2, Tv *restrict atmp)
|
const Tbri * restrict p2)
|
||||||
{
|
{
|
||||||
int lmax=gen->lmax;
|
int lmax=gen->lmax;
|
||||||
Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
|
Tb lam_1=Tbconst(0.),lam_2=Tbconst(0.),scale;
|
||||||
|
@ -339,6 +341,7 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
|
||||||
job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
|
job->opcnt += (lmax+1-l) * 8*VLEN*nvec;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
|
const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
|
||||||
|
dcmplx * restrict alm=job->almtmp;
|
||||||
Tb corfac;
|
Tb corfac;
|
||||||
getCorfac(scale,&corfac,gen->cf);
|
getCorfac(scale,&corfac,gen->cf);
|
||||||
int full_ieee = TballGe(scale,sharp_minscale);
|
int full_ieee = TballGe(scale,sharp_minscale);
|
||||||
|
@ -347,11 +350,12 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]),
|
Tv f10=vload(rf[l ].f[0]), f11=vload(rf[l ].f[1]),
|
||||||
f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
|
f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
|
||||||
|
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
||||||
for (int i=0; i<nvec; ++i)
|
for (int i=0; i<nvec; ++i)
|
||||||
{
|
{
|
||||||
lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
|
lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
|
||||||
vfmaeq(atmp[2*l ],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
|
vfmaeq(atmp[0],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
|
||||||
vfmaeq(atmp[2*l+1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
|
vfmaeq(atmp[1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
|
||||||
lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
|
lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
|
||||||
Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
|
Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
|
||||||
if (vanyTrue(mask))
|
if (vanyTrue(mask))
|
||||||
|
@ -367,14 +371,15 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
|
||||||
corfac.v[i]=corf.v;
|
corfac.v[i]=corf.v;
|
||||||
full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
|
full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
|
||||||
}
|
}
|
||||||
vfmaeq(atmp[2*(l+1) ],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
|
vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
|
||||||
vfmaeq(atmp[2*(l+1)+1],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
|
vfmaeq(atmp[3],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
|
||||||
}
|
}
|
||||||
|
vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
|
||||||
l+=2;
|
l+=2;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
|
Tbmuleq(&lam_1,corfac); Tbmuleq(&lam_2,corfac);
|
||||||
map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
|
map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -454,8 +459,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||||
{
|
{
|
||||||
if (job->spin==0)
|
if (job->spin==0)
|
||||||
{
|
{
|
||||||
Tv atmp[2*(gen->lmax+2)];
|
|
||||||
memset (&atmp[2*m],0,2*(gen->lmax+2-m)*sizeof(Tv));
|
|
||||||
for (int ith=0; ith<ulim-llim; ith+=nval)
|
for (int ith=0; ith<ulim-llim; ith+=nval)
|
||||||
{
|
{
|
||||||
Tburi p1, p2; VZERO(p1); VZERO(p2);
|
Tburi p1, p2; VZERO(p1); VZERO(p2);
|
||||||
|
@ -478,15 +481,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!skip)
|
if (!skip)
|
||||||
calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
|
calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b);
|
||||||
}
|
}
|
||||||
{
|
|
||||||
int istart=m, istop=gen->lmax+1;
|
|
||||||
for(; istart<istop-2; istart+=2)
|
|
||||||
vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
|
|
||||||
for(; istart<istop; istart++)
|
|
||||||
job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue