spin>0 not yet working
This commit is contained in:
parent
0b8222393f
commit
07a708dbc6
2 changed files with 306 additions and 8 deletions
|
@ -201,7 +201,7 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
|
|||
*l_=l;
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
static inline void rec_step (Tv * restrict rxp, Tv * restrict rxm,
|
||||
Tv * restrict ryp, Tv * restrict rym, const Tv cth,
|
||||
const sharp_ylmgen_dbl3 fx)
|
||||
|
@ -261,7 +261,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
|||
Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
|
||||
Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
|
||||
|
||||
below_limit &= vallTrue(vand_mask(vlt(d->scm[i],limscale),vlt(d->scp[i],limscale)));
|
||||
below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
|
||||
vallTrue(vlt(d->scp[i],limscale));
|
||||
}
|
||||
|
||||
int l=gen->mhi;
|
||||
|
@ -298,7 +299,7 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
|||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
||||
for (int i=0; i<nvec; ++i)
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
||||
|
@ -319,8 +320,209 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
|||
}
|
||||
l+=2;
|
||||
}
|
||||
// if (l==lmax)
|
||||
// Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
|
||||
}
|
||||
|
||||
NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
||||
const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
|
||||
int l, int lmax, int nv2)
|
||||
{
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
|
||||
fx12=vload(fx[l+1].f[2]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
|
||||
fx22=vload(fx[l+2].f[2]);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
||||
Tv lw = d->l2p[i] + d->l2m[i];
|
||||
agr1 += d->p1pr[i]*lw;
|
||||
agi1 += d->p1pi[i]*lw;
|
||||
acr1 += d->p1mr[i]*lw;
|
||||
aci1 += d->p1mi[i]*lw;
|
||||
Tv lx = d->l2m[i] - d->l2p[i];
|
||||
agr1 -= d->p2mi[i]*lx;
|
||||
agi1 += d->p2mr[i]*lx;
|
||||
acr1 += d->p2pi[i]*lx;
|
||||
aci1 -= d->p2pr[i]*lx;
|
||||
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
|
||||
lw = d->l1p[i] + d->l1m[i];
|
||||
agr2 += d->p2pr[i]*lw;
|
||||
agi2 += d->p2pi[i]*lw;
|
||||
acr2 += d->p2mr[i]*lw;
|
||||
aci2 += d->p2mi[i]*lw;
|
||||
lx = d->l1m[i] - d->l1p[i];
|
||||
agr2 -= d->p1mi[i]*lx;
|
||||
agi2 += d->p1mr[i]*lx;
|
||||
acr2 += d->p1pi[i]*lx;
|
||||
aci2 -= d->p1pr[i]*lx;
|
||||
}
|
||||
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
|
||||
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
|
||||
l+=2;
|
||||
}
|
||||
}
|
||||
|
||||
NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
||||
{
|
||||
int l,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
iter_to_ieee_spin(gen, d, &l, nv2);
|
||||
job->opcnt += (l-gen->m) * 10*nth;
|
||||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 28*nth;
|
||||
|
||||
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
}
|
||||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
|
||||
fx12=vload(fx[l+1].f[2]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
|
||||
fx22=vload(fx[l+2].f[2]);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
||||
Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
|
||||
Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
|
||||
d->p1pr[i] += agr1*lw1 - aci2*lx2;
|
||||
d->p1pi[i] += agi1*lw1 + acr2*lx2;
|
||||
d->p1mr[i] += acr1*lw1 + agi2*lx2;
|
||||
d->p1mi[i] += aci1*lw1 - agr2*lx2;
|
||||
Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
|
||||
Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
|
||||
d->p2pr[i] -= agr2*lw2 - aci1*lx1;
|
||||
d->p2pi[i] += agi2*lw2 + acr1*lx1;
|
||||
d->p2mr[i] += acr2*lw2 + agi1*lx1;
|
||||
d->p2mi[i] -= aci2*lw2 - agr1*lx1;
|
||||
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
||||
}
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
||||
{
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
}
|
||||
}
|
||||
l+=2;
|
||||
}
|
||||
if (l>lmax) return;
|
||||
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] *= d->cfp[i];
|
||||
d->l2p[i] *= d->cfp[i];
|
||||
d->l1m[i] *= d->cfm[i];
|
||||
d->l2m[i] *= d->cfm[i];
|
||||
}
|
||||
alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
|
||||
}
|
||||
|
||||
NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
||||
const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
|
||||
{
|
||||
int l,lmax=gen->lmax;
|
||||
int nv2 = (nth+VLEN-1)/VLEN;
|
||||
iter_to_ieee_spin(gen, d, &l, nv2);
|
||||
job->opcnt += (l-gen->m) * 10*nth;
|
||||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 28*nth;
|
||||
|
||||
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
|
||||
vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
}
|
||||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
|
||||
fx12=vload(fx[l+1].f[2]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
|
||||
fx22=vload(fx[l+2].f[2]);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
||||
Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
|
||||
agr1 += d->p1pr[i]*lw;
|
||||
agi1 += d->p1pi[i]*lw;
|
||||
acr1 += d->p1mr[i]*lw;
|
||||
aci1 += d->p1mi[i]*lw;
|
||||
Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
|
||||
agr1 -= d->p2mi[i]*lx;
|
||||
agi1 += d->p2mr[i]*lx;
|
||||
acr1 += d->p2pi[i]*lx;
|
||||
aci1 -= d->p2pr[i]*lx;
|
||||
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
|
||||
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
|
||||
lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
|
||||
agr2 += d->p2pr[i]*lw;
|
||||
agi2 += d->p2pi[i]*lw;
|
||||
acr2 += d->p2mr[i]*lw;
|
||||
aci2 += d->p2mi[i]*lw;
|
||||
lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
|
||||
agr2 -= d->p1mi[i]*lx;
|
||||
agi2 += d->p1mr[i]*lx;
|
||||
acr2 += d->p1pi[i]*lx;
|
||||
aci2 -= d->p1pr[i]*lx;
|
||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
||||
{
|
||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
||||
}
|
||||
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
|
||||
{
|
||||
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
|
||||
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
|
||||
}
|
||||
}
|
||||
vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
|
||||
vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
|
||||
l+=2;
|
||||
}
|
||||
if (l>lmax) return;
|
||||
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] *= d->cfp[i];
|
||||
d->l2p[i] *= d->cfp[i];
|
||||
d->l1m[i] *= d->cfm[i];
|
||||
d->l2m[i] *= d->cfm[i];
|
||||
}
|
||||
map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -521,6 +723,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
|||
{
|
||||
d.s.cth[i]=d.s.cth[nth-1];
|
||||
d.s.sth[i]=d.s.sth[nth-1];
|
||||
d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
|
||||
}
|
||||
calc_alm2map (job, gen, &d.v, nth);
|
||||
for (int i=0; i<nth; ++i)
|
||||
|
@ -538,7 +741,63 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
|
|||
}
|
||||
else
|
||||
{
|
||||
UTIL_FAIL("only spin==0 allowed at the moment");
|
||||
int ith=0;
|
||||
int itgt[nvec*VLEN];
|
||||
while (ith<ulim-llim)
|
||||
{
|
||||
sxdata_u d;
|
||||
VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
|
||||
VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
|
||||
int nth=0;
|
||||
while ((nth<nval)&&(ith<ulim-llim))
|
||||
{
|
||||
if (mlim[ith]>=m)
|
||||
{
|
||||
itgt[nth] = ith;
|
||||
d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
|
||||
++nth;
|
||||
}
|
||||
else
|
||||
{
|
||||
int phas_idx = ith*job->s_th + mi*job->s_m;
|
||||
job->phase[phas_idx ] = job->phase[phas_idx+1] = 0;
|
||||
job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0;
|
||||
}
|
||||
++ith;
|
||||
}
|
||||
if (nth>0)
|
||||
{
|
||||
int i2=((nth+VLEN-1)/VLEN)*VLEN;
|
||||
for (int i=nth; i<i2; ++i)
|
||||
{
|
||||
d.s.cth[i]=d.s.cth[nth-1];
|
||||
d.s.sth[i]=d.s.sth[nth-1];
|
||||
d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
|
||||
d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
|
||||
}
|
||||
calc_alm2map_spin (job, gen, &d.v, nth);
|
||||
for (int i=0; i<nth; ++i)
|
||||
{
|
||||
int tgt=itgt[i];
|
||||
int phas_idx = tgt*job->s_th + mi*job->s_m;
|
||||
complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
|
||||
q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
|
||||
u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
|
||||
u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
|
||||
job->phase[phas_idx ] = q1+q2;
|
||||
job->phase[phas_idx+2] = u1+u2;
|
||||
if (ispair[tgt])
|
||||
{
|
||||
dcmplx *phQ = &(job->phase[phas_idx+1]),
|
||||
*phU = &(job->phase[phas_idx+3]);
|
||||
*phQ = q1-q2;
|
||||
*phU = u1-u2;
|
||||
if ((gen->mhi-gen->m+gen->s)&1)
|
||||
{ *phQ=-(*phQ); *phU=-(*phU); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -599,7 +858,46 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
|
|||
}
|
||||
else
|
||||
{
|
||||
UTIL_FAIL("only spin==0 allowed at the moment");
|
||||
int ith=0;
|
||||
while (ith<ulim-llim)
|
||||
{
|
||||
sxdata_u d;
|
||||
VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
|
||||
VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
|
||||
int nth=0;
|
||||
while ((nth<nval)&&(ith<ulim-llim))
|
||||
{
|
||||
if (mlim[ith]>=m)
|
||||
{
|
||||
d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
|
||||
int phas_idx = ith*job->s_th + mi*job->s_m;
|
||||
dcmplx p1Q=job->phase[phas_idx],
|
||||
p1U=job->phase[phas_idx+2],
|
||||
p2Q=ispair[ith] ? job->phase[phas_idx+1]:0.,
|
||||
p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
|
||||
if ((gen->mhi-gen->m+gen->s)&1)
|
||||
{ p2Q=-p2Q; p2U=-p2U; }
|
||||
d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
|
||||
d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
|
||||
d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
|
||||
d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
|
||||
++nth;
|
||||
}
|
||||
++ith;
|
||||
}
|
||||
if (nth>0)
|
||||
{
|
||||
int i2=((nth+VLEN-1)/VLEN)*VLEN;
|
||||
for (int i=nth; i<i2; ++i)
|
||||
{
|
||||
d.s.cth[i]=d.s.cth[nth-1];
|
||||
d.s.sth[i]=d.s.sth[nth-1];
|
||||
d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
|
||||
d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
|
||||
}
|
||||
calc_map2alm_spin(job, gen, &d.v, nth);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -375,7 +375,7 @@ static void check_sign_scale(void)
|
|||
UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
|
||||
"error");
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
|
||||
NULL,NULL);
|
||||
UTIL_ASSERT(FAPPROX(map[0][0 ], 2.750897760535633285e+00,1e-12),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue