simplify Ylm data structures
This commit is contained in:
parent
b0b0875def
commit
253b253467
3 changed files with 52 additions and 55 deletions
|
@ -190,8 +190,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
|
|||
{
|
||||
if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||
below_limit=1;
|
||||
Tv a1=vload(gen->ab[il ].f[0]), b1=vload(gen->ab[il ].f[1]);
|
||||
Tv a2=vload(gen->ab[il+1].f[0]), b2=vload(gen->ab[il+1].f[1]);
|
||||
Tv a1=vload(gen->coef[il ][0]), b1=vload(gen->coef[il ][1]);
|
||||
Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]);
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
|
||||
|
@ -205,7 +205,7 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
|
|||
}
|
||||
|
||||
NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm,
|
||||
const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
|
||||
int l, int il, int lmax, int nv2)
|
||||
{
|
||||
if (nv2==nv0)
|
||||
|
@ -216,8 +216,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
|||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
||||
Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]);
|
||||
Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
|
||||
Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]);
|
||||
Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
|
||||
for (int i=0; i<nv0; ++i)
|
||||
{
|
||||
d->p1r[i] += d->lam2[i]*ar1;
|
||||
|
@ -241,8 +241,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
|||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
|
||||
Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
|
||||
Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]);
|
||||
Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
|
||||
Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]);
|
||||
Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->p1r[i] += d->lam2[i]*ar1;
|
||||
|
@ -262,7 +262,7 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
|
|||
{
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
|
||||
Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->p1r[i] += d->lam2[i]*ar1;
|
||||
|
@ -286,7 +286,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 6*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
|
||||
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -299,7 +299,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
|||
{
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ]));
|
||||
Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
|
||||
Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -323,17 +323,17 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
|
|||
d->lam1[i] *= d->corfac[i];
|
||||
d->lam2[i] *= d->corfac[i];
|
||||
}
|
||||
alm2map_kernel(d, ab, alm, l, il, lmax, nv2);
|
||||
alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
|
||||
}
|
||||
|
||||
NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
||||
const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l,
|
||||
const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
|
||||
int il, int lmax, int nv2)
|
||||
{
|
||||
for (; l<=lmax-2; il+=2, l+=4)
|
||||
{
|
||||
Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]);
|
||||
Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]);
|
||||
Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]);
|
||||
Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]);
|
||||
Tv atmp1[4] = {vzero, vzero, vzero, vzero};
|
||||
Tv atmp2[4] = {vzero, vzero, vzero, vzero};
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -354,7 +354,7 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
|
|||
}
|
||||
for (; l<=lmax; ++il, l+=2)
|
||||
{
|
||||
Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
|
||||
Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
|
||||
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
|
@ -380,7 +380,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 6*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict ab = gen->ab;
|
||||
const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -391,7 +391,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
|||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]);
|
||||
Tv a=vload(coef[il][0]), b=vload(coef[il][1]);
|
||||
Tv atmp[4] = {vzero, vzero, vzero, vzero};
|
||||
full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -417,13 +417,13 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
|||
d->lam1[i] *= d->corfac[i];
|
||||
d->lam2[i] *= d->corfac[i];
|
||||
}
|
||||
map2alm_kernel(d, ab, alm, l, il, lmax, nv2);
|
||||
map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
|
||||
}
|
||||
|
||||
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||
sxdata_v * restrict d, int * restrict l_, int nv2)
|
||||
{
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
Tv prefac=vload(gen->prefac[gen->m]),
|
||||
prescale=vload(gen->fscale[gen->m]);
|
||||
Tv limscale=vload(sharp_limscale);
|
||||
|
@ -474,8 +474,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
|||
{
|
||||
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||
below_limit=1;
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
for (int i=0; i<nv2; ++i)
|
||||
{
|
||||
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||
|
@ -500,8 +500,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
|||
int lsave = l;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
|
@ -525,8 +525,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
|||
l=lsave;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
|
@ -559,7 +559,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 23*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -572,8 +572,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
|||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||
|
@ -636,8 +636,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
|||
int lsave=l;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -660,8 +660,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
|
|||
l=lsave;
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -693,7 +693,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 23*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -714,8 +714,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
|
|||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
|
||||
Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
|
||||
full_ieee=1;
|
||||
|
@ -766,8 +766,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
|
|||
{
|
||||
while (l<=lmax)
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -803,7 +803,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
|||
if (l>lmax) return;
|
||||
job->opcnt += (lmax+1-l) * 17*nth;
|
||||
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->fx;
|
||||
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
|
||||
const dcmplx * restrict alm=job->almtmp;
|
||||
int full_ieee=1;
|
||||
for (int i=0; i<nv2; ++i)
|
||||
|
@ -816,8 +816,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
|
|||
|
||||
while((!full_ieee) && (l<=lmax))
|
||||
{
|
||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||
Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]);
|
||||
Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]);
|
||||
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
|
||||
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
|
||||
full_ieee=1;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue