first try, not working
This commit is contained in:
parent
abf3b053d7
commit
c89efbec62
3 changed files with 42 additions and 27 deletions
|
@ -392,7 +392,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
|
||||||
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
sxdata_v * restrict d, int * restrict l_, int nv2)
|
sxdata_v * restrict d, int * restrict l_, int nv2)
|
||||||
{
|
{
|
||||||
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
|
const sharp_ylmgen_dbl2 * restrict fxx = gen->fxx;
|
||||||
Tv prefac=vload(gen->prefac[gen->m]),
|
Tv prefac=vload(gen->prefac[gen->m]),
|
||||||
prescale=vload(gen->fscale[gen->m]);
|
prescale=vload(gen->fscale[gen->m]);
|
||||||
Tv limscale=vload(sharp_limscale);
|
Tv limscale=vload(sharp_limscale);
|
||||||
|
@ -443,16 +443,14 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
{
|
{
|
||||||
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
|
||||||
below_limit=1;
|
below_limit=1;
|
||||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
|
Tv fx10=vload(fxx[l+1].f[0]),fx11=vload(fxx[l+1].f[1]);
|
||||||
fx12=vload(fx[l+1].f[2]);
|
Tv fx20=vload(fxx[l+2].f[0]),fx21=vload(fxx[l+2].f[1]);
|
||||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
|
|
||||||
fx22=vload(fx[l+2].f[2]);
|
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
|
if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
|
||||||
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
|
rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
|
||||||
below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
|
below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
|
||||||
|
@ -465,23 +463,21 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
|
||||||
}
|
}
|
||||||
|
|
||||||
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
||||||
const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
|
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
|
||||||
int l, int lmax, int nv2)
|
int l, int lmax, int nv2)
|
||||||
{
|
{
|
||||||
while (l<=lmax)
|
while (l<=lmax)
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
|
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||||
fx12=vload(fx[l+1].f[2]);
|
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
|
|
||||||
fx22=vload(fx[l+2].f[2]);
|
|
||||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||||
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
Tv lw1=d->l2p[i]+d->l2m[i];
|
Tv lw1=d->l2p[i]+d->l2m[i];
|
||||||
Tv lx2=d->l1m[i]-d->l1p[i];
|
Tv lx2=d->l1m[i]-d->l1p[i];
|
||||||
d->p1pr[i] += agr1*lw1 - aci2*lx2;
|
d->p1pr[i] += agr1*lw1 - aci2*lx2;
|
||||||
|
@ -494,8 +490,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
|
||||||
d->p2pi[i] += agi2*lw2 + acr1*lx1;
|
d->p2pi[i] += agi2*lw2 + acr1*lx1;
|
||||||
d->p2mr[i] += acr2*lw2 + agi1*lx1;
|
d->p2mr[i] += acr2*lw2 + agi1*lx1;
|
||||||
d->p2mi[i] += aci2*lw2 - agr1*lx1;
|
d->p2mi[i] += aci2*lw2 - agr1*lx1;
|
||||||
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
}
|
}
|
||||||
l+=2;
|
l+=2;
|
||||||
}
|
}
|
||||||
|
@ -511,7 +507,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
if (l>lmax) return;
|
if (l>lmax) return;
|
||||||
job->opcnt += (lmax+1-l) * 28*nth;
|
job->opcnt += (lmax+1-l) * 28*nth;
|
||||||
|
|
||||||
const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
|
const sharp_ylmgen_dbl2 * restrict fx = gen->fxx;
|
||||||
const dcmplx * restrict alm=job->almtmp;
|
const dcmplx * restrict alm=job->almtmp;
|
||||||
int full_ieee=1;
|
int full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
|
@ -524,10 +520,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
|
|
||||||
while((!full_ieee) && (l<=lmax))
|
while((!full_ieee) && (l<=lmax))
|
||||||
{
|
{
|
||||||
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
|
Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]);
|
||||||
fx12=vload(fx[l+1].f[2]);
|
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]);
|
||||||
Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
|
|
||||||
fx22=vload(fx[l+2].f[2]);
|
|
||||||
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])),
|
||||||
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
|
||||||
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
|
||||||
|
@ -535,8 +529,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
full_ieee=1;
|
full_ieee=1;
|
||||||
for (int i=0; i<nv2; ++i)
|
for (int i=0; i<nv2; ++i)
|
||||||
{
|
{
|
||||||
d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
|
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
|
||||||
d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
|
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
|
||||||
Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
|
Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
|
||||||
Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
|
Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
|
||||||
d->p1pr[i] += agr1*lw1 - aci2*lx2;
|
d->p1pr[i] += agr1*lw1 - aci2*lx2;
|
||||||
|
@ -549,8 +543,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
|
||||||
d->p2pi[i] += agi2*lw2 + acr1*lx1;
|
d->p2pi[i] += agi2*lw2 + acr1*lx1;
|
||||||
d->p2mr[i] += acr2*lw2 + agi1*lx1;
|
d->p2mr[i] += acr2*lw2 + agi1*lx1;
|
||||||
d->p2mi[i] += aci2*lw2 - agr1*lx1;
|
d->p2mi[i] += aci2*lw2 - agr1*lx1;
|
||||||
d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
|
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
|
||||||
d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
|
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
|
||||||
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
|
||||||
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
|
||||||
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
|
||||||
|
|
|
@ -88,6 +88,8 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
|
||||||
{
|
{
|
||||||
gen->m=gen->mlo=gen->mhi=-1234567890;
|
gen->m=gen->mlo=gen->mhi=-1234567890;
|
||||||
ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
|
ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
|
||||||
|
ALLOC(gen->alpha,double,gen->lmax+3);
|
||||||
|
ALLOC(gen->fxx,sharp_ylmgen_dbl2,gen->lmax+3);
|
||||||
for (int m=0; m<gen->lmax+3; ++m)
|
for (int m=0; m<gen->lmax+3; ++m)
|
||||||
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
|
gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
|
||||||
ALLOC(gen->inv,double,gen->lmax+2);
|
ALLOC(gen->inv,double,gen->lmax+2);
|
||||||
|
@ -145,6 +147,8 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
DEALLOC(gen->fx);
|
DEALLOC(gen->fx);
|
||||||
|
DEALLOC(gen->alpha);
|
||||||
|
DEALLOC(gen->fxx);
|
||||||
DEALLOC(gen->prefac);
|
DEALLOC(gen->prefac);
|
||||||
DEALLOC(gen->fscale);
|
DEALLOC(gen->fscale);
|
||||||
DEALLOC(gen->flm1);
|
DEALLOC(gen->flm1);
|
||||||
|
@ -199,6 +203,21 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
|
||||||
*gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
|
*gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
|
||||||
gen->fx[l+1].f[2]=t*l1*gen->inv[l];
|
gen->fx[l+1].f[2]=t*l1*gen->inv[l];
|
||||||
}
|
}
|
||||||
|
// calculate alpha <=> index 3
|
||||||
|
gen->alpha[gen->mhi]=gen->alpha[gen->mhi+1]=1.;
|
||||||
|
for (int l=gen->mhi+2; l<gen->lmax; ++l)
|
||||||
|
{
|
||||||
|
gen->alpha[l] = gen->alpha[l-2]*gen->fx[l+1].f[2];
|
||||||
|
// printf("%d %e %e\n", l, gen->fx[l].f[2], gen->alpha[l]);
|
||||||
|
}
|
||||||
|
gen->fxx[gen->mhi].f[0] = 0;
|
||||||
|
gen->fxx[gen->mhi].f[0] = 0;
|
||||||
|
for (int l=gen->mhi+1; l<gen->lmax+1; ++l)
|
||||||
|
{
|
||||||
|
gen->fxx[l].f[0] = gen->fx[l].f[0]*gen->alpha[l-1]/gen->alpha[l];
|
||||||
|
gen->fxx[l].f[1] = gen->fx[l].f[1]*gen->fxx[l].f[0];
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
gen->preMinus_p = gen->preMinus_m = 0;
|
gen->preMinus_p = gen->preMinus_m = 0;
|
||||||
|
|
|
@ -63,6 +63,8 @@ typedef struct
|
||||||
double *prefac;
|
double *prefac;
|
||||||
int *fscale;
|
int *fscale;
|
||||||
sharp_ylmgen_dbl3 *fx;
|
sharp_ylmgen_dbl3 *fx;
|
||||||
|
//double *alpha;
|
||||||
|
sharp_ylmgen_dbl2 *fxx;
|
||||||
|
|
||||||
/* internal usage only */
|
/* internal usage only */
|
||||||
/* used if s==0 */
|
/* used if s==0 */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue