From d2cd9a08550bfa6f67d0417be556b05bb790e14f Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Thu, 10 Jan 2019 18:38:46 +0100 Subject: [PATCH] tweaks --- libsharp/sharp_core.c | 110 ++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 68 deletions(-) diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 7d6709c..f0d6354 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -508,18 +508,17 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->p1pr[i] += agr1*d->l2p[i]; d->p1pi[i] += agi1*d->l2p[i]; d->p1mr[i] += acr1*d->l2p[i]; d->p1mi[i] += aci1*d->l2p[i]; - d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->p1pr[i] += aci2*d->l1p[i]; d->p1pi[i] -= acr2*d->l1p[i]; d->p1mr[i] -= agi2*d->l1p[i]; d->p1mi[i] += agr2*d->l1p[i]; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; } l+=2; } @@ -534,18 +533,17 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3])); for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; d->p2pr[i] -= aci1*d->l2m[i]; d->p2pi[i] += acr1*d->l2m[i]; d->p2mr[i] += agi1*d->l2m[i]; d->p2mi[i] -= agr1*d->l2m[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; - d->p2pr[i] += agr2*d->l1m[i]; d->p2pi[i] += agi2*d->l1m[i]; d->p2mr[i] += acr2*d->l1m[i]; d->p2mi[i] += aci2*d->l1m[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; } l+=2; } @@ -571,7 +569,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && vallTrue(vge(d->scm[i],vload(sharp_minscale))); } -full_ieee=0; + while((!full_ieee) && (l<=lmax)) { Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); @@ -617,11 +615,10 @@ full_ieee=0; if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) getCorfac(d->scm[i], &d->cfm[i], gen->cf); full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); - full_ieee=0; } l+=2; } - if (l>lmax) return; +// if (l>lmax) return; for (int i=0; il1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - agr1 += d->p1pr[i]*d->l2p[i]; - agr1 += d->p1pr[i]*d->l2m[i]; - agr1 -= d->p2mi[i]*d->l2m[i]; agr1 += d->p2mi[i]*d->l2p[i]; - agi1 += d->p1pi[i]*d->l2p[i]; - agi1 += d->p1pi[i]*d->l2m[i]; - agi1 += d->p2mr[i]*d->l2m[i]; agi1 -= d->p2mr[i]*d->l2p[i]; - acr1 += d->p1mr[i]*d->l2p[i]; - acr1 += d->p1mr[i]*d->l2m[i]; - acr1 += d->p2pi[i]*d->l2m[i]; acr1 -= d->p2pi[i]*d->l2p[i]; - aci1 += d->p1mi[i]*d->l2p[i]; - aci1 += d->p1mi[i]*d->l2m[i]; - aci1 -= d->p2pr[i]*d->l2m[i]; aci1 += d->p2pr[i]*d->l2p[i]; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; agr2 += d->p2pr[i]*d->l1p[i]; - agr2 += d->p2pr[i]*d->l1m[i]; - agr2 -= d->p1mi[i]*d->l1m[i]; - agr2 += d->p1mi[i]*d->l1p[i]; agi2 += d->p2pi[i]*d->l1p[i]; - agi2 += d->p2pi[i]*d->l1m[i]; - agi2 += d->p1mr[i]*d->l1m[i]; - agi2 -= d->p1mr[i]*d->l1p[i]; acr2 += d->p2mr[i]*d->l1p[i]; - acr2 += d->p2mr[i]*d->l1m[i]; - acr2 += d->p1pi[i]*d->l1m[i]; - acr2 -= d->p1pi[i]*d->l1p[i]; aci2 += d->p2mi[i]*d->l1p[i]; - aci2 += d->p2mi[i]*d->l1m[i]; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + } + vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); + vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); + l+=2; + } + l=lsave; + while (l<=lmax) + { + Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); + Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; + Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; + for (int i=0; il1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + agr1 += d->p1pr[i]*d->l2m[i]; + agi1 += d->p1pi[i]*d->l2m[i]; + acr1 += d->p1mr[i]*d->l2m[i]; + aci1 += d->p1mi[i]*d->l2m[i]; + agr2 -= d->p1mi[i]*d->l1m[i]; + agi2 += d->p1mr[i]*d->l1m[i]; + acr2 += d->p1pi[i]*d->l1m[i]; aci2 -= d->p1pr[i]*d->l1m[i]; - aci2 += d->p1pr[i]*d->l1p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; } vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); @@ -705,7 +701,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, iter_to_ieee_spin(gen, d, &l, nv2); job->opcnt += (l-gen->mhi) * 7*nth; if (l>lmax) return; - job->opcnt += (lmax+1-l) * 25*nth; + job->opcnt += (lmax+1-l) * 23*nth; const sharp_ylmgen_dbl2 * restrict fx = gen->fx; dcmplx * restrict alm=job->almtmp; @@ -717,8 +713,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) && vallTrue(vge(d->scm[i],vload(sharp_minscale))); } + for (int i=0; ip1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp; + tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp; + tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp; + tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp; + } -full_ieee=0; while((!full_ieee) && (l<=lmax)) { Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); @@ -732,60 +735,31 @@ full_ieee=0; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; Tv l2p = d->l2p[i]*d->cfp[i], l2m = d->l2m[i]*d->cfm[i]; Tv l1p = d->l1p[i]*d->cfp[i], l1m = d->l1m[i]*d->cfm[i]; - agr1 += d->p1pr[i]*l2p; agr1 += d->p1pr[i]*l2m; - agr1 -= d->p2mi[i]*l2m; agr1 += d->p2mi[i]*l2p; - agi1 += d->p1pi[i]*l2p; agi1 += d->p1pi[i]*l2m; - agi1 += d->p2mr[i]*l2m; agi1 -= d->p2mr[i]*l2p; - acr1 += d->p1mr[i]*l2p; acr1 += d->p1mr[i]*l2m; - acr1 += d->p2pi[i]*l2m; acr1 -= d->p2pi[i]*l2p; - aci1 += d->p1mi[i]*l2p; aci1 += d->p1mi[i]*l2m; - aci1 -= d->p2pr[i]*l2m; aci1 += d->p2pr[i]*l2p; agr2 += d->p2pr[i]*l1p; - agr2 += d->p2pr[i]*l1m; agr2 -= d->p1mi[i]*l1m; - agr2 += d->p1mi[i]*l1p; agi2 += d->p2pi[i]*l1p; - agi2 += d->p2pi[i]*l1m; agi2 += d->p1mr[i]*l1m; - agi2 -= d->p1mr[i]*l1p; acr2 += d->p2mr[i]*l1p; - acr2 += d->p2mr[i]*l1m; acr2 += d->p1pi[i]*l1m; - acr2 -= d->p1pi[i]*l1p; aci2 += d->p2mi[i]*l1p; - aci2 += d->p2mi[i]*l1m; aci2 -= d->p1pr[i]*l1m; - aci2 += d->p1pr[i]*l1p; -// Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i]; -// Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i]; -// agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx; -// agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx; -// acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx; -// aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; -// lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i]; -// lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i]; -// agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx; -// agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx; -// acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx; -// aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) getCorfac(d->scp[i], &d->cfp[i], gen->cf); full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) getCorfac(d->scm[i], &d->cfm[i], gen->cf); full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); - full_ieee=0; } vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);