diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 5bf74a8..7307202 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -209,22 +209,46 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm, int l, int il, int lmax, int nv2) { - for (; l<=lmax-2; il+=2, l+=4) + if (nv2==nv0) { - Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); - Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); - Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); - Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]); - Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]); - for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; - d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3; - d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3; - d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4; - d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4; - d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); + Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]); + Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]); + for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; + d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3; + d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3; + d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4; + d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4; + d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + } + } + } + else + { + for (; l<=lmax-2; il+=2, l+=4) + { + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); + Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); + Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); + Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]); + Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]); + for (int i=0; ilam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; + d->p1r[i] += d->lam2[i]*ar1 + d->lam1[i]*ar3; + d->p1i[i] += d->lam2[i]*ai1 + d->lam1[i]*ai3; + d->p2r[i] += d->lam2[i]*ar2 + d->lam1[i]*ar4; + d->p2i[i] += d->lam2[i]*ai2 + d->lam1[i]*ai4; + d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i]; + } } } for (; l<=lmax; ++il, l+=2) @@ -486,12 +510,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, d->p1mi[i] += aci1*lw1 - agr2*lx2; Tv lx1=d->l2m[i]-d->l2p[i]; Tv lw2=d->l1p[i]+d->l1m[i]; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; d->p2pr[i] += agr2*lw2 - aci1*lx1; d->p2pi[i] += agi2*lw2 + acr1*lx1; d->p2mr[i] += acr2*lw2 + agi1*lx1; d->p2mi[i] += aci2*lw2 - agr1*lx1; - d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; - d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; } l+=2; }