works now

2018-12-17 10:32:49 +01:00 · 2018-12-17 10:32:49 +01:00 · 8c98d4624e
commit 8c98d4624e
parent 846b37c231
2 changed files with 30 additions and 36 deletions
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@ -40,7 +40,7 @@

 typedef complex double dcmplx;

-#define nvec (256/VLEN)
+#define nvec (128/VLEN)

 typedef union
  { Tv v; double s[VLEN]; } Tvu;
@ -215,42 +215,36 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
  sxdata_v * restrict d, int * restrict l_, int nv2)
  {
  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tbv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
  Tv prefac=vload(gen->prefac[gen->m]),
     prescale=vload(gen->fscale[gen->m]);
  Tv limscale=vload(sharp_limscale);
  int below_limit=1;
  for (int i=0; i<nv2; ++i)
    {
-    Tv cth2, sth2;
-    cth2=vsqrt((vone+d->cth[i])*vload(0.5));
-    cth2=vmax(cth2,vload(1e-15));
-    sth2=vsqrt((vone-d->cth[i])*vload(0.5));
-    sth2=vmax(sth2,vload(1e-15));
+    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
+    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
    Tm mask=vlt(d->sth[i],vzero);
-    Tm cmask=vand_mask(mask,vlt(d->cth[i],vzero));
-    vmuleq_mask(cmask,cth2,vload(-1.));
-    Tm smask=vand_mask(mask,vgt(d->cth[i],vzero));
-    vmuleq_mask(smask,sth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));

-    mypow(cth2,gen->cosPow,gen->powlimit,&ccp[i],&ccps[i]);
-    mypow(sth2,gen->sinPow,gen->powlimit,&ssp[i],&ssps[i]);
-    mypow(cth2,gen->sinPow,gen->powlimit,&csp[i],&csps[i]);
-    mypow(sth2,gen->cosPow,gen->powlimit,&scp[i],&scps[i]);
+    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);

    d->l1p[i] = vzero;
    d->l1m[i] = vzero;
-    d->l2p[i] = prefac*ccp[i];
-    d->scp[i] = prescale+ccps[i];
-    d->l2m[i] = prefac*csp[i];
-    d->scm[i] = prescale+csps[i];
+    d->l2p[i] = prefac*ccp;
+    d->scp[i] = prescale+ccps;
+    d->l2m[i] = prefac*csp;
+    d->scm[i] = prescale+csps;
    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
-
-    d->l2p[i] *= ssp[i];
-    d->scp[i] += ssps[i];
-    d->l2m[i] *= scp[i];
-    d->scm[i] += scps[i];
+    d->l2p[i] *= ssp;
+    d->scp[i] += ssps;
+    d->l2m[i] *= scp;
+    d->scm[i] += scps;
    if (gen->preMinus_p)
      d->l2p[i] = vneg(d->l2p[i]);
    if (gen->preMinus_m)
@ -277,8 +271,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
      rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]);
      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
-      below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
-                     vallTrue(vlt(d->scm[i],limscale));
+        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
+                       vallTrue(vlt(d->scm[i],limscale));
      }
    l+=2;
    }
@ -312,10 +306,10 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
      d->p1mi[i] += aci1*lw1 - agr2*lx2;
      Tv lx1=d->l2m[i]-d->l2p[i];
      Tv lw2=d->l1p[i]+d->l1m[i];
-      d->p2pr[i] -= agr2*lw2 - aci1*lx1;
+      d->p2pr[i] += agr2*lw2 - aci1*lx1;
      d->p2pi[i] += agi2*lw2 + acr1*lx1;
      d->p2mr[i] += acr2*lw2 + agi1*lx1;
-      d->p2mi[i] -= aci2*lw2 - agr1*lx1;
+      d->p2mi[i] += aci2*lw2 - agr1*lx1;
      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
      }
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@ -85,15 +85,15 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
  else
    {
    gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
-    for (int m=0; m<gen->lmax+2; ++m)
+    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
+    for (int m=0; m<gen->lmax+3; ++m)
      gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
-    ALLOC(gen->inv,double,gen->lmax+1);
+    ALLOC(gen->inv,double,gen->lmax+2);
    gen->inv[0]=0;
-    for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
-    ALLOC(gen->flm1,double,2*gen->lmax+1);
-    ALLOC(gen->flm2,double,2*gen->lmax+1);
-    for (int m=0; m<2*gen->lmax+1; ++m)
+    for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
+    ALLOC(gen->flm1,double,2*gen->lmax+3);
+    ALLOC(gen->flm2,double,2*gen->lmax+3);
+    for (int m=0; m<2*gen->lmax+3; ++m)
      {
      gen->flm1[m] = sqrt(1./(m+1.));
      gen->flm2[m] = sqrt(m/(m+1.));
@ -176,7 +176,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)

    if (!ms_similar)
      {
-      for (int l=gen->mhi; l<gen->lmax; ++l)
+      for (int l=gen->mhi; l<gen->lmax+1; ++l)
        {
        double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
                  *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];