diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index f232018..1e476d0 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -40,7 +40,7 @@
 
 typedef complex double dcmplx;
 
-#define nvec (256/VLEN)
+#define nvec (128/VLEN)
 
 typedef union
   { Tv v; double s[VLEN]; } Tvu;
@@ -215,42 +215,36 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
   sxdata_v * restrict d, int * restrict l_, int nv2)
   {
   const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tbv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
   Tv prefac=vload(gen->prefac[gen->m]),
      prescale=vload(gen->fscale[gen->m]);
   Tv limscale=vload(sharp_limscale);
   int below_limit=1;
   for (int i=0; i<nv2; ++i)
     {
-    Tv cth2, sth2;
-    cth2=vsqrt((vone+d->cth[i])*vload(0.5));
-    cth2=vmax(cth2,vload(1e-15));
-    sth2=vsqrt((vone-d->cth[i])*vload(0.5));
-    sth2=vmax(sth2,vload(1e-15));
+    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
+    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
     Tm mask=vlt(d->sth[i],vzero);
-    Tm cmask=vand_mask(mask,vlt(d->cth[i],vzero));
-    vmuleq_mask(cmask,cth2,vload(-1.));
-    Tm smask=vand_mask(mask,vgt(d->cth[i],vzero));
-    vmuleq_mask(smask,sth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
 
-    mypow(cth2,gen->cosPow,gen->powlimit,&ccp[i],&ccps[i]);
-    mypow(sth2,gen->sinPow,gen->powlimit,&ssp[i],&ssps[i]);
-    mypow(cth2,gen->sinPow,gen->powlimit,&csp[i],&csps[i]);
-    mypow(sth2,gen->cosPow,gen->powlimit,&scp[i],&scps[i]);
+    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
 
     d->l1p[i] = vzero;
     d->l1m[i] = vzero;
-    d->l2p[i] = prefac*ccp[i];
-    d->scp[i] = prescale+ccps[i];
-    d->l2m[i] = prefac*csp[i];
-    d->scm[i] = prescale+csps[i];
+    d->l2p[i] = prefac*ccp;
+    d->scp[i] = prescale+ccps;
+    d->l2m[i] = prefac*csp;
+    d->scm[i] = prescale+csps;
     Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
     Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
-
-    d->l2p[i] *= ssp[i];
-    d->scp[i] += ssps[i];
-    d->l2m[i] *= scp[i];
-    d->scm[i] += scps[i];
+    d->l2p[i] *= ssp;
+    d->scp[i] += ssps;
+    d->l2m[i] *= scp;
+    d->scm[i] += scps;
     if (gen->preMinus_p)
       d->l2p[i] = vneg(d->l2p[i]);
     if (gen->preMinus_m)
@@ -277,8 +271,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
       rec_step(&d->l2p[i],&d->l2m[i],&d->l1p[i],&d->l1m[i],d->cth[i],fx[l+2]);
       if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
           rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
-      below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
-                     vallTrue(vlt(d->scm[i],limscale));
+        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
+                       vallTrue(vlt(d->scm[i],limscale));
       }
     l+=2;
     }
@@ -312,10 +306,10 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
       d->p1mi[i] += aci1*lw1 - agr2*lx2;
       Tv lx1=d->l2m[i]-d->l2p[i];
       Tv lw2=d->l1p[i]+d->l1m[i];
-      d->p2pr[i] -= agr2*lw2 - aci1*lx1;
+      d->p2pr[i] += agr2*lw2 - aci1*lx1;
       d->p2pi[i] += agi2*lw2 + acr1*lx1;
       d->p2mr[i] += acr2*lw2 + agi1*lx1;
-      d->p2mi[i] -= aci2*lw2 - agr1*lx1;
+      d->p2mi[i] += aci2*lw2 - agr1*lx1;
       d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
       d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
       }
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index e967773..9cb9dbb 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -85,15 +85,15 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
   else
     {
     gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
-    for (int m=0; m<gen->lmax+2; ++m)
+    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
+    for (int m=0; m<gen->lmax+3; ++m)
       gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
-    ALLOC(gen->inv,double,gen->lmax+1);
+    ALLOC(gen->inv,double,gen->lmax+2);
     gen->inv[0]=0;
-    for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
-    ALLOC(gen->flm1,double,2*gen->lmax+1);
-    ALLOC(gen->flm2,double,2*gen->lmax+1);
-    for (int m=0; m<2*gen->lmax+1; ++m)
+    for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
+    ALLOC(gen->flm1,double,2*gen->lmax+3);
+    ALLOC(gen->flm2,double,2*gen->lmax+3);
+    for (int m=0; m<2*gen->lmax+3; ++m)
       {
       gen->flm1[m] = sqrt(1./(m+1.));
       gen->flm2[m] = sqrt(m/(m+1.));
@@ -176,7 +176,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
 
     if (!ms_similar)
       {
-      for (int l=gen->mhi; l<gen->lmax; ++l)
+      for (int l=gen->mhi; l<gen->lmax+1; ++l)
         {
         double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
                   *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];