cleanup

2018-12-10 16:32:12 +01:00 · 2018-12-10 16:32:12 +01:00 · 10c5b5f7a9
commit 10c5b5f7a9
parent c56747d36e
3 changed files with 220 additions and 369 deletions
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@ -82,22 +82,22 @@ void CONCATX(inner_loop,ARCH) (sharp_job *job, const int *ispair,const double *c
  switch (nv)
    {
    case 0x1:
-      CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
      return;
    case 0x2:
-      CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
      return;
    case 0x3:
-      CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
      return;
    case 0x4:
-      CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
      return;
    case 0x5:
-      CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
      return;
    case 0x6:
-      CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
+      CONCAT2(inner_loop,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
      return;
    }
  UTIL_FAIL("Incorrect vector parameters");
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@ -32,149 +32,89 @@
 NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax NJ1)
+  int l, int lmax)
  {
-if (njobs>1)
-  {
-  while (l<lmax-2)
-    {
-    Tb lam_3, lam_4;
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-      lam_3.v[i] = vabmc(vmul(cth.v[i],lam_2.v[i]),r0,vmul(lam_1.v[i],r1));
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
-      lam_4.v[i] = vabmc(vmul(cth.v[i],lam_3.v[i]),r0,vmul(lam_2.v[i],r1));
-    r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
-      lam_1.v[i] = vabmc(vmul(cth.v[i],lam_4.v[i]),r0,vmul(lam_3.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar2=vload(creal(alm[njobs*l+j])),
-         ai2=vload(cimag(alm[njobs*l+j])),
-         ar4=vload(creal(alm[njobs*(l+2)+j])),
-         ai4=vload(cimag(alm[njobs*(l+2)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
-        vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
-        }
-      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
-         ai3=vload(cimag(alm[njobs*(l+1)+j])),
-         ar1=vload(creal(alm[njobs*(l+3)+j])),
-         ai1=vload(cimag(alm[njobs*(l+3)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
-        vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
-        }
-      }
-    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
-    for (int i=0; i<nvec; ++i)
-//      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
-      lam_2.v[i] = vabmc(vmul(cth.v[i],lam_1.v[i]),r0,vmul(lam_4.v[i],r1));
-    l+=4;
-    }
-  }
  while (l<lmax)
    {
    for (int i=0; i<nvec; ++i)
      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(rf[l].f[1])*lam_1.v[i];
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l])),
+       ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
      {
-      Tv ar=vload(creal(alm[njobs*l+j])),
-         ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        p1[j].r.v[i] += lam_2.v[i]*ar;
-        p1[j].i.v[i] += lam_2.v[i]*ai;
-        }
+      p1->r.v[i] += lam_2.v[i]*ar;
+      p1->i.v[i] += lam_2.v[i]*ai;
      }
+    }
    for (int i=0; i<nvec; ++i)
      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(rf[l+1].f[1])*lam_2.v[i];
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l+1])),
+       ai=vload(cimag(alm[l+1]));
+    for (int i=0; i<nvec; ++i)
      {
-      Tv ar=vload(creal(alm[njobs*(l+1)+j])),
-         ai=vload(cimag(alm[njobs*(l+1)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        p2[j].r.v[i] += lam_1.v[i]*ar;
-        p2[j].i.v[i] += lam_1.v[i]*ai;
-        }
+      p2->r.v[i] += lam_1.v[i]*ar;
+      p2->i.v[i] += lam_1.v[i]*ai;
+      }
      }
    l+=2;
    }
  if (l==lmax)
    {
-    for (int j=0; j<njobs; ++j)
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        p1[j].r.v[i] += lam_2.v[i]*ar;
-        p1[j].i.v[i] += lam_2.v[i]*ai;
-        }
+      p1->r.v[i] += lam_2.v[i]*ar;
+      p1->i.v[i] += lam_2.v[i]*ai;
      }
    }
  }

 NOINLINE static void Z(map2alm_kernel) (const Tb cth,
  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
-  NJ1)
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp)
  {
  while (l<lmax)
    {
    for (int i=0; i<nvec; ++i)
      lam_1.v[i] = vabmc(vload(rf[l].f[0]),vmul(cth.v[i],lam_2.v[i]),
                   vmul(vload(rf[l].f[1]),lam_1.v[i]));
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(atmp[2*(l*njobs+j)],lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(atmp[2*(l*njobs+j)+1],lam_2.v[i],p1[j].i.v[i]);
-//        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
-//        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      vfmaeq(atmp[2*l  ],lam_2.v[i],p1->r.v[i]);
+      vfmaeq(atmp[2*l+1],lam_2.v[i],p1->i.v[i]);
+      }
    for (int i=0; i<nvec; ++i)
      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(rf[l+1].f[1])*lam_2.v[i];
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(atmp[2*((l+1)*njobs+j)],lam_1.v[i],p2[j].r.v[i]);
-        vfmaeq(atmp[2*((l+1)*njobs+j)+1],lam_1.v[i],p2[j].i.v[i]);
-//        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
-//        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      vfmaeq(atmp[2*(l+1)  ],lam_1.v[i],p2->r.v[i]);
+      vfmaeq(atmp[2*(l+1)+1],lam_1.v[i],p2->i.v[i]);
+      }
    l+=2;
    }
  if (l==lmax)
-    {
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
-        atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
-        }
-    }
+    for (int i=0; i<nvec; ++i)
+      {
+      atmp[2*l  ] += lam_2.v[i]*p1->r.v[i];
+      atmp[2*l+1] += lam_2.v[i]*p1->i.v[i];
+      }
  }

 NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2 NJ1)
+  Y(Tbri) * restrict p2)
  {
  int l,lmax=gen->lmax;
  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;

  Tb corfac;
  Y(getCorfac)(scale,&corfac,gen->cf);
@ -183,30 +123,28 @@ NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
  int full_ieee = Y(TballGe)(scale,sharp_minscale);
  while (!full_ieee)
    {
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(p1[j].r.v[i],tmp,ar);
-        vfmaeq(p1[j].i.v[i],tmp,ai);
-        }
+      Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+      vfmaeq(p1->r.v[i],tmp,ar);
+      vfmaeq(p1->i.v[i],tmp,ai);
      }
+    }
    if (++l>lmax) break;
    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
    for (int i=0; i<nvec; ++i)
      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
+    {
+    Tv ar=vload(creal(alm[l])),ai=vload(cimag(alm[l]));
+    for (int i=0; i<nvec; ++i)
      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(p2[j].r.v[i],tmp,ar);
-        vfmaeq(p2[j].i.v[i],tmp,ai);
-        }
+      Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+      vfmaeq(p2->r.v[i],tmp,ar);
+      vfmaeq(p2->i.v[i],tmp,ai);
      }
+    }
    if (++l>lmax) break;
    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
    for (int i=0; i<nvec; ++i)
@ -220,12 +158,12 @@ NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
  if (l>lmax) return;

  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
+  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
  }

 NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
+  const Y(Tbri) * restrict p2, Tv *restrict atmp)
  {
  int lmax=gen->lmax;
  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
@ -233,7 +171,7 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 8*VLEN*nvec;

  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
  Tb corfac;
@ -241,24 +179,22 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
  int full_ieee = Y(TballGe)(scale,sharp_minscale);
  while (!full_ieee)
    {
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=lam_2.v[i]*corfac.v[i];
-        atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
-        atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=lam_2.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p1->r.v[i];
+      atmp[2*l+1]+=tmp*p1->i.v[i];
+      }
    if (++l>lmax) return;
    for (int i=0; i<nvec; ++i)
      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(rf[l-1].f[1])*lam_1.v[i];
-    for (int j=0; j<njobs; ++j)
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=lam_1.v[i]*corfac.v[i];
-        atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
-        atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
-        }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv tmp=lam_1.v[i]*corfac.v[i];
+      atmp[2*l  ]+=tmp*p2->r.v[i];
+      atmp[2*l+1]+=tmp*p2->i.v[i];
+      }
    if (++l>lmax) return;
    for (int i=0; i<nvec; ++i)
      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
@ -271,97 +207,88 @@ NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
    }

  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp);
  }

 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
  {
-  for (int j=0; j<njobs; ++j)
+  Tv agr=vload(creal(alm[0])), agi=vload(cimag(alm[0])),
+     acr=vload(creal(alm[1])), aci=vload(cimag(alm[1]));
+  for (int i=0; i<nvec; ++i)
    {
-    Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
-       acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],agr,lw);
-      vfmaeq(px[j].qi.v[i],agi,lw);
-      vfmaeq(px[j].ur.v[i],acr,lw);
-      vfmaeq(px[j].ui.v[i],aci,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmseq(py[j].qr.v[i],aci,lx);
-      vfmaeq(py[j].qi.v[i],acr,lx);
-      vfmaeq(py[j].ur.v[i],agi,lx);
-      vfmseq(py[j].ui.v[i],agr,lx);
-      }
+    Tv lw=vadd(rxp.v[i],rxm.v[i]);
+    vfmaeq(px->qr.v[i],agr,lw);
+    vfmaeq(px->qi.v[i],agi,lw);
+    vfmaeq(px->ur.v[i],acr,lw);
+    vfmaeq(px->ui.v[i],aci,lw);
+    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx=vsub(rxm.v[i],rxp.v[i]);
+    vfmseq(py->qr.v[i],aci,lx);
+    vfmaeq(py->qi.v[i],acr,lx);
+    vfmaeq(py->ur.v[i],agi,lx);
+    vfmseq(py->ui.v[i],agr,lx);
    }
  }

 static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
+  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
  {
-  for (int j=0; j<njobs; ++j)
+  Tv agr1=vload(creal(alm1[0])), agi1=vload(cimag(alm1[0])),
+     acr1=vload(creal(alm1[1])), aci1=vload(cimag(alm1[1]));
+  Tv agr2=vload(creal(alm2[0])), agi2=vload(cimag(alm2[0])),
+     acr2=vload(creal(alm2[1])), aci2=vload(cimag(alm2[1]));
+  for (int i=0; i<nvec; ++i)
    {
-    Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
-       acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
-    Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
-       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw1=r2p.v[i]+r2m.v[i];
-      Tv lx2=r1m.v[i]-r1p.v[i];
-      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
-      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
-      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
-      vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx1=r2m.v[i]-r2p.v[i];
-      Tv lw2=r1p.v[i]+r1m.v[i];
-      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
-      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
-      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
-      vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
-      }
+    Tv lw1=r2p.v[i]+r2m.v[i];
+    Tv lx2=r1m.v[i]-r1p.v[i];
+    vfmaseq(p1->qr.v[i],agr1,lw1,aci2,lx2);
+    vfmaaeq(p1->qi.v[i],agi1,lw1,acr2,lx2);
+    vfmaaeq(p1->ur.v[i],acr1,lw1,agi2,lx2);
+    vfmaseq(p1->ui.v[i],aci1,lw1,agr2,lx2);
+    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx1=r2m.v[i]-r2p.v[i];
+    Tv lw2=r1p.v[i]+r1m.v[i];
+    vfmaseq(p2->qr.v[i],agr2,lw2,aci1,lx1);
+    vfmaaeq(p2->qi.v[i],agi2,lw2,acr1,lx1);
+    vfmaaeq(p2->ur.v[i],acr2,lw2,agi1,lx1);
+    vfmaseq(p2->ui.v[i],aci2,lw2,agr1,lx1);
    }
  }

 static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm NJ1)
+  const Tb * restrict rxm, dcmplx * restrict alm)
  {
-  for (int j=0; j<njobs; ++j)
+  Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
+  for (int i=0; i<nvec; ++i)
    {
-    Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp->v[i],rxm->v[i]);
-      vfmaeq(agr,px[j].qr.v[i],lw);
-      vfmaeq(agi,px[j].qi.v[i],lw);
-      vfmaeq(acr,px[j].ur.v[i],lw);
-      vfmaeq(aci,px[j].ui.v[i],lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm->v[i],rxp->v[i]);
-      vfmseq(agr,py[j].ui.v[i],lx);
-      vfmaeq(agi,py[j].ur.v[i],lx);
-      vfmaeq(acr,py[j].qi.v[i],lx);
-      vfmseq(aci,py[j].qr.v[i],lx);
-      }
-    vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
+    Tv lw=vadd(rxp->v[i],rxm->v[i]);
+    vfmaeq(agr,px->qr.v[i],lw);
+    vfmaeq(agi,px->qi.v[i],lw);
+    vfmaeq(acr,px->ur.v[i],lw);
+    vfmaeq(aci,px->ui.v[i],lw);
    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx=vsub(rxm->v[i],rxp->v[i]);
+    vfmseq(agr,py->ui.v[i],lx);
+    vfmaeq(agi,py->ur.v[i],lx);
+    vfmaeq(acr,py->qi.v[i],lx);
+    vfmseq(aci,py->qr.v[i],lx);
+    }
+  vhsum_cmplx_special(agr,agi,acr,aci,alm);
  }

 NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax NJ1)
+  int lmax)
  {
  while (l<lmax)
    {
@ -372,8 +299,8 @@ NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
      }
-    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
-      &alm[2*njobs*(l+1)] NJ2);
+    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*l],
+      &alm[2*(l+1)]);
    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
@ -384,13 +311,12 @@ NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
    l+=2;
    }
  if (l==lmax)
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
+    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*l]);
  }

 NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
-  NJ1)
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
  {
  while (l<lmax)
    {
@ -403,8 +329,8 @@ NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                        vmul(fx2,rec1m.v[i]));
      }
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
-    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
+    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*(l+1)]);
    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
@ -417,12 +343,12 @@ NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1
    l+=2;
    }
  if (l==lmax)
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*l]);
  }

 NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2 NJ1)
+  Y(Tbqu) * restrict p2)
  {
  int l, lmax=gen->lmax;
  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
@ -430,7 +356,7 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;

  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
@ -442,11 +368,11 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
  while (!full_ieee)
    {
    Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[2*njobs*l] NJ2);
+      &alm[2*l]);
    if (++l>lmax) break;
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
    Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[2*njobs*l] NJ2);
+      &alm[2*l]);
    if (++l>lmax) break;
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -463,12 +389,12 @@ NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
  Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax NJ2);
+    lmax);
  }

 NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
-  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
+  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2)
  {
  int l, lmax=gen->lmax;
  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
@ -476,7 +402,7 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 28*VLEN*nvec;

  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
@ -488,11 +414,11 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
  while (!full_ieee)
    {
    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*l]);
    if (++l>lmax) return;
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*l]);
    if (++l>lmax) return;
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -506,34 +432,31 @@ NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,

  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
+  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
  }

 static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
  {
-  for (int j=0; j<njobs; ++j)
+  Tv ar=vload(creal(alm[0])), ai=vload(cimag(alm[0]));
+  for (int i=0; i<nvec; ++i)
    {
-    Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],ar,lw);
-      vfmaeq(px[j].qi.v[i],ai,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmaeq(py[j].ur.v[i],ai,lx);
-      vfmseq(py[j].ui.v[i],ar,lx);
-      }
+    Tv lw=vadd(rxp.v[i],rxm.v[i]);
+    vfmaeq(px->qr.v[i],ar,lw);
+    vfmaeq(px->qi.v[i],ai,lw);
+    }
+  for (int i=0; i<nvec; ++i)
+    {
+    Tv lx=vsub(rxm.v[i],rxp.v[i]);
+    vfmaeq(py->ur.v[i],ai,lx);
+    vfmseq(py->ui.v[i],ar,lx);
    }
  }

 NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax NJ1)
+  int lmax)
  {
  while (l<lmax)
    {
@ -546,8 +469,8 @@ NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                        vmul(fx2,rec1m.v[i]));
      }
-    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
-    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
+    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[l]);
+    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[l+1]);
    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
@ -560,12 +483,12 @@ NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
    l+=2;
    }
  if (l==lmax)
-    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
+    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[l]);
  }

 NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2 NJ1)
+  Y(Tbqu) * restrict p2)
  {
  int l, lmax=gen->lmax;
  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
@ -573,7 +496,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
+  job->opcnt += (lmax+1-l) * 20*VLEN*nvec;

  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
@ -585,11 +508,11 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
  while (!full_ieee)
    {
    Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[njobs*l] NJ2);
+      &alm[l]);
    if (++l>lmax) break;
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
    Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[njobs*l] NJ2);
+      &alm[l]);
    if (++l>lmax) break;
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -606,7 +529,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
  Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax NJ2);
+    lmax);
  }


@ -614,7 +537,7 @@ NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,

 NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
  {
  const int nval=nvec*VLEN;
  const int m = job->ainfo->mval[mi];
@ -629,7 +552,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
        {
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tburi) p1,p2; VZERO(p1); VZERO(p2);
          Y(Tbu) cth, sth;

          int skip=1;
@ -641,22 +564,19 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
            }
          if (!skip)
-            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b);

          for (int i=0; i<nval; ++i)
            {
            int itot=i+ith;
            if (itot<ulim-llim)
              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
-                complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
-                               r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
-                job->phase[phas_idx] = r1+r2;
-                if (ispair[itot])
-                  job->phase[phas_idx+1] = r1-r2;
-                }
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
+                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
+              job->phase[phas_idx] = r1+r2;
+              if (ispair[itot])
+                job->phase[phas_idx+1] = r1-r2;
              }
            }
          }
@ -665,7 +585,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
        {
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbuqu) p1,p2; VZERO(p1); VZERO(p2);
          Y(Tbu) cth, sth;
          int skip=1;

@ -679,33 +599,30 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
          if (!skip)
            (job->type==SHARP_ALM2MAP) ?
              Z(calc_alm2map_spin  )
-                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
+                (cth.b,sth.b,gen,job,&p1.b,&p2.b) :
              Z(calc_alm2map_deriv1)
-                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+                (cth.b,sth.b,gen,job,&p1.b,&p2.b);

          for (int i=0; i<nval; ++i)
            {
            int itot=i+ith;
            if (itot<ulim-llim)
              {
-              for (int j=0; j<njobs; ++j)
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              complex double q1 = p1.s.qr[i] + p1.s.qi[i]*_Complex_I,
+                             q2 = p2.s.qr[i] + p2.s.qi[i]*_Complex_I,
+                             u1 = p1.s.ur[i] + p1.s.ui[i]*_Complex_I,
+                             u2 = p2.s.ur[i] + p2.s.ui[i]*_Complex_I;
+              job->phase[phas_idx] = q1+q2;
+              job->phase[phas_idx+2] = u1+u2;
+              if (ispair[itot])
                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
-                complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
-                               q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
-                               u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
-                               u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
-                job->phase[phas_idx] = q1+q2;
-                job->phase[phas_idx+2] = u1+u2;
-                if (ispair[itot])
-                  {
-                  dcmplx *phQ = &(job->phase[phas_idx+1]),
-                         *phU = &(job->phase[phas_idx+3]);
-                  *phQ = q1-q2;
-                  *phU = u1-u2;
-                  if ((gen->mhi-gen->m+gen->s)&1)
-                    { *phQ=-(*phQ); *phU=-(*phU); }
-                  }
+                dcmplx *phQ = &(job->phase[phas_idx+1]),
+                       *phU = &(job->phase[phas_idx+3]);
+                *phQ = q1-q2;
+                *phU = u1-u2;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { *phQ=-(*phQ); *phU=-(*phU); }
                }
              }
            }
@ -723,7 +640,7 @@ NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,

 NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
  {
  const int nval=nvec*VLEN;
  const int m = job->ainfo->mval[mi];
@ -735,11 +652,11 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
      {
      if (job->spin==0)
        {
-        Tv atmp[2*njobs*(gen->lmax+1)];
-        memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
+        Tv atmp[2*(gen->lmax+1)];
+        memset (&atmp[2*m],0,2*(gen->lmax+1-m)*sizeof(Tv));
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tburi) p1, p2; VZERO(p1); VZERO(p2);
          Y(Tbu) cth, sth;
          int skip=1;

@ -751,21 +668,18 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
-                dcmplx ph1=job->phase[phas_idx];
-                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-                p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
-                p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
-                }
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx ph1=job->phase[phas_idx];
+              dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+              p1.s.r[i]=creal(ph1+ph2); p1.s.i[i]=cimag(ph1+ph2);
+              p2.s.r[i]=creal(ph1-ph2); p2.s.i[i]=cimag(ph1-ph2);
              }
            }
          if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
+            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b, atmp);
          }
        {
-        int istart=m*njobs, istop=(gen->lmax+1)*njobs;
+        int istart=m, istop=gen->lmax+1;
        for(; istart<istop-2; istart+=2)
          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
        for(; istart<istop; istart++)
@ -776,7 +690,7 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
        {
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbuqu) p1, p2; VZERO(p1); VZERO(p2);
          Y(Tbu) cth, sth;
          int skip=1;

@ -788,24 +702,21 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
            if (i+ith<ulim-llim)
              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
-                dcmplx p1Q=job->phase[phas_idx],
-                       p1U=job->phase[phas_idx+2],
-                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
-                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { p2Q=-p2Q; p2U=-p2U; }
-                p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
-                p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
-                p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
-                p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
-                }
+              int phas_idx = itot*job->s_th + mi*job->s_m;
+              dcmplx p1Q=job->phase[phas_idx],
+                     p1U=job->phase[phas_idx+2],
+                     p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
+                     p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
+              if ((gen->mhi-gen->m+gen->s)&1)
+                { p2Q=-p2Q; p2U=-p2U; }
+              p1.s.qr[i]=creal(p1Q+p2Q); p1.s.qi[i]=cimag(p1Q+p2Q);
+              p1.s.ur[i]=creal(p1U+p2U); p1.s.ui[i]=cimag(p1U+p2U);
+              p2.s.qr[i]=creal(p1Q-p2Q); p2.s.qi[i]=cimag(p1Q-p2Q);
+              p2.s.ur[i]=creal(p1U-p2U); p2.s.ui[i]=cimag(p1U-p2U);
              }
            }
          if (!skip)
-            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1.b,&p2.b);
          }
        }
      break;
@ -820,11 +731,11 @@ NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,

 static void Z(inner_loop) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim)
  {
  (job->type==SHARP_MAP2ALM) ?
-    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
-    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
+    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim) :
+    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim);
  }

 #undef VZERO
--- a/libsharp/sharp_core_inchelper.c
+++ b/libsharp/sharp_core_inchelper.c
@ -2,69 +2,9 @@
 #define Y(arg) CONCAT2(arg,nvec)
 #include "sharp_core_inc.c"

-#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
-#define NJ1 , int njobs
-#define NJ2 , njobs
 #define Z(arg) CONCAT2(arg,nvec)
 #include "sharp_core_inc2.c"
 #undef Z
-#undef NJ1
-#undef NJ2
-#endif
-
-#define NJ1
-#define NJ2
-
-#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
-#define njobs 1
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
-#define njobs 2
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
-#define njobs 3
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
-#define njobs 4
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
-#define njobs 5
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
-#define njobs 6
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#undef NJ1
-#undef NJ2

 #undef Y
 #undef Tb