sharpification

2012-07-05 16:14:28 +02:00 · 2012-07-05 16:14:28 +02:00 · 82dc2a541f
commit 82dc2a541f
parent 84c0a7de18
14 changed files with 145 additions and 123 deletions
--- a/2
+++ b/2
@ -32,7 +32,7 @@ compile_all: $(all_cbin) hdrcopy

 autotune: sharp_bench
 	$(BINDIR)/sharp_bench
-	mv oracle.inc $(SRCROOT)/libsharp
+	mv sharp_oracle.inc $(SRCROOT)/libsharp
 	$(MAKE)

 hdrclean:
--- a/libsharp/libsharp.dox
+++ b/libsharp/libsharp.dox
@ -61,11 +61,11 @@
  <ul>
  <li>scalar a_lm to map</li>
  <li>scalar map to a_lm</li>
-<!--   <li>polarised a_lm to map</li>
-  <li>polarised map to a_lm</li> -->
+<!--  <li>polarised a_lm to map</li>
+  <li>polarised map to a_lm</li> !-->
  <li>spin a_lm to map</li>
  <li>spin map to a_lm</li>
-<!--  <li>scalar a_lm to maps of first derivatives</li> -->
+  <li>scalar a_lm to maps of first derivatives</li>
  </ul>

  SHARP supports shared-memory parallelisation via OpenMP; this feature will
--- a/libsharp/oracle.inc
+++ b/libsharp/oracle.inc
@ -1,9 +0,0 @@
-static const int maxtr = 6;
-static const int nv_opt[6][2][3] = {
-{{4,2,-1},{2,1,1}},
-{{4,2,-1},{2,1,1}},
-{{5,2,-1},{5,2,1}},
-{{5,2,-1},{5,2,1}},
-{{5,2,-1},{5,2,1}},
-{{5,2,-1},{5,2,1}}
-};
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@ -15,7 +15,7 @@ ALLOBJ:=$(ALLOBJ:%=$(OD)/%)

 ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
 $(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
-$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/oracle.inc
+$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/sharp_oracle.inc
 BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)

 $(LIB_$(PKG)): $(LIBOBJ)
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@ -464,7 +464,7 @@ void sharp_execute_job (sharp_job *job)
  int lmax = job->ainfo->lmax,
      mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);

-  job->norm_l = Ylmgen_get_norm (lmax, job->spin);
+  job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);

 /* clear output arrays if requested */
  init_output (job);
@ -501,8 +501,8 @@ void sharp_execute_job (sharp_job *job)
 {
    sharp_job ljob = *job;
    ljob.opcnt=0;
-    Ylmgen_C generator;
-    Ylmgen_init (&generator,lmax,mmax,ljob.spin);
+    sharp_Ylmgen_C generator;
+    sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
    alloc_almtmp(&ljob,lmax);

 #pragma omp for schedule(dynamic,1)
@ -517,7 +517,7 @@ void sharp_execute_job (sharp_job *job)
      almtmp2alm (&ljob, lmax, mi);
      }

-    Ylmgen_destroy(&generator);
+    sharp_Ylmgen_destroy(&generator);
    dealloc_almtmp(&ljob);

 #pragma omp critical
@ -589,7 +589,7 @@ int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
  if (type==ALM2MAP_DERIV1) spin=1;
  UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
  UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
-#include "oracle.inc"
+#include "sharp_oracle.inc"

  return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
  }
--- a/libsharp/sharp_bench.c
+++ b/libsharp/sharp_bench.c
@ -98,7 +98,7 @@ int main(void)
  sharp_module_startup("sharp_bench",1,1,"",1);

  printf("Benchmarking SHTs.\n\n");
-  FILE *fp=fopen("oracle.inc","w");
+  FILE *fp=fopen("sharp_oracle.inc","w");
  UTIL_ASSERT(fp, "failed to open oracle file for writing");
  fprintf(fp,"static const int maxtr = 6;\n");
  fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@ -73,7 +73,8 @@ typedef complex double dcmplx;
 #undef nvec

 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx)
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *idx)
  {
  int njobs=job->ntrans;
  if (njobs<=MAXJOB_SPECIAL)
--- a/libsharp/sharp_core.h
+++ b/libsharp/sharp_core.h
@ -40,7 +40,8 @@ extern "C" {
 #endif

 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx);
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *idx);

 #ifdef __cplusplus
 }
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@ -83,8 +83,8 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
        {
        vmuleq(res.v[i],val.v[i]);
        vaddeq(scale.v[i],scaleint.v[i]);
-        Tv mask=vlt(vabs(res.v[i]),vload(fsmall));
-        vmuleq(res.v[i],vblend(mask,vload(fbig),vone));
+        Tv mask=vlt(vabs(res.v[i]),vload(sharp_fsmall));
+        vmuleq(res.v[i],vblend(mask,vload(sharp_fbig),vone));
        vsubeq(scale.v[i],vblend(mask,vone,vzero));
        }
      }
@ -92,8 +92,8 @@ static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
      {
      vmuleq(val.v[i],val.v[i]);
      vaddeq(scaleint.v[i],scaleint.v[i]);
-      Tv mask = vlt(vabs(val.v[i]),vload(fsmall));
-      vmuleq(val.v[i],vblend(mask,vload(fbig),vone));
+      Tv mask = vlt(vabs(val.v[i]),vload(sharp_fsmall));
+      vmuleq(val.v[i],vblend(mask,vload(sharp_fbig),vone));
      vsubeq(scaleint.v[i],vblend(mask,vone,vzero));
      }
    }
@ -113,7 +113,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
    if (vanyTrue(mask))
      {
      did_scale=1;
-      Tv fact = vblend(mask,vload(fsmall),vone);
+      Tv fact = vblend(mask,vload(sharp_fsmall),vone);
      vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
      vaddeq(scale->v[i],vblend(mask,vone,vzero));
      }
@ -123,7 +123,7 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,

 static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale)
  {
-  const Tv vfsmall=vload(fsmall), vfbig=vload(fbig);
+  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
  for (int i=0;i<nvec; ++i)
    {
    Tv mask = vgt(vabs(val->v[i]),vone);
@ -168,13 +168,14 @@ static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
  Y(Tbu) sc, corf;
  sc.b=scale;
  for (int i=0; i<VLEN*nvec; ++i)
-    corf.s[i] = (sc.s[i]<minscale) ? 0. : cf[(int)(sc.s[i])-minscale];
+    corf.s[i] = (sc.s[i]<sharp_minscale) ?
+      0. : cf[(int)(sc.s[i])-sharp_minscale];
  *corfac=corf.b;
  }

 static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
-  const Ylmgen_C * restrict gen)
+  const sharp_Ylmgen_C * restrict gen)
  {
  int l=gen->m;
  Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
@ -182,7 +183,7 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
  Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
  Y(normalize)(&lam_2,&scale);

-  int below_limit = Y(TballLt)(scale,limscale);
+  int below_limit = Y(TballLt)(scale,sharp_limscale);
  while (below_limit)
    {
    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
@ -193,14 +194,15 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
    for (int i=0; i<nvec; ++i)
      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      below_limit = Y(TballLt)(scale,limscale);
+      below_limit = Y(TballLt)(scale,sharp_limscale);
    l+=2;
    }
  *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
  }

 static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
-  Tb * restrict ryp, Tb * restrict rym, const Tb cth, const ylmgen_dbl3 fx)
+  Tb * restrict ryp, Tb * restrict rym, const Tb cth,
+  const sharp_ylmgen_dbl3 fx)
  {
  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
  for (int i=0; i<nvec; ++i)
@ -214,9 +216,9 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,

 static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
  Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
-  Tb * scalep_, Tb * scalem_, const Ylmgen_C * restrict gen)
+  Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
  {
-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb cth2, sth2;
  for (int i=0; i<nvec; ++i)
    {
@ -251,14 +253,16 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,

  int l=gen->mhi;

-  int below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
+  int below_limit = Y(TballLt)(scalep,sharp_limscale)
+                 && Y(TballLt)(scalem,sharp_limscale);
  while (below_limit)
    {
    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
+      below_limit = Y(TballLt)(scalep,sharp_limscale)
+                 && Y(TballLt)(scalem,sharp_limscale);
    l+=2;
    }

--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@ -40,7 +40,7 @@ typedef union

 static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
  Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
-  const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
  int l, int lmax)
  {
 #if (njobs>1)
@ -126,7 +126,7 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,

 static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
  const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
-  const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
  {
  while (l<lmax)
    {
@ -168,8 +168,9 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
    }
  }

-static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
-  sharp_job *job, Z(Tbrij) * restrict p1, Z(Tbrij) * restrict p2, int *done)
+static void Z(calc_alm2map) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1,
+  Z(Tbrij) * restrict p2, int *done)
  {
  int l,lmax=gen->lmax;
  Tb lam_1,lam_2,scale;
@ -180,9 +181,9 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,

  Tb corfac;
  Y(getCorfac)(scale,&corfac,gen->cf);
-  const ylmgen_dbl2 * restrict rf = gen->rf;
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scale,minscale);
+  int full_ieee = Y(TballGt)(scale,sharp_minscale);
  while (!full_ieee)
    {
    for (int j=0; j<njobs; ++j)
@ -216,7 +217,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      {
      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGt)(scale,minscale);
+      full_ieee = Y(TballGt)(scale,sharp_minscale);
      }
    }
  if (l>lmax) { *done=1; return; }
@ -226,7 +227,7 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
  }

 static void Z(calc_map2alm) (const Tb cth, const Tb sth,
-  const Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
+  const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
  const Z(Tbrij) * restrict p2, int *done)
  {
  int lmax=gen->lmax;
@ -237,11 +238,11 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
  if (l>lmax) { *done=1; return; }
  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;

-  const ylmgen_dbl2 * restrict rf = gen->rf;
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
  Tb corfac;
  Y(getCorfac)(scale,&corfac,gen->cf);
  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scale,minscale);
+  int full_ieee = Y(TballGt)(scale,sharp_minscale);
  while (!full_ieee)
    {
    for (int j=0; j<njobs; ++j)
@ -277,7 +278,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      {
      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGt)(scale,minscale);
+      full_ieee = Y(TballGt)(scale,sharp_minscale);
      }
    }

@ -371,7 +372,8 @@ static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,

 static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
  Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax)
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  int lmax)
  {
  while (l<lmax)
    {
@ -408,7 +410,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,

 static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
  const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
  {
  while (l<lmax)
    {
@ -438,7 +440,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
  }

-static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
+static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
  sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
  {
  int l, lmax=gen->lmax;
@ -449,12 +451,13 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
   { *done=1; return; }
  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;

-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
  Y(getCorfac)(scalep,&corfacp,gen->cf);
  Y(getCorfac)(scalem,&corfacm,gen->cf);
  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  int full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
  while (!full_ieee)
    {
    Z(saddstep)(p1, p2,
@ -469,7 +472,8 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
      {
      Y(getCorfac)(scalep,&corfacp,gen->cf);
      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
      }
    }

@ -482,7 +486,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
    rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
  }

-static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
+static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
  sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
  int *done)
  {
@ -493,12 +497,13 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
  if (l>lmax) { *done=1; return; }
  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;

-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
  Y(getCorfac)(scalep,&corfacp,gen->cf);
  Y(getCorfac)(scalem,&corfacm,gen->cf);
  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  int full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
  while (!full_ieee)
    {
    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
@ -513,7 +518,8 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
      {
      Y(getCorfac)(scalep,&corfacp,gen->cf);
      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
      }
    }

@ -522,8 +528,9 @@ static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
  Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
  }

-static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
+static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
+  Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm,
+  const dcmplx * restrict alm)
  {
  for (int j=0; j<njobs; ++j)
    {
@ -545,7 +552,7 @@ static inline void Z(saddstep_d) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py

 static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
  Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
  int lmax)
  {
  while (l<lmax)
@ -576,7 +583,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]);
  }

-static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
+static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
  sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
  {
  int l, lmax=gen->lmax;
@ -587,12 +594,13 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
   { *done=1; return; }
  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;

-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
  Y(getCorfac)(scalep,&corfacp,gen->cf);
  Y(getCorfac)(scalem,&corfacm,gen->cf);
  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  int full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
  while (!full_ieee)
    {
    Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -607,7 +615,8 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
      {
      Y(getCorfac)(scalep,&corfacp,gen->cf);
      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
      }
    }

@ -623,12 +632,12 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)

 static void Z(inner_loop) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
-  int mi, const int *idx)
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *idx)
  {
  const int nval=nvec*VLEN;
  const int m = job->ainfo->mval[mi];
-  Ylmgen_prepare (gen, m);
+  sharp_Ylmgen_prepare (gen, m);

  switch (job->type)
    {
--- a/libsharp/sharp_core_inc3.c
+++ b/libsharp/sharp_core_inc3.c
@ -31,7 +31,7 @@

 static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
  int l, int lmax, int njobs)
  {
  while (l<lmax-2)
@ -115,7 +115,7 @@ static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,

 static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
  int njobs)
  {
  while (l<lmax)
@ -158,9 +158,9 @@ static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
    }
  }

-static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
-  sharp_job *job, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, int njobs,
-  int *done)
+static void Y(calc_alm2map) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
+  Y(Tbri) * restrict p2, int njobs, int *done)
  {
  int l,lmax=gen->lmax;
  Tb lam_1,lam_2,scale;
@ -171,9 +171,9 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,

  Tb corfac;
  Y(getCorfac)(scale,&corfac,gen->cf);
-  const ylmgen_dbl2 * restrict rf = gen->rf;
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scale,minscale);
+  int full_ieee = Y(TballGt)(scale,sharp_minscale);
  while (!full_ieee)
    {
    for (int j=0; j<njobs; ++j)
@ -207,7 +207,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      {
      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGt)(scale,minscale);
+      full_ieee = Y(TballGt)(scale,sharp_minscale);
      }
    }
  if (l>lmax) { *done=1; return; }
@ -217,7 +217,7 @@ static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
  }

 static void Y(calc_map2alm) (const Tb cth, const Tb sth,
-  const Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
+  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
  const Y(Tbri) * restrict p2, int njobs, int *done)
  {
  int lmax=gen->lmax;
@ -228,11 +228,11 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth,
  if (l>lmax) { *done=1; return; }
  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;

-  const ylmgen_dbl2 * restrict rf = gen->rf;
+  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
  Tb corfac;
  Y(getCorfac)(scale,&corfac,gen->cf);
  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scale,minscale);
+  int full_ieee = Y(TballGt)(scale,sharp_minscale);
  while (!full_ieee)
    {
    for (int j=0; j<njobs; ++j)
@ -268,7 +268,7 @@ static void Y(calc_map2alm) (const Tb cth, const Tb sth,
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      {
      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGt)(scale,minscale);
+      full_ieee = Y(TballGt)(scale,sharp_minscale);
      }
    }

@ -362,7 +362,7 @@ static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,

 static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
  int lmax, int njobs)
  {
  while (l<lmax)
@ -395,7 +395,7 @@ static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,

 static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
  int njobs)
  {
  while (l<lmax)
@ -426,7 +426,7 @@ static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
  }

-static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
+static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
  sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
  int *done)
  {
@ -438,12 +438,13 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
   { *done=1; return; }
  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;

-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
  Y(getCorfac)(scalep,&corfacp,gen->cf);
  Y(getCorfac)(scalem,&corfacm,gen->cf);
  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  int full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
  while (!full_ieee)
    {
    Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -458,7 +459,8 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
      {
      Y(getCorfac)(scalep,&corfacp,gen->cf);
      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
      }
    }

@ -471,7 +473,7 @@ static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
    lmax, njobs);
  }

-static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
+static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
  sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
  int njobs, int *done)
  {
@ -482,12 +484,13 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
  if (l>lmax) { *done=1; return; }
  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;

-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
  Y(getCorfac)(scalep,&corfacp,gen->cf);
  Y(getCorfac)(scalem,&corfacm,gen->cf);
  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  int full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
  while (!full_ieee)
    {
    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
@ -502,7 +505,8 @@ static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
      {
      Y(getCorfac)(scalep,&corfacp,gen->cf);
      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
      }
    }

@ -534,7 +538,7 @@ static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,

 static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
  int lmax, int njobs)
  {
  while (l<lmax)
@ -565,7 +569,7 @@ static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
    Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
  }

-static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
+static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
  sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
  int *done)
  {
@ -577,12 +581,13 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
   { *done=1; return; }
  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;

-  const ylmgen_dbl3 * restrict fx = gen->fx;
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
  Tb corfacp,corfacm;
  Y(getCorfac)(scalep,&corfacp,gen->cf);
  Y(getCorfac)(scalem,&corfacm,gen->cf);
  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  int full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
  while (!full_ieee)
    {
    Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
@ -597,7 +602,8 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
      {
      Y(getCorfac)(scalep,&corfacp,gen->cf);
      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      full_ieee = Y(TballGt)(scalep,sharp_minscale)
+               && Y(TballGt)(scalem,sharp_minscale);
      }
    }

@ -614,12 +620,12 @@ static void Y(calc_alm2map_deriv1) (const Tb cth, const Ylmgen_C *gen,
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)

 static void Y(inner_loop) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
-  int mi, const int *idx, int njobs)
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs)
  {
  const int nval=nvec*VLEN;
  const int m = job->ainfo->mval[mi];
-  Ylmgen_prepare (gen, m);
+  sharp_Ylmgen_prepare (gen, m);

  switch (job->type)
    {
--- a/libsharp/sharp_oracle.inc
+++ b/libsharp/sharp_oracle.inc
@ -0,0 +1,9 @@
+static const int maxtr = 6;
+static const int nv_opt[6][2][3] = {
+{{4,2,-1},{2,1,1}},
+{{5,2,-1},{2,1,1}},
+{{5,2,-1},{5,2,2}},
+{{5,2,-1},{5,2,2}},
+{{5,2,-1},{5,2,2}},
+{{5,2,-1},{5,2,2}}
+};
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@ -34,7 +34,7 @@
 #include "sharp_ylmgen_c.h"
 #include "c_utils.h"

-void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
+void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
  {
  const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;

@ -42,18 +42,19 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
  gen->mmax = m_max;
  UTIL_ASSERT(spin>=0,"incorrect spin");
  gen->s = spin;
-  UTIL_ASSERT((minscale<=0)&&(maxscale>0),"bad value for min/maxscale");
-  gen->cf=RALLOC(double,maxscale-minscale+1);
-  gen->cf[-minscale]=1.;
-  for (int m=-minscale-1; m>=0; --m)
-    gen->cf[m]=gen->cf[m+1]*fsmall;
-  for (int m=-minscale+1; m<(maxscale-minscale+1); ++m)
-    gen->cf[m]=gen->cf[m-1]*fbig;
+  UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0),
+    "bad value for min/maxscale");
+  gen->cf=RALLOC(double,sharp_maxscale-sharp_minscale+1);
+  gen->cf[-sharp_minscale]=1.;
+  for (int m=-sharp_minscale-1; m>=0; --m)
+    gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
+  for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
+    gen->cf[m]=gen->cf[m-1]*sharp_fbig;

  gen->m = -1;
  if (spin==0)
    {
-    gen->rf = RALLOC(ylmgen_dbl2,gen->lmax+1);
+    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
    gen->mfac = RALLOC(double,gen->mmax+1);
    gen->mfac[0] = inv_sqrt4pi;
    for (int m=1; m<=gen->mmax; ++m)
@ -69,7 +70,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
  else
    {
    gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,ylmgen_dbl3,gen->lmax+2);
+    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
    for (int m=0; m<gen->lmax+2; ++m)
      gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
    ALLOC(gen->inv,double,gen->lmax+1);
@ -91,7 +92,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
      {
      fac[m]=fac[m-1]*sqrt(m);
      facscale[m]=facscale[m-1];
-      if (fac[m]>1.) { fac[m]*=fsmall; ++facscale[m]; }
+      if (fac[m]>1.) { fac[m]*=sharp_fsmall; ++facscale[m]; }
      }
    for (int m=0; m<=gen->mmax; ++m)
      {
@ -105,7 +106,7 @@ void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
    }
  }

-void Ylmgen_destroy (Ylmgen_C *gen)
+void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
  {
  DEALLOC(gen->cf);
  if (gen->s==0)
@ -126,7 +127,7 @@ void Ylmgen_destroy (Ylmgen_C *gen)
    }
  }

-void Ylmgen_prepare (Ylmgen_C *gen, int m)
+void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
  {
  if (m==gen->m) return;
  UTIL_ASSERT(m>=0,"incorrect m");
@ -181,7 +182,7 @@ void Ylmgen_prepare (Ylmgen_C *gen, int m)
    }
  }

-double *Ylmgen_get_norm (int lmax, int spin)
+double *sharp_Ylmgen_get_norm (int lmax, int spin)
  {
  const double pi = 3.141592653589793238462643383279502884197;
  double *res=RALLOC(double,lmax+1);
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@ -36,11 +36,11 @@
 extern "C" {
 #endif

-enum { minscale=-8, limscale=-3, maxscale=5 };
-static const double fbig=0x1p+90,fsmall=0x1p-90;
+enum { sharp_minscale=-8, sharp_limscale=-3, sharp_maxscale=5 };
+static const double sharp_fbig=0x1p+90,sharp_fsmall=0x1p-90;

-typedef struct { double f[2]; } ylmgen_dbl2;
-typedef struct { double f[3]; } ylmgen_dbl3;
+typedef struct { double f[2]; } sharp_ylmgen_dbl2;
+typedef struct { double f[3]; } sharp_ylmgen_dbl3;

 typedef struct
  {
@ -53,13 +53,13 @@ typedef struct

 /* used if s==0 */
  double *mfac;
-  ylmgen_dbl2 *rf;
+  sharp_ylmgen_dbl2 *rf;

 /* used if s!=0 */
  int sinPow, cosPow, preMinus_p, preMinus_m;
  double *prefac;
  int *fscale;
-  ylmgen_dbl3 *fx;
+  sharp_ylmgen_dbl3 *fx;

 /* internal usage only */
 /* used if s==0 */
@ -68,22 +68,22 @@ typedef struct
 /* used if s!=0 */
  double *flm1, *flm2, *inv;
  int mlo, mhi;
-  } Ylmgen_C;
+  } sharp_Ylmgen_C;

 /*! Creates a generator which will calculate helper data for Y_lm calculation
    up to \a l=l_max and \a m=m_max. */
-void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin);
+void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin);

 /*! Deallocates a generator previously initialised by Ylmgen_init(). */
-void Ylmgen_destroy (Ylmgen_C *gen);
+void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen);

 /*! Prepares the object for the calculation at \a m. */
-void Ylmgen_prepare (Ylmgen_C *gen, int m);
+void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m);

 /*! Returns a pointer to an array with \a lmax+1 entries containing
    normalisation factors that must be applied to Y_lm values computed for
    \a spin. The array must be deallocated (using free()) by the user. */
-double *Ylmgen_get_norm (int lmax, int spin);
+double *sharp_Ylmgen_get_norm (int lmax, int spin);

 #ifdef __cplusplus
 }