updates

2018-10-26 14:36:25 +02:00 · 2018-10-26 14:36:25 +02:00 · 18c82762c3
commit 18c82762c3
parent dce3c2b430
15 changed files with 424 additions and 219 deletions
--- a/c_utils/c_utils.c
+++ b/c_utils/c_utils.c
@ -25,7 +25,7 @@
 /*
 *  Convenience functions
 *
- *  Copyright (C) 2008, 2009, 2010, 2011, 2012 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
 *  Author: Martin Reinecke
 */
@ -44,7 +44,7 @@ void util_warn_ (const char *file, int line, const char *func, const char *msg)
 /* This function tries to avoid allocations with a total size close to a high
   power of two (called the "critical stride" here), by adding a few more bytes
-   if necssary. This lowers the probability that two arrays differ by a multiple
+   if necessary. This lowers the probability that two arrays differ by a multiple
   of the critical stride in their starting address, which in turn lowers the
   risk of cache line contention. */
 static size_t manipsize(size_t sz)
@ -61,7 +61,7 @@ void *util_malloc_ (size_t sz)
  {
  void *res;
  if (sz==0) return NULL;
-  res = _mm_malloc(manipsize(sz),16);
+  res = _mm_malloc(manipsize(sz),32);
  UTIL_ASSERT(res,"_mm_malloc() failed");
  return res;
  }
--- a/c_utils/c_utils.h
+++ b/c_utils/c_utils.h
@ -25,7 +25,7 @@
 /*! \file c_utils.h
 *  Convenience functions
 *
- *  Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
 *  \author Martin Reinecke
 *  \note This file should only be included from .c files, NOT from .h files.
 */
@ -144,4 +144,10 @@ void util_free_ (void *ptr);
 }
 #endif
 #ifdef __GNUC__
 #define NOINLINE __attribute__((noinline))
 #else
 #define NOINLINE
 #endif
 #endif
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@ -25,11 +25,12 @@
 /*! \file sharp.c
 *  Spherical transform library
 *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2016 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */
 #include <math.h>
 #include <string.h>
 #include "pocketfft/pocketfft.h"
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
@ -63,7 +64,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
  *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
  }
-int sharp_get_mlim (int lmax, int spin, double sth, double cth)
+NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
  {
  double ofs=lmax*0.01;
  if (ofs<100.) ofs=100.;
@ -83,12 +84,13 @@ typedef struct
  dcmplx *shiftarr;
  int s_shift;
  rfft_plan plan;
  int length;
  int norot;
  } ringhelper;
 static void ringhelper_init (ringhelper *self)
  {
-  static ringhelper rh_null = { 0, NULL, 0, NULL, 0 };
+  static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 };
  *self = rh_null;
  }
@ -99,7 +101,7 @@ static void ringhelper_destroy (ringhelper *self)
  ringhelper_init(self);
  }
-static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
+NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
  {
  self->norot = (fabs(phi0)<1e-14);
  if (!(self->norot))
@ -110,12 +112,15 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
      self->phi0_ = phi0;
      for (int m=0; m<=mmax; ++m)
        self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
 //      double *tmp=(double *) self->shiftarr;
 //      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
      }
  if (!self->plan) self->plan=make_rfft_plan(nph);
-  if (nph!=(int)rfft_length(self->plan))
+  if (nph!=(int)self->length)
    {
    destroy_rfft_plan(self->plan);
    self->plan=make_rfft_plan(nph);
    self->length=nph;
    }
  }
@ -127,6 +132,7 @@ static int ringinfo_compare (const void *xa, const void *xb)
 static int ringpair_compare (const void *xa, const void *xb)
  {
  const sharp_ringpair *a=xa, *b=xb;
 //  return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
  if (a->r1.nph==b->r1.nph)
    return (a->r1.phi0 < b->r1.phi0) ? -1 :
      ((a->r1.phi0 > b->r1.phi0) ? 1 :
@ -261,6 +267,7 @@ void sharp_destroy_geom_info (sharp_geom_info *geom_info)
   distribution are permissible. */
 static int sharp_get_mmax (int *mval, int nm)
  {
  //FIXME: if gaps are allowed, we have to search the maximum m in the array
  int *mcheck=RALLOC(int,nm);
  SET_ARRAY(mcheck,0,nm,0);
  for (int i=0; i<nm; ++i)
@ -274,7 +281,7 @@ static int sharp_get_mmax (int *mval, int nm)
  return nm-1;
  }
-static void ringhelper_phase2ring (ringhelper *self,
+NOINLINE static void ringhelper_phase2ring (ringhelper *self,
  const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
  int pstride, int flags)
  {
@ -288,13 +295,19 @@ static void ringhelper_phase2ring (ringhelper *self,
  if (nph>=2*mmax+1)
    {
-    for (int m=0; m<=mmax; ++m)
+    if (self->norot)
-      {
+      for (int m=0; m<=mmax; ++m)
-      dcmplx tmp = phase[m*pstride]*wgt;
+        {
-      if(!self->norot) tmp*=self->shiftarr[m];
+        data[2*m]=creal(phase[m*pstride])*wgt;
-      data[2*m]=creal(tmp);
+        data[2*m+1]=cimag(phase[m*pstride])*wgt;
-      data[2*m+1]=cimag(tmp);
+        }
-      }
+    else
      for (int m=0; m<=mmax; ++m)
        {
        dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
        data[2*m]=creal(tmp)*wgt;
        data[2*m+1]=cimag(tmp)*wgt;
        }
    for (int m=2*(mmax+1); m<nph+2; ++m)
      data[m]=0.;
    }
@ -326,7 +339,7 @@ static void ringhelper_phase2ring (ringhelper *self,
  rfft_backward (self->plan, &(data[1]), 1.);
  }
-static void ringhelper_ring2phase (ringhelper *self,
+NOINLINE static void ringhelper_ring2phase (ringhelper *self,
  const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
  int pstride, int flags)
  {
@ -376,7 +389,7 @@ static void ringhelper_ring2phase (ringhelper *self,
    phase[m*pstride]=0.;
  }
-static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
+NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
  int flags)
  {
  if (flags & SHARP_NO_FFT)
@ -386,50 +399,55 @@ static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
      if (flags&SHARP_DP)
        {
        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
+          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
            =value;
        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
+          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
            =value;
        }
      else
        {
        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
+          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
            =(float)value;
        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
+          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
            =(float)value;
        }
      }
    }
  else
    {
-    for (int j=0;j<ginfo->npairs;++j)
+    if (flags&SHARP_DP)
      {
-      if (flags&SHARP_DP)
+      for (int j=0;j<ginfo->npairs;++j)
        {
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
+        double *dmap=(double *)map;
-          ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
+        if (ginfo->pair[j].r1.stride==1)
-            =value;
+          memset(&dmap[ginfo->pair[j].r1.ofs],0,
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
+            ginfo->pair[j].r1.nph*sizeof(double));
-          ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
+        else
-            =value;
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
            dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
        if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1))
          memset(&dmap[ginfo->pair[j].r2.ofs],0,
            ginfo->pair[j].r2.nph*sizeof(double));
        else
          for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
            dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
        }
-      else
+      }
    else
      {
      for (int j=0;j<ginfo->npairs;++j)
        {
        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
+          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
            =(float)value;
        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
+          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
            =(float)value;
        }
      }
    }
  }
-static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
+NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
  int flags)
  {
 #define CLEARLOOP(real_t,body)             \
      {                                    \
@ -465,7 +483,7 @@ static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
    }
  }
-static void init_output (sharp_job *job)
+NOINLINE static void init_output (sharp_job *job)
  {
  if (job->flags&SHARP_ADD) return;
  if (job->type == SHARP_MAP2ALM)
@ -473,21 +491,21 @@ static void init_output (sharp_job *job)
      clear_alm (job->ainfo,job->alm[i],job->flags);
  else
    for (int i=0; i<job->ntrans*job->nmaps; ++i)
-      fill_map (job->ginfo,job->map[i],0.,job->flags);
+      clear_map (job->ginfo,job->map[i],job->flags);
  }
-static void alloc_phase (sharp_job *job, int nm, int ntheta)
+NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
  {
  if (job->type==SHARP_MAP2ALM)
    {
    if ((nm&1023)==0) nm+=3; // hack to avoid critical strides
    job->s_m=2*job->ntrans*job->nmaps;
    if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
    job->s_th=job->s_m*nm;
    }
  else
    {
    if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides
    job->s_th=2*job->ntrans*job->nmaps;
    if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
    job->s_m=job->s_th*ntheta;
    }
  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
@ -502,22 +520,28 @@ static void alloc_almtmp (sharp_job *job, int lmax)
 static void dealloc_almtmp (sharp_job *job)
  { DEALLOC(job->almtmp); }
-static void alm2almtmp (sharp_job *job, int lmax, int mi)
+NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
  {
-#define COPY_LOOP(real_t, source_t, expr_of_x)                      \
+#define COPY_LOOP(real_t, source_t, expr_of_x)              \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)            \
+  {                                                         \
  for (int l=m; l<lmin; ++l)                                \
    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
      job->almtmp[job->ntrans*job->nalm*l+i] = 0;           \
  for (int l=lmin; l<=lmax; ++l)                            \
    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
      {                                                     \
-        source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
+      source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
-        job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
+      job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x;   \
-      }
+      }                                                     \
  }
  if (job->type!=SHARP_MAP2ALM)
    {
    ptrdiff_t ofs=job->ainfo->mvstart[mi];
    int stride=job->ainfo->stride;
    int m=job->ainfo->mval[mi];
    int lmin=(m<job->spin) ? job->spin : m;
    /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the
       coefficients by sqrt_one_half; here we must compensate to avoid scaling
       m=0 */
@ -562,17 +586,17 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi)
      }
    }
  else
-    SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
+    memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
-              job->ntrans*job->nalm*(lmax+1),0.);
+      job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
 #undef COPY_LOOP
  }
-static void almtmp2alm (sharp_job *job, int lmax, int mi)
+NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
  {
 #define COPY_LOOP(real_t, target_t, expr_of_x)               \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)             \
+  for (int l=lmin; l<=lmax; ++l)                             \
    for (int i=0; i<job->ntrans*job->nalm; ++i)              \
      {                                                      \
        dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i];   \
@ -583,6 +607,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
  ptrdiff_t ofs=job->ainfo->mvstart[mi];
  int stride=job->ainfo->stride;
  int m=job->ainfo->mval[mi];
  int lmin=(m<job->spin) ? job->spin : m;
  /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the
     coefficients by sqrt_two; here we must compensate to avoid scaling
     m=0 */
@ -629,27 +654,56 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
 #undef COPY_LOOP
  }
-static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
+NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
-  int rstride)
+  const double *ringtmp, int rstride)
  {
-  double **dmap = (double **)job->map;
+  if (job->flags & SHARP_DP)
-  float  **fmap = (float  **)job->map;
+    {
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    double **dmap = (double **)job->map;
-    for (int m=0; m<ri->nph; ++m)
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
-      if (job->flags & SHARP_DP)
+      {
-        dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1];
+      double *restrict p1=&dmap[i][ri->ofs];
      const double *restrict p2=&ringtmp[i*rstride+1];
      if (ri->stride==1)
        {
        if (job->flags&SHARP_ADD)
          for (int m=0; m<ri->nph; ++m)
            p1[m] += p2[m];
        else
          memcpy(p1,p2,ri->nph*sizeof(double));
        }
      else
        for (int m=0; m<ri->nph; ++m)
          p1[m*ri->stride] += p2[m];
      }
    }
  else
    {
    float  **fmap = (float  **)job->map;
    for (int i=0; i<job->ntrans*job->nmaps; ++i)
      for (int m=0; m<ri->nph; ++m)
        fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
    }
  }
-static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
+NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
-  int rstride)
+  double *ringtmp, int rstride)
  {
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
+  if (job->flags & SHARP_DP)
-    for (int m=0; m<ri->nph; ++m)
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
-      ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ?
+      {
-        ((double *)(job->map[i]))[ri->ofs+m*ri->stride] :
+      double *restrict p1=&ringtmp[i*rstride+1],
-        ((float  *)(job->map[i]))[ri->ofs+m*ri->stride];
+             *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
      if (ri->stride==1)
        memcpy(p1,p2,ri->nph*sizeof(double));
      else
        for (int m=0; m<ri->nph; ++m)
          p1[m] = p2[m*ri->stride];
      }
  else
    for (int i=0; i<job->ntrans*job->nmaps; ++i)
      for (int m=0; m<ri->nph; ++m)
        ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
  }
 static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -693,7 +747,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
  }
 //FIXME: set phase to zero if not SHARP_MAP2ALM?
-static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
  {
  if (job->type != SHARP_MAP2ALM) return;
  int pstride = job->s_m;
@ -738,7 +792,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
    }
  }
-static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
  {
  if (job->type == SHARP_MAP2ALM) return;
  int pstride = job->s_m;
@ -783,7 +837,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
    }
  }
-static void sharp_execute_job (sharp_job *job)
+NOINLINE static void sharp_execute_job (sharp_job *job)
  {
  double timer=wallTime();
  job->opcnt=0;
@ -800,6 +854,7 @@ static void sharp_execute_job (sharp_job *job)
  int nchunks, chunksize;
  get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
    &chunksize);
 //FIXME: needs to be changed to "nm"
  alloc_phase (job,mmax+1,chunksize);
 /* chunk loop */
--- a/libsharp/sharp_almhelpers.c
+++ b/libsharp/sharp_almhelpers.c
@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.c
 *  Spherical transform library
 *
- *  Copyright (C) 2008-2013 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
 *  \author Martin Reinecke
 */
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.h
 *  SHARP helper function for the creation of a_lm data structures
 *
- *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
 *  \author Martin Reinecke
 */
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@ -25,7 +25,7 @@
 /*  \file sharp_complex_hacks.h
 *  support for converting vector types and complex numbers
 *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
 *  Author: Martin Reinecke
 */
@ -51,6 +51,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
  complex double * restrict c1, complex double * restrict c2)
  { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
  complex double * restrict cc)
  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
 #endif
 #if (VLEN==2)
@ -94,6 +98,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
 #endif
  }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
  complex double * restrict cc)
  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 #endif
 #if (VLEN==4)
@ -130,6 +138,23 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
 #endif
  }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
  complex double * restrict cc)
  {
  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
  tmp1=vadd(tmp3,tmp4);
 #ifdef UNSAFE_CODE
  _mm256_storeu_pd((double *)cc,
    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
 #else
  union {Tv v; complex double c[2]; } u;
  u.v=tmp1;
  cc[0]+=u.c[0]; cc[1]+=u.c[1];
 #endif
  }
 #endif
 #if (VLEN==8)
@ -144,6 +169,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
  *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
  }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
  complex double * restrict cc)
  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 #endif
 #endif
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@ -25,7 +25,7 @@
 /*! \file sharp_core_inc.c
 *  Type-dependent code for the computational core
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
 *  \author Martin Reinecke
 */
@ -73,8 +73,8 @@ static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
 static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
  double maxval)
  {
  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
  for (int i=0;i<nvec; ++i)
    {
    Tm mask = vgt(vabs(val->v[i]),vfmax);
@ -94,35 +94,58 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
    }
  }
-static void Y(mypow) (Tb val, int npow, Tb * restrict resd,
+NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
-  Tb * restrict ress)
+  Tb * restrict resd, Tb * restrict ress)
  {
-  Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
+  Tv vminv=vload(powlimit[npow]);
-
+  Tm mask = vlt(vabs(val.v[0]),vminv);
-  Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+  for (int i=1;i<nvec; ++i)
-
+    mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
-  do
+  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
    {
-    if (npow&1)
+    Tb res=Y(Tbconst)(1.);
    do
      {
      if (npow&1)
        for (int i=0; i<nvec; ++i)
          {
          vmuleq(res.v[i],val.v[i]);
          vmuleq(val.v[i],val.v[i]);
          }
      else
        for (int i=0; i<nvec; ++i)
          vmuleq(val.v[i],val.v[i]);
      }
    while(npow>>=1);
    *resd=res;
    *ress=Y(Tbconst)(0.);
    }
  else
    {
    Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
    do
      {
      if (npow&1)
        {
        for (int i=0; i<nvec; ++i)
          {
          vmuleq(res.v[i],val.v[i]);
          vaddeq(scale.v[i],scaleint.v[i]);
          }
        Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
        }
      for (int i=0; i<nvec; ++i)
        {
-        vmuleq(res.v[i],val.v[i]);
+        vmuleq(val.v[i],val.v[i]);
-        vaddeq(scale.v[i],scaleint.v[i]);
+        vaddeq(scaleint.v[i],scaleint.v[i]);
        }
-      Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
+      Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
      }
-    for (int i=0; i<nvec; ++i)
+    while(npow>>=1);
-      {
+    *resd=res;
-      vmuleq(val.v[i],val.v[i]);
+    *ress=scale;
      vaddeq(scaleint.v[i],scaleint.v[i]);
      }
    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
    }
  while(npow>>=1);
  *resd=res;
  *ress=scale;
  }
 static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
@ -179,13 +202,13 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
  *corfac=corf.b;
  }
-static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
+NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
  const sharp_Ylmgen_C * restrict gen)
  {
  int l=gen->m;
  Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
-  Y(mypow) (sth,l,&lam_2,&scale);
+  Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
  Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
  Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
@ -193,12 +216,12 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
  while (below_limit)
    {
    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
    Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
-    r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
+                 - vload(gen->rf[l].f[1])*lam_1.v[i];
    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      below_limit = Y(TballLt)(scale,sharp_limscale);
    l+=2;
@ -213,10 +236,8 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
  for (int i=0; i<nvec; ++i)
    {
-    rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
+    rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
-                vmul(fx2,rxp->v[i]));
+    rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
    rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
                vmul(fx2,rxm->v[i]));
    }
  }
@ -240,8 +261,10 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
    }
  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
-  Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
+  Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
-  Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
+  Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
  Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
  Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
  Tb rec2p, rec2m, scalep, scalem;
  Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@ -25,7 +25,7 @@
 /*! \file sharp_core_inc0.c
 *  Computational core
 *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
 *  \author Martin Reinecke
 */
@ -34,7 +34,6 @@
 #include <string.h>
 #include "sharp_vecsupport.h"
 #include "sharp_complex_hacks.h"
 #include "sharp_ylmgen_c.h"
 #include "sharp.h"
 #include "sharp_core.h"
 #include "c_utils.h"
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@ -25,11 +25,11 @@
 /*! \file sharp_core_inc2.c
 *  Type-dependent code for the computational core
 *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
 *  \author Martin Reinecke
 */
-static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
+NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
  int l, int lmax NJ1)
@ -77,29 +77,32 @@ if (njobs>1)
  }
  while (l<lmax)
    {
    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(rf[l].f[1])*lam_1.v[i];
    for (int j=0; j<njobs; ++j)
      {
      Tv ar=vload(creal(alm[njobs*l+j])),
         ai=vload(cimag(alm[njobs*l+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
+        p1[j].r.v[i] += lam_2.v[i]*ar;
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        p1[j].i.v[i] += lam_2.v[i]*ai;
        }
      ar=vload(creal(alm[njobs*(l+1)+j]));
      ai=vload(cimag(alm[njobs*(l+1)+j]));
      for (int i=0; i<nvec; ++i)
        {
        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
        }
      }
    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(rf[l+1].f[1])*lam_2.v[i];
    for (int j=0; j<njobs; ++j)
      {
      Tv ar=vload(creal(alm[njobs*(l+1)+j])),
         ai=vload(cimag(alm[njobs*(l+1)+j]));
      for (int i=0; i<nvec; ++i)
        {
        p2[j].r.v[i] += lam_1.v[i]*ar;
        p2[j].i.v[i] += lam_1.v[i]*ai;
        }
      }
    l+=2;
    }
  if (l==lmax)
@ -109,64 +112,57 @@ if (njobs>1)
      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
+        p1[j].r.v[i] += lam_2.v[i]*ar;
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        p1[j].i.v[i] += lam_2.v[i]*ai;
        }
      }
    }
  }
-static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
+NOINLINE static void Z(map2alm_kernel) (const Tb cth,
-  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
  NJ1)
  {
  while (l<lmax)
    {
    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(rf[l].f[1])*lam_1.v[i];
    for (int j=0; j<njobs; ++j)
      {
      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
+        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
-        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
        }
      for (int i=0; i<nvec; ++i)
        {
        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
        }
      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
      }
    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(rf[l+1].f[1])*lam_2.v[i];
    for (int j=0; j<njobs; ++j)
      for (int i=0; i<nvec; ++i)
        {
        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
        }
    l+=2;
    }
  if (l==lmax)
    {
    for (int j=0; j<njobs; ++j)
      {
      Tv tre=vzero, tim=vzero;
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
+        atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
-        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
        }
      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
      }
    }
  }
-static void Z(calc_alm2map) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
  Y(Tbri) * restrict p2 NJ1)
  {
  int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
  if (l>lmax) return;
@ -219,12 +215,12 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
  }
-static void Z(calc_map2alm) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2 NJ1)
+  const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
  {
  int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
  int l=gen->m;
  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
@ -234,40 +230,31 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
  Tb corfac;
  Y(getCorfac)(scale,&corfac,gen->cf);
  dcmplx * restrict alm=job->almtmp;
  int full_ieee = Y(TballGe)(scale,sharp_minscale);
  while (!full_ieee)
    {
    for (int j=0; j<njobs; ++j)
      {
      Tv tre=vzero, tim=vzero;
      for (int i=0; i<nvec; ++i)
        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+        Tv tmp=lam_2.v[i]*corfac.v[i];
-        vfmaeq(tre,tmp,p1[j].r.v[i]);
+        atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
-        vfmaeq(tim,tmp,p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
        }
      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
      }
    if (++l>lmax) return;
    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
                 - vload(rf[l-1].f[1])*lam_1.v[i];
    for (int j=0; j<njobs; ++j)
      {
      Tv tre=vzero, tim=vzero;
      for (int i=0; i<nvec; ++i)
        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+        Tv tmp=lam_1.v[i]*corfac.v[i];
-        vfmaeq(tre,tmp,p2[j].r.v[i]);
+        atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
-        vfmaeq(tim,tmp,p2[j].i.v[i]);
+        atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
        }
      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
      }
    if (++l>lmax) return;
    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
                 - vload(rf[l-1].f[1])*lam_2.v[i];
    if (Y(rescale)(&lam_1,&lam_2,&scale))
      {
      Y(getCorfac)(scale,&corfac,gen->cf);
@ -276,7 +263,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
    }
  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
  }
 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
@ -317,8 +304,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
    for (int i=0; i<nvec; ++i)
      {
-      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
+      Tv lw1=r2p.v[i]+r2m.v[i];
-      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
+      Tv lx2=r1m.v[i]-r1p.v[i];
      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
@ -326,8 +313,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
      }
    for (int i=0; i<nvec; ++i)
      {
-      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
+      Tv lx1=r2m.v[i]-r2p.v[i];
-      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
+      Tv lw2=r1p.v[i]+r1m.v[i];
      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
@ -359,11 +346,11 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
      vfmaeq(acr,py[j].qi.v[i],lx);
      vfmseq(aci,py[j].qr.v[i],lx);
      }
-    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
+    vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
    }
  }
-static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
  int lmax NJ1)
@ -374,10 +361,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
       fx2=vload(fx[l+1].f[2]);
    for (int i=0; i<nvec; ++i)
      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
-                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                        vmul(fx2,rec1m.v[i]));
      }
    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
      &alm[2*njobs*(l+1)] NJ2);
@ -385,10 +370,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
-                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
                        vmul(fx2,rec2m.v[i]));
      }
    l+=2;
    }
@ -396,7 +379,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
  }
-static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
+NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
  NJ1)
@ -429,7 +412,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
  }
-static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2 NJ1)
  {
@ -475,7 +458,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
    lmax NJ2);
  }
-static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
+NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
  {
@ -539,7 +522,7 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
    }
  }
-static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
  int lmax NJ1)
@ -572,7 +555,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
  }
-static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
  Y(Tbqu) * restrict p2 NJ1)
  {
@ -621,7 +604,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
-static void Z(inner_loop) (sharp_job *job, const int *ispair,
+NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
  {
@ -722,10 +705,30 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
        }
      break;
      }
    default:
      {
      UTIL_FAIL("must not happen");
      break;
      }
    }
  }
 NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
  {
  const int nval=nvec*VLEN;
  const int m = job->ainfo->mval[mi];
  sharp_Ylmgen_prepare (gen, m);
  switch (job->type)
    {
    case SHARP_MAP2ALM:
      {
      if (job->spin==0)
        {
        Tv atmp[2*njobs*(gen->lmax+1)];
        memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
@ -751,8 +754,15 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
              }
            }
          if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
          }
        {
        int istart=m*njobs, istop=(gen->lmax+1)*njobs;
        for(; istart<istop-2; istart+=2)
          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
        for(; istart<istop; istart++)
          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
        }
        }
      else
        {
@ -800,4 +810,13 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
    }
  }
 static void Z(inner_loop) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
  {
  (job->type==SHARP_MAP2ALM) ?
    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
  }
 #undef VZERO
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@ -25,13 +25,14 @@
 /*! \file sharp_cxx.h
 *  Spherical transform library
 *
- *  Copyright (C) 2012-2015 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
 *  \author Martin Reinecke
 */
 #ifndef PLANCK_SHARP_CXX_H
 #define PLANCK_SHARP_CXX_H
 #include <complex>
 #include "sharp_lowlevel.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
@ -107,19 +108,30 @@ template<typename T> class sharp_cxxjob: public sharp_base
  private:
    static void *conv (T *ptr)
      { return reinterpret_cast<void *>(ptr); }
    static void *conv (std::complex<T> *ptr)
      { return reinterpret_cast<void *>(ptr); }
    static void *conv (const T *ptr)
      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
    static void *conv (const std::complex<T> *ptr)
      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
  public:
-    void alm2map (const T *alm, T *map, bool add)
+    void alm2map (const T *alm, T *map, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
        flags,0,0);
      }
-    void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2,
+    void alm2map (const std::complex<T> *alm, T *map, bool add) const
-      int spin, bool add)
+      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
        flags,0,0);
      }
    void alm2map_spin (const T *alm1, const T *alm2,
      T *map1, T *map2, int spin, bool add) const
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
@ -127,21 +139,65 @@ template<typename T> class sharp_cxxjob: public sharp_base
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
-    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add)
+    void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2,
      T *map1, T *map2, int spin, bool add) const
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const
      {
      void *aptr=conv(alm), *mptr[2];
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
-    void map2alm (const T *map, T *alm, bool add)
+    void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add)
      const
      {
      void *aptr=conv(alm), *mptr[2];
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
    void alm2map_adjoint (const T *map, T *alm, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
      }
    void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
      }
    void map2alm (const T *map, T *alm, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
      }
    void map2alm (const T *map, std::complex<T> *alm, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
      }
    void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
-      int spin, bool add)
+      int spin, bool add) const
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
    void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1,
      std::complex<T> *alm2, int spin, bool add) const
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@ -25,9 +25,8 @@
 /*! \file sharp_geomhelpers.c
 *  Spherical transform library
 *
- *  Copyright (C) 2006-2012 Max-Planck-Society<br>
+ *  Copyright (C) 2006-2018 Max-Planck-Society<br>
- *  Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl)
+ *  \author Martin Reinecke
 *  \author Martin Reinecke \author Pavel Holoborodko
 */
 #include <math.h>
@ -35,7 +34,6 @@
 #include "sharp_legendre_roots.h"
 #include "c_utils.h"
 #include "pocketfft/pocketfft.h"
 #include <stdio.h>
 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
  const int *rings, const double *weight, sharp_geom_info **geom_info)
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@ -23,7 +23,7 @@
 */
 /*  \file sharp_testsuite.c
- * 
+ *
 *  Copyright (C) 2012-2013 Max-Planck-Society
 *  \author Martin Reinecke
 */
@ -50,9 +50,9 @@ typedef complex double dcmplx;
 int ntasks, mytask;
-static double drand (double min, double max, int *state)
+static double drand (double min, double max, unsigned *state)
  {
-  *state = (((*state) * 1103515245) + 12345) & 0x7fffffff;
+  *state = (((*state) * 1103515245u) + 12345u) & 0x7fffffffu;
  return min + (max-min)*(*state)/(0x7fffffff+1.0);
  }
@ -65,7 +65,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
  for (mi=0;mi<helper->nm; ++mi)
    {
    int m=helper->mval[mi];
-    int state=1234567*cnt+8912*m; // random seed
+    unsigned state=1234567u*(unsigned)cnt+8912u*(unsigned)m; // random seed
    for (int l=m;l<=helper->lmax; ++l)
      {
      if ((l<spin)&&(m<spin))
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
 *  Convenience functions for vector arithmetics
 *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
 *  Author: Martin Reinecke
 */
@ -72,6 +72,7 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
 #define vor_mask(a,b) ((a)||(b))
 #define vstoreu(p, a) (*(p)=a)
 #define vstoreu_s(p, a) (*(p)=a)
@ -138,6 +139,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vge(a,b) _mm_cmpge_pd(a,b)
 #define vne(a,b) _mm_cmpneq_pd(a,b)
 #define vand_mask(a,b) _mm_and_pd(a,b)
 #define vor_mask(a,b) _mm_or_pd(a,b)
 #define vmin(a,b) _mm_min_pd(a,b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
@ -183,6 +185,13 @@ typedef __m256d Tm;
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
 #if (USE_FMA)
 #define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
 #define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
@ -191,6 +200,7 @@ typedef __m256d Tm;
 #define vfmaseq(a,b,c,d,e) \
  a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
 #endif
 #endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
 #define vload_s(a) _mm256_set1_ps(a)
@ -201,6 +211,7 @@ typedef __m256d Tm;
 #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
 #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
 #define vand_mask(a,b) _mm256_and_pd(a,b)
 #define vor_mask(a,b) _mm256_or_pd(a,b)
 #define vmin(a,b) _mm256_min_pd(a,b)
 #define vmax(a,b) _mm256_max_pd(a,b)
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
@ -242,6 +253,7 @@ typedef __mmask8 Tm;
 #define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
 #define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
 #define vand_mask(a,b) ((a)&(b))
 #define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)
 #define vmax(a,b) _mm512_max_pd(a,b)
 #define vanyTrue(a) (a!=0)
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@ -25,7 +25,7 @@
 /*
 *  Helper code for efficient calculation of Y_lm(theta,phi=0)
 *
- *  Copyright (C) 2005-2014 Max-Planck-Society
+ *  Copyright (C) 2005-2016 Max-Planck-Society
 *  Author: Martin Reinecke
 */
@ -59,6 +59,12 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
    gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
  for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
    gen->cf[m]=gen->cf[m-1]*sharp_fbig;
  gen->powlimit=RALLOC(double,m_max+spin+1);
  gen->powlimit[0]=0.;
  const double ln2 = 0.6931471805599453094172321214581766;
  const double expo=-400*ln2;
  for (int m=1; m<=m_max+spin; ++m)
    gen->powlimit[m]=exp(expo/m);
  gen->m = -1;
  if (spin==0)
@ -124,6 +130,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
 void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
  {
  DEALLOC(gen->cf);
  DEALLOC(gen->powlimit);
  if (gen->s==0)
    {
    DEALLOC(gen->rf);
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@ -25,7 +25,7 @@
 /*! \file sharp_ylmgen_c.h
 *  Code for efficient calculation of Y_lm(phi=0,theta)
 *
- *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  Copyright (C) 2005-2016 Max-Planck-Society
 *  \author Martin Reinecke
 */
@ -49,6 +49,7 @@ typedef struct
 /* for public use; immutable during lifetime */
  int lmax, mmax, s;
  double *cf;
  double *powlimit;
 /* for public use; will typically change after call to Ylmgen_prepare() */
  int m;