diff --git a/Makefile.am b/Makefile.am
index c738f29..6370d95 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,17 +16,14 @@ src_sharp = \
   libsharp/sharp_ylmgen_c.c \
   libsharp/sharp_announce.h \
   libsharp/sharp_complex_hacks.h \
-  libsharp/sharp_core.h \
   libsharp/sharp_internal.h \
   libsharp/sharp_legendre_roots.h \
-  libsharp/sharp_lowlevel.h \
   libsharp/sharp_vecsupport.h \
   libsharp/sharp_vecutil.h \
   libsharp/sharp_ylmgen_c.h
 
 include_HEADERS = \
   libsharp/sharp.h \
-  libsharp/sharp_lowlevel.h \
   libsharp/sharp_geomhelpers.h \
   libsharp/sharp_almhelpers.h \
   libsharp/sharp_cxx.h
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index f312fc3..bbb3872 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -35,7 +35,6 @@
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
-#include "sharp_core.h"
 #include "walltime_c.h"
 #include "sharp_almhelpers.h"
 #include "sharp_geomhelpers.h"
@@ -588,7 +587,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
     }
   else
     memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
-      job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
+      job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx));
 
 #undef COPY_LOOP
   }
@@ -960,9 +959,6 @@ void sharp_set_chunksize_min(int new_chunksize_min)
 void sharp_set_nchunks_max(int new_nchunks_max)
   { nchunks_max=new_nchunks_max; }
 
-int sharp_get_nv_max (void)
-{ return 6; }
-
 #ifdef USE_MPI
 #include "sharp_mpi.c"
 
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
index 9c5dd57..35a0cb5 100644
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@@ -23,21 +23,248 @@
  */
 
 /*! \file sharp.h
- *  Interface for the spherical transform library.
+ *  Portable interface for the spherical transform library.
  *
- *  Copyright (C) 2006-2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
-#ifndef PLANCK_SHARP_H
-#define PLANCK_SHARP_H
+#ifndef PLANCK_SHARP_LOWLEVEL_H
+#define PLANCK_SHARP_LOWLEVEL_H
+
+#include <stddef.h>
 
 #ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
+extern "C" {
 #endif
 
-#include <complex.h>
+/*! \internal
+    Helper type containing information about a single ring. */
+typedef struct
+  {
+  double theta, phi0, weight, cth, sth;
+  ptrdiff_t ofs;
+  int nph, stride;
+  } sharp_ringinfo;
 
-#include "sharp_lowlevel.h"
+/*! \internal
+    Helper type containing information about a pair of rings with colatitudes
+    symmetric around the equator. */
+typedef struct
+  {
+  sharp_ringinfo r1,r2;
+  } sharp_ringpair;
+
+/*! \internal
+    Type holding all required information about a map geometry. */
+typedef struct
+  {
+  sharp_ringpair *pair;
+  int npairs, nphmax;
+  } sharp_geom_info;
+
+/*! \defgroup almgroup Helpers for dealing with a_lm */
+/*! \{ */
+
+/*! \internal
+    Helper type for index calculation in a_lm arrays. */
+typedef struct
+  {
+  /*! Maximum \a l index of the array */
+  int lmax;
+  /*! Number of different \a m values in this object */
+  int nm;
+  /*! Array with \a nm entries containing the individual m values */
+  int *mval;
+  /*! Combination of flags from sharp_almflags */
+  int flags;
+  /*! Array with \a nm entries containing the (hypothetical) indices of
+      the coefficients with quantum numbers 0,\a mval[i] */
+  ptrdiff_t *mvstart;
+  /*! Stride between a_lm and a_(l+1),m */
+  ptrdiff_t stride;
+  } sharp_alm_info;
+
+/*! alm_info flags */
+typedef enum { SHARP_PACKED = 1,
+               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
+                    not present. mvstart is in units of *real* float/double for all
+                    m; stride is in units of reals for m=0 and complex for m!=0 */
+               SHARP_REAL_HARMONICS  = 1<<6
+               /*!< Use the real spherical harmonic convention. For
+                    m==0, the alm are treated exactly the same as in
+                    the complex case.  For m!=0, alm[i] represent a
+                    pair (+abs(m), -abs(m)) instead of (real, imag),
+                    and the coefficients are scaled by a factor of
+                    sqrt(2) relative to the complex case.  In other
+                    words, (sqrt(.5) * alm[i]) recovers the
+                    corresponding complex coefficient (when accessed
+                    as complex).
+                */
+             } sharp_almflags;
+
+
+
+/*! Creates an a_lm data structure from the following parameters:
+    \param lmax maximum \a l quantum number (>=0)
+    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
+    \param mstart the index of the (hypothetical) coefficient with the
+      quantum numbers 0,\a m. Must have \a mmax+1 entries.
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_alm_info (int lmax, int mmax, int stride,
+  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
+/*! Creates an a_lm data structure which from the following parameters:
+    \param lmax maximum \a l quantum number (\a >=0)
+    \param nm number of different \a m (\a 0<=nm<=lmax+1)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
+    \param mval array with \a nm entries containing the individual m values
+    \param mvstart array with \a nm entries containing the (hypothetical)
+      indices of the coefficients with the quantum numbers 0,\a mval[i]
+    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
+  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
+/*! Returns the index of the coefficient with quantum numbers \a l,
+    \a mval[mi].
+    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
+    the index for the coefficient with the quantum numbers \a l, \a mi. */
+ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
+/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
+    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
+    otherwise it is the number of complex coefficients (with m>=0). */
+ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
+/*! Deallocates the a_lm info object. */
+void sharp_destroy_alm_info (sharp_alm_info *info);
+
+/*! \} */
+
+/*! \defgroup geominfogroup Functions for dealing with geometry information */
+/*! \{ */
+
+/*! Creates a geometry information from a set of ring descriptions.
+    All arrays passed to this function must have \a nrings elements.
+    \param nrings the number of rings in the map
+    \param nph the number of pixels in each ring
+    \param ofs the index of the first pixel in each ring in the map array
+    \param stride the stride between consecutive pixels
+    \param phi0 the azimuth (in radians) of the first pixel in each ring
+    \param theta the colatitude (in radians) of each ring
+    \param wgt the pixel weight to be used for the ring in map2alm
+      and adjoint map2alm transforms.
+      Pass NULL to use 1.0 as weight for all rings.
+    \param geom_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
+  const int *stride, const double *phi0, const double *theta,
+  const double *wgt, sharp_geom_info **geom_info);
+
+/*! Counts the number of grid points needed for (the local part of) a map described
+    by \a info.
+ */
+ptrdiff_t sharp_map_size(const sharp_geom_info *info);
+
+/*! Deallocates the geometry information in \a info. */
+void sharp_destroy_geom_info (sharp_geom_info *info);
+
+/*! \} */
+
+/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
+/*! \{ */
+
+/*! Enumeration of SHARP job types. */
+typedef enum { SHARP_YtW=0,               /*!< analysis */
+               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
+               SHARP_Y=1,                 /*!< synthesis */
+               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
+               SHARP_Yt=2,                /*!< adjoint synthesis */
+               SHARP_WY=3,                /*!< adjoint analysis */
+               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
+             } sharp_jobtype;
+
+/*! Job flags */
+typedef enum { SHARP_DP              = 1<<4,
+               /*!< map and a_lm are in double precision */
+               SHARP_ADD             = 1<<5,
+               /*!< results are added to the output arrays, instead of
+                    overwriting them */
+
+               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
+                  but its use here is deprecated in favor of having it in the sharp_alm_info */
+
+               SHARP_NO_FFT          = 1<<7,
+
+               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
+               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
+             } sharp_jobflags;
+
+/*! Performs a libsharp SHT job. The interface deliberately does not use
+  the C99 "complex" data type, in order to be callable from C89 and C++.
+  \param type the type of SHT
+  \param spin the spin of the quantities to be transformed
+  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
+    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
+    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
+    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
+    depends on whether the SHARP_DP flag is set.
+  \param map contains pointers to the maps. If \a spin==0,
+    map[0] points to the map of the first SHT, map[1] to that of the second
+    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
+    point to the maps of the first SHT, map[2] and map[3] to those of the
+    second, etc. The exact data type of \a map depends on whether the SHARP_DP
+    flag is set.
+  \param geom_info A \c sharp_geom_info object compatible with the provided
+    \a map arrays.
+  \param alm_info A \c sharp_alm_info object compatible with the provided
+    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
+    exactly once.
+  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
+    \a alm is expected to have the type "complex double **" and \a map is
+    expected to have the type "double **"; otherwise, the expected
+    types are "complex float **" and "float **", respectively.
+  \param time If not NULL, the wall clock time required for this SHT
+    (in seconds) will be written here.
+  \param opcnt If not NULL, a conservative estimate of the total floating point
+    operation count for this SHT will be written here. */
+void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
+  int flags, double *time, unsigned long long *opcnt);
+
+void sharp_set_chunksize_min(int new_chunksize_min);
+void sharp_set_nchunks_max(int new_nchunks_max);
+
+
+typedef enum { SHARP_ERROR_NO_MPI = 1,
+               /*!< libsharp not compiled with MPI support */
+              } sharp_errors;
+
+/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
+    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
+
+    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
+    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
+    is *(MPI_Comm*)pcomm.
+
+    Other parameters are the same as sharp_execute_mpi.
+
+    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
+    (in which case nothing is done).
+ */
+int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int flags, double *time,
+  unsigned long long *opcnt);
+
+
+
+/*! \} */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index 67016d7..c17028a 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -32,7 +32,7 @@
 #ifndef PLANCK_SHARP_ALMHELPERS_H
 #define PLANCK_SHARP_ALMHELPERS_H
 
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_announce.c b/libsharp/sharp_announce.c
index 7027167..a028258 100644
--- a/libsharp/sharp_announce.c
+++ b/libsharp/sharp_announce.c
@@ -40,7 +40,7 @@
 #endif
 
 #include "sharp_announce.h"
-#include "sharp_core.h"
+#include "sharp_internal.h"
 
 static void OpenMP_status(void)
   {
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
index 86d1153..6ec27bb 100644
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@@ -25,131 +25,126 @@
 /*  \file sharp_complex_hacks.h
  *  support for converting vector types and complex numbers
  *
- *  Copyright (C) 2012-2016 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
 #ifndef SHARP_COMPLEX_HACKS_H
 #define SHARP_COMPLEX_HACKS_H
 
-#ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
-#endif
-
 #include <math.h>
-#include <complex.h>
 #include "sharp_vecsupport.h"
 
 #define UNSAFE_CODE
 
 #if (VLEN==1)
 
-static inline complex double vhsum_cmplx(Tv a, Tv b)
+static inline _Complex double vhsum_cmplx(Tv a, Tv b)
   { return a+_Complex_I*b; }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
   { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
 
 #endif
 
 #if (VLEN==2)
 
-static inline complex double vhsum_cmplx (Tv a, Tv b)
+static inline _Complex double vhsum_cmplx (Tv a, Tv b)
   {
 #if defined(__SSE3__)
   Tv tmp = _mm_hadd_pd(a,b);
 #else
-  Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-                _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
+  Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+           _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
 #endif
-  union {Tv v; complex double c; } u;
+  union {Tv v; _Complex double c; } u;
   u.v=tmp; return u.c;
   }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, complex double * restrict c1, complex double * restrict c2)
+  Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
   {
 #ifdef UNSAFE_CODE
 #if defined(__SSE3__)
-  vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
-  vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
+  *((__m128d *)c1) += _mm_hadd_pd(a,b);
+  *((__m128d *)c2) += _mm_hadd_pd(c,d);
 #else
-  vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-                               _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
-  vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
-                               _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
+  *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+                      _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+                      _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
 #endif
 #else
-  union {Tv v; complex double c; } u1, u2;
+  union {Tv v; _Complex double c; } u1, u2;
 #if defined(__SSE3__)
   u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
 #else
-  u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-              _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
-  u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
-              _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
 #endif
   *c1+=u1.c; *c2+=u2.c;
 #endif
   }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 
 #endif
 
 #if (VLEN==4)
 
-static inline complex double vhsum_cmplx (Tv a, Tv b)
+static inline _Complex double vhsum_cmplx (Tv a, Tv b)
   {
   Tv tmp=_mm256_hadd_pd(a,b);
   Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
   tmp=_mm256_add_pd(tmp,tmp2);
 #ifdef UNSAFE_CODE
-  complex double ret;
+  _Complex double ret;
   *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
   return ret;
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
   u.v=tmp; return u.c[0];
 #endif
   }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
   {
   Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
   Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
      tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
+  tmp1=tmp3+tmp4;
 #ifdef UNSAFE_CODE
   *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
   *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
   u.v=tmp1;
   *c1+=u.c[0]; *c2+=u.c[1];
 #endif
   }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   {
   Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
   Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
      tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
+  tmp1=tmp3+tmp4;
 #ifdef UNSAFE_CODE
   _mm256_storeu_pd((double *)cc,
     _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
   u.v=tmp1;
   cc[0]+=u.c[0]; cc[1]+=u.c[1];
 #endif
@@ -159,18 +154,18 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
 
 #if (VLEN==8)
 
-static inline complex double vhsum_cmplx(Tv a, Tv b)
+static inline _Complex double vhsum_cmplx(Tv a, Tv b)
   { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
 
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
   {
   *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
   *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
   }
 
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
   { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 
 #endif
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 57a3ecd..2dcee90 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -35,32 +35,55 @@
 #include "sharp_vecsupport.h"
 #include "sharp_complex_hacks.h"
 #include "sharp.h"
-#include "sharp_core.h"
+#include "sharp_internal.h"
 #include "c_utils.h"
 
 typedef complex double dcmplx;
 
-#define nvec (256/VLEN)
+#define nv0 (128/VLEN)
+#define nvx (64/VLEN)
 
-typedef union
-  { Tv v; double s[VLEN]; } Tvu;
+typedef Tv Tbv0[nv0];
+typedef double Tbs0[nv0*VLEN];
 
 typedef struct
-  { Tv v[nvec]; } Tb;
-
-typedef union
-  { Tb b; double s[VLEN*nvec]; } Tbu;
+  {
+  Tbv0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  } s0data_v;
 
 typedef struct
-  { Tb r, i; } Tbri;
-
-typedef struct
-  { double r[VLEN*nvec], i[VLEN*nvec]; } Tsri;
+  {
+  Tbs0 sth, corfac, scale, lam1, lam2, cth, p1r, p1i, p2r, p2i;
+  } s0data_s;
 
 typedef union
-  { Tbri b; Tsri s; } Tburi;
+  {
+  s0data_v v;
+  s0data_s s;
+  } s0data_u;
 
-static void Tvnormalize (Tv * restrict val, Tv * restrict scale,
+typedef Tv Tbvx[nvx];
+typedef double Tbsx[nvx*VLEN];
+
+typedef struct
+  {
+  Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  } sxdata_v;
+
+typedef struct
+  {
+  Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
+       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
+  } sxdata_s;
+
+typedef union
+  {
+  sxdata_v v;
+  sxdata_s s;
+  } sxdata_u;
+
+static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
   double maxval)
   {
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
@@ -85,7 +108,6 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
   Tv * restrict resd, Tv * restrict ress)
   {
   Tv vminv=vload(powlimit[npow]);
-  Tv res=vone;
   Tm mask = vlt(vabs(val),vminv);
   if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
     {
@@ -93,8 +115,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
     do
       {
       if (npow&1)
-        vmuleq(res,val);
-      vmuleq(val,val);
+        res*=val;
+      val*=val;
       }
     while(npow>>=1);
     *resd=res;
@@ -108,12 +130,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
       {
       if (npow&1)
         {
-        vmuleq(res,val);
-        vaddeq(scale,scaleint);
+        res*=val;
+        scale+=scaleint;
         Tvnormalize(&res,&scale,sharp_fbighalf);
         }
-      vmuleq(val,val);
-      vaddeq(scaleint,scaleint);
+      val*=val;
+      scaleint+=scaleint;
       Tvnormalize(&val,&scaleint,sharp_fbighalf);
       }
     while(npow>>=1);
@@ -122,9 +144,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
     }
   }
 
-static void getCorfac(Tv scale, Tv * restrict corfac,
+static inline void getCorfac(Tv scale, Tv * restrict corfac,
   const double * restrict cf)
   {
+  typedef union
+    { Tv v; double s[VLEN]; } Tvu;
+
   Tvu sc, corf;
   sc.v=scale;
   for (int i=0; i<VLEN; ++i)
@@ -133,21 +158,33 @@ static void getCorfac(Tv scale, Tv * restrict corfac,
   *corfac=corf.v;
   }
 
-NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
-  Tb * restrict lam_1, Tb * restrict lam_2, Tb * restrict scale,
-  const sharp_Ylmgen_C * restrict gen, int nv2)
+static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
+  {
+  Tm mask = vgt(vabs(*v2),eps);
+  if (vanyTrue(mask))
+    {
+    vmuleq_mask(mask,*v1,vload(sharp_fsmall));
+    vmuleq_mask(mask,*v2,vload(sharp_fsmall));
+    vaddeq_mask(mask,*s,vone);
+    return 1;
+    }
+  return 0;
+  }
+
+NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
+  s0data_v * restrict d, int * restrict l_, int nv2)
   {
   int l=gen->m;
   Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tv fsmall=vload(sharp_fsmall), limscale=vload(sharp_limscale);
+  Tv limscale=vload(sharp_limscale);
   int below_limit = 1;
   for (int i=0; i<nv2; ++i)
     {
-    lam_1->v[i]=vzero;
-    mypow(sth.v[i],l,gen->powlimit,&lam_2->v[i],&scale->v[i]);
-    lam_2->v[i] *= mfac;
-    Tvnormalize(&lam_2->v[i],&scale->v[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(scale->v[i],limscale));
+    d->lam1[i]=vzero;
+    mypow(d->sth[i],l,gen->powlimit,&d->lam2[i],&d->scale[i]);
+    d->lam2[i] *= mfac;
+    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
+    below_limit &= vallTrue(vlt(d->scale[i],limscale));
     }
 
   while (below_limit)
@@ -158,25 +195,17 @@ NOINLINE static void iter_to_ieee (const Tb sth, Tb cth, int *l_,
        r20=vload(gen->rf[l+1].f[0]), r21=vload(gen->rf[l+1].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      lam_1->v[i] = r10*cth.v[i]*lam_2->v[i] - r11*lam_1->v[i];
-      lam_2->v[i] = r20*cth.v[i]*lam_1->v[i] - r21*lam_2->v[i];
-      Tm mask = vgt(vabs(lam_2->v[i]),vload(sharp_ftol));
-      if (vanyTrue(mask))
-        {
-        vmuleq_mask(mask,lam_1->v[i],fsmall);
-        vmuleq_mask(mask,lam_2->v[i],fsmall);
-        vaddeq_mask(mask,scale->v[i],vone);
-        below_limit &= vallTrue(vlt(scale->v[i],limscale));
-        }
+      d->lam1[i] = r10*d->cth[i]*d->lam2[i] - r11*d->lam1[i];
+      d->lam2[i] = r20*d->cth[i]*d->lam1[i] - r21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scale[i],limscale));
       }
     l+=2;
     }
   *l_=l;
   }
 
-
-NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
-  Tbri * restrict p2, Tb lam_1, Tb lam_2,
+NOINLINE static void alm2map_kernel(s0data_v * restrict d,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax, int nv2)
   {
@@ -188,49 +217,23 @@ NOINLINE static void alm2map_kernel(const Tb cth, Tbri * restrict p1,
        f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(p1->r.v[i],lam_2.v[i],ar1);
-      vfmaeq(p1->i.v[i],lam_2.v[i],ai1);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      vfmaeq(p2->r.v[i],lam_1.v[i],ar2);
-      vfmaeq(p2->i.v[i],lam_1.v[i],ai2);
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      d->p1r[i] += d->lam2[i]*ar1;
+      d->p1i[i] += d->lam2[i]*ai1;
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      d->p2r[i] += d->lam1[i]*ar2;
+      d->p2i[i] += d->lam1[i]*ai2;
       }
     l+=2;
     }
   }
 
-NOINLINE static void map2alm_kernel (const Tb cth,
-  const Tbri * restrict p1, const Tbri * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
-  int lmax, int nv2)
-  {
-  while (l<=lmax)
-    {
-    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
-       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
-    for (int i=0; i<nv2; ++i)
-      {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(atmp[0],lam_2.v[i],p1->r.v[i]);
-      vfmaeq(atmp[1],lam_2.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      vfmaeq(atmp[2],lam_1.v[i],p2->r.v[i]);
-      vfmaeq(atmp[3],lam_1.v[i],p2->i.v[i]);
-      }
-    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
-    l+=2;
-    }
-  }
-
-NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Tbri * restrict p1,
-  Tbri * restrict p2, int nth)
+NOINLINE static void calc_alm2map (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
   {
   int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
   int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2);
+  iter_to_ieee(gen, d, &l, nv2);
   job->opcnt += (l-gen->m) * 4*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*nth;
@@ -238,11 +241,10 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   const dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
-  Tb corfac;
   for (int i=0; i<nv2; ++i)
     {
-    getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-    full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
     }
 
   while((!full_ieee) && (l<=lmax))
@@ -254,21 +256,17 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
     full_ieee=1;
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(p1->r.v[i],lam_2.v[i]*corfac.v[i],ar1);
-      vfmaeq(p1->i.v[i],lam_2.v[i]*corfac.v[i],ai1);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
-      if (vanyTrue(mask))
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      d->p1r[i] += d->lam2[i]*d->corfac[i]*ar1;
+      d->p1i[i] += d->lam2[i]*d->corfac[i]*ai1;
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
         {
-        vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
-        vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
-        vaddeq_mask(mask,scale.v[i],vone);
-        getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-        full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
         }
-      vfmaeq(p2->r.v[i],lam_1.v[i]*corfac.v[i],ar2);
-      vfmaeq(p2->i.v[i],lam_1.v[i]*corfac.v[i],ai2);
+      d->p2r[i] += d->lam1[i]*d->corfac[i]*ar2;
+      d->p2i[i] += d->lam1[i]*d->corfac[i]*ai2;
       }
     l+=2;
     }
@@ -276,21 +274,42 @@ NOINLINE static void calc_alm2map (const Tb cth, const Tb sth,
 
   for (int i=0; i<nv2; ++i)
     {
-    lam_1.v[i] *= corfac.v[i];
-    lam_2.v[i] *= corfac.v[i];
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
     }
-  alm2map_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2);
+  alm2map_kernel(d, rf, alm, l, lmax, nv2);
   }
 
-NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Tbri * restrict p1,
-  const Tbri * restrict p2, int nth)
+NOINLINE static void map2alm_kernel(s0data_v * restrict d,
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l,
+  int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv f10=vload(rf[l  ].f[0]), f11=vload(rf[l  ].f[1]),
+       f20=vload(rf[l+1].f[0]), f21=vload(rf[l+1].f[1]);
+    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    for (int i=0; i<nv2; ++i)
+      {
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      atmp[0] += d->lam2[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->p1i[i];
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      atmp[2] += d->lam1[i]*d->p2r[i];
+      atmp[3] += d->lam1[i]*d->p2i[i];
+      }
+    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_map2alm(sharp_job * restrict job,
+  const sharp_Ylmgen_C *gen, s0data_v * restrict d, int nth)
   {
   int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
   int l=gen->m;
   int nv2 = (nth+VLEN-1)/VLEN;
-  iter_to_ieee(sth,cth,&l,&lam_1,&lam_2,&scale,gen,nv2);
+  iter_to_ieee(gen, d, &l, nv2);
   job->opcnt += (l-gen->m) * 4*nth;
   if (l>lmax) return;
   job->opcnt += (lmax+1-l) * 8*nth;
@@ -298,11 +317,10 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   dcmplx * restrict alm=job->almtmp;
   int full_ieee=1;
-  Tb corfac;
   for (int i=0; i<nv2; ++i)
     {
-    getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-    full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
     }
 
   while ((!full_ieee) && (l<=lmax))
@@ -313,21 +331,17 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
     Tv atmp[4] = {vzero, vzero, vzero, vzero};
     for (int i=0; i<nv2; ++i)
       {
-      lam_1.v[i] = f10*cth.v[i]*lam_2.v[i] - f11*lam_1.v[i];
-      vfmaeq(atmp[0],lam_2.v[i]*corfac.v[i],p1->r.v[i]);
-      vfmaeq(atmp[1],lam_2.v[i]*corfac.v[i],p1->i.v[i]);
-      lam_2.v[i] = f20*cth.v[i]*lam_1.v[i] - f21*lam_2.v[i];
-      Tm mask = vgt(vabs(lam_2.v[i]),vload(sharp_ftol));
-      if (vanyTrue(mask))
+      d->lam1[i] = f10*d->cth[i]*d->lam2[i] - f11*d->lam1[i];
+      atmp[0] += d->lam2[i]*d->corfac[i]*d->p1r[i];
+      atmp[1] += d->lam2[i]*d->corfac[i]*d->p1i[i];
+      d->lam2[i] = f20*d->cth[i]*d->lam1[i] - f21*d->lam2[i];
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
         {
-        vmuleq_mask(mask,lam_1.v[i],vload(sharp_fsmall));
-        vmuleq_mask(mask,lam_2.v[i],vload(sharp_fsmall));
-        vaddeq_mask(mask,scale.v[i],vone);
-        getCorfac(scale.v[i], &corfac.v[i], gen->cf);
-        full_ieee &= vallTrue(vge(scale.v[i],vload(sharp_minscale)));
+        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
         }
-      vfmaeq(atmp[2],lam_1.v[i]*corfac.v[i],p2->r.v[i]);
-      vfmaeq(atmp[3],lam_1.v[i]*corfac.v[i],p2->i.v[i]);
+      atmp[2] += d->lam1[i]*d->corfac[i]*d->p2r[i];
+      atmp[3] += d->lam1[i]*d->corfac[i]*d->p2i[i];
       }
     vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
     l+=2;
@@ -335,10 +349,418 @@ NOINLINE static void calc_map2alm(const Tb cth, const Tb sth,
 
   for (int i=0; i<nv2; ++i)
     {
-    lam_1.v[i] *= corfac.v[i];
-    lam_2.v[i] *= corfac.v[i];
+    d->lam1[i] *= d->corfac[i];
+    d->lam2[i] *= d->corfac[i];
     }
-  map2alm_kernel(cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, nv2);
+  map2alm_kernel(d, rf, alm, l, lmax, nv2);
+  }
+
+NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
+  sxdata_v * restrict d, int * restrict l_, int nv2)
+  {
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  Tv prefac=vload(gen->prefac[gen->m]),
+     prescale=vload(gen->fscale[gen->m]);
+  Tv limscale=vload(sharp_limscale);
+  int below_limit=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
+    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
+    Tm mask=vlt(d->sth[i],vzero);
+    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
+    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
+
+    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+    mypow(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
+
+    d->l1p[i] = vzero;
+    d->l1m[i] = vzero;
+    d->l2p[i] = prefac*ccp;
+    d->scp[i] = prescale+ccps;
+    d->l2m[i] = prefac*csp;
+    d->scm[i] = prescale+csps;
+    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_fbighalf);
+    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_fbighalf);
+    d->l2p[i] *= ssp;
+    d->scp[i] += ssps;
+    d->l2m[i] *= scp;
+    d->scm[i] += scps;
+    if (gen->preMinus_p)
+      d->l2p[i] = vneg(d->l2p[i]);
+    if (gen->preMinus_m)
+      d->l2m[i] = vneg(d->l2m[i]);
+    if (gen->s&1)
+      d->l2p[i] = vneg(d->l2p[i]);
+
+    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
+    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);
+
+    below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
+                   vallTrue(vlt(d->scp[i],limscale));
+    }
+
+  int l=gen->mhi;
+
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    below_limit=1;
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
+          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
+        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
+                       vallTrue(vlt(d->scm[i],limscale));
+      }
+    l+=2;
+    }
+
+  *l_=l;
+  }
+
+NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw1=d->l2p[i]+d->l2m[i];
+      Tv lx2=d->l1m[i]-d->l1p[i];
+      d->p1pr[i] += agr1*lw1 - aci2*lx2;
+      d->p1pi[i] += agi1*lw1 + acr2*lx2;
+      d->p1mr[i] += acr1*lw1 + agi2*lx2;
+      d->p1mi[i] += aci1*lw1 - agr2*lx2;
+      Tv lx1=d->l2m[i]-d->l2p[i];
+      Tv lw2=d->l1p[i]+d->l1m[i];
+      d->p2pr[i] += agr2*lw2 - aci1*lx1;
+      d->p2pi[i] += agi2*lw2 + acr1*lx1;
+      d->p2mr[i] += acr2*lw2 + agi1*lx1;
+      d->p2mi[i] += aci2*lw2 - agr1*lx1;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 10*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 28*nth;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
+       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
+    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
+       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw1=d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
+      Tv lx2=d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
+      d->p1pr[i] += agr1*lw1 - aci2*lx2;
+      d->p1pi[i] += agi1*lw1 + acr2*lx2;
+      d->p1mr[i] += acr1*lw1 + agi2*lx2;
+      d->p1mi[i] += aci1*lw1 - agr2*lx2;
+      Tv lx1=d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
+      Tv lw2=d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
+      d->p2pr[i] += agr2*lw2 - aci1*lx1;
+      d->p2pi[i] += agi2*lw2 + acr1*lx1;
+      d->p2mr[i] += acr2*lw2 + agi1*lx1;
+      d->p2mi[i] += aci2*lw2 - agr1*lx1;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);
+  }
+
+NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw = d->l2p[i] + d->l2m[i];
+      Tv lx = d->l2m[i] - d->l2p[i];
+      agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;;
+      agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
+      acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
+      aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      lw = d->l1p[i] + d->l1m[i];
+      lx = d->l1m[i] - d->l1p[i];
+      agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
+      agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
+      acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
+      aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 10*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 28*nth;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
+    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw = d->l2p[i]*d->cfp[i] + d->l2m[i]*d->cfm[i];
+      Tv lx = d->l2m[i]*d->cfm[i] - d->l2p[i]*d->cfp[i];
+      agr1 += d->p1pr[i]*lw - d->p2mi[i]*lx;
+      agi1 += d->p1pi[i]*lw + d->p2mr[i]*lx;
+      acr1 += d->p1mr[i]*lw + d->p2pi[i]*lx;
+      aci1 += d->p1mi[i]*lw - d->p2pr[i]*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      lw = d->l1p[i]*d->cfp[i] + d->l1m[i]*d->cfm[i];
+      lx = d->l1m[i]*d->cfm[i] - d->l1p[i]*d->cfp[i];
+      agr2 += d->p2pr[i]*lw - d->p1mi[i]*lx;
+      agi2 += d->p2pi[i]*lw + d->p1mr[i]*lx;
+      acr2 += d->p2mr[i]*lw + d->p1pi[i]*lx;
+      aci2 += d->p2mi[i]*lw - d->p1pr[i]*lx;
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
+    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
+  }
+
+
+NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
+  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm,
+  int l, int lmax, int nv2)
+  {
+  while (l<=lmax)
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
+       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw=d->l2p[i]+d->l2m[i];
+      d->p1pr[i] += ar1*lw;
+      d->p1pi[i] += ai1*lw;
+      Tv lx=d->l2m[i]-d->l2p[i];
+      d->p2mr[i] += ai1*lx;
+      d->p2mi[i] -= ar1*lx;
+      lw=d->l1p[i]+d->l1m[i];
+      d->p2pr[i] += ar2*lw;
+      d->p2pi[i] += ai2*lw;
+      lx=d->l1m[i]-d->l1p[i];
+      d->p1mr[i] += ai2*lx;
+      d->p1mi[i] -= ar2*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      }
+    l+=2;
+    }
+  }
+
+NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
+  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+  {
+  int l,lmax=gen->lmax;
+  int nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, &l, nv2);
+  job->opcnt += (l-gen->mhi) * 10*nth;
+  if (l>lmax) return;
+  job->opcnt += (lmax+1-l) * 20*nth;
+
+  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee=1;
+  for (int i=0; i<nv2; ++i)
+    {
+    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
+                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]),
+       fx12=vload(fx[l+1].f[2]);
+    Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]),
+       fx22=vload(fx[l+2].f[2]);
+    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
+       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    full_ieee=1;
+    for (int i=0; i<nv2; ++i)
+      {
+      d->l1p[i] = (d->cth[i]-fx11)*fx10*d->l2p[i] - fx12*d->l1p[i];
+      d->l1m[i] = (d->cth[i]+fx11)*fx10*d->l2m[i] - fx12*d->l1m[i];
+      Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i];
+      d->p1pr[i] += ar1*lw;
+      d->p1pi[i] += ai1*lw;
+      Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i];
+      d->p2mr[i] += ai1*lx;
+      d->p2mi[i] -= ar1*lx;
+      lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i];
+      d->p2pr[i] += ar2*lw;
+      d->p2pi[i] += ai2*lw;
+      lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i];
+      d->p1mr[i] += ai2*lx;
+      d->p1mi[i] -= ar2*lx;
+      d->l2p[i] = (d->cth[i]-fx21)*fx20*d->l1p[i] - fx22*d->l2p[i];
+      d->l2m[i] = (d->cth[i]+fx21)*fx20*d->l1m[i] - fx22*d->l2m[i];
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
+        }
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+        {
+        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
+        full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+        }
+      }
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (int i=0; i<nv2; ++i)
+    {
+    d->l1p[i] *= d->cfp[i];
+    d->l2p[i] *= d->cfp[i];
+    d->l1m[i] *= d->cfm[i];
+    d->l2m[i] *= d->cfm[i];
+    }
+  alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);
   }
 
 
@@ -348,7 +770,6 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
-  const int nval=nvec*VLEN;
   const int m = job->ainfo->mval[mi];
   sharp_Ylmgen_prepare (gen, m);
 
@@ -359,19 +780,20 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
+        const int nval=nv0*VLEN;
         int ith=0;
-        int itgt[nvec*VLEN];
+        int itgt[nval];
         while (ith<ulim-llim)
           {
-          Tburi p1,p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
+          s0data_u d;
+          VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
           int nth=0;
           while ((nth<nval)&&(ith<ulim-llim))
             {
             if (mlim[ith]>=m)
               {
               itgt[nth] = ith;
-              cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
               ++nth;
               }
             else
@@ -386,16 +808,17 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
             int i2=((nth+VLEN-1)/VLEN)*VLEN;
             for (int i=nth; i<i2; ++i)
               {
-              cth.s[i]=cth.s[nth-1];
-              sth.s[i]=sth.s[nth-1];
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_alm2map (cth.b,sth.b,gen,job,&p1.b,&p2.b,nth);
+            calc_alm2map (job, gen, &d.v, nth);
             for (int i=0; i<nth; ++i)
               {
               int tgt=itgt[i];
               int phas_idx = tgt*job->s_th + mi*job->s_m;
-              complex double r1 = p1.s.r[i] + p1.s.i[i]*_Complex_I,
-                             r2 = p2.s.r[i] + p2.s.i[i]*_Complex_I;
+              complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
+                             r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
               job->phase[phas_idx] = r1+r2;
               if (ispair[tgt])
                 job->phase[phas_idx+1] = r1-r2;
@@ -405,7 +828,66 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
         }
       else
         {
-        UTIL_FAIL("only spin==0 allowed at the moment");
+        const int nval=nvx*VLEN;
+        int ith=0;
+        int itgt[nval];
+        while (ith<ulim-llim)
+          {
+          sxdata_u d;
+          VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
+          VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              itgt[nth] = ith;
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              ++nth;
+              }
+            else
+              {
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              job->phase[phas_idx  ] = job->phase[phas_idx+1] = 0;
+              job->phase[phas_idx+2] = job->phase[phas_idx+3] = 0;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
+              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
+              }
+            (job->type==SHARP_ALM2MAP) ?
+              calc_alm2map_spin  (job, gen, &d.v, nth) :
+              calc_alm2map_deriv1(job, gen, &d.v, nth);
+            for (int i=0; i<nth; ++i)
+              {
+              int tgt=itgt[i];
+              int phas_idx = tgt*job->s_th + mi*job->s_m;
+              complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
+                             q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
+                             u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
+                             u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
+              job->phase[phas_idx  ] = q1+q2;
+              job->phase[phas_idx+2] = u1+u2;
+              if (ispair[tgt])
+                {
+                dcmplx *phQ = &(job->phase[phas_idx+1]),
+                       *phU = &(job->phase[phas_idx+3]);
+                *phQ = q1-q2;
+                *phU = u1-u2;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { *phQ=-(*phQ); *phU=-(*phU); }
+                }
+              }
+            }
+          }
         }
       break;
       }
@@ -421,7 +903,6 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim)
   {
-  const int nval=nvec*VLEN;
   const int m = job->ainfo->mval[mi];
   sharp_Ylmgen_prepare (gen, m);
 
@@ -431,22 +912,22 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
       {
       if (job->spin==0)
         {
+        const int nval=nv0*VLEN;
         int ith=0;
         while (ith<ulim-llim)
           {
-          Tburi p1,p2; VZERO(p1); VZERO(p2);
-          Tbu cth, sth;
+          s0data_u d;
           int nth=0;
           while ((nth<nval)&&(ith<ulim-llim))
             {
             if (mlim[ith]>=m)
               {
-              cth.s[nth]=cth_[ith]; sth.s[nth]=sth_[ith];
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
               int phas_idx = ith*job->s_th + mi*job->s_m;
               dcmplx ph1=job->phase[phas_idx];
               dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
-              p1.s.r[nth]=creal(ph1+ph2); p1.s.i[nth]=cimag(ph1+ph2);
-              p2.s.r[nth]=creal(ph1-ph2); p2.s.i[nth]=cimag(ph1-ph2);
+              d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
+              d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
               ++nth;
               }
             ++ith;
@@ -456,17 +937,55 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
             int i2=((nth+VLEN-1)/VLEN)*VLEN;
             for (int i=nth; i<i2; ++i)
               {
-              cth.s[i]=cth.s[nth-1];
-              sth.s[i]=sth.s[nth-1];
-              p1.s.r[i]=p1.s.i[i]=p2.s.r[i]=p2.s.i[i]=0.;
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
               }
-            calc_map2alm(cth.b,sth.b,gen,job,&p1.b,&p2.b, nth);
+            calc_map2alm (job, gen, &d.v, nth);
             }
           }
         }
       else
         {
-        UTIL_FAIL("only spin==0 allowed at the moment");
+        const int nval=nvx*VLEN;
+        int ith=0;
+        while (ith<ulim-llim)
+          {
+          sxdata_u d;
+          int nth=0;
+          while ((nth<nval)&&(ith<ulim-llim))
+            {
+            if (mlim[ith]>=m)
+              {
+              d.s.cth[nth]=cth_[ith]; d.s.sth[nth]=sth_[ith];
+              int phas_idx = ith*job->s_th + mi*job->s_m;
+              dcmplx p1Q=job->phase[phas_idx],
+                     p1U=job->phase[phas_idx+2],
+                     p2Q=ispair[ith] ? job->phase[phas_idx+1]:0.,
+                     p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
+              if ((gen->mhi-gen->m+gen->s)&1)
+                { p2Q=-p2Q; p2U=-p2U; }
+              d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
+              d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
+              d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
+              d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
+              ++nth;
+              }
+            ++ith;
+            }
+          if (nth>0)
+            {
+            int i2=((nth+VLEN-1)/VLEN)*VLEN;
+            for (int i=nth; i<i2; ++i)
+              {
+              d.s.cth[i]=d.s.cth[nth-1];
+              d.s.sth[i]=d.s.sth[nth-1];
+              d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
+              d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
+              }
+            calc_map2alm_spin(job, gen, &d.v, nth);
+            }
+          }
         }
       break;
       }
@@ -496,5 +1015,5 @@ int sharp_veclen(void)
 
 int sharp_max_nvec(void)
   {
-  return nvec;
+  return nv0;
   }
diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h
deleted file mode 100644
index a9e509b..0000000
--- a/libsharp/sharp_core.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core.h
- *  Interface for the computational core
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_SHARP_CORE_H
-#define PLANCK_SHARP_CORE_H
-
-#include "sharp_internal.h"
-#include "sharp_ylmgen_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inner_loop (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *mlim);
-
-int sharp_veclen(void);
-int sharp_max_nvec(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index f0c2738..6d5a6e4 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -33,7 +33,7 @@
 #define PLANCK_SHARP_CXX_H
 
 #include <complex>
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
 
diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h
index b7f98c4..1c77e27 100644
--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@@ -32,7 +32,7 @@
 #ifndef PLANCK_SHARP_GEOMHELPERS_H
 #define PLANCK_SHARP_GEOMHELPERS_H
 
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_internal.h b/libsharp/sharp_internal.h
index 11f23cb..635aeb8 100644
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@@ -25,7 +25,7 @@
 /*! \file sharp_internal.h
  *  Internally used functionality for the spherical transform library.
  *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2018 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
@@ -36,7 +36,9 @@
 #error This header file cannot be included from C++, only from C
 #endif
 
+#include <complex.h>
 #include "sharp.h"
+#include "sharp_ylmgen_c.h"
 
 #define SHARP_MAXTRANS 100
 
@@ -58,8 +60,13 @@ typedef struct
   unsigned long long opcnt;
   } sharp_job;
 
-int sharp_get_nv_max (void);
-int sharp_nv_oracle (sharp_jobtype type, int spin);
 int sharp_get_mlim (int lmax, int spin, double sth, double cth);
 
+void inner_loop (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *mlim);
+
+int sharp_veclen(void);
+int sharp_max_nvec(void);
+
 #endif
diff --git a/libsharp/sharp_lowlevel.h b/libsharp/sharp_lowlevel.h
deleted file mode 100644
index 2e7ab24..0000000
--- a/libsharp/sharp_lowlevel.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_lowlevel.h
- *  Low-level, portable interface for the spherical transform library.
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke \author Dag Sverre Seljebotn
- */
-
-#ifndef PLANCK_SHARP_LOWLEVEL_H
-#define PLANCK_SHARP_LOWLEVEL_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! \internal
-    Helper type containing information about a single ring. */
-typedef struct
-  {
-  double theta, phi0, weight, cth, sth;
-  ptrdiff_t ofs;
-  int nph, stride;
-  } sharp_ringinfo;
-
-/*! \internal
-    Helper type containing information about a pair of rings with colatitudes
-    symmetric around the equator. */
-typedef struct
-  {
-  sharp_ringinfo r1,r2;
-  } sharp_ringpair;
-
-/*! \internal
-    Type holding all required information about a map geometry. */
-typedef struct
-  {
-  sharp_ringpair *pair;
-  int npairs, nphmax;
-  } sharp_geom_info;
-
-/*! \defgroup almgroup Helpers for dealing with a_lm */
-/*! \{ */
-
-/*! \internal
-    Helper type for index calculation in a_lm arrays. */
-typedef struct
-  {
-  /*! Maximum \a l index of the array */
-  int lmax;
-  /*! Number of different \a m values in this object */
-  int nm;
-  /*! Array with \a nm entries containing the individual m values */
-  int *mval;
-  /*! Combination of flags from sharp_almflags */
-  int flags;
-  /*! Array with \a nm entries containing the (hypothetical) indices of
-      the coefficients with quantum numbers 0,\a mval[i] */
-  ptrdiff_t *mvstart;
-  /*! Stride between a_lm and a_(l+1),m */
-  ptrdiff_t stride;
-  } sharp_alm_info;
-
-/*! alm_info flags */
-typedef enum { SHARP_PACKED = 1,
-               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
-                    not present. mvstart is in units of *real* float/double for all
-                    m; stride is in units of reals for m=0 and complex for m!=0 */
-               SHARP_REAL_HARMONICS  = 1<<6
-               /*!< Use the real spherical harmonic convention. For
-                    m==0, the alm are treated exactly the same as in
-                    the complex case.  For m!=0, alm[i] represent a
-                    pair (+abs(m), -abs(m)) instead of (real, imag),
-                    and the coefficients are scaled by a factor of
-                    sqrt(2) relative to the complex case.  In other
-                    words, (sqrt(.5) * alm[i]) recovers the
-                    corresponding complex coefficient (when accessed
-                    as complex).
-                */
-             } sharp_almflags;
-
-
-
-/*! Creates an a_lm data structure from the following parameters:
-    \param lmax maximum \a l quantum number (>=0)
-    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
-    \param stride the stride between entries with identical \a m, and \a l
-      differing by 1.
-    \param mstart the index of the (hypothetical) coefficient with the
-      quantum numbers 0,\a m. Must have \a mmax+1 entries.
-    \param alm_info will hold a pointer to the newly created data structure
- */
-void sharp_make_alm_info (int lmax, int mmax, int stride,
-  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
-/*! Creates an a_lm data structure which from the following parameters:
-    \param lmax maximum \a l quantum number (\a >=0)
-    \param nm number of different \a m (\a 0<=nm<=lmax+1)
-    \param stride the stride between entries with identical \a m, and \a l
-      differing by 1.
-    \param mval array with \a nm entries containing the individual m values
-    \param mvstart array with \a nm entries containing the (hypothetical)
-      indices of the coefficients with the quantum numbers 0,\a mval[i]
-    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
-    \param alm_info will hold a pointer to the newly created data structure
- */
-void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
-  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
-/*! Returns the index of the coefficient with quantum numbers \a l,
-    \a mval[mi].
-    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
-    the index for the coefficient with the quantum numbers \a l, \a mi. */
-ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
-/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
-    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
-    otherwise it is the number of complex coefficients (with m>=0). */
-ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
-/*! Deallocates the a_lm info object. */
-void sharp_destroy_alm_info (sharp_alm_info *info);
-
-/*! \} */
-
-/*! \defgroup geominfogroup Functions for dealing with geometry information */
-/*! \{ */
-
-/*! Creates a geometry information from a set of ring descriptions.
-    All arrays passed to this function must have \a nrings elements.
-    \param nrings the number of rings in the map
-    \param nph the number of pixels in each ring
-    \param ofs the index of the first pixel in each ring in the map array
-    \param stride the stride between consecutive pixels
-    \param phi0 the azimuth (in radians) of the first pixel in each ring
-    \param theta the colatitude (in radians) of each ring
-    \param wgt the pixel weight to be used for the ring in map2alm
-      and adjoint map2alm transforms.
-      Pass NULL to use 1.0 as weight for all rings.
-    \param geom_info will hold a pointer to the newly created data structure
- */
-void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
-  const int *stride, const double *phi0, const double *theta,
-  const double *wgt, sharp_geom_info **geom_info);
-
-/*! Counts the number of grid points needed for (the local part of) a map described
-    by \a info.
- */
-ptrdiff_t sharp_map_size(const sharp_geom_info *info);
-
-/*! Deallocates the geometry information in \a info. */
-void sharp_destroy_geom_info (sharp_geom_info *info);
-
-/*! \} */
-
-/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
-/*! \{ */
-
-/*! Enumeration of SHARP job types. */
-typedef enum { SHARP_YtW=0,               /*!< analysis */
-               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
-               SHARP_Y=1,                 /*!< synthesis */
-               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
-               SHARP_Yt=2,                /*!< adjoint synthesis */
-               SHARP_WY=3,                /*!< adjoint analysis */
-               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
-             } sharp_jobtype;
-
-/*! Job flags */
-typedef enum { SHARP_DP              = 1<<4,
-               /*!< map and a_lm are in double precision */
-               SHARP_ADD             = 1<<5,
-               /*!< results are added to the output arrays, instead of
-                    overwriting them */
-
-               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
-                  but its use here is deprecated in favor of having it in the sharp_alm_info */
-
-               SHARP_NO_FFT          = 1<<7,
-
-               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
-               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
-             } sharp_jobflags;
-
-/*! Performs a libsharp SHT job. The interface deliberately does not use
-  the C99 "complex" data type, in order to be callable from C89 and C++.
-  \param type the type of SHT
-  \param spin the spin of the quantities to be transformed
-  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
-    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
-    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
-    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
-    depends on whether the SHARP_DP flag is set.
-  \param map contains pointers to the maps. If \a spin==0,
-    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
-    point to the maps of the first SHT, map[2] and map[3] to those of the
-    second, etc. The exact data type of \a map depends on whether the SHARP_DP
-    flag is set.
-  \param geom_info A \c sharp_geom_info object compatible with the provided
-    \a map arrays.
-  \param alm_info A \c sharp_alm_info object compatible with the provided
-    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
-    exactly once.
-  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
-    \a alm is expected to have the type "complex double **" and \a map is
-    expected to have the type "double **"; otherwise, the expected
-    types are "complex float **" and "float **", respectively.
-  \param time If not NULL, the wall clock time required for this SHT
-    (in seconds) will be written here.
-  \param opcnt If not NULL, a conservative estimate of the total floating point
-    operation count for this SHT will be written here. */
-void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
-  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
-  int flags, double *time, unsigned long long *opcnt);
-
-void sharp_set_chunksize_min(int new_chunksize_min);
-void sharp_set_nchunks_max(int new_nchunks_max);
-
-
-typedef enum { SHARP_ERROR_NO_MPI = 1,
-               /*!< libsharp not compiled with MPI support */
-              } sharp_errors;
-
-/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
-    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
-
-    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
-    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
-    is *(MPI_Comm*)pcomm.
-
-    Other parameters are the same as sharp_execute_mpi.
-
-    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
-    (in which case nothing is done).
- */
-int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
-  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int flags, double *time,
-  unsigned long long *opcnt);
-
-
-
-/*! \} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
index df07117..73a8aa0 100644
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@@ -33,7 +33,7 @@
 #define PLANCK_SHARP_MPI_H
 
 #include <mpi.h>
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 4b124af..f712ae4 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -24,7 +24,7 @@
 
 /*  \file sharp_testsuite.c
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -375,7 +375,6 @@ static void check_sign_scale(void)
   UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
     "error");
 
-#if 0
   sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
     NULL,NULL);
   UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),
@@ -420,7 +419,6 @@ static void check_sign_scale(void)
     "error");
   UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
     "error");
-#endif
 
   DEALLOC2D(map);
   DEALLOC2D(alm);
@@ -430,7 +428,7 @@ static void check_sign_scale(void)
   }
 
 static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int nv, double **err_abs, double **err_rel,
+  int spin, double **err_abs, double **err_rel,
   double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
   unsigned long long *op_m2a)
   {
@@ -450,20 +448,20 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
 
 #ifdef USE_MPI
   sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
-    ainfo, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
+    ainfo, SHARP_DP|SHARP_ADD,t_a2m,op_a2m);
 #else
   sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
-    SHARP_DP|nv,t_a2m,op_a2m);
+    SHARP_DP,t_a2m,op_a2m);
 #endif
   if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
   if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
   double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
 #ifdef USE_MPI
   sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
-    ainfo,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+    ainfo,SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
 #else
   sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
-    SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+    SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
 #endif
   if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
   if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
@@ -475,11 +473,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
   }
 
 static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int nv)
+  int spin)
   {
   int ncomp = (spin==0) ? 1 : 2;
   double *err_abs, *err_rel;
-  do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel, NULL, NULL,
+  do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, NULL, NULL,
     NULL, NULL);
   for (int i=0; i<ncomp; ++i)
     UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
@@ -501,16 +499,11 @@ static void sharp_acctest(void)
   sharp_alm_info *ainfo;
   int lmax=127, mmax=127, nlat=128, nlon=256;
   get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
-  for (int nv=1; nv<=6; ++nv)
-    {
-    check_accuracy(ginfo,ainfo,0,nv);
-#if 0
-    check_accuracy(ginfo,ainfo,1,nv);
-    check_accuracy(ginfo,ainfo,2,nv);
-    check_accuracy(ginfo,ainfo,3,nv);
-    check_accuracy(ginfo,ainfo,30,nv);
-#endif
-    }
+  check_accuracy(ginfo,ainfo,0);
+  check_accuracy(ginfo,ainfo,1);
+  check_accuracy(ginfo,ainfo,2);
+  check_accuracy(ginfo,ainfo,3);
+  check_accuracy(ginfo,ainfo,30);
   sharp_destroy_alm_info(ainfo);
   sharp_destroy_geom_info(ginfo);
   if (mytask==0) printf("Passed.\n\n");
@@ -544,7 +537,7 @@ static void sharp_test (int argc, const char **argv)
     {
     ++nrpt;
     double ta2m2, tm2a2;
-    do_sht (ginfo, ainfo, spin, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
+    do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, &ta2m2, &tm2a2,
       &op_a2m, &op_m2a);
     if (ta2m2<t_a2m) t_a2m=ta2m2;
     if (tm2a2<t_m2a) t_m2a=tm2a2;
@@ -594,7 +587,7 @@ static void sharp_test (int argc, const char **argv)
     }
 
   if (mytask==0)
-    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
+    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %.2e %7.2f %.2e %7.2f"
            " %9.2f %6.2f %.2e %.2e\n",
       getenv("HOST"),argv[2],spin,sharp_veclen(),nomp,ntasks,lmax,mmax,gpar1,gpar2,
       t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
@@ -604,68 +597,6 @@ static void sharp_test (int argc, const char **argv)
   DEALLOC(err_rel);
   }
 
-static void sharp_bench (int argc, const char **argv)
-  {
-  if (mytask==0) sharp_announce("sharp_bench");
-  UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
-  int lmax=atoi(argv[3]);
-  int mmax=atoi(argv[4]);
-  int gpar1=atoi(argv[5]);
-  int gpar2=atoi(argv[6]);
-  int spin=atoi(argv[7]);
-
-  if (mytask==0) printf("Testing map analysis accuracy.\n");
-  if (mytask==0) printf("spin=%d\n", spin);
-
-  sharp_geom_info *ginfo;
-  sharp_alm_info *ainfo;
-  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
-
-  double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
-  unsigned long long opa2m_min=0, opm2a_min=0;
-  int nvmin_a2m=-1, nvmin_m2a=-1;
-  for (int nv=0; nv<=6; ++nv)
-    {
-    int ntries=0;
-    double tacc=0;
-    do
-      {
-      double t_a2m, t_m2a;
-      unsigned long long op_a2m, op_m2a;
-      double *err_abs,*err_rel;
-      do_sht (ginfo, ainfo, spin, nv, &err_abs, &err_rel,
-        &t_a2m, &t_m2a, &op_a2m, &op_m2a);
-
-      DEALLOC(err_abs);
-      DEALLOC(err_rel);
-      tacc+=t_a2m+t_m2a;
-      ++ntries;
-      if (nv==0)
-        {
-        if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
-        if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
-        }
-      else
-        {
-        if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
-        if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
-        }
-      } while((ntries<2)||(tacc<3.));
-    }
-  if (mytask==0)
-    {
-    printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
-      nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
-      1e-9*opa2m_min/ta2m_min);
-    printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
-      nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
-      1e-9*opm2a_min/tm2a_min);
-    }
-
-  sharp_destroy_alm_info(ainfo);
-  sharp_destroy_geom_info(ginfo);
-  }
-
 int main(int argc, const char **argv)
   {
 #ifdef USE_MPI
@@ -682,8 +613,6 @@ int main(int argc, const char **argv)
     sharp_acctest();
   else if (strcmp(argv[1],"test")==0)
     sharp_test(argc,argv);
-  else if (strcmp(argv[1],"bench")==0)
-    sharp_bench(argc,argv);
   else
     UTIL_FAIL("unknown command");
 
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index ff3f573..942e290 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -40,32 +40,13 @@ typedef double Ts;
 #if (VLEN==1)
 
 typedef double Tv;
-typedef float Tv_s;
 typedef int Tm;
 
-#define vadd(a,b) ((a)+(b))
-#define vadd_s(a,b) ((a)+(b))
-#define vaddeq(a,b) ((a)+=(b))
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
-#define vsub(a,b) ((a)-(b))
-#define vsub_s(a,b) ((a)-(b))
-#define vsubeq(a,b) ((a)-=(b))
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
-#define vmul(a,b) ((a)*(b))
-#define vmul_s(a,b) ((a)*(b))
-#define vmuleq(a,b) ((a)*=(b))
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
-#define vfmaeq(a,b,c) ((a)+=(b)*(c))
-#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
-#define vfmseq(a,b,c) ((a)-=(b)*(c))
-#define vabmc(a,b,c) ((a)*(b)-(c))
-#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
-#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
 #define vload(a) (a)
-#define vload_s(a) (a)
-#define vloadu(p) (*(p))
-#define vloadu_s(p) (*(p))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@@ -74,8 +55,6 @@ typedef int Tm;
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
 #define vor_mask(a,b) ((a)||(b))
-#define vstoreu(p, a) (*(p)=a)
-#define vstoreu_s(p, a) (*(p)=a)
 
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
@@ -99,7 +78,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif
 
 typedef __m128d Tv;
-typedef __m128 Tv_s;
 typedef __m128d Tm;
 
 #if defined(__SSE4_1__)
@@ -111,29 +89,11 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vzero _mm_setzero_pd()
 #define vone _mm_set1_pd(1.)
 
-#define vadd(a,b) _mm_add_pd(a,b)
-#define vadd_s(a,b) _mm_add_ps(a,b)
-#define vaddeq(a,b) a=_mm_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
-#define vsub(a,b) _mm_sub_pd(a,b)
-#define vsub_s(a,b) _mm_sub_ps(a,b)
-#define vsubeq(a,b) a=_mm_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
-#define vmul(a,b) _mm_mul_pd(a,b)
-#define vmul_s(a,b) _mm_mul_ps(a,b)
-#define vmuleq(a,b) a=_mm_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
-#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
-#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
-#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
-#define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
-#define vload_s(a) _mm_set1_ps(a)
 #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
@@ -146,10 +106,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
-#define vloadu(p) _mm_loadu_pd(p)
-#define vloadu_s(p) _mm_loadu_ps(p)
-#define vstoreu(p, v) _mm_storeu_pd(p, v)
-#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
 
 #endif
 
@@ -161,54 +117,17 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #endif
 
 typedef __m256d Tv;
-typedef __m256 Tv_s;
 typedef __m256d Tm;
 
 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
 #define vzero _mm256_setzero_pd()
 #define vone _mm256_set1_pd(1.)
 
-#define vadd(a,b) _mm256_add_pd(a,b)
-#define vadd_s(a,b) _mm256_add_ps(a,b)
-#define vaddeq(a,b) a=_mm256_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
-#define vsub(a,b) _mm256_sub_pd(a,b)
-#define vsub_s(a,b) _mm256_sub_ps(a,b)
-#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
-#define vmul(a,b) _mm256_mul_pd(a,b)
-#define vmul_s(a,b) _mm256_mul_ps(a,b)
-#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#if (USE_FMA4)
-#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
-#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
-#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
-#define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
-#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
-#else
-#if (USE_FMA)
-#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
-#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
-#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
-#define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
-#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
-#else
-#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
-#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
-#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
-#define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
-#endif
-#endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
-#define vload_s(a) _mm256_set1_ps(a)
 #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
@@ -222,11 +141,6 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
 
-#define vloadu(p) _mm256_loadu_pd(p)
-#define vloadu_s(p) _mm256_loadu_ps(p)
-#define vstoreu(p, v) _mm256_storeu_pd(p, v)
-#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
-
 #endif
 
 #if (VLEN==8)
@@ -236,20 +150,8 @@ typedef __m256d Tm;
 typedef __m512d Tv;
 typedef __mmask8 Tm;
 
-#define vadd(a,b) _mm512_add_pd(a,b)
-#define vaddeq(a,b) a=_mm512_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
-#define vsub(a,b) _mm512_sub_pd(a,b)
-#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
-#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
-#define vmul(a,b) _mm512_mul_pd(a,b)
-#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
-#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
-//#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
-//#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
-#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
 #define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
 #define vload(a) _mm512_set1_pd(a)
 #define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
diff --git a/libsharp/sharp_vecutil.h b/libsharp/sharp_vecutil.h
index 24a2e94..522cc5f 100644
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@@ -25,7 +25,7 @@
 /*! \file sharp_vecutil.h
  *  Functionality related to vector instruction support
  *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 548f061..e2247c8 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -85,15 +85,15 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
   else
     {
     gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
-    for (int m=0; m<gen->lmax+2; ++m)
+    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+3);
+    for (int m=0; m<gen->lmax+3; ++m)
       gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
-    ALLOC(gen->inv,double,gen->lmax+1);
+    ALLOC(gen->inv,double,gen->lmax+2);
     gen->inv[0]=0;
-    for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
-    ALLOC(gen->flm1,double,2*gen->lmax+1);
-    ALLOC(gen->flm2,double,2*gen->lmax+1);
-    for (int m=0; m<2*gen->lmax+1; ++m)
+    for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
+    ALLOC(gen->flm1,double,2*gen->lmax+3);
+    ALLOC(gen->flm2,double,2*gen->lmax+3);
+    for (int m=0; m<2*gen->lmax+3; ++m)
       {
       gen->flm1[m] = sqrt(1./(m+1.));
       gen->flm2[m] = sqrt(m/(m+1.));
@@ -176,7 +176,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)
 
     if (!ms_similar)
       {
-      for (int l=gen->mhi; l<gen->lmax; ++l)
+      for (int l=gen->mhi; l<gen->lmax+1; ++l)
         {
         double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
                   *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];