switch to new FFT

use MRUTIL macros
vectorization cleanup
2020-01-04 12:59:59 +01:00 · 2020-01-03 22:55:52 +01:00 · 2020-01-03 22:47:49 +01:00 · 2020-01-03 18:26:53 +01:00 · 2020-01-03 18:01:34 +01:00 · 2020-01-03 17:22:31 +01:00
19 changed files with 382 additions and 2817 deletions
--- a/2
+++ b/2
@ -49,7 +49,7 @@ It can be disabled at configuration time by specifying "--disable-openmp" at the
 configure command line.
 At runtime OMP_NUM_THREADS should be set to the number of hardware threads
 (not physical cores) of the system.
-(Usually this is  already the default setting when OMP_NUM_THREADS is not
+(Usually this is already the default setting when OMP_NUM_THREADS is not
 specified.)


--- a/Makefile.am
+++ b/Makefile.am
@ -3,16 +3,14 @@ ACLOCAL_AMFLAGS = -I m4
 lib_LTLIBRARIES = libsharp2.la

 libsharp2_la_SOURCES = \
-  libsharp2/pocketfft.c \
-  libsharp2/pocketfft.h \
-  libsharp2/sharp_utils.c \
+  libsharp2/sharp_utils.cc \
  libsharp2/sharp_utils.h \
-  libsharp2/sharp.c \
-  libsharp2/sharp_almhelpers.c \
-  libsharp2/sharp_core.c \
-  libsharp2/sharp_geomhelpers.c \
-  libsharp2/sharp_legendre_roots.c \
-  libsharp2/sharp_ylmgen_c.c \
+  libsharp2/sharp.cc \
+  libsharp2/sharp_almhelpers.cc \
+  libsharp2/sharp_core.cc \
+  libsharp2/sharp_geomhelpers.cc \
+  libsharp2/sharp_legendre_roots.cc \
+  libsharp2/sharp_ylmgen_c.cc \
  libsharp2/sharp_internal.h \
  libsharp2/sharp_legendre_roots.h \
  libsharp2/sharp_vecsupport.h \
@ -24,25 +22,25 @@ libsharp2_la_SOURCES = \
 # any backward-compatible change: increase age
 # any backward-incompatible change: age=0
 # ==> age <= current
-libsharp2_la_LDFLAGS = -version-info 0:0:0
+libsharp2_la_LDFLAGS = -version-info 0:0:0 -lpthread

-AM_CFLAGS = @AM_CFLAGS@
+AM_CXXFLAGS = @AM_CXXFLAGS@

 if HAVE_MULTIARCH

-libavx_la_SOURCES = libsharp2/sharp_core_inc.c
-libavx2_la_SOURCES = libsharp2/sharp_core_inc.c
-libfma_la_SOURCES = libsharp2/sharp_core_inc.c
-libfma4_la_SOURCES = libsharp2/sharp_core_inc.c
-libavx512f_la_SOURCES = libsharp2/sharp_core_inc.c
+libavx_la_SOURCES = libsharp2/sharp_core_inc.cc
+libavx2_la_SOURCES = libsharp2/sharp_core_inc.cc
+libfma_la_SOURCES = libsharp2/sharp_core_inc.cc
+libfma4_la_SOURCES = libsharp2/sharp_core_inc.cc
+libavx512f_la_SOURCES = libsharp2/sharp_core_inc.cc

 noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la

-libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
-libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
-libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
-libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
-libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
+libavx_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx -DARCH=avx
+libavx2_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx2 -DARCH=avx2
+libfma_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma -DARCH=fma
+libfma4_la_CXXFLAGS = ${AM_CXXFLAGS} -mfma4 -DARCH=fma4
+libavx512f_la_CXXFLAGS = ${AM_CXXFLAGS} -mavx512f -DARCH=avx512f

 libsharp2_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la

@ -56,10 +54,10 @@ nobase_include_HEADERS = \
  libsharp2/sharp_cxx.h

 EXTRA_DIST = \
-  runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.c
+  runtest.sh fortran/sharp.f90 fortran/test_sharp.f90 libsharp2/sharp_mpi.cc

 check_PROGRAMS = sharp2_testsuite
-sharp2_testsuite_SOURCES = test/sharp2_testsuite.c test/memusage.c test/memusage.h
+sharp2_testsuite_SOURCES = test/sharp2_testsuite.cc test/memusage.cc test/memusage.h
 sharp2_testsuite_LDADD = libsharp2.la

 TESTS = runtest.sh
--- a/configure.ac
+++ b/configure.ac
@ -20,28 +20,21 @@ dnl
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])


-AC_PROG_CC_C99
-AC_OPENMP
-
-# add math library
-LIBS="-lm"
+AC_PROG_CXX
+AX_CXX_COMPILE_STDCXX([11])

 AC_PROG_LIBTOOL

-tmpval=`echo $CFLAGS | grep -c '\-DMULTIARCH'`
+tmpval=`echo $CXXFLAGS | grep -c '\-DMULTIARCH'`
 AM_CONDITIONAL([HAVE_MULTIARCH], [test $tmpval -gt 0])

-AM_CFLAGS="$AM_CFLAGS $OPENMP_CFLAGS"
-
 PACKAGE_LIBS="-lsharp2"
-PACKAGE_CFLAGS="$PACKAGE_CFLAGS $OPENMP_CFLAGS"
-PACKAGE_LDFLAGS="$PACKAGE_LDFLAGS $OPENMP_CFLAGS"

 dnl
 dnl Create pkgconfig .pc file.
 dnl
 AX_CREATE_PKGCONFIG_INFO(,,,,[])
-AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_CXXFLAGS])

 AC_CONFIG_FILES([Makefile])
 AC_OUTPUT
--- a/libsharp2/pocketfft.c
+++ b/libsharp2/pocketfft.c
--- a/libsharp2/pocketfft.h
+++ b/libsharp2/pocketfft.h
@ -1,50 +0,0 @@
-/*
- * This file is part of pocketfft.
- * Licensed under a 3-clause BSD style license - see LICENSE.md
- */
-
-/*! \file pocketfft.h
- *  Public interface of the pocketfft library
- *
- *  Copyright (C) 2008-2019 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef POCKETFFT_H
-#define POCKETFFT_H
-
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct pocketfft_plan_c_i;
-typedef struct pocketfft_plan_c_i * pocketfft_plan_c;
-pocketfft_plan_c pocketfft_make_plan_c (size_t length);
-void pocketfft_delete_plan_c (pocketfft_plan_c plan);
-int pocketfft_backward_c(pocketfft_plan_c plan, double c[], double fct);
-int pocketfft_forward_c(pocketfft_plan_c plan, double c[], double fct);
-size_t pocketfft_length_c(pocketfft_plan_c plan);
-
-struct pocketfft_plan_r_i;
-typedef struct pocketfft_plan_r_i * pocketfft_plan_r;
-pocketfft_plan_r pocketfft_make_plan_r (size_t length);
-void pocketfft_delete_plan_r (pocketfft_plan_r plan);
-/*! Computes a real backward FFT on \a c, using \a plan
-    and assuming the FFTPACK storage scheme:
-    - on entry, \a c has the form <tt>r0, r1, i1, r2, i2, ...</tt>
-    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
-int pocketfft_backward_r(pocketfft_plan_r plan, double c[], double fct);
-/*! Computes a real forward FFT on \a c, using \a plan
-    and assuming the FFTPACK storage scheme:
-    - on entry, \a c has the form <tt>r0, r1, ..., r[length-1]</tt>;
-    - on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt> */
-int pocketfft_forward_r(pocketfft_plan_r plan, double c[], double fct);
-size_t pocketfft_length_r(pocketfft_plan_r plan);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libsharp2/sharp.cc
+++ b/libsharp2/sharp.cc
@ -25,17 +25,20 @@
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

-#include <math.h>
-#include <string.h>
-#include "libsharp2/pocketfft.h"
+#include <cmath>
+#include <cstring>
+#include <atomic>
+#include "mr_util/fft.h"
 #include "libsharp2/sharp_ylmgen_c.h"
 #include "libsharp2/sharp_internal.h"
 #include "libsharp2/sharp_utils.h"
 #include "libsharp2/sharp_almhelpers.h"
 #include "libsharp2/sharp_geomhelpers.h"
+#include "mr_util/threading.h"
+#include "mr_util/useful_macros.h"

-typedef complex double dcmplx;
-typedef complex float  fcmplx;
+typedef complex<double> dcmplx;
+typedef complex<float>  fcmplx;

 static const double sqrt_one_half = 0.707106781186547572737310929369;
 static const double sqrt_two = 1.414213562373095145474621858739;
@ -57,7 +60,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
  *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
  }

-NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
+MRUTIL_NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
  {
  double ofs=lmax*0.01;
  if (ofs<100.) ofs=100.;
@ -76,7 +79,7 @@ typedef struct
  double phi0_;
  dcmplx *shiftarr;
  int s_shift;
-  pocketfft_plan_r plan;
+  mr::detail_fft::rfftp<double> *plan;
  int length;
  int norot;
  } ringhelper;
@ -89,12 +92,12 @@ static void ringhelper_init (ringhelper *self)

 static void ringhelper_destroy (ringhelper *self)
  {
-  if (self->plan) pocketfft_delete_plan_r(self->plan);
+  delete self->plan;
  DEALLOC(self->shiftarr);
  ringhelper_init(self);
  }

-NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
+MRUTIL_NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
  {
  self->norot = (fabs(phi0)<1e-14);
  if (!(self->norot))
@ -105,27 +108,27 @@ NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, dou
      self->phi0_ = phi0;
 // FIXME: improve this by using sincos2pibyn(nph) etc.
      for (int m=0; m<=mmax; ++m)
-        self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
+        self->shiftarr[m] = dcmplx(cos(m*phi0),sin(m*phi0));
 //      double *tmp=(double *) self->shiftarr;
 //      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
      }
 //  if (!self->plan) self->plan=pocketfft_make_plan_r(nph);
  if (nph!=(int)self->length)
    {
-    if (self->plan) pocketfft_delete_plan_r(self->plan);
-    self->plan=pocketfft_make_plan_r(nph);
+    if (self->plan) delete self->plan;
+    self->plan=new mr::detail_fft::rfftp<double>(nph);
    self->length=nph;
    }
  }

 static int ringinfo_compare (const void *xa, const void *xb)
  {
-  const sharp_ringinfo *a=xa, *b=xb;
+  const sharp_ringinfo *a=(const sharp_ringinfo *)xa, *b=(const sharp_ringinfo *)xb;
  return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
  }
 static int ringpair_compare (const void *xa, const void *xb)
  {
-  const sharp_ringpair *a=xa, *b=xb;
+  const sharp_ringpair *a=(const sharp_ringpair *)xa, *b=(const sharp_ringpair *)xb;
 //  return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
  if (a->r1.nph==b->r1.nph)
    return (a->r1.phi0 < b->r1.phi0) ? -1 :
@ -275,7 +278,7 @@ static int sharp_get_mmax (int *mval, int nm)
  return nm-1;
  }

-NOINLINE static void ringhelper_phase2ring (ringhelper *self,
+MRUTIL_NOINLINE static void ringhelper_phase2ring (ringhelper *self,
  const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
  int pstride, int flags)
  {
@ -292,22 +295,22 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
    if (self->norot)
      for (int m=0; m<=mmax; ++m)
        {
-        data[2*m]=creal(phase[m*pstride])*wgt;
-        data[2*m+1]=cimag(phase[m*pstride])*wgt;
+        data[2*m]=phase[m*pstride].real()*wgt;
+        data[2*m+1]=phase[m*pstride].imag()*wgt;
        }
    else
      for (int m=0; m<=mmax; ++m)
        {
        dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
-        data[2*m]=creal(tmp)*wgt;
-        data[2*m+1]=cimag(tmp)*wgt;
+        data[2*m]=tmp.real()*wgt;
+        data[2*m+1]=tmp.imag()*wgt;
        }
    for (int m=2*(mmax+1); m<nph+2; ++m)
      data[m]=0.;
    }
  else
    {
-    data[0]=creal(phase[0])*wgt;
+    data[0]=phase[0].real()*wgt;
    SET_ARRAY(data,1,nph+2,0.);

    int idx1=1, idx2=nph-1;
@ -317,23 +320,23 @@ NOINLINE static void ringhelper_phase2ring (ringhelper *self,
      if(!self->norot) tmp*=self->shiftarr[m];
      if (idx1<(nph+2)/2)
        {
-        data[2*idx1]+=creal(tmp);
-        data[2*idx1+1]+=cimag(tmp);
+        data[2*idx1]+=tmp.real();
+        data[2*idx1+1]+=tmp.imag();
        }
      if (idx2<(nph+2)/2)
        {
-        data[2*idx2]+=creal(tmp);
-        data[2*idx2+1]-=cimag(tmp);
+        data[2*idx2]+=tmp.real();
+        data[2*idx2+1]-=tmp.imag();
        }
      if (++idx1>=nph) idx1=0;
      if (--idx2<0) idx2=nph-1;
      }
    }
  data[1]=data[0];
-  pocketfft_backward_r (self->plan, &(data[1]), 1.);
+  self->plan->exec(&(data[1]), 1., false);
  }

-NOINLINE static void ringhelper_ring2phase (ringhelper *self,
+MRUTIL_NOINLINE static void ringhelper_ring2phase (ringhelper *self,
  const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
  int pstride, int flags)
  {
@ -349,7 +352,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
  if (flags&SHARP_REAL_HARMONICS)
    wgt *= sqrt_two;

-  pocketfft_forward_r (self->plan, &(data[1]), 1.);
+  self->plan->exec (&(data[1]), 1., true);
  data[0]=data[1];
  data[1]=data[nph+1]=0.;

@ -357,11 +360,11 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
    {
    if (self->norot)
      for (int m=0; m<=maxidx; ++m)
-        phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
+        phase[m*pstride] = dcmplx(data[2*m], data[2*m+1]) * wgt;
    else
      for (int m=0; m<=maxidx; ++m)
        phase[m*pstride] =
-          (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
+          dcmplx(data[2*m], data[2*m+1]) * self->shiftarr[m] * wgt;
    }
  else
    {
@ -370,9 +373,9 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
      int idx=m%nph;
      dcmplx val;
      if (idx<(nph-idx))
-        val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
+        val = dcmplx(data[2*idx], data[2*idx+1]) * wgt;
      else
-        val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
+        val = dcmplx(data[2*(nph-idx)], -data[2*(nph-idx)+1]) * wgt;
      if (!self->norot)
        val *= self->shiftarr[m];
      phase[m*pstride]=val;
@ -383,7 +386,7 @@ NOINLINE static void ringhelper_ring2phase (ringhelper *self,
    phase[m*pstride]=0.;
  }

-NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
+MRUTIL_NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
  int flags)
  {
  if (flags & SHARP_NO_FFT)
@ -440,7 +443,7 @@ NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
    }
  }

-NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
+MRUTIL_NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
  int flags)
  {
 #define CLEARLOOP(real_t,body)             \
@ -477,7 +480,7 @@ NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
    }
  }

-NOINLINE static void init_output (sharp_job *job)
+MRUTIL_NOINLINE static void init_output (sharp_job *job)
  {
  if (job->flags&SHARP_ADD) return;
  if (job->type == SHARP_MAP2ALM)
@ -488,7 +491,7 @@ NOINLINE static void init_output (sharp_job *job)
      clear_map (job->ginfo,job->map[i],job->flags);
  }

-NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
+MRUTIL_NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
  {
  if (job->type==SHARP_MAP2ALM)
    {
@ -514,7 +517,7 @@ static void alloc_almtmp (sharp_job *job, int lmax)
 static void dealloc_almtmp (sharp_job *job)
  { DEALLOC(job->almtmp); }

-NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
+MRUTIL_NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
  {

 #define COPY_LOOP(real_t, source_t, expr_of_x)              \
@ -577,7 +580,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
        if (job->flags&SHARP_DP)
          COPY_LOOP(double, dcmplx, x*job->norm_l[l])
        else
-          COPY_LOOP(float, fcmplx, x*job->norm_l[l])
+          COPY_LOOP(float, fcmplx, x*float(job->norm_l[l]))
        }
      }
    }
@ -588,7 +591,7 @@ NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
 #undef COPY_LOOP
  }

-NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
+MRUTIL_NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
  {

 #define COPY_LOOP(real_t, target_t, expr_of_x)               \
@ -617,9 +620,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
    if (m==0)
      {
      if (job->flags&SHARP_DP)
-        COPY_LOOP(double, double, creal(x)*norm_m0)
+        COPY_LOOP(double, double, x.real()*norm_m0)
      else
-        COPY_LOOP(float, float, crealf(x)*norm_m0)
+        COPY_LOOP(float, float, x.real()*norm_m0)
      }
    else
      {
@ -634,9 +637,9 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
    if (m==0)
      {
      if (job->flags&SHARP_DP)
-        COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
+        COPY_LOOP(double, double, x.real()*job->norm_l[l]*norm_m0)
      else
-        COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
+        COPY_LOOP(float, fcmplx, (float)(x.real()*job->norm_l[l]*norm_m0))
      }
    else
      {
@ -650,7 +653,7 @@ NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
 #undef COPY_LOOP
  }

-NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
+MRUTIL_NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
  const double *ringtmp, int rstride)
  {
  if (job->flags & SHARP_DP)
@ -658,8 +661,8 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
    double **dmap = (double **)job->map;
    for (int i=0; i<job->nmaps; ++i)
      {
-      double *restrict p1=&dmap[i][ri->ofs];
-      const double *restrict p2=&ringtmp[i*rstride+1];
+      double *MRUTIL_RESTRICT p1=&dmap[i][ri->ofs];
+      const double *MRUTIL_RESTRICT p2=&ringtmp[i*rstride+1];
      if (ri->stride==1)
        {
        if (job->flags&SHARP_ADD)
@ -682,14 +685,14 @@ NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
    }
  }

-NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
+MRUTIL_NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
  double *ringtmp, int rstride)
  {
  if (job->flags & SHARP_DP)
    for (int i=0; i<job->nmaps; ++i)
      {
-      double *restrict p1=&ringtmp[i*rstride+1],
-             *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
+      double *MRUTIL_RESTRICT p1=&ringtmp[i*rstride+1],
+             *MRUTIL_RESTRICT p2=&(((double *)(job->map[i]))[ri->ofs]);
      if (ri->stride==1)
        memcpy(p1,p2,ri->nph*sizeof(double));
      else
@ -721,7 +724,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
      for (int m=0; m<=mmax; ++m)
        phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
          ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
-          ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
+          ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*float(wgt);
    }
  }
 static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -743,7 +746,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
  }

 //FIXME: set phase to zero if not SHARP_MAP2ALM?
-NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
+MRUTIL_NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
  {
  if (job->type != SHARP_MAP2ALM) return;
  int pstride = job->s_m;
@ -760,35 +763,35 @@ NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
    }
  else
    {
-#pragma omp parallel
-{
-    ringhelper helper;
-    ringhelper_init(&helper);
-    int rstride=job->ginfo->nphmax+2;
-    double *ringtmp=RALLOC(double,job->nmaps*rstride);
-#pragma omp for schedule(dynamic,1)
-    for (int ith=llim; ith<ulim; ++ith)
+    mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
      {
-      int dim2 = job->s_th*(ith-llim);
-      ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
-      for (int i=0; i<job->nmaps; ++i)
-        ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
-          &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
-      if (job->ginfo->pair[ith].r2.nph>0)
+      ringhelper helper;
+      ringhelper_init(&helper);
+      int rstride=job->ginfo->nphmax+2;
+      double *ringtmp=RALLOC(double,job->nmaps*rstride);
+
+      while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
        {
-        ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
+        int dim2 = job->s_th*(ith-llim);
+        ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
        for (int i=0; i<job->nmaps; ++i)
-          ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
-           &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
+          ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
+            &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
+        if (job->ginfo->pair[ith].r2.nph>0)
+          {
+          ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
+          for (int i=0; i<job->nmaps; ++i)
+            ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
+             &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
+          }
        }
-      }
-    DEALLOC(ringtmp);
-    ringhelper_destroy(&helper);
-} /* end of parallel region */
+      DEALLOC(ringtmp);
+      ringhelper_destroy(&helper);
+      }); /* end of parallel region */
    }
  }

-NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
+MRUTIL_NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
  {
  if (job->type == SHARP_MAP2ALM) return;
  int pstride = job->s_m;
@ -805,35 +808,35 @@ NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
    }
  else
    {
-#pragma omp parallel
-{
-    ringhelper helper;
-    ringhelper_init(&helper);
-    int rstride=job->ginfo->nphmax+2;
-    double *ringtmp=RALLOC(double,job->nmaps*rstride);
-#pragma omp for schedule(dynamic,1)
-    for (int ith=llim; ith<ulim; ++ith)
+    mr::execDynamic(ulim-llim, 0, 1, [&](mr::Scheduler &sched)
      {
-      int dim2 = job->s_th*(ith-llim);
-      for (int i=0; i<job->nmaps; ++i)
-        ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
-          &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
-      ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
-      if (job->ginfo->pair[ith].r2.nph>0)
+      ringhelper helper;
+      ringhelper_init(&helper);
+      int rstride=job->ginfo->nphmax+2;
+      double *ringtmp=RALLOC(double,job->nmaps*rstride);
+
+      while (auto rng=sched.getNext()) for(auto ith=rng.lo+llim; ith<rng.hi+llim; ++ith)
        {
+        int dim2 = job->s_th*(ith-llim);
        for (int i=0; i<job->nmaps; ++i)
-          ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
-            &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
-        ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
+          ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
+            &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
+        ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
+        if (job->ginfo->pair[ith].r2.nph>0)
+          {
+          for (int i=0; i<job->nmaps; ++i)
+            ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
+              &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
+          ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
+          }
        }
-      }
-    DEALLOC(ringtmp);
-    ringhelper_destroy(&helper);
-} /* end of parallel region */
+      DEALLOC(ringtmp);
+      ringhelper_destroy(&helper);
+      }); /* end of parallel region */
    }
  }

-NOINLINE static void sharp_execute_job (sharp_job *job)
+MRUTIL_NOINLINE static void sharp_execute_job (sharp_job *job)
  {
  double timer=sharp_wallTime();
  job->opcnt=0;
@ -852,6 +855,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
                 &nchunks,&chunksize);
 //FIXME: needs to be changed to "nm"
  alloc_phase (job,mmax+1,chunksize);
+  std::atomic<size_t> opcnt = 0;

 /* chunk loop */
  for (int chunk=0; chunk<nchunks; ++chunk)
@ -871,32 +875,30 @@ NOINLINE static void sharp_execute_job (sharp_job *job)
 /* map->phase where necessary */
    map2phase (job, mmax, llim, ulim);

-#pragma omp parallel
-{
-    sharp_job ljob = *job;
-    ljob.opcnt=0;
-    sharp_Ylmgen_C generator;
-    sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
-    alloc_almtmp(&ljob,lmax);
-
-#pragma omp for schedule(dynamic,1)
-    for (int mi=0; mi<job->ainfo->nm; ++mi)
+    mr::execDynamic(job->ainfo->nm, 0, 1, [&](mr::Scheduler &sched)
      {
-/* alm->alm_tmp where necessary */
-      alm2almtmp (&ljob, lmax, mi);
+      sharp_job ljob = *job;
+      ljob.opcnt=0;
+      sharp_Ylmgen_C generator;
+      sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
+      alloc_almtmp(&ljob,lmax);

-      inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
+      while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
+        {
+/* alm->alm_tmp where necessary */
+        alm2almtmp (&ljob, lmax, mi);
+
+        inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);

 /* alm_tmp->alm where necessary */
-      almtmp2alm (&ljob, lmax, mi);
-      }
+        almtmp2alm (&ljob, lmax, mi);
+        }

-    sharp_Ylmgen_destroy(&generator);
-    dealloc_almtmp(&ljob);
+      sharp_Ylmgen_destroy(&generator);
+      dealloc_almtmp(&ljob);

-#pragma omp critical
-    job->opcnt+=ljob.opcnt;
-} /* end of parallel region */
+      opcnt+=ljob.opcnt;
+      }); /* end of parallel region */

 /* phase->map where necessary */
    phase2map (job, mmax, llim, ulim);
@ -909,6 +911,7 @@ NOINLINE static void sharp_execute_job (sharp_job *job)

  DEALLOC(job->norm_l);
  dealloc_phase (job);
+  job->opcnt = opcnt;
  job->time=sharp_wallTime()-timer;
  }

@ -934,8 +937,8 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
    job->flags|=SHARP_REAL_HARMONICS;
  job->time = 0.;
  job->opcnt = 0;
-  job->alm=alm;
-  job->map=map;
+  job->alm=(void **)alm;
+  job->map=(void **)map;
  }

 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
--- a/libsharp2/sharp_almhelpers.cc
+++ b/libsharp2/sharp_almhelpers.cc
--- a/libsharp2/sharp_core.cc
+++ b/libsharp2/sharp_core.cc
@ -27,7 +27,7 @@

 #define ARCH default
 #define GENERIC_ARCH
-#include "libsharp2/sharp_core_inc.c"
+#include "libsharp2/sharp_core_inc.cc"
 #undef GENERIC_ARCH
 #undef ARCH

--- a/libsharp2/sharp_core_inc.cc
+++ b/libsharp2/sharp_core_inc.cc
@ -34,7 +34,7 @@
 #define XCONCATX2(a,b) XCONCATX(a,b)
 #define XARCH(a) XCONCATX2(a,ARCH)

-#include <complex.h>
+#include <complex>
 #include <math.h>
 #include <string.h>
 #include "libsharp2/sharp_vecsupport.h"
@ -49,7 +49,7 @@
 // Unfortunately, most compilers don't act on this pragma yet.
 #pragma STDC FP_CONTRACT ON

-typedef complex double dcmplx;
+typedef complex<double> dcmplx;

 #define nv0 (128/VLEN)
 #define nvx (64/VLEN)
@ -94,35 +94,35 @@ typedef union
  sxdata_s s;
  } sxdata_u;

-static inline void Tvnormalize (Tv * restrict val, Tv * restrict scale,
+static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
  double maxval)
  {
-  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
-  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
-  Tm mask = vgt(vabs(*val),vfmax);
-  while (vanyTrue(mask))
+  const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
+  const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig;
+  auto mask = abs(*val)>vfmax;
+  while (any_of(mask))
    {
-    vmuleq_mask(mask,*val,vfsmall);
-    vaddeq_mask(mask,*scale,vone);
-    mask = vgt(vabs(*val),vfmax);
+    where(mask,*val)*=vfsmall;
+    where(mask,*scale)+=1;
+    mask = abs(*val)>vfmax;
    }
-  mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
-  while (vanyTrue(mask))
+  mask = (abs(*val)<vfmin) & (*val!=0);
+  while (any_of(mask))
    {
-    vmuleq_mask(mask,*val,vfbig);
-    vsubeq_mask(mask,*scale,vone);
-    mask = vand_mask(vlt(vabs(*val),vfmin),vne(*val,vzero));
+    where(mask,*val)*=vfbig;
+    where(mask,*scale)-=1;
+    mask = (abs(*val)<vfmin) & (*val!=0);
    }
  }

-static void mypow(Tv val, int npow, const double * restrict powlimit,
-  Tv * restrict resd, Tv * restrict ress)
+static void mypow(Tv val, int npow, const double * MRUTIL_RESTRICT powlimit,
+  Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
  {
-  Tv vminv=vload(powlimit[npow]);
-  Tm mask = vlt(vabs(val),vminv);
-  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
+  Tv vminv=powlimit[npow];
+  auto mask = abs(val)<vminv;
+  if (none_of(mask)) // no underflows possible, use quick algoritm
    {
-    Tv res=vone;
+    Tv res=1;
    do
      {
      if (npow&1)
@ -131,11 +131,11 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
      }
    while(npow>>=1);
    *resd=res;
-    *ress=vzero;
+    *ress=0;
    }
  else
    {
-    Tv scale=vzero, scaleint=vzero, res=vone;
+    Tv scale=0, scaleint=0, res=1;
    Tvnormalize(&val,&scaleint,sharp_fbighalf);
    do
      {
@ -155,8 +155,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
    }
  }

-static inline void getCorfac(Tv scale, Tv * restrict corfac,
-  const double * restrict cf)
+static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
+  const double * MRUTIL_RESTRICT cf)
  {
  typedef union
    { Tv v; double s[VLEN]; } Tvu;
@ -169,67 +169,67 @@ static inline void getCorfac(Tv scale, Tv * restrict corfac,
  *corfac=corf.v;
  }

-static inline int rescale(Tv * restrict v1, Tv * restrict v2, Tv * restrict s, Tv eps)
+static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
  {
-  Tm mask = vgt(vabs(*v2),eps);
-  if (vanyTrue(mask))
+  auto mask = abs(*v2)>eps;
+  if (any_of(mask))
    {
-    vmuleq_mask(mask,*v1,vload(sharp_fsmall));
-    vmuleq_mask(mask,*v2,vload(sharp_fsmall));
-    vaddeq_mask(mask,*s,vone);
-    return 1;
+    where(mask,*v1)*=sharp_fsmall;
+    where(mask,*v2)*=sharp_fsmall;
+    where(mask,*s)+=1;
+    return true;
    }
-  return 0;
+  return false;
  }

-NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen,
-  s0data_v * restrict d, int * restrict l_, int * restrict il_, int nv2)
+MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
+  s0data_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int * MRUTIL_RESTRICT il_, int nv2)
  {
  int l=gen->m, il=0;
-  Tv mfac = vload((gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
-  Tv limscale=vload(sharp_limscale);
+  Tv mfac = (gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m];
+  Tv limscale=sharp_limscale;
  int below_limit = 1;
  for (int i=0; i<nv2; ++i)
    {
-    d->lam1[i]=vzero;
+    d->lam1[i]=0;
    mypow(d->sth[i],gen->m,gen->powlimit,&d->lam2[i],&d->scale[i]);
    d->lam2[i] *= mfac;
    Tvnormalize(&d->lam2[i],&d->scale[i],sharp_ftol);
-    below_limit &= vallTrue(vlt(d->scale[i],limscale));
+    below_limit &= all_of(d->scale[i]<limscale);
    }

  while (below_limit)
    {
    if (l+4>gen->lmax) {*l_=gen->lmax+1;return;}
    below_limit=1;
-    Tv a1=vload(gen->coef[il  ].a), b1=vload(gen->coef[il  ].b);
-    Tv a2=vload(gen->coef[il+1].a), b2=vload(gen->coef[il+1].b);
+    Tv a1=gen->coef[il  ].a, b1=gen->coef[il  ].b;
+    Tv a2=gen->coef[il+1].a, b2=gen->coef[il+1].b;
    for (int i=0; i<nv2; ++i)
      {
      d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i];
      d->lam2[i] = (a2*d->csq[i] + b2)*d->lam1[i] + d->lam2[i];
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(d->scale[i],vload(sharp_limscale)));
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
+        below_limit &= all_of(d->scale[i]<sharp_limscale);
      }
    l+=4; il+=2;
    }
  *l_=l; *il_=il;
  }

-NOINLINE static void alm2map_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm,
+MRUTIL_NOINLINE static void alm2map_kernel(s0data_v * MRUTIL_RESTRICT d,
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, const dcmplx * MRUTIL_RESTRICT alm,
  int l, int il, int lmax, int nv2)
  {
  if (nv2==nv0)
    {
    for (; l<=lmax-2; il+=2, l+=4)
      {
-      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(coef[il  ].a), b1=vload(coef[il  ].b);
-      Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
+      Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
+      Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
+      Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
+      Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
+      Tv a1=coef[il  ].a, b1=coef[il  ].b;
+      Tv a2=coef[il+1].a, b2=coef[il+1].b;
      for (int i=0; i<nv0; ++i)
        {
        d->p1r[i] += d->lam2[i]*ar1;
@ -249,12 +249,12 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
    {
    for (; l<=lmax-2; il+=2, l+=4)
      {
-      Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-      Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-      Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2]));
-      Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3]));
-      Tv a1=vload(coef[il  ].a), b1=vload(coef[il  ].b);
-      Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
+      Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
+      Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
+      Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
+      Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
+      Tv a1=coef[il  ].a, b1=coef[il  ].b;
+      Tv a2=coef[il+1].a, b2=coef[il+1].b;
      for (int i=0; i<nv2; ++i)
        {
        d->p1r[i] += d->lam2[i]*ar1;
@ -272,9 +272,9 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
    }
  for (; l<=lmax; ++il, l+=2)
    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(coef[il].a), b=vload(coef[il].b);
+    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
+    Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
+    Tv a=coef[il].a, b=coef[il].b;
    for (int i=0; i<nv2; ++i)
      {
      d->p1r[i] += d->lam2[i]*ar1;
@ -288,8 +288,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d,
    }
  }

-NOINLINE static void calc_alm2map (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+MRUTIL_NOINLINE static void calc_alm2map (sharp_job * MRUTIL_RESTRICT job,
+  const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
  {
  int l,il,lmax=gen->lmax;
  int nv2 = (nth+VLEN-1)/VLEN;
@ -298,20 +298,20 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * 6*nth;

-  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
-  const dcmplx * restrict alm=job->almtmp;
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
+  const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
  int full_ieee=1;
  for (int i=0; i<nv2; ++i)
    {
    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    full_ieee &= all_of(d->scale[i]>=sharp_minscale);
    }

  while((!full_ieee) && (l<=lmax))
    {
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ]));
-    Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
-    Tv a=vload(coef[il].a), b=vload(coef[il].b);
+    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
+    Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
+    Tv a=coef[il].a, b=coef[il].b;
    full_ieee=1;
    for (int i=0; i<nv2; ++i)
      {
@ -322,9 +322,9 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
      d->lam1[i] = d->lam2[i];
      d->lam2[i] = tmp;
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+      full_ieee &= all_of(d->scale[i]>=sharp_minscale);
      }
    l+=2; ++il;
    }
@ -338,16 +338,16 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job,
  alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
  }

-NOINLINE static void map2alm_kernel(s0data_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l,
+MRUTIL_NOINLINE static void map2alm_kernel(s0data_v * MRUTIL_RESTRICT d,
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef, dcmplx * MRUTIL_RESTRICT alm, int l,
  int il, int lmax, int nv2)
  {
  for (; l<=lmax-2; il+=2, l+=4)
    {
-    Tv a1=vload(coef[il  ].a), b1=vload(coef[il  ].b);
-    Tv a2=vload(coef[il+1].a), b2=vload(coef[il+1].b);
-    Tv atmp1[4] = {vzero, vzero, vzero, vzero};
-    Tv atmp2[4] = {vzero, vzero, vzero, vzero};
+    Tv a1=coef[il  ].a, b1=coef[il  ].b;
+    Tv a2=coef[il+1].a, b2=coef[il+1].b;
+    Tv atmp1[4] = {0,0,0,0};
+    Tv atmp2[4] = {0,0,0,0};
    for (int i=0; i<nv2; ++i)
      {
      atmp1[0] += d->lam2[i]*d->p1r[i];
@ -366,8 +366,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
    }
  for (; l<=lmax; ++il, l+=2)
    {
-    Tv a=vload(coef[il].a), b=vload(coef[il].b);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    Tv a=coef[il].a, b=coef[il].b;
+    Tv atmp[4] = {0,0,0,0};
    for (int i=0; i<nv2; ++i)
      {
      atmp[0] += d->lam2[i]*d->p1r[i];
@ -382,8 +382,8 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d,
    }
  }

-NOINLINE static void calc_map2alm (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, s0data_v * restrict d, int nth)
+MRUTIL_NOINLINE static void calc_map2alm (sharp_job * MRUTIL_RESTRICT job,
+  const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, s0data_v * MRUTIL_RESTRICT d, int nth)
  {
  int l,il,lmax=gen->lmax;
  int nv2 = (nth+VLEN-1)/VLEN;
@ -392,19 +392,19 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * 6*nth;

-  const sharp_ylmgen_dbl2 * restrict coef = gen->coef;
-  dcmplx * restrict alm=job->almtmp;
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT coef = gen->coef;
+  dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
  int full_ieee=1;
  for (int i=0; i<nv2; ++i)
    {
    getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+    full_ieee &= all_of(d->scale[i]>=sharp_minscale);
    }

  while((!full_ieee) && (l<=lmax))
    {
-    Tv a=vload(coef[il].a), b=vload(coef[il].b);
-    Tv atmp[4] = {vzero, vzero, vzero, vzero};
+    Tv a=coef[il].a, b=coef[il].b;
+    Tv atmp[4] = {0,0,0,0};
    full_ieee=1;
    for (int i=0; i<nv2; ++i)
      {
@ -415,9 +415,9 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
      Tv tmp = (a*d->csq[i] + b)*d->lam2[i] + d->lam1[i];
      d->lam1[i] = d->lam2[i];
      d->lam2[i] = tmp;
-      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], vload(sharp_ftol)))
+      if (rescale(&d->lam1[i], &d->lam2[i], &d->scale[i], sharp_ftol))
        getCorfac(d->scale[i], &d->corfac[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scale[i],vload(sharp_minscale)));
+      full_ieee &= all_of(d->scale[i]>=sharp_minscale);
      }
    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
    l+=2; ++il;
@ -432,21 +432,21 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job,
  map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
  }

-NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
-  sxdata_v * restrict d, int * restrict l_, int nv2)
+MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * MRUTIL_RESTRICT gen,
+  sxdata_v * MRUTIL_RESTRICT d, int * MRUTIL_RESTRICT l_, int nv2)
  {
-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  Tv prefac=vload(gen->prefac[gen->m]),
-     prescale=vload(gen->fscale[gen->m]);
-  Tv limscale=vload(sharp_limscale);
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
+  Tv prefac=gen->prefac[gen->m],
+     prescale=gen->fscale[gen->m];
+  Tv limscale=sharp_limscale;
  int below_limit=1;
  for (int i=0; i<nv2; ++i)
    {
-    Tv cth2=vmax(vload(1e-15),vsqrt((vone+d->cth[i])*vload(0.5)));
-    Tv sth2=vmax(vload(1e-15),vsqrt((vone-d->cth[i])*vload(0.5)));
-    Tm mask=vlt(d->sth[i],vzero);
-    vmuleq_mask(vand_mask(mask,vlt(d->cth[i],vzero)),cth2,vload(-1.));
-    vmuleq_mask(vand_mask(mask,vgt(d->cth[i],vzero)),sth2,vload(-1.));
+    Tv cth2=max(Tv(1e-15),sqrt((1.+d->cth[i])*0.5));
+    Tv sth2=max(Tv(1e-15),sqrt((1.-d->cth[i])*0.5));
+    auto mask=d->sth[i]<0;
+    where(mask&(d->cth[i]<0),cth2)*=-1.;
+    where(mask&(d->cth[i]<0),sth2)*=-1.;

    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
    mypow(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
@ -454,8 +454,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
    mypow(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
    mypow(sth2,gen->cosPow,gen->powlimit,&scp,&scps);

-    d->l1p[i] = vzero;
-    d->l1m[i] = vzero;
+    d->l1p[i] = 0;
+    d->l1m[i] = 0;
    d->l2p[i] = prefac*ccp;
    d->scp[i] = prescale+ccps;
    d->l2m[i] = prefac*csp;
@ -467,17 +467,17 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
    d->l2m[i] *= scp;
    d->scm[i] += scps;
    if (gen->preMinus_p)
-      d->l2p[i] = vneg(d->l2p[i]);
+      d->l2p[i] = -d->l2p[i];
    if (gen->preMinus_m)
-      d->l2m[i] = vneg(d->l2m[i]);
+      d->l2m[i] = -d->l2m[i];
    if (gen->s&1)
-      d->l2p[i] = vneg(d->l2p[i]);
+      d->l2p[i] = -d->l2p[i];

    Tvnormalize(&d->l2m[i],&d->scm[i],sharp_ftol);
    Tvnormalize(&d->l2p[i],&d->scp[i],sharp_ftol);

-    below_limit &= vallTrue(vlt(d->scm[i],limscale)) &&
-                   vallTrue(vlt(d->scp[i],limscale));
+    below_limit &= all_of(d->scm[i]<limscale) &&
+                   all_of(d->scp[i]<limscale);
    }

  int l=gen->mhi;
@ -486,18 +486,18 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
    {
    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
    below_limit=1;
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    for (int i=0; i<nv2; ++i)
      {
      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],vload(sharp_ftol)) ||
-          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],vload(sharp_ftol)))
-        below_limit &= vallTrue(vlt(d->scp[i],limscale)) &&
-                       vallTrue(vlt(d->scm[i],limscale));
+      if (rescale(&d->l1p[i],&d->l2p[i],&d->scp[i],sharp_ftol) ||
+          rescale(&d->l1m[i],&d->l2m[i],&d->scm[i],sharp_ftol))
+        below_limit &= all_of(d->scp[i]<limscale) &&
+                       all_of(d->scm[i]<limscale);
      }
    l+=2;
    }
@ -505,19 +505,19 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen,
  *l_=l;
  }

-NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
+MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
  int l, int lmax, int nv2)
  {
  int lsave = l;
  while (l<=lmax)
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
-       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
-    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
-       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=alm[2*l  ].real(), agi1=alm[2*l  ].imag(),
+       acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
+    Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
+       acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
    for (int i=0; i<nv2; ++i)
      {
      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -537,12 +537,12 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
  l=lsave;
  while (l<=lmax)
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
-       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
-    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
-       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=alm[2*l  ].real(), agi1=alm[2*l  ].imag(),
+       acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
+    Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
+       acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
    for (int i=0; i<nv2; ++i)
      {
      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -561,8 +561,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d,
    }
  }

-NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job * MRUTIL_RESTRICT job,
+  const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
  {
  int l,lmax=gen->lmax;
  int nv2 = (nth+VLEN-1)/VLEN;
@ -571,25 +571,25 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * 23*nth;

-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  const dcmplx * restrict alm=job->almtmp;
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
+  const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
  int full_ieee=1;
  for (int i=0; i<nv2; ++i)
    {
    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
-                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
+                 all_of(d->scm[i]>=sharp_minscale);
    }

  while((!full_ieee) && (l<=lmax))
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv agr1=vload(creal(alm[2*l  ])), agi1=vload(cimag(alm[2*l  ])),
-       acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1]));
-    Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])),
-       acr2=vload(creal(alm[2*l+3])), aci2=vload(cimag(alm[2*l+3]));
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=alm[2*l  ].real(), agi1=alm[2*l  ].imag(),
+       acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
+    Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
+       acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
    full_ieee=1;
    for (int i=0; i<nv2; ++i)
      {
@ -611,12 +611,12 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,

      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+      full_ieee &= all_of(d->scp[i]>=sharp_minscale);
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      full_ieee &= all_of(d->scm[i]>=sharp_minscale);
      }
    l+=2;
    }
@ -641,17 +641,17 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job,
    }
  }

-NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict fx, dcmplx * restrict alm,
+MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v * MRUTIL_RESTRICT d,
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, dcmplx * MRUTIL_RESTRICT alm,
  int l, int lmax, int nv2)
  {
  int lsave=l;
  while (l<=lmax)
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=0, agi1=0, acr1=0, aci1=0;
+    Tv agr2=0, agi2=0, acr2=0, aci2=0;
    for (int i=0; i<nv2; ++i)
      {
      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -672,10 +672,10 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
  l=lsave;
  while (l<=lmax)
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=0, agi1=0, acr1=0, aci1=0;
+    Tv agr2=0, agi2=0, acr2=0, aci2=0;
    for (int i=0; i<nv2; ++i)
      {
      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -695,8 +695,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d,
    }
  }

-NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job * MRUTIL_RESTRICT job,
+  const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
  {
  int l,lmax=gen->lmax;
  int nv2 = (nth+VLEN-1)/VLEN;
@ -705,15 +705,15 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * 23*nth;

-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  dcmplx * restrict alm=job->almtmp;
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
+  dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
  int full_ieee=1;
  for (int i=0; i<nv2; ++i)
    {
    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
-                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
+                 all_of(d->scm[i]>=sharp_minscale);
    }
  for (int i=0; i<nv2; ++i)
    {
@ -726,10 +726,10 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,

  while((!full_ieee) && (l<=lmax))
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero;
-    Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero;
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=0, agi1=0, acr1=0, aci1=0;
+    Tv agr2=0, agi2=0, acr2=0, aci2=0;
    full_ieee=1;
    for (int i=0; i<nv2; ++i)
      {
@ -748,12 +748,12 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,

      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+      full_ieee &= all_of(d->scp[i]>=sharp_minscale);
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      full_ieee &= all_of(d->scm[i]>=sharp_minscale);
      }
    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
@ -772,17 +772,17 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job,
  }


-NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
-  const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
+MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v * MRUTIL_RESTRICT d,
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx, const dcmplx * MRUTIL_RESTRICT alm,
  int l, int lmax, int nv2)
  {
  int lsave=l;
  while (l<=lmax)
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
-       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag(),
+       ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    for (int i=0; i<nv2; ++i)
      {
      d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
@ -798,10 +798,10 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
  l=lsave;
  while (l<=lmax)
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
-       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag(),
+       ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    for (int i=0; i<nv2; ++i)
      {
      d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
@ -816,8 +816,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
    }
  }

-NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
-  const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int nth)
+MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job * MRUTIL_RESTRICT job,
+  const sharp_Ylmgen_C * MRUTIL_RESTRICT gen, sxdata_v * MRUTIL_RESTRICT d, int nth)
  {
  int l,lmax=gen->lmax;
  int nv2 = (nth+VLEN-1)/VLEN;
@ -826,23 +826,23 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * 15*nth;

-  const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
-  const dcmplx * restrict alm=job->almtmp;
+  const sharp_ylmgen_dbl2 * MRUTIL_RESTRICT fx = gen->coef;
+  const dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
  int full_ieee=1;
  for (int i=0; i<nv2; ++i)
    {
    getCorfac(d->scp[i], &d->cfp[i], gen->cf);
    getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-    full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))) &&
-                 vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+    full_ieee &= all_of(d->scp[i]>=sharp_minscale) &&
+                 all_of(d->scm[i]>=sharp_minscale);
    }

  while((!full_ieee) && (l<=lmax))
    {
-    Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
-    Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
-    Tv ar1=vload(creal(alm[l  ])), ai1=vload(cimag(alm[l  ])),
-       ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag(),
+       ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    full_ieee=1;
    for (int i=0; i<nv2; ++i)
      {
@ -864,12 +864,12 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,

      d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
      d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
-      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
+      if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], sharp_ftol))
        getCorfac(d->scp[i], &d->cfp[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
-      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
+      full_ieee &= all_of(d->scp[i]>=sharp_minscale);
+      if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], sharp_ftol))
        getCorfac(d->scm[i], &d->cfm[i], gen->cf);
-      full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
+      full_ieee &= all_of(d->scm[i]>=sharp_minscale);
      }
    l+=2;
    }
@ -897,7 +897,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,

 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)

-NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
+MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
  sharp_Ylmgen_C *gen, int mi, const int *mlim)
  {
@ -912,7 +912,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
      if (job->spin==0)
        {
        //adjust the a_lm for the new algorithm
-        dcmplx * restrict alm=job->almtmp;
+        dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
          {
          dcmplx al = alm[l];
@ -963,8 +963,8 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
              d.s.p2r[i]*=cth_[tgt];
              d.s.p2i[i]*=cth_[tgt];
              int phas_idx = tgt*job->s_th + mi*job->s_m;
-              complex double r1 = d.s.p1r[i] + d.s.p1i[i]*_Complex_I,
-                             r2 = d.s.p2r[i] + d.s.p2i[i]*_Complex_I;
+              complex<double> r1(d.s.p1r[i], d.s.p1i[i]),
+                              r2(d.s.p2r[i], d.s.p2i[i]);
              job->phase[phas_idx] = r1+r2;
              if (ispair[tgt])
                job->phase[phas_idx+1] = r1-r2;
@ -1027,10 +1027,10 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
              {
              int tgt=itgt[i];
              int phas_idx = tgt*job->s_th + mi*job->s_m;
-              complex double q1 = d.s.p1pr[i] + d.s.p1pi[i]*_Complex_I,
-                             q2 = d.s.p2pr[i] + d.s.p2pi[i]*_Complex_I,
-                             u1 = d.s.p1mr[i] + d.s.p1mi[i]*_Complex_I,
-                             u2 = d.s.p2mr[i] + d.s.p2mi[i]*_Complex_I;
+              complex<double> q1(d.s.p1pr[i], d.s.p1pi[i]),
+                              q2(d.s.p2pr[i], d.s.p2pi[i]),
+                              u1(d.s.p1mr[i], d.s.p1mi[i]),
+                              u2(d.s.p2mr[i], d.s.p2mi[i]);
              job->phase[phas_idx  ] = q1+q2;
              job->phase[phas_idx+2] = u1+u2;
              if (ispair[tgt])
@ -1056,7 +1056,7 @@ NOINLINE static void inner_loop_a2m(sharp_job *job, const int *ispair,
    }
  }

-NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
+MRUTIL_NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
  sharp_Ylmgen_C *gen, int mi, const int *mlim)
  {
@ -1083,8 +1083,8 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
              int phas_idx = ith*job->s_th + mi*job->s_m;
              dcmplx ph1=job->phase[phas_idx];
              dcmplx ph2=ispair[ith] ? job->phase[phas_idx+1] : 0.;
-              d.s.p1r[nth]=creal(ph1+ph2); d.s.p1i[nth]=cimag(ph1+ph2);
-              d.s.p2r[nth]=creal(ph1-ph2); d.s.p2i[nth]=cimag(ph1-ph2);
+              d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag();
+              d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag();
              //adjust for new algorithm
              d.s.p2r[nth]*=cth_[ith];
              d.s.p2i[nth]*=cth_[ith];
@ -1105,7 +1105,7 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
            }
          }
        //adjust the a_lm for the new algorithm
-        dcmplx * restrict alm=job->almtmp;
+        dcmplx * MRUTIL_RESTRICT alm=job->almtmp;
        dcmplx alm2 = 0.;
        double alold=0;
        for (int il=0, l=gen->m; l<=gen->lmax; ++il,l+=2)
@ -1138,10 +1138,10 @@ NOINLINE static void inner_loop_m2a(sharp_job *job, const int *ispair,
                     p2U=ispair[ith] ? job->phase[phas_idx+3]:0.;
              if ((gen->mhi-gen->m+gen->s)&1)
                { p2Q=-p2Q; p2U=-p2U; }
-              d.s.p1pr[nth]=creal(p1Q+p2Q); d.s.p1pi[nth]=cimag(p1Q+p2Q);
-              d.s.p1mr[nth]=creal(p1U+p2U); d.s.p1mi[nth]=cimag(p1U+p2U);
-              d.s.p2pr[nth]=creal(p1Q-p2Q); d.s.p2pi[nth]=cimag(p1Q-p2Q);
-              d.s.p2mr[nth]=creal(p1U-p2U); d.s.p2mi[nth]=cimag(p1U-p2U);
+              d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag();
+              d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag();
+              d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag();
+              d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag();
              ++nth;
              }
            ++ith;
--- a/libsharp2/sharp_geomhelpers.cc
+++ b/libsharp2/sharp_geomhelpers.cc
@ -29,7 +29,7 @@
 #include "libsharp2/sharp_geomhelpers.h"
 #include "libsharp2/sharp_legendre_roots.h"
 #include "libsharp2/sharp_utils.h"
-#include "libsharp2/pocketfft.h"
+#include "mr_util/fft.h"

 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
  const int *rings, const double *weight, sharp_geom_info **geom_info)
@ -155,9 +155,7 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
    weight[2*k  ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
    }
  if ((nrings&1)==0) weight[nrings-1]=0.;
-  pocketfft_plan_r plan = pocketfft_make_plan_r(nrings);
-  pocketfft_backward_r(plan,weight,1.);
-  pocketfft_delete_plan_r(plan);
+  mr::r2r_fftpack({size_t(nrings)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);

  for (int m=0; m<(nrings+1)/2; ++m)
    {
@ -202,9 +200,7 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
  for (int k=1; k<=(n/2-1); ++k)
    weight[2*k-1]=2./(1.-4.*k*k) + dw;
  weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
-  pocketfft_plan_r plan = pocketfft_make_plan_r(n);
-  pocketfft_backward_r(plan,weight,1.);
-  pocketfft_delete_plan_r(plan);
+  mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
  weight[n]=weight[0];

  for (int m=0; m<(nrings+1)/2; ++m)
@ -250,9 +246,7 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
  for (int k=1; k<=(n/2-1); ++k)
    weight[2*k-1]=2./(1.-4.*k*k);
  weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
-  pocketfft_plan_r plan = pocketfft_make_plan_r(n);
-  pocketfft_backward_r(plan,weight,1.);
-  pocketfft_delete_plan_r(plan);
+  mr::r2r_fftpack({size_t(n)}, {sizeof(double)}, {sizeof(double)}, {0}, false, false, weight, weight, 1.);
  for (int m=0; m<nrings; ++m)
    weight[m]=weight[m+1];

--- a/libsharp2/sharp_internal.h
+++ b/libsharp2/sharp_internal.h
@ -28,14 +28,12 @@
 #ifndef SHARP2_INTERNAL_H
 #define SHARP2_INTERNAL_H

-#ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
-#endif
-
-#include <complex.h>
+#include <complex>
 #include "libsharp2/sharp.h"
 #include "libsharp2/sharp_ylmgen_c.h"

+using std::complex;
+
 typedef struct
  {
  sharp_jobtype type;
@ -45,9 +43,9 @@ typedef struct
  void **map;
  void **alm;
  int s_m, s_th; // strides in m and theta direction
-  complex double *phase;
+  complex<double> *phase;
  double *norm_l;
-  complex double *almtmp;
+  complex<double> *almtmp;
  const sharp_geom_info *ginfo;
  const sharp_alm_info *ainfo;
  double time;
--- a/libsharp2/sharp_legendre_roots.cc
+++ b/libsharp2/sharp_legendre_roots.cc
--- a/libsharp2/sharp_mpi.cc
+++ b/libsharp2/sharp_mpi.cc
--- a/libsharp2/sharp_utils.cc
+++ b/libsharp2/sharp_utils.cc
--- a/libsharp2/sharp_utils.h
+++ b/libsharp2/sharp_utils.h
@ -122,10 +122,4 @@ double sharp_wallTime(void);
 }
 #endif

-#ifdef __GNUC__
-#define NOINLINE __attribute__((noinline))
-#else
-#define NOINLINE
-#endif
-
 #endif
--- a/libsharp2/sharp_vecsupport.h
+++ b/libsharp2/sharp_vecsupport.h
@ -28,193 +28,27 @@
 #ifndef SHARP2_VECSUPPORT_H
 #define SHARP2_VECSUPPORT_H

-#include <math.h>
+#include <cmath>
+#include <complex>
+using std::complex;
+#include <experimental/simd>
+using std::experimental::native_simd;
+using std::experimental::reduce;

-#ifndef VLEN
+#include "mr_util/useful_macros.h"

-#if (defined(__AVX512F__))
-#define VLEN 8
-#elif (defined (__AVX__))
-#define VLEN 4
-#elif (defined (__SSE2__))
-#define VLEN 2
-#else
-#define VLEN 1
-#endif
-
-#endif
-
-typedef double Ts;
-
-#if (VLEN==1)
-
-typedef double Tv;
-typedef int Tm;
+using Tv=native_simd<double>;
+using Tm=Tv::mask_type;
+using Ts=Tv::value_type;
+static constexpr size_t VLEN=Tv::size();

 #define vload(a) (a)
-#define vzero 0.
-#define vone 1.
-
-#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
-#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
-#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
-#define vneg(a) (-(a))
-#define vabs(a) fabs(a)
-#define vsqrt(a) sqrt(a)
-#define vlt(a,b) ((a)<(b))
-#define vgt(a,b) ((a)>(b))
-#define vge(a,b) ((a)>=(b))
-#define vne(a,b) ((a)!=(b))
-#define vand_mask(a,b) ((a)&&(b))
-#define vor_mask(a,b) ((a)||(b))
-static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
-static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
-#define vanyTrue(a) (a)
-#define vallTrue(a) (a)

 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
-
-
-#endif
-
-#if (VLEN==2)
-
-#include <emmintrin.h>
-
-#if defined (__SSE3__)
-#include <pmmintrin.h>
-#endif
-#if defined (__SSE4_1__)
-#include <smmintrin.h>
-#endif
-
-typedef __m128d Tv;
-typedef __m128d Tm;
-
-#if defined(__SSE4_1__)
-#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
-#else
-static inline Tv vblend__(Tv m, Tv a, Tv b)
-  { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
-#endif
-#define vload(a) _mm_set1_pd(a)
-#define vzero _mm_setzero_pd()
-#define vone vload(1.)
-
-#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
-#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
-#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
-#define vneg(a) _mm_xor_pd(vload(-0.),a)
-#define vabs(a) _mm_andnot_pd(vload(-0.),a)
-#define vsqrt(a) _mm_sqrt_pd(a)
-#define vlt(a,b) _mm_cmplt_pd(a,b)
-#define vgt(a,b) _mm_cmpgt_pd(a,b)
-#define vge(a,b) _mm_cmpge_pd(a,b)
-#define vne(a,b) _mm_cmpneq_pd(a,b)
-#define vand_mask(a,b) _mm_and_pd(a,b)
-#define vor_mask(a,b) _mm_or_pd(a,b)
-#define vmin(a,b) _mm_min_pd(a,b)
-#define vmax(a,b) _mm_max_pd(a,b);
-#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
-#define vallTrue(a) (_mm_movemask_pd(a)==3)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
-  Tv d, _Complex double * restrict cc)
+  complex<double> * MRUTIL_RESTRICT cc)
  {
-  union {Tv v; _Complex double c; } u1, u2;
-#if defined(__SSE3__)
-  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
-#else
-  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
-#endif
-  cc[0]+=u1.c; cc[1]+=u2.c;
+  cc[0] += complex<double>(reduce(a,std::plus<>()),reduce(b,std::plus<>()));
+  cc[1] += complex<double>(reduce(c,std::plus<>()),reduce(d,std::plus<>()));
  }

 #endif
-
-#if (VLEN==4)
-
-#include <immintrin.h>
-
-typedef __m256d Tv;
-typedef __m256d Tm;
-
-#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
-#define vload(a) _mm256_set1_pd(a)
-#define vzero _mm256_setzero_pd()
-#define vone vload(1.)
-
-#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
-#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
-#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
-#define vneg(a) _mm256_xor_pd(vload(-0.),a)
-#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
-#define vsqrt(a) _mm256_sqrt_pd(a)
-#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
-#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
-#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
-#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
-#define vand_mask(a,b) _mm256_and_pd(a,b)
-#define vor_mask(a,b) _mm256_or_pd(a,b)
-#define vmin(a,b) _mm256_min_pd(a,b)
-#define vmax(a,b) _mm256_max_pd(a,b)
-#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
-#define vallTrue(a) (_mm256_movemask_pd(a)==15)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  {
-  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
-  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
-     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=tmp3+tmp4;
-  union {Tv v; _Complex double c[2]; } u;
-  u.v=tmp1;
-  cc[0]+=u.c[0]; cc[1]+=u.c[1];
-  }
-
-#endif
-
-#if (VLEN==8)
-
-#include <immintrin.h>
-
-typedef __m512d Tv;
-typedef __mmask8 Tm;
-
-#define vload(a) _mm512_set1_pd(a)
-#define vzero _mm512_setzero_pd()
-#define vone vload(1.)
-
-#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
-#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
-#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
-#define vneg(a) _mm512_mul_pd(a,vload(-1.))
-#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
-#define vsqrt(a) _mm512_sqrt_pd(a)
-#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
-#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
-#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
-#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
-#define vand_mask(a,b) ((a)&(b))
-#define vor_mask(a,b) ((a)|(b))
-#define vmin(a,b) _mm512_min_pd(a,b)
-#define vmax(a,b) _mm512_max_pd(a,b)
-#define vanyTrue(a) (a!=0)
-#define vallTrue(a) (a==255)
-
-static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  _Complex double * restrict cc)
-  {
-  cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
-  cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
-  }
-
-#endif
-
-#endif
--- a/libsharp2/sharp_ylmgen_c.cc
+++ b/libsharp2/sharp_ylmgen_c.cc
--- a/test/memusage.cc
+++ b/test/memusage.cc
--- a/test/sharp2_testsuite.cc
+++ b/test/sharp2_testsuite.cc
@ -26,7 +26,8 @@

 #include <stdio.h>
 #include <string.h>
-#include <complex.h>
+#include <complex>
+using std::complex;
 #ifdef USE_MPI
 #include "mpi.h"
 #include "libsharp2/sharp_mpi.h"
@ -38,7 +39,7 @@
 #include "libsharp2/sharp_geomhelpers.h"
 #include "libsharp2/sharp_almhelpers.h"
 #include "libsharp2/sharp_utils.h"
-#include "libsharp2/sharp_utils.c"
+#include "libsharp2/sharp_utils.cc"
 #include "test/memusage.h"

 static void OpenMP_status(void)
@ -94,7 +95,7 @@ static void sharp_module_startup (const char *name, int argc, int argc_expected,
  exit(1);
  }

-typedef complex double dcmplx;
+typedef complex<double> dcmplx;

 int ntasks, mytask;

@ -122,7 +123,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
        {
        double rv = drand(-1,1,&state);
        double iv = (m==0) ? 0 : drand(-1,1,&state);
-        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
+        alm[sharp_alm_index(helper,l,mi)] = dcmplx(rv,iv);
        }
      }
    }
@ -230,8 +231,7 @@ static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
    sqsum[i]=0;
    for (ptrdiff_t j=0; j<nalms; ++j)
      {
-      sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
-               +cimag(alm[i][j])*cimag(alm[i][j]);
+      sqsum[i]+=norm(alm[i][j]);
      alm[i][j]=-alm[i][j];
      }
    }
@ -253,8 +253,7 @@ static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
    double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
    for (ptrdiff_t j=0; j<nalms; ++j)
      {
-      double sqr=creal(alm[i][j])*creal(alm[i][j])
-                +cimag(alm[i][j])*cimag(alm[i][j]);
+      double sqr=norm(alm[i][j]);
      sum+=sqr;
      if (sqr>maxdiff) maxdiff=sqr;
      }
@ -414,7 +413,7 @@ static void check_sign_scale(void)
  ALLOC2D(alm,dcmplx,2,nalms);
  for (int i=0; i<2; ++i)
    for (int j=0; j<nalms; ++j)
-      alm[i][j]=1.+_Complex_I;
+      alm[i][j]=dcmplx(1.,1.);

  sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
    NULL,NULL);
Author	SHA1	Message	Date
Martin Reinecke	b27ce30cde	switch to new FFT	2020-01-04 12:59:59 +01:00
Martin Reinecke	f8a9c96acf	use MRUTIL macros	2020-01-03 22:55:52 +01:00
Martin Reinecke	75559e6894	vectorization cleanup	2020-01-03 22:47:49 +01:00
Martin Reinecke	54cbae0534	multithreading demonstrated to work, but not safe yet	2020-01-03 18:26:53 +01:00
Martin Reinecke	0f8051fc28	vectors work with experimental::simd	2020-01-03 18:01:34 +01:00
Martin Reinecke	0378ce155a	runs as C++, no vector support yet	2020-01-03 17:22:31 +01:00