cleanup

2018-12-15 13:02:56 +01:00 · 2018-12-15 13:02:56 +01:00 · 0b8222393f
commit 0b8222393f
parent c750162e2b
12 changed files with 281 additions and 428 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -19,14 +19,12 @@ src_sharp = \
  libsharp/sharp_core.h \
  libsharp/sharp_internal.h \
  libsharp/sharp_legendre_roots.h \
  libsharp/sharp_lowlevel.h \
  libsharp/sharp_vecsupport.h \
  libsharp/sharp_vecutil.h \
  libsharp/sharp_ylmgen_c.h
 include_HEADERS = \
  libsharp/sharp.h \
  libsharp/sharp_lowlevel.h \
  libsharp/sharp_geomhelpers.h \
  libsharp/sharp_almhelpers.h \
  libsharp/sharp_cxx.h
--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@ -23,21 +23,248 @@
 */
 /*! \file sharp.h
- *  Interface for the spherical transform library.
+ *  Portable interface for the spherical transform library.
 *
- *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
- *  \author Martin Reinecke
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */
-#ifndef PLANCK_SHARP_H
+#ifndef PLANCK_SHARP_LOWLEVEL_H
-#define PLANCK_SHARP_H
+#define PLANCK_SHARP_LOWLEVEL_H
 #include <stddef.h>
 #ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
+extern "C" {
 #endif
-#include <complex.h>
+/*! \internal
    Helper type containing information about a single ring. */
 typedef struct
  {
  double theta, phi0, weight, cth, sth;
  ptrdiff_t ofs;
  int nph, stride;
  } sharp_ringinfo;
-#include "sharp_lowlevel.h"
+/*! \internal
    Helper type containing information about a pair of rings with colatitudes
    symmetric around the equator. */
 typedef struct
  {
  sharp_ringinfo r1,r2;
  } sharp_ringpair;
 /*! \internal
    Type holding all required information about a map geometry. */
 typedef struct
  {
  sharp_ringpair *pair;
  int npairs, nphmax;
  } sharp_geom_info;
 /*! \defgroup almgroup Helpers for dealing with a_lm */
 /*! \{ */
 /*! \internal
    Helper type for index calculation in a_lm arrays. */
 typedef struct
  {
  /*! Maximum \a l index of the array */
  int lmax;
  /*! Number of different \a m values in this object */
  int nm;
  /*! Array with \a nm entries containing the individual m values */
  int *mval;
  /*! Combination of flags from sharp_almflags */
  int flags;
  /*! Array with \a nm entries containing the (hypothetical) indices of
      the coefficients with quantum numbers 0,\a mval[i] */
  ptrdiff_t *mvstart;
  /*! Stride between a_lm and a_(l+1),m */
  ptrdiff_t stride;
  } sharp_alm_info;
 /*! alm_info flags */
 typedef enum { SHARP_PACKED = 1,
               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
                    not present. mvstart is in units of *real* float/double for all
                    m; stride is in units of reals for m=0 and complex for m!=0 */
               SHARP_REAL_HARMONICS  = 1<<6
               /*!< Use the real spherical harmonic convention. For
                    m==0, the alm are treated exactly the same as in
                    the complex case.  For m!=0, alm[i] represent a
                    pair (+abs(m), -abs(m)) instead of (real, imag),
                    and the coefficients are scaled by a factor of
                    sqrt(2) relative to the complex case.  In other
                    words, (sqrt(.5) * alm[i]) recovers the
                    corresponding complex coefficient (when accessed
                    as complex).
                */
             } sharp_almflags;
 /*! Creates an a_lm data structure from the following parameters:
    \param lmax maximum \a l quantum number (>=0)
    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
    \param stride the stride between entries with identical \a m, and \a l
      differing by 1.
    \param mstart the index of the (hypothetical) coefficient with the
      quantum numbers 0,\a m. Must have \a mmax+1 entries.
    \param alm_info will hold a pointer to the newly created data structure
 */
 void sharp_make_alm_info (int lmax, int mmax, int stride,
  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
 /*! Creates an a_lm data structure which from the following parameters:
    \param lmax maximum \a l quantum number (\a >=0)
    \param nm number of different \a m (\a 0<=nm<=lmax+1)
    \param stride the stride between entries with identical \a m, and \a l
      differing by 1.
    \param mval array with \a nm entries containing the individual m values
    \param mvstart array with \a nm entries containing the (hypothetical)
      indices of the coefficients with the quantum numbers 0,\a mval[i]
    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
    \param alm_info will hold a pointer to the newly created data structure
 */
 void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
 /*! Returns the index of the coefficient with quantum numbers \a l,
    \a mval[mi].
    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
    the index for the coefficient with the quantum numbers \a l, \a mi. */
 ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
 /*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
    otherwise it is the number of complex coefficients (with m>=0). */
 ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
 /*! Deallocates the a_lm info object. */
 void sharp_destroy_alm_info (sharp_alm_info *info);
 /*! \} */
 /*! \defgroup geominfogroup Functions for dealing with geometry information */
 /*! \{ */
 /*! Creates a geometry information from a set of ring descriptions.
    All arrays passed to this function must have \a nrings elements.
    \param nrings the number of rings in the map
    \param nph the number of pixels in each ring
    \param ofs the index of the first pixel in each ring in the map array
    \param stride the stride between consecutive pixels
    \param phi0 the azimuth (in radians) of the first pixel in each ring
    \param theta the colatitude (in radians) of each ring
    \param wgt the pixel weight to be used for the ring in map2alm
      and adjoint map2alm transforms.
      Pass NULL to use 1.0 as weight for all rings.
    \param geom_info will hold a pointer to the newly created data structure
 */
 void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
  const int *stride, const double *phi0, const double *theta,
  const double *wgt, sharp_geom_info **geom_info);
 /*! Counts the number of grid points needed for (the local part of) a map described
    by \a info.
 */
 ptrdiff_t sharp_map_size(const sharp_geom_info *info);
 /*! Deallocates the geometry information in \a info. */
 void sharp_destroy_geom_info (sharp_geom_info *info);
 /*! \} */
 /*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
 /*! \{ */
 /*! Enumeration of SHARP job types. */
 typedef enum { SHARP_YtW=0,               /*!< analysis */
               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
               SHARP_Y=1,                 /*!< synthesis */
               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
               SHARP_Yt=2,                /*!< adjoint synthesis */
               SHARP_WY=3,                /*!< adjoint analysis */
               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
             } sharp_jobtype;
 /*! Job flags */
 typedef enum { SHARP_DP              = 1<<4,
               /*!< map and a_lm are in double precision */
               SHARP_ADD             = 1<<5,
               /*!< results are added to the output arrays, instead of
                    overwriting them */
               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
                  but its use here is deprecated in favor of having it in the sharp_alm_info */
               SHARP_NO_FFT          = 1<<7,
               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
             } sharp_jobflags;
 /*! Performs a libsharp SHT job. The interface deliberately does not use
  the C99 "complex" data type, in order to be callable from C89 and C++.
  \param type the type of SHT
  \param spin the spin of the quantities to be transformed
  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
    depends on whether the SHARP_DP flag is set.
  \param map contains pointers to the maps. If \a spin==0,
    map[0] points to the map of the first SHT, map[1] to that of the second
    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
    point to the maps of the first SHT, map[2] and map[3] to those of the
    second, etc. The exact data type of \a map depends on whether the SHARP_DP
    flag is set.
  \param geom_info A \c sharp_geom_info object compatible with the provided
    \a map arrays.
  \param alm_info A \c sharp_alm_info object compatible with the provided
    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
    exactly once.
  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
    \a alm is expected to have the type "complex double **" and \a map is
    expected to have the type "double **"; otherwise, the expected
    types are "complex float **" and "float **", respectively.
  \param time If not NULL, the wall clock time required for this SHT
    (in seconds) will be written here.
  \param opcnt If not NULL, a conservative estimate of the total floating point
    operation count for this SHT will be written here. */
 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
  int flags, double *time, unsigned long long *opcnt);
 void sharp_set_chunksize_min(int new_chunksize_min);
 void sharp_set_nchunks_max(int new_nchunks_max);
 typedef enum { SHARP_ERROR_NO_MPI = 1,
               /*!< libsharp not compiled with MPI support */
              } sharp_errors;
 /*! Works like sharp_execute_mpi, but is always present whether or not libsharp
    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
    is *(MPI_Comm*)pcomm.
    Other parameters are the same as sharp_execute_mpi.
    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
    (in which case nothing is done).
 */
 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt);
 /*! \} */
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@ -32,7 +32,7 @@
 #ifndef PLANCK_SHARP_ALMHELPERS_H
 #define PLANCK_SHARP_ALMHELPERS_H
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 #ifdef __cplusplus
 extern "C" {
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@ -25,131 +25,126 @@
 /*  \file sharp_complex_hacks.h
 *  support for converting vector types and complex numbers
 *
- *  Copyright (C) 2012-2016 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
 *  Author: Martin Reinecke
 */
 #ifndef SHARP_COMPLEX_HACKS_H
 #define SHARP_COMPLEX_HACKS_H
 #ifdef __cplusplus
 #error This header file cannot be included from C++, only from C
 #endif
 #include <math.h>
 #include <complex.h>
 #include "sharp_vecsupport.h"
 #define UNSAFE_CODE
 #if (VLEN==1)
-static inline complex double vhsum_cmplx(Tv a, Tv b)
+static inline _Complex double vhsum_cmplx(Tv a, Tv b)
  { return a+_Complex_I*b; }
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
  { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
 #endif
 #if (VLEN==2)
-static inline complex double vhsum_cmplx (Tv a, Tv b)
+static inline _Complex double vhsum_cmplx (Tv a, Tv b)
  {
 #if defined(__SSE3__)
  Tv tmp = _mm_hadd_pd(a,b);
 #else
-  Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
+  Tv tmp = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-                _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
+           _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
 #endif
-  union {Tv v; complex double c; } u;
+  union {Tv v; _Complex double c; } u;
  u.v=tmp; return u.c;
  }
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, complex double * restrict c1, complex double * restrict c2)
+  Tv d, _Complex double * restrict c1, _Complex double * restrict c2)
  {
 #ifdef UNSAFE_CODE
 #if defined(__SSE3__)
-  vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
+  *((__m128d *)c1) += _mm_hadd_pd(a,b);
-  vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
+  *((__m128d *)c2) += _mm_hadd_pd(c,d);
 #else
-  vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
+  *((__m128d *)c1) += _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-                               _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
+                      _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
+  *((__m128d *)c2) += _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-                               _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
+                      _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
 #endif
 #else
-  union {Tv v; complex double c; } u1, u2;
+  union {Tv v; _Complex double c; } u1, u2;
 #if defined(__SSE3__)
  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
 #else
-  u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
-              _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
-  u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
-              _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
 #endif
  *c1+=u1.c; *c2+=u2.c;
 #endif
  }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 #endif
 #if (VLEN==4)
-static inline complex double vhsum_cmplx (Tv a, Tv b)
+static inline _Complex double vhsum_cmplx (Tv a, Tv b)
  {
  Tv tmp=_mm256_hadd_pd(a,b);
  Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
  tmp=_mm256_add_pd(tmp,tmp2);
 #ifdef UNSAFE_CODE
-  complex double ret;
+  _Complex double ret;
  *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
  return ret;
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
  u.v=tmp; return u.c[0];
 #endif
  }
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
  {
  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
+  tmp1=tmp3+tmp4;
 #ifdef UNSAFE_CODE
  *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
  *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
  u.v=tmp1;
  *c1+=u.c[0]; *c2+=u.c[1];
 #endif
  }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
  {
  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
+  tmp1=tmp3+tmp4;
 #ifdef UNSAFE_CODE
  _mm256_storeu_pd((double *)cc,
    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
 #else
-  union {Tv v; complex double c[2]; } u;
+  union {Tv v; _Complex double c[2]; } u;
  u.v=tmp1;
  cc[0]+=u.c[0]; cc[1]+=u.c[1];
 #endif
@ -159,18 +154,18 @@ static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
 #if (VLEN==8)
-static inline complex double vhsum_cmplx(Tv a, Tv b)
+static inline _Complex double vhsum_cmplx(Tv a, Tv b)
  { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
+  _Complex double * restrict c1, _Complex double * restrict c2)
  {
  *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
  *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
  }
 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict cc)
+  _Complex double * restrict cc)
  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
 #endif
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@ -114,8 +114,8 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
    do
      {
      if (npow&1)
-        vmuleq(res,val);
+        res*=val;
-      vmuleq(val,val);
+      val*=val;
      }
    while(npow>>=1);
    *resd=res;
@ -129,12 +129,12 @@ static void mypow(Tv val, int npow, const double * restrict powlimit,
      {
      if (npow&1)
        {
-        vmuleq(res,val);
+        res*=val;
-        vaddeq(scale,scaleint);
+        scale+=scaleint;
        Tvnormalize(&res,&scale,sharp_fbighalf);
        }
-      vmuleq(val,val);
+      val*=val;
-      vaddeq(scaleint,scaleint);
+      scaleint+=scaleint;
      Tvnormalize(&val,&scaleint,sharp_fbighalf);
      }
    while(npow>>=1);
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@ -33,7 +33,7 @@
 #define PLANCK_SHARP_CXX_H
 #include <complex>
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@ -32,7 +32,7 @@
 #ifndef PLANCK_SHARP_GEOMHELPERS_H
 #define PLANCK_SHARP_GEOMHELPERS_H
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 #ifdef __cplusplus
 extern "C" {
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@ -25,7 +25,7 @@
 /*! \file sharp_internal.h
 *  Internally used functionality for the spherical transform library.
 *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2018 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */
@ -36,6 +36,7 @@
 #error This header file cannot be included from C++, only from C
 #endif
 #include <complex.h>
 #include "sharp.h"
 #define SHARP_MAXTRANS 100
--- a/libsharp/sharp_lowlevel.h
+++ b/libsharp/sharp_lowlevel.h
@ -1,270 +0,0 @@
 /*
 *  This file is part of libsharp.
 *
 *  libsharp is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libsharp is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libsharp; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 /*
 *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
 *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
 *  (DLR).
 */
 /*! \file sharp_lowlevel.h
 *  Low-level, portable interface for the spherical transform library.
 *
 *  Copyright (C) 2012-2013 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */
 #ifndef PLANCK_SHARP_LOWLEVEL_H
 #define PLANCK_SHARP_LOWLEVEL_H
 #include <stddef.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*! \internal
    Helper type containing information about a single ring. */
 typedef struct
  {
  double theta, phi0, weight, cth, sth;
  ptrdiff_t ofs;
  int nph, stride;
  } sharp_ringinfo;
 /*! \internal
    Helper type containing information about a pair of rings with colatitudes
    symmetric around the equator. */
 typedef struct
  {
  sharp_ringinfo r1,r2;
  } sharp_ringpair;
 /*! \internal
    Type holding all required information about a map geometry. */
 typedef struct
  {
  sharp_ringpair *pair;
  int npairs, nphmax;
  } sharp_geom_info;
 /*! \defgroup almgroup Helpers for dealing with a_lm */
 /*! \{ */
 /*! \internal
    Helper type for index calculation in a_lm arrays. */
 typedef struct
  {
  /*! Maximum \a l index of the array */
  int lmax;
  /*! Number of different \a m values in this object */
  int nm;
  /*! Array with \a nm entries containing the individual m values */
  int *mval;
  /*! Combination of flags from sharp_almflags */
  int flags;
  /*! Array with \a nm entries containing the (hypothetical) indices of
      the coefficients with quantum numbers 0,\a mval[i] */
  ptrdiff_t *mvstart;
  /*! Stride between a_lm and a_(l+1),m */
  ptrdiff_t stride;
  } sharp_alm_info;
 /*! alm_info flags */
 typedef enum { SHARP_PACKED = 1,
               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
                    not present. mvstart is in units of *real* float/double for all
                    m; stride is in units of reals for m=0 and complex for m!=0 */
               SHARP_REAL_HARMONICS  = 1<<6
               /*!< Use the real spherical harmonic convention. For
                    m==0, the alm are treated exactly the same as in
                    the complex case.  For m!=0, alm[i] represent a
                    pair (+abs(m), -abs(m)) instead of (real, imag),
                    and the coefficients are scaled by a factor of
                    sqrt(2) relative to the complex case.  In other
                    words, (sqrt(.5) * alm[i]) recovers the
                    corresponding complex coefficient (when accessed
                    as complex).
                */
             } sharp_almflags;
 /*! Creates an a_lm data structure from the following parameters:
    \param lmax maximum \a l quantum number (>=0)
    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
    \param stride the stride between entries with identical \a m, and \a l
      differing by 1.
    \param mstart the index of the (hypothetical) coefficient with the
      quantum numbers 0,\a m. Must have \a mmax+1 entries.
    \param alm_info will hold a pointer to the newly created data structure
 */
 void sharp_make_alm_info (int lmax, int mmax, int stride,
  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
 /*! Creates an a_lm data structure which from the following parameters:
    \param lmax maximum \a l quantum number (\a >=0)
    \param nm number of different \a m (\a 0<=nm<=lmax+1)
    \param stride the stride between entries with identical \a m, and \a l
      differing by 1.
    \param mval array with \a nm entries containing the individual m values
    \param mvstart array with \a nm entries containing the (hypothetical)
      indices of the coefficients with the quantum numbers 0,\a mval[i]
    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
    \param alm_info will hold a pointer to the newly created data structure
 */
 void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
 /*! Returns the index of the coefficient with quantum numbers \a l,
    \a mval[mi].
    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
    the index for the coefficient with the quantum numbers \a l, \a mi. */
 ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
 /*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
    otherwise it is the number of complex coefficients (with m>=0). */
 ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
 /*! Deallocates the a_lm info object. */
 void sharp_destroy_alm_info (sharp_alm_info *info);
 /*! \} */
 /*! \defgroup geominfogroup Functions for dealing with geometry information */
 /*! \{ */
 /*! Creates a geometry information from a set of ring descriptions.
    All arrays passed to this function must have \a nrings elements.
    \param nrings the number of rings in the map
    \param nph the number of pixels in each ring
    \param ofs the index of the first pixel in each ring in the map array
    \param stride the stride between consecutive pixels
    \param phi0 the azimuth (in radians) of the first pixel in each ring
    \param theta the colatitude (in radians) of each ring
    \param wgt the pixel weight to be used for the ring in map2alm
      and adjoint map2alm transforms.
      Pass NULL to use 1.0 as weight for all rings.
    \param geom_info will hold a pointer to the newly created data structure
 */
 void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
  const int *stride, const double *phi0, const double *theta,
  const double *wgt, sharp_geom_info **geom_info);
 /*! Counts the number of grid points needed for (the local part of) a map described
    by \a info.
 */
 ptrdiff_t sharp_map_size(const sharp_geom_info *info);
 /*! Deallocates the geometry information in \a info. */
 void sharp_destroy_geom_info (sharp_geom_info *info);
 /*! \} */
 /*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
 /*! \{ */
 /*! Enumeration of SHARP job types. */
 typedef enum { SHARP_YtW=0,               /*!< analysis */
               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
               SHARP_Y=1,                 /*!< synthesis */
               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
               SHARP_Yt=2,                /*!< adjoint synthesis */
               SHARP_WY=3,                /*!< adjoint analysis */
               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
             } sharp_jobtype;
 /*! Job flags */
 typedef enum { SHARP_DP              = 1<<4,
               /*!< map and a_lm are in double precision */
               SHARP_ADD             = 1<<5,
               /*!< results are added to the output arrays, instead of
                    overwriting them */
               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
                  but its use here is deprecated in favor of having it in the sharp_alm_info */
               SHARP_NO_FFT          = 1<<7,
               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
             } sharp_jobflags;
 /*! Performs a libsharp SHT job. The interface deliberately does not use
  the C99 "complex" data type, in order to be callable from C89 and C++.
  \param type the type of SHT
  \param spin the spin of the quantities to be transformed
  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
    depends on whether the SHARP_DP flag is set.
  \param map contains pointers to the maps. If \a spin==0,
    map[0] points to the map of the first SHT, map[1] to that of the second
    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
    point to the maps of the first SHT, map[2] and map[3] to those of the
    second, etc. The exact data type of \a map depends on whether the SHARP_DP
    flag is set.
  \param geom_info A \c sharp_geom_info object compatible with the provided
    \a map arrays.
  \param alm_info A \c sharp_alm_info object compatible with the provided
    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
    exactly once.
  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
    \a alm is expected to have the type "complex double **" and \a map is
    expected to have the type "double **"; otherwise, the expected
    types are "complex float **" and "float **", respectively.
  \param time If not NULL, the wall clock time required for this SHT
    (in seconds) will be written here.
  \param opcnt If not NULL, a conservative estimate of the total floating point
    operation count for this SHT will be written here. */
 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
  int flags, double *time, unsigned long long *opcnt);
 void sharp_set_chunksize_min(int new_chunksize_min);
 void sharp_set_nchunks_max(int new_nchunks_max);
 typedef enum { SHARP_ERROR_NO_MPI = 1,
               /*!< libsharp not compiled with MPI support */
              } sharp_errors;
 /*! Works like sharp_execute_mpi, but is always present whether or not libsharp
    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
    is *(MPI_Comm*)pcomm.
    Other parameters are the same as sharp_execute_mpi.
    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
    (in which case nothing is done).
 */
 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt);
 /*! \} */
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@ -33,7 +33,7 @@
 #define PLANCK_SHARP_MPI_H
 #include <mpi.h>
-#include "sharp_lowlevel.h"
+#include "sharp.h"
 #ifdef __cplusplus
 extern "C" {
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@ -40,32 +40,13 @@ typedef double Ts;
 #if (VLEN==1)
 typedef double Tv;
 typedef float Tv_s;
 typedef int Tm;
 #define vadd(a,b) ((a)+(b))
 #define vadd_s(a,b) ((a)+(b))
 #define vaddeq(a,b) ((a)+=(b))
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
 #define vsub(a,b) ((a)-(b))
 #define vsub_s(a,b) ((a)-(b))
 #define vsubeq(a,b) ((a)-=(b))
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
 #define vmul(a,b) ((a)*(b))
 #define vmul_s(a,b) ((a)*(b))
 #define vmuleq(a,b) ((a)*=(b))
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
 #define vfmaeq(a,b,c) ((a)+=(b)*(c))
 #define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
 #define vfmseq(a,b,c) ((a)-=(b)*(c))
 #define vabmc(a,b,c) ((a)*(b)-(c))
 #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
 #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
 #define vload(a) (a)
 #define vload_s(a) (a)
 #define vloadu(p) (*(p))
 #define vloadu_s(p) (*(p))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@ -74,8 +55,6 @@ typedef int Tm;
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
 #define vor_mask(a,b) ((a)||(b))
 #define vstoreu(p, a) (*(p)=a)
 #define vstoreu_s(p, a) (*(p)=a)
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
@ -99,7 +78,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif
 typedef __m128d Tv;
 typedef __m128 Tv_s;
 typedef __m128d Tm;
 #if defined(__SSE4_1__)
@ -111,29 +89,11 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vzero _mm_setzero_pd()
 #define vone _mm_set1_pd(1.)
 #define vadd(a,b) _mm_add_pd(a,b)
 #define vadd_s(a,b) _mm_add_ps(a,b)
 #define vaddeq(a,b) a=_mm_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm_sub_pd(a,b)
 #define vsub_s(a,b) _mm_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm_mul_pd(a,b)
 #define vmul_s(a,b) _mm_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
 #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
 #define vabmc(a,b,c) _mm_sub_pd(_mm_mul_pd(a,b),c)
 #define vfmaaeq(a,b,c,d,e) \
  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vfmaseq(a,b,c,d,e) \
  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
 #define vload_s(a) _mm_set1_ps(a)
 #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
@ -146,10 +106,6 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
 #define vloadu(p) _mm_loadu_pd(p)
 #define vloadu_s(p) _mm_loadu_ps(p)
 #define vstoreu(p, v) _mm_storeu_pd(p, v)
 #define vstoreu_s(p, v) _mm_storeu_ps(p, v)
 #endif
@ -161,54 +117,17 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #endif
 typedef __m256d Tv;
 typedef __m256 Tv_s;
 typedef __m256d Tm;
 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
 #define vzero _mm256_setzero_pd()
 #define vone _mm256_set1_pd(1.)
 #define vadd(a,b) _mm256_add_pd(a,b)
 #define vadd_s(a,b) _mm256_add_ps(a,b)
 #define vaddeq(a,b) a=_mm256_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm256_sub_pd(a,b)
 #define vsub_s(a,b) _mm256_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm256_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm256_mul_pd(a,b)
 #define vmul_s(a,b) _mm256_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm256_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
 #if (USE_FMA4)
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
 #define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
 #define vabmc(a,b,c) _mm256_msub_pd(a,b,c)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
 #if (USE_FMA)
 #define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
 #define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
 #define vabmc(a,b,c) _mm256_fmsub_pd(a,b,c)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
 #define vabmc(a,b,c) _mm256_sub_pd(_mm256_mul_pd(a,b),c)
 #define vfmaaeq(a,b,c,d,e) \
  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
 #define vfmaseq(a,b,c,d,e) \
  a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
 #endif
 #endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
 #define vload_s(a) _mm256_set1_ps(a)
 #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
@ -222,11 +141,6 @@ typedef __m256d Tm;
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
 #define vloadu(p) _mm256_loadu_pd(p)
 #define vloadu_s(p) _mm256_loadu_ps(p)
 #define vstoreu(p, v) _mm256_storeu_pd(p, v)
 #define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
 #endif
 #if (VLEN==8)
@ -236,20 +150,8 @@ typedef __m256d Tm;
 typedef __m512d Tv;
 typedef __mmask8 Tm;
 #define vadd(a,b) _mm512_add_pd(a,b)
 #define vaddeq(a,b) a=_mm512_add_pd(a,b)
 #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
 #define vsub(a,b) _mm512_sub_pd(a,b)
 #define vsubeq(a,b) a=_mm512_sub_pd(a,b)
 #define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
 #define vmul(a,b) _mm512_mul_pd(a,b)
 #define vmuleq(a,b) a=_mm512_mul_pd(a,b)
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
 #define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
 //#define vabmc(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
 //#define vfms(a,b,c) _mm512_fnmadd_pd(b,c,a)
 #define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
 #define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
 #define vload(a) _mm512_set1_pd(a)
 #define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@ -25,7 +25,7 @@
 /*! \file sharp_vecutil.h
 *  Functionality related to vector instruction support
 *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
 *  \author Martin Reinecke
 */