Updated libsharp to commit 0787838ab3ec8afc0c28b98479a321ffba388980

2016-11-04 18:14:49 +01:00 · 2016-11-04 18:14:49 +01:00 · 23aa450a77
commit 23aa450a77
parent a933430c60
62 changed files with 5075 additions and 11205 deletions
--- a/external/sharp/libsharp/libsharp.dox
+++ b/external/sharp/libsharp/libsharp.dox
@ -7,10 +7,9 @@

 /*! \page introduction Introduction to libsharp

-  "SHARP" is an acronym for <i>Performant Spherical Harmonic Transforms</i>.
+  "SHARP" is an acronym for <i>Spherical HARmonic Package</i>.
  All user-visible data types and functions in this library start with
-  the prefix "sharp_", or with "sharps_" and "sharpd_" for single- and
-  double precision variants, respectively.
+  the prefix "sharp_" to avoid pollution of the global C namespace.

  <i>libsharp</i>'s main functionality is the conversion between <i>maps</i>
  on the sphere and <i>spherical harmonic coefficients</i> (or <i>a_lm</i>).
@ -57,7 +56,7 @@
  for generating often-used pixelisations like ECP grids, Gaussian grids,
  and Healpix grids.

-  Currently, SHARP supports the following kinds of transforms:
+  Currently, libsharp supports the following kinds of transforms:
  <ul>
  <li>scalar a_lm to map</li>
  <li>scalar map to a_lm</li>
@ -68,10 +67,10 @@
  <li>scalar a_lm to maps of first derivatives</li>
  </ul>

-  SHARP supports shared-memory parallelisation via OpenMP; this feature will
+  libsharp supports shared-memory parallelisation via OpenMP; this feature will
  be automatically enabled if the compiler supports it.

-  SHARP will also make use of SSE2 and AVX instructions when compiled for a
+  Libsharp will also make use of SSE2 and AVX instructions when compiled for a
  platform known to support them.

  Support for MPI-parallel transforms is also available; in this mode,
@ -83,12 +82,4 @@
  single-precision transforms will most likely not be faster than their
  double-precision counterparts, but they will require significantly less
  memory.
-
-  Two example and benchmark programs are distributed with SHARP:
-  <ul>
-  <li>sharp_test.c checks the accuracy of the (iterative) map analysis
-      algorithm</li>
-  <li>sharp_bench.c determines the quickest transform strategy for a given
-      SHT</li>
-  </ul>
 */
--- a/external/sharp/libsharp/planck.make
+++ b/external/sharp/libsharp/planck.make
@ -7,14 +7,14 @@ FULL_INCLUDE+= -I$(SD)

 HDR_$(PKG):=$(SD)/*.h
 LIB_$(PKG):=$(LIBDIR)/libsharp.a
-BIN:=sharp_test sharp_acctest sharp_test_mpi sharp_bench sharp_bench2
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
-ALLOBJ:=$(LIBOBJ) sharp_test.o sharp_acctest.o sharp_test_mpi.o sharp_bench.o sharp_bench2.o
+BIN:=sharp_testsuite
+LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o
+ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
 LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
 ALLOBJ:=$(ALLOBJ:%=$(OD)/%)

 ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
-$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
+$(OD)/sharp_core.o: $(SD)/sharp_core_inchelper.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c
 $(OD)/sharp.o: $(SD)/sharp_mpi.c
 BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)

--- a/external/sharp/libsharp/sharp.c
+++ b/external/sharp/libsharp/sharp.c
--- a/external/sharp/libsharp/sharp.h
+++ b/external/sharp/libsharp/sharp.h
@ -39,5 +39,7 @@
 #include <complex.h>

 #include "sharp_lowlevel.h"
+#include "sharp_legendre.h"
+#include "sharp_legendre_roots.h"

 #endif
--- a/external/sharp/libsharp/sharp_acctest.c
+++ b/external/sharp/libsharp/sharp_acctest.c
@ -1,267 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_acctest.c
-    Systematic accuracy test for libsharp.
-
-    Copyright (C) 2006-2012 Max-Planck-Society
-    \author Martin Reinecke
-*/
-
-#include <stdio.h>
-#include <string.h>
-#ifdef USE_MPI
-#include "mpi.h"
-#endif
-#include "sharp.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
-#include "sharp_announce.h"
-#include "sharp_core.h"
-
-typedef complex double dcmplx;
-
-static double drand (double min, double max)
-  { return min + (max-min)*rand()/(RAND_MAX+1.0); }
-
-static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
-  {
-  for (int mi=0;mi<helper->nm; ++mi)
-    {
-    int m=helper->mval[mi];
-    for (int l=m;l<=helper->lmax; ++l)
-      {
-      if ((l<spin)&&(m<spin))
-        alm[sharp_alm_index(helper,l,mi)] = 0.;
-      else
-        {
-        double rv = drand(-1,1);
-        double iv = (m==0) ? 0 : drand(-1,1);
-        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
-        }
-      }
-    }
-  }
-
-static void measure_errors (dcmplx **alm, dcmplx **alm2,
-  ptrdiff_t nalms, int ncomp)
-  {
-  for (int i=0; i<ncomp; ++i)
-    {
-    double sum=0, sum2=0, maxdiff=0;
-    for (ptrdiff_t m=0; m<nalms; ++m)
-      {
-      double x=creal(alm[i][m])-creal(alm2[i][m]),
-             y=cimag(alm[i][m])-cimag(alm2[i][m]);
-      sum+=x*x+y*y;
-      sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
-      if (fabs(x)>maxdiff) maxdiff=fabs(x);
-      if (fabs(y)>maxdiff) maxdiff=fabs(y);
-      }
-    sum=sqrt(sum/nalms);
-    sum2=sqrt(sum2/nalms);
-    UTIL_ASSERT((maxdiff<1e-10)&&(sum/sum2<1e-10),"error");
-    }
-  }
-
-static void check_sign_scale(void)
-  {
-  int lmax=50;
-  int mmax=lmax;
-  sharp_geom_info *tinfo;
-  int nrings=lmax+1;
-  int ppring=2*lmax+2;
-  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
-
-  /* flip theta to emulate the "old" Gaussian grid geometry */
-  for (int i=0; i<tinfo->npairs; ++i)
-    {
-    const double pi=3.141592653589793238462643383279502884197;
-    tinfo->pair[i].r1.cth=-tinfo->pair[i].r1.cth;
-    tinfo->pair[i].r2.cth=-tinfo->pair[i].r2.cth;
-    tinfo->pair[i].r1.theta=pi-tinfo->pair[i].r1.theta;
-    tinfo->pair[i].r2.theta=pi-tinfo->pair[i].r2.theta;
-    }
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-
-  for (int ntrans=1; ntrans<10; ++ntrans)
-    {
-    double **map;
-    ALLOC2D(map,double,2*ntrans,npix);
-
-    dcmplx **alm;
-    ALLOC2D(alm,dcmplx,2*ntrans,nalms);
-    for (int i=0; i<2*ntrans; ++i)
-      for (int j=0; j<nalms; ++j)
-        alm[i][j]=1.+_Complex_I;
-
-    sharp_execute(SHARP_ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
-      NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[it][0     ], 3.588246976618616912e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
-        "error");
-      }
-    sharp_execute(SHARP_ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
-      NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ], 2.750897760535633285e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2], 3.137704477368562905e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-8.405730859837063917e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-2.398026536095463346e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
-        "error");
-      }
-
-    sharp_execute(SHARP_ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,NULL,
-      NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-1.398186224727334448e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.456676000884031197e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.516249174408820863e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-3.173406200299964119e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
-        "error");
-      }
-
-    sharp_execute(SHARP_ALM2MAP_DERIV1,1,0,&alm[0],&map[0],tinfo,alms,ntrans,1,
-      0,NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-6.859393905369091105e-01,1e-11),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.103947835973212364e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.092463246472086439e+03,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-1.411433220713928165e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
-        "error");
-      }
-
-    DEALLOC2D(map);
-    DEALLOC2D(alm);
-    }
-
-  sharp_destroy_alm_info(alms);
-  sharp_destroy_geom_info(tinfo);
-  }
-
-static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
-  ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int nv)
-  {
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  srand(4);
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  for (int i=0; i<ncomp; ++i)
-    random_alm(alm[i],alms,spin);
-
-  dcmplx **alm2;
-  ALLOC2D(alm2,dcmplx,ncomp,nalms);
-
-  sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,
-    NULL,NULL);
-  sharp_execute(SHARP_MAP2ALM,spin,0,&alm2[0],&map[0],tinfo,alms,ntrans,1,nv,
-    NULL,NULL);
-  measure_errors(alm,alm2,nalms,ncomp);
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-  DEALLOC2D(alm2);
-
-  sharp_destroy_alm_info(alms);
-  }
-
-int main(void)
-  {
-#ifdef USE_MPI
-  MPI_Init(NULL,NULL);
-#endif
-  sharp_module_startup("sharp_acctest",1,1,"",1);
-
-  int lmax=127;
-
-  printf("Checking signs and scales.\n");
-  check_sign_scale();
-  printf("Passed.\n\n");
-
-  printf("Testing map analysis accuracy.\n");
-
-  sharp_geom_info *tinfo;
-  int nrings=lmax+1;
-  int ppring=2*lmax+2;
-  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
-  for (int nv=1; nv<=6; ++nv)
-    for (int ntrans=1; ntrans<=6; ++ntrans)
-      {
-      check_accuracy(tinfo,lmax,lmax,npix,0,ntrans,nv);
-      check_accuracy(tinfo,lmax,lmax,npix,1,ntrans,nv);
-      check_accuracy(tinfo,lmax,lmax,npix,2,ntrans,nv);
-      check_accuracy(tinfo,lmax,lmax,npix,3,ntrans,nv);
-      check_accuracy(tinfo,lmax,lmax,npix,30,ntrans,nv);
-      }
-  sharp_destroy_geom_info(tinfo);
-  printf("Passed.\n\n");
-
-#ifdef USE_MPI
-  MPI_Finalize();
-#endif
-  return 0;
-  }
--- a/external/sharp/libsharp/sharp_almhelpers.c
+++ b/external/sharp/libsharp/sharp_almhelpers.c
@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.c
 *  Spherical transform library
 *
- *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  Copyright (C) 2008-2013 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -41,7 +41,8 @@ void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
  info->mval = RALLOC(int,mmax+1);
  info->mvstart = RALLOC(ptrdiff_t,mmax+1);
  info->stride = stride;
-  int tval = 2*lmax+1;
+  info->flags = 0;
+  ptrdiff_t tval = 2*lmax+1;
  for (ptrdiff_t m=0; m<=mmax; ++m)
    {
    info->mval[m] = m;
@ -59,6 +60,7 @@ void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
  info->mval = RALLOC(int,mmax+1);
  info->mvstart = RALLOC(ptrdiff_t,mmax+1);
  info->stride = stride;
+  info->flags = 0;
  for (ptrdiff_t m=0; m<=mmax; ++m)
    {
    info->mval[m] = m;
@ -66,3 +68,27 @@ void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
    }
  *alm_info = info;
  }
+
+void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
+  int nm, const int *ms, sharp_alm_info **alm_info)
+  {
+  ptrdiff_t idx;
+  int f;
+  sharp_alm_info *info = RALLOC(sharp_alm_info,1);
+  info->lmax = lmax;
+  info->nm = nm;
+  info->mval = RALLOC(int,nm);
+  info->mvstart = RALLOC(ptrdiff_t,nm);
+  info->stride = stride;
+  info->flags = SHARP_PACKED | SHARP_REAL_HARMONICS;
+  idx = 0;  /* tracks the number of 'consumed' elements so far; need to correct by m */
+  for (int im=0; im!=nm; ++im)
+    {
+    int m=(ms==NULL)?im:ms[im];
+    f = (m==0) ? 1 : 2;
+    info->mval[im] = m;
+    info->mvstart[im] = stride * (idx - f * m);
+    idx += f * (lmax + 1 - m);
+    }
+  *alm_info = info;
+  }
--- a/external/sharp/libsharp/sharp_almhelpers.h
+++ b/external/sharp/libsharp/sharp_almhelpers.h
@ -50,6 +50,14 @@ void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
 void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
  sharp_alm_info **alm_info);

+/*! Initialises alm_info for mmajor, real, packed spherical harmonics.
+    Pass \a mmax + 1 to nm and NULL to \a ms in order to use everything;
+    otherwise you can pick a subset of m to process (should only be used
+    for MPI parallelization).
+    \ingroup almgroup */
+void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
+  int nm, const int *ms, sharp_alm_info **alm_info);
+
 #ifdef __cplusplus
 }
 #endif
--- a/external/sharp/libsharp/sharp_bench.c
+++ b/external/sharp/libsharp/sharp_bench.c
@ -1,149 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_bench.c
-    Copyright (C) 2012 Max-Planck-Society
-    \author Martin Reinecke
-*/
-
-#include <stdio.h>
-#include <string.h>
-#ifdef USE_MPI
-#include "mpi.h"
-#endif
-#include "sharp.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
-#include "sharp_announce.h"
-#include "sharp_core.h"
-
-typedef complex double dcmplx;
-
-static void bench_sht (int spin, int nv, sharp_jobtype type,
-  int ntrans, double *time, unsigned long long *opcnt)
-  {
-  int lmax=2047;
-  int mmax=128;
-  int nrings=512;
-  int ppring=1024;
-  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-  sharp_geom_info *tinfo;
-  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
-
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-  SET_ARRAY(map[0],0,npix*ncomp,0.);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
-
-  int nruns=0;
-  *time=1e30;
-  *opcnt=1000000000000000;
-  do
-    {
-    double jtime;
-    unsigned long long jopcnt;
-    sharp_execute(type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,nv,&jtime,
-      &jopcnt);
-
-    if (jopcnt<*opcnt) *opcnt=jopcnt;
-    if (jtime<*time) *time=jtime;
-    }
-  while (++nruns < 4);
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-
-  sharp_destroy_alm_info(alms);
-  sharp_destroy_geom_info(tinfo);
-  }
-
-int main(void)
-  {
-#ifdef USE_MPI
-  MPI_Init(NULL,NULL);
-#endif
-  sharp_module_startup("sharp_bench",1,1,"",1);
-
-  printf("Benchmarking SHTs.\n\n");
-  FILE *fp=fopen("sharp_oracle.inc","w");
-  UTIL_ASSERT(fp, "failed to open oracle file for writing");
-  fprintf(fp,"static const int maxtr = 6;\n");
-  fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
-
-  const char *shtname[]={"map2alm","alm2map","a2mder1"};
-
-  for (int ntr=1; ntr<=6; ++ntr)
-    {
-    fprintf(fp,"{");
-    for (int spin=0; spin<=2; spin+=2)
-      {
-      fprintf(fp,"{");
-      for (sharp_jobtype type=SHARP_MAP2ALM; type<=SHARP_ALM2MAP_DERIV1; ++type)
-        {
-        if ((type==SHARP_ALM2MAP_DERIV1) && (spin==0))
-          fprintf(fp,"-1");
-        else
-          {
-          int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
-          unsigned long long opmin=1000000000000000, op;
-          double tmin=1e30;
-          double *time=RALLOC(double,sharp_get_nv_max()+1);
-          for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
-            {
-            bench_sht (spin,nv,type,ntr,&time[nv],&op);
-            if (op<opmin) opmin=op;
-            if (time[nv]<tmin)
-              { tmin=time[nv]; nvbest=nv; }
-            }
-          printf("nt: %d  %s  spin: %d   nv: %d   time: %6.3f   perf: %6.3f"
-            "   dev[%d]: %6.2f%%\n",ntr,shtname[type],
-            spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
-            (time[nvoracle]-tmin)/tmin*100.);
-          DEALLOC(time);
-          fprintf(fp,"%d",nvbest);
-          }
-        if (type!=SHARP_ALM2MAP_DERIV1) fprintf(fp,",");
-        }
-      fprintf(fp,(spin==0)?"},":"}");
-      printf("\n");
-      }
-    fprintf(fp,(ntr<6)?"},\n":"}\n");
-    }
-  fprintf(fp,"};\n");
-  fclose(fp);
-#ifdef USE_MPI
-  MPI_Finalize();
-#endif
-  return 0;
-  }
--- a/external/sharp/libsharp/sharp_bench2.c
+++ b/external/sharp/libsharp/sharp_bench2.c
@ -1,223 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_bench2.c
-    Copyright (C) 2012 Max-Planck-Society
-    \author Martin Reinecke
-*/
-
-#include <stdio.h>
-
-#if (defined(_OPENMP) && defined(USE_MPI))
-
-#include <stdlib.h>
-#include <string.h>
-#include <omp.h>
-#include <mpi.h>
-#include "sharp_mpi.h"
-#include "sharp.h"
-#include "sharp_vecutil.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
-#include "sharp_announce.h"
-#include "sharp_core.h"
-#include "memusage.h"
-
-typedef complex double dcmplx;
-
-int ntasks, mytask;
-
-static unsigned long long totalops (unsigned long long val)
-  {
-  unsigned long long tmp;
-  MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-  return tmp;
-  }
-
-static double maxTime (double val)
-  {
-  double tmp;
-  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  return tmp;
-  }
-
-static double totalMem (double val)
-  {
-  double tmp;
-  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  return tmp;
-  }
-
-static void reduce_alm_info(sharp_alm_info *ainfo)
-  {
-  int nmnew=0;
-  ptrdiff_t ofs = 0;
-  for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
-    {
-    ainfo->mval[nmnew]=ainfo->mval[i];
-    ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
-    ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
-    }
-  ainfo->nm=nmnew;
-  }
-
-static void reduce_geom_info(sharp_geom_info *ginfo)
-  {
-  int npairsnew=0;
-  ptrdiff_t ofs = 0;
-  for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
-    {
-    ginfo->pair[npairsnew]=ginfo->pair[i];
-    ginfo->pair[npairsnew].r1.ofs=ofs;
-    ofs+=ginfo->pair[npairsnew].r1.nph;
-    ginfo->pair[npairsnew].r2.ofs=ofs;
-    if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
-    }
-  ginfo->npairs=npairsnew;
-  }
-
-static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
-  {
-  ptrdiff_t res=0;
-  for (int i=0; i<ainfo->nm; ++i)
-    res += ainfo->lmax-ainfo->mval[i]+1;
-  return res;
-  }
-
-static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
-  {
-  ptrdiff_t res=0;
-  for (int i=0; i<ginfo->npairs; ++i)
-    {
-    res += ginfo->pair[i].r1.nph;
-    if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
-    }
-  return res;
-  }
-
-int main(int argc, char **argv)
-  {
-  MPI_Init(NULL,NULL);
-  MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
-  MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
-  int master=(mytask==0);
-
-  sharp_module_startup("sharp_bench2",argc,7,
-    "<healpix|ecp|gauss> <lmax> <nside|nphi> <a2m/m2a> <spin> <ntrans>",0);
-
-  int lmax=atoi(argv[2]);
-  sharp_jobtype jtype = (strcmp(argv[4],"a2m")==0) ?
-    SHARP_ALM2MAP : SHARP_MAP2ALM;
-  int spin=atoi(argv[5]);
-  int ntrans=atoi(argv[6]);
-
-  sharp_geom_info *tinfo;
-  ptrdiff_t npix=0;
-  int geom2=0;
-  if (strcmp(argv[1],"gauss")==0)
-    {
-    int nrings=geom2=lmax+1;
-    int ppring=atoi(argv[3]);
-    sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
-    }
-  else if (strcmp(argv[1],"ecp")==0)
-    {
-    int nrings=geom2=2*lmax+2;
-    int ppring=atoi(argv[3]);
-    sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
-    }
-  else if (strcmp(argv[1],"healpix")==0)
-    {
-    int nside=atoi(argv[3]);
-    if (nside<1) nside=1;
-    geom2=4*nside-1;
-    sharp_make_healpix_geom_info (nside, 1, &tinfo);
-    }
-  else
-    UTIL_FAIL("unknown grid geometry");
-
-  reduce_geom_info(tinfo);
-  npix=get_npix(tinfo);
-
-  int mmax=lmax;
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  reduce_alm_info(alms);
-  ptrdiff_t nalms=get_nalms(alms);
-
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-
-  for (int n=0; n<ncomp; ++n)
-    {
-    for (int i=0; i<npix; ++i) map[n][i]=1;
-    for (int i=0; i<nalms; ++i) alm[n][i]=1;
-    }
-
-  double time=1e20;
-  unsigned long long opcnt=0;
-  for (int ntries=0; (ntries<2)||(ntries*time<5); ++ntries)
-    {
-    double ltime;
-    unsigned long long lopcnt;
-    sharp_execute_mpi(MPI_COMM_WORLD,jtype,spin,0,&alm[0],&map[0],
-      tinfo,alms,ntrans,1,0,&ltime,&lopcnt);
-
-    ltime=maxTime(ltime);
-    if (ltime<time) { time=ltime; opcnt=totalops(lopcnt); }
-    }
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-
-  sharp_destroy_alm_info(alms);
-  sharp_destroy_geom_info(tinfo);
-
-  double mHWM=totalMem(VmHWM());
-
-  int nomp=omp_get_max_threads();
-
-  if (master)
-    printf("%-12s %-7s %-3s %2d %d %2d %3d %5d %5d %1d %.2e %7.2f %9.2f\n",
-      getenv("HOST"),argv[1],argv[4],spin,VLEN,nomp,ntasks,lmax,geom2,ntrans,
-      time,opcnt/(time*1e9),mHWM/(1<<20));
-
-  MPI_Finalize();
-  return 0;
-  }
-
-#else
-
-#include "c_utils.h"
-
-int main(void)
-  { UTIL_FAIL("Need OpenMP and MPI"); return 1; }
-
-#endif
--- a/external/sharp/libsharp/sharp_complex_hacks.h
+++ b/external/sharp/libsharp/sharp_complex_hacks.h
@ -25,7 +25,7 @@
 /*  \file sharp_complex_hacks.h
 *  support for converting vector types and complex numbers
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012,2013 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -132,4 +132,18 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,

 #endif

+#if (VLEN==8)
+
+static inline complex double vhsum_cmplx(Tv a, Tv b)
+  { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
+
+static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict c1, complex double * restrict c2)
+  {
+  *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
+  *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
+  }
+
+#endif
+
 #endif
--- a/external/sharp/libsharp/sharp_core.c
+++ b/external/sharp/libsharp/sharp_core.c
@ -25,7 +25,7 @@
 /*! \file sharp_core.c
 *  Computational core
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2013 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -41,6 +41,7 @@

 typedef complex double dcmplx;

+// must be in the range [0;6]
 #define MAXJOB_SPECIAL 2

 #define XCONCAT2(a,b) a##_##b
@ -49,188 +50,188 @@ typedef complex double dcmplx;
 #define CONCAT3(a,b,c) XCONCAT3(a,b,c)

 #define nvec 1
-#include "sharp_inchelper1.inc.c"
+#include "sharp_core_inchelper.c"
 #undef nvec

 #define nvec 2
-#include "sharp_inchelper1.inc.c"
+#include "sharp_core_inchelper.c"
 #undef nvec

 #define nvec 3
-#include "sharp_inchelper1.inc.c"
+#include "sharp_core_inchelper.c"
 #undef nvec

 #define nvec 4
-#include "sharp_inchelper1.inc.c"
+#include "sharp_core_inchelper.c"
 #undef nvec

 #define nvec 5
-#include "sharp_inchelper1.inc.c"
+#include "sharp_core_inchelper.c"
 #undef nvec

 #define nvec 6
-#include "sharp_inchelper1.inc.c"
+#include "sharp_core_inchelper.c"
 #undef nvec

 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *idx)
+  const int *mlim)
  {
-  int njobs=job->ntrans;
+  int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
  if (njobs<=MAXJOB_SPECIAL)
    {
-    switch (njobs*16+job->nv)
+    switch (njobs*16+nv)
      {
-#if (MAXJOB_SPECIAL>=1)
+#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
      case 0x11:
-        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x12:
-        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x13:
-        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x14:
-        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x15:
-        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x16:
-        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
 #endif
-#if (MAXJOB_SPECIAL>=2)
+#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
      case 0x21:
-        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x22:
-        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x23:
-        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x24:
-        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x25:
-        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x26:
-        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
 #endif
-#if (MAXJOB_SPECIAL>=3)
+#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
      case 0x31:
-        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x32:
-        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x33:
-        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x34:
-        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x35:
-        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x36:
-        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
 #endif
-#if (MAXJOB_SPECIAL>=4)
+#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
      case 0x41:
-        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x42:
-        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x43:
-        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x44:
-        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x45:
-        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x46:
-        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
 #endif
-#if (MAXJOB_SPECIAL>=5)
+#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
      case 0x51:
-        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x52:
-        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x53:
-        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x54:
-        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x55:
-        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x56:
-        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
 #endif
-#if (MAXJOB_SPECIAL>=6)
+#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
      case 0x61:
-        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x62:
-        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x63:
-        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x64:
-        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x65:
-        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
      case 0x66:
-        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
        return;
 #endif
      }
    }
-#if (MAXJOB_SPECIAL<6)
+#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
  else
    {
-    switch (job->nv)
+    switch (nv)
      {
      case 1:
        CONCAT2(inner_loop,1)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
        return;
      case 2:
        CONCAT2(inner_loop,2)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
        return;
      case 3:
        CONCAT2(inner_loop,3)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
        return;
      case 4:
        CONCAT2(inner_loop,4)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
        return;
      case 5:
        CONCAT2(inner_loop,5)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
        return;
      case 6:
        CONCAT2(inner_loop,6)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
        return;
      }
    }
--- a/external/sharp/libsharp/sharp_core.h
+++ b/external/sharp/libsharp/sharp_core.h
@ -25,7 +25,7 @@
 /*! \file sharp_core.h
 *  Interface for the computational core
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2013 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -41,7 +41,7 @@ extern "C" {

 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *idx);
+  const int *mlim);

 #ifdef __cplusplus
 }
--- a/external/sharp/libsharp/sharp_core_inc.c
+++ b/external/sharp/libsharp/sharp_core_inc.c
@ -70,31 +70,31 @@ static inline Tb Y(Tbprod)(Tb a, Tb b)
 static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
  { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }

-static inline void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
+static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
  double maxval)
  {
  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
  for (int i=0;i<nvec; ++i)
    {
-    Tv mask = vgt(vabs(val->v[i]),vfmax);
+    Tm mask = vgt(vabs(val->v[i]),vfmax);
    while (vanyTrue(mask))
      {
-      vmuleq(val->v[i],vblend(mask,vfsmall,vone));
-      vaddeq(scale->v[i],vblend(mask,vone,vzero));
+      vmuleq_mask(mask,val->v[i],vfsmall);
+      vaddeq_mask(mask,scale->v[i],vone);
      mask = vgt(vabs(val->v[i]),vfmax);
      }
-    mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
+    mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
    while (vanyTrue(mask))
      {
-      vmuleq(val->v[i],vblend(mask,vfbig,vone));
-      vsubeq(scale->v[i],vblend(mask,vone,vzero));
-      mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
+      vmuleq_mask(mask,val->v[i],vfbig);
+      vsubeq_mask(mask,scale->v[i],vone);
+      mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
      }
    }
  }

-static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
+static void Y(mypow) (Tb val, int npow, Tb * restrict resd,
  Tb * restrict ress)
  {
  Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
@ -131,13 +131,13 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
  int did_scale=0;
  for (int i=0;i<nvec; ++i)
    {
-    Tv mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
+    Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
    if (vanyTrue(mask))
      {
      did_scale=1;
-      Tv fact = vblend(mask,vload(sharp_fsmall),vone);
-      vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
-      vaddeq(scale->v[i],vblend(mask,vone,vzero));
+      vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
+      vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
+      vaddeq_mask(mask,scale->v[i],vone);
      }
    }
  return did_scale;
@ -146,29 +146,29 @@ static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
 static inline int Y(TballLt)(Tb a,double b)
  {
  Tv vb=vload(b);
-  Tv res=vlt(a.v[0],vb);
+  Tm res=vlt(a.v[0],vb);
  for (int i=1; i<nvec; ++i)
-    res=vand(res,vlt(a.v[i],vb));
+    res=vand_mask(res,vlt(a.v[i],vb));
  return vallTrue(res);
  }
 static inline int Y(TballGt)(Tb a,double b)
  {
  Tv vb=vload(b);
-  Tv res=vgt(a.v[0],vb);
+  Tm res=vgt(a.v[0],vb);
  for (int i=1; i<nvec; ++i)
-    res=vand(res,vgt(a.v[i],vb));
+    res=vand_mask(res,vgt(a.v[i],vb));
  return vallTrue(res);
  }
 static inline int Y(TballGe)(Tb a,double b)
  {
  Tv vb=vload(b);
-  Tv res=vge(a.v[0],vb);
+  Tm res=vge(a.v[0],vb);
  for (int i=1; i<nvec; ++i)
-    res=vand(res,vge(a.v[i],vb));
+    res=vand_mask(res,vge(a.v[i],vb));
  return vallTrue(res);
  }

-static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
+static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
  const double * restrict cf)
  {
  Y(Tbu) sc, corf;
@ -220,7 +220,7 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
    }
  }

-static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
+static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
  Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
  Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
  {
@ -232,6 +232,11 @@ static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
    cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
    sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
    sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
+    Tm mask=vlt(sth.v[i],vzero);
+    Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
+    vmuleq_mask(cmask,cth2.v[i],vload(-1.));
+    Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
+    vmuleq_mask(smask,sth2.v[i],vload(-1.));
    }

  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
--- a/external/sharp/libsharp/sharp_core_inc2.c
+++ b/external/sharp/libsharp/sharp_core_inc2.c
@ -25,25 +25,17 @@
 /*! \file sharp_core_inc2.c
 *  Type-dependent code for the computational core
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2013 Max-Planck-Society
 *  \author Martin Reinecke
 */

-typedef struct
-  { Y(Tbri) j[njobs]; } Z(Tbrij);
-typedef union
-  { Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij);
-typedef struct
-  { Y(Tbqu) j[njobs]; } Z(Tbquj);
-typedef union
-  { Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj);
-
-static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
-  Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
+static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
+  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax)
+  int l, int lmax NJ1)
+  {
+if (njobs>1)
  {
-#if (njobs>1)
  while (l<lmax-2)
    {
    Tb lam_3, lam_4;
@ -64,8 +56,8 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
         ai4=vload(cimag(alm[njobs*(l+2)+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaaeq(p1->j[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
-        vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
+        vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
+        vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
        }
      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
         ai3=vload(cimag(alm[njobs*(l+1)+j])),
@ -73,8 +65,8 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
         ai1=vload(cimag(alm[njobs*(l+3)+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaaeq(p2->j[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
-        vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
+        vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
+        vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
        }
      }
    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
@ -82,7 +74,7 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
    l+=4;
    }
-#endif
+  }
  while (l<lmax)
    {
    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
@ -94,15 +86,15 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
         ai=vload(cimag(alm[njobs*l+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
+        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
+        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
        }
      ar=vload(creal(alm[njobs*(l+1)+j]));
      ai=vload(cimag(alm[njobs*(l+1)+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(p2->j[j].r.v[i],lam_1.v[i],ar);
-        vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai);
+        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
+        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
        }
      }
    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
@ -117,16 +109,17 @@ static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
+        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
+        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
        }
      }
    }
  }

-static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
-  const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
+static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
+  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
+  NJ1)
  {
  while (l<lmax)
    {
@ -138,13 +131,13 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(tr1,lam_2.v[i],p1->j[j].r.v[i]);
-        vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]);
+        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
+        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
        }
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(tr2,lam_1.v[i],p2->j[j].r.v[i]);
-        vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]);
+        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
+        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
        }
      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
      }
@ -160,8 +153,8 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
      Tv tre=vzero, tim=vzero;
      for (int i=0; i<nvec; ++i)
        {
-        vfmaeq(tre,lam_2.v[i],p1->j[j].r.v[i]);
-        vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]);
+        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
+        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
        }
      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
      }
@ -169,14 +162,14 @@ static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
  }

 static void Z(calc_alm2map) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Z(Tbrij) * restrict p1,
-  Z(Tbrij) * restrict p2, int *done)
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
+  Y(Tbri) * restrict p2 NJ1)
  {
  int l,lmax=gen->lmax;
  Tb lam_1,lam_2,scale;
  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) { *done=1; return; }
+  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;

  Tb corfac;
@ -192,8 +185,8 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
      for (int i=0; i<nvec; ++i)
        {
        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(p1->j[j].r.v[i],tmp,ar);
-        vfmaeq(p1->j[j].i.v[i],tmp,ai);
+        vfmaeq(p1[j].r.v[i],tmp,ar);
+        vfmaeq(p1[j].i.v[i],tmp,ai);
        }
      }
    if (++l>lmax) break;
@ -206,8 +199,8 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
      for (int i=0; i<nvec; ++i)
        {
        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(p2->j[j].r.v[i],tmp,ar);
-        vfmaeq(p2->j[j].i.v[i],tmp,ai);
+        vfmaeq(p2[j].r.v[i],tmp,ar);
+        vfmaeq(p2[j].i.v[i],tmp,ai);
        }
      }
    if (++l>lmax) break;
@ -220,22 +213,22 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
      full_ieee = Y(TballGe)(scale,sharp_minscale);
      }
    }
-  if (l>lmax) { *done=1; return; }
+  if (l>lmax) return;

  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
  }

 static void Z(calc_map2alm) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
-  const Z(Tbrij) * restrict p2, int *done)
+  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
+  const Y(Tbri) * restrict p2 NJ1)
  {
  int lmax=gen->lmax;
  Tb lam_1,lam_2,scale;
  int l=gen->m;
  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) { *done=1; return; }
+  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;

  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
@ -251,12 +244,12 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
      for (int i=0; i<nvec; ++i)
        {
        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p1->j[j].r.v[i]);
-        vfmaeq(tim,tmp,p1->j[j].i.v[i]);
+        vfmaeq(tre,tmp,p1[j].r.v[i]);
+        vfmaeq(tim,tmp,p1[j].i.v[i]);
        }
      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
      }
-    if (++l>lmax) { *done=1; return; }
+    if (++l>lmax) return;
    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
    for (int i=0; i<nvec; ++i)
      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
@ -266,12 +259,12 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
      for (int i=0; i<nvec; ++i)
        {
        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p2->j[j].r.v[i]);
-        vfmaeq(tim,tmp,p2->j[j].i.v[i]);
+        vfmaeq(tre,tmp,p2[j].r.v[i]);
+        vfmaeq(tim,tmp,p2[j].i.v[i]);
        }
      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
      }
-    if (++l>lmax) { *done=1; return; }
+    if (++l>lmax) return;
    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
    for (int i=0; i<nvec; ++i)
      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
@ -283,11 +276,11 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
    }

  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
  }

-static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
+static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
  {
  for (int j=0; j<njobs; ++j)
    {
@ -296,25 +289,25 @@ static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
    for (int i=0; i<nvec; ++i)
      {
      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px->j[j].qr.v[i],agr,lw);
-      vfmaeq(px->j[j].qi.v[i],agi,lw);
-      vfmaeq(px->j[j].ur.v[i],acr,lw);
-      vfmaeq(px->j[j].ui.v[i],aci,lw);
+      vfmaeq(px[j].qr.v[i],agr,lw);
+      vfmaeq(px[j].qi.v[i],agi,lw);
+      vfmaeq(px[j].ur.v[i],acr,lw);
+      vfmaeq(px[j].ui.v[i],aci,lw);
      }
    for (int i=0; i<nvec; ++i)
      {
      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmseq(py->j[j].qr.v[i],aci,lx);
-      vfmaeq(py->j[j].qi.v[i],acr,lx);
-      vfmaeq(py->j[j].ur.v[i],agi,lx);
-      vfmseq(py->j[j].ui.v[i],agr,lx);
+      vfmseq(py[j].qr.v[i],aci,lx);
+      vfmaeq(py[j].qi.v[i],acr,lx);
+      vfmaeq(py[j].ur.v[i],agi,lx);
+      vfmseq(py[j].ui.v[i],agr,lx);
      }
    }
  }

-static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2,
+static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
+  const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
  {
  for (int j=0; j<njobs; ++j)
    {
@ -326,26 +319,26 @@ static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2,
      {
      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
-      vfmaseq(p1->j[j].qr.v[i],agr1,lw1,aci2,lx2);
-      vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2);
-      vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2);
-      vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2);
+      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
+      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
+      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
+      vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
      }
    for (int i=0; i<nvec; ++i)
      {
      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
-      vfmaseq(p2->j[j].qr.v[i],agr2,lw2,aci1,lx1);
-      vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1);
-      vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1);
-      vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1);
+      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
+      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
+      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
+      vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
      }
    }
  }

-static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
-  const Z(Tbquj) * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm)
+static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
+  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
+  const Tb * restrict rxm, dcmplx * restrict alm NJ1)
  {
  for (int j=0; j<njobs; ++j)
    {
@ -353,27 +346,27 @@ static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
    for (int i=0; i<nvec; ++i)
      {
      Tv lw=vadd(rxp->v[i],rxm->v[i]);
-      vfmaeq(agr,px->j[j].qr.v[i],lw);
-      vfmaeq(agi,px->j[j].qi.v[i],lw);
-      vfmaeq(acr,px->j[j].ur.v[i],lw);
-      vfmaeq(aci,px->j[j].ui.v[i],lw);
+      vfmaeq(agr,px[j].qr.v[i],lw);
+      vfmaeq(agi,px[j].qi.v[i],lw);
+      vfmaeq(acr,px[j].ur.v[i],lw);
+      vfmaeq(aci,px[j].ui.v[i],lw);
      }
    for (int i=0; i<nvec; ++i)
      {
      Tv lx=vsub(rxm->v[i],rxp->v[i]);
-      vfmseq(agr,py->j[j].ui.v[i],lx);
-      vfmaeq(agi,py->j[j].ur.v[i],lx);
-      vfmaeq(acr,py->j[j].qi.v[i],lx);
-      vfmseq(aci,py->j[j].qr.v[i],lx);
+      vfmseq(agr,py[j].ui.v[i],lx);
+      vfmaeq(agi,py[j].ur.v[i],lx);
+      vfmaeq(acr,py[j].qi.v[i],lx);
+      vfmseq(aci,py[j].qr.v[i],lx);
      }
    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
    }
  }

-static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
-  Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax)
+  int lmax NJ1)
  {
  while (l<lmax)
    {
@ -386,13 +379,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                        vmul(fx2,rec1m.v[i]));
      }
-#if (njobs>1)
    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
-      &alm[2*njobs*(l+1)]);
-#else
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
-    Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]);
-#endif
+      &alm[2*njobs*(l+1)] NJ2);
    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
@ -405,12 +393,13 @@ static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
    l+=2;
    }
  if (l==lmax)
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
+    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
  }

-static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
-  const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
+static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
+  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
+  NJ1)
  {
  while (l<lmax)
    {
@ -423,8 +412,8 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                        vmul(fx2,rec1m.v[i]));
      }
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
-    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)]);
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
+    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
@ -437,18 +426,19 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
    l+=2;
    }
  if (l==lmax)
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
  }

-static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
-  sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
+static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2 NJ1)
  {
  int l, lmax=gen->lmax;
  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  Y(iter_to_ieee_spin)
+    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax)
-   { *done=1; return; }
+  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;

  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
@ -460,12 +450,12 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
               && Y(TballGe)(scalem,sharp_minscale);
  while (!full_ieee)
    {
-    Z(saddstep)(p1, p2,
-      Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]);
+    Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
+      &alm[2*njobs*l] NJ2);
    if (++l>lmax) break;
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Z(saddstep)(p2, p1,
-      Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]);
+    Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
+      &alm[2*njobs*l] NJ2);
    if (++l>lmax) break;
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -477,24 +467,24 @@ static void Z(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
      }
    }

-  if (l>lmax)
-    { *done=1; return; }
+  if (l>lmax) return;

  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(alm2map_spin_kernel) (cth,p1,p2,
-    rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
+  Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
+    lmax NJ2);
  }

-static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
-  sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
-  int *done)
+static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
+  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
+  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
  {
  int l, lmax=gen->lmax;
  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  Y(iter_to_ieee_spin)
+    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) { *done=1; return; }
+  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;

  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
@ -507,12 +497,12 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
  while (!full_ieee)
    {
    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]);
-    if (++l>lmax) { *done=1; return; }
+    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
+    if (++l>lmax) return;
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]);
-    if (++l>lmax) { *done=1; return; }
+    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
+    if (++l>lmax) return;
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
      {
@ -525,12 +515,11 @@ static void Z(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,

  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
+  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
  }

-static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
-  Z(Tbquj) * restrict py, const Tb rxp, const Tb rxm,
-  const dcmplx * restrict alm)
+static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
  {
  for (int j=0; j<njobs; ++j)
    {
@ -538,22 +527,22 @@ static inline void Z(saddstep_d) (Z(Tbquj) * restrict px,
    for (int i=0; i<nvec; ++i)
      {
      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px->j[j].qr.v[i],ar,lw);
-      vfmaeq(px->j[j].qi.v[i],ai,lw);
+      vfmaeq(px[j].qr.v[i],ar,lw);
+      vfmaeq(px[j].qi.v[i],ai,lw);
      }
    for (int i=0; i<nvec; ++i)
      {
      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmaeq(py->j[j].ur.v[i],ai,lx);
-      vfmseq(py->j[j].ui.v[i],ar,lx);
+      vfmaeq(py[j].ur.v[i],ai,lx);
+      vfmseq(py[j].ui.v[i],ar,lx);
      }
    }
  }

-static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
-  Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax)
+  int lmax NJ1)
  {
  while (l<lmax)
    {
@ -566,8 +555,8 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
                        vmul(fx2,rec1m.v[i]));
      }
-    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l]);
-    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)]);
+    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
+    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
    fx2=vload(fx[l+2].f[2]);
    for (int i=0; i<nvec; ++i)
@ -580,18 +569,19 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Z(Tbquj) * restrict p1,
    l+=2;
    }
  if (l==lmax)
-    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l]);
+    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
  }

-static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
-  sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
+static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
+  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2 NJ1)
  {
  int l, lmax=gen->lmax;
  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  Y(iter_to_ieee_spin)
+    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax)
-   { *done=1; return; }
+  if (l>lmax) return;
  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;

  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
@ -604,11 +594,11 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
  while (!full_ieee)
    {
    Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[njobs*l]);
+      &alm[njobs*l] NJ2);
    if (++l>lmax) break;
    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
    Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[njobs*l]);
+      &alm[njobs*l] NJ2);
    if (++l>lmax) break;
    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
@ -620,20 +610,20 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
      }
    }

-  if (l>lmax)
-    { *done=1; return; }
+  if (l>lmax) return;

  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
  Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax);
+    lmax NJ2);
  }

+
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)

 static void Z(inner_loop) (sharp_job *job, const int *ispair,
  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *idx)
+  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
  {
  const int nval=nvec*VLEN;
  const int m = job->ainfo->mval[mi];
@ -646,35 +636,32 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
      {
      if (job->spin==0)
        {
-        int done=0;
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Z(Tburij) p1,p2; VZERO(p1); VZERO(p2);
-          if (!done)
-            {
-            Y(Tbu) cth, sth;
+          Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;

-            for (int i=0; i<nval; ++i)
-              {
-              int itot=i+ith;
-              if (itot>=ulim-llim) itot=ulim-llim-1;
-              itot=idx[itot];
-              cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-              }
-            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
+          int skip=1;
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
            }
+          if (!skip)
+            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);

          for (int i=0; i<nval; ++i)
            {
            int itot=i+ith;
            if (itot<ulim-llim)
              {
-              itot=idx[itot];
              for (int j=0; j<njobs; ++j)
                {
-                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
-                complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I,
-                               r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I;
+                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
+                complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
+                               r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
                job->phase[phas_idx] = r1+r2;
                if (ispair[itot])
                  job->phase[phas_idx+1] = r1-r2;
@ -685,39 +672,38 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
        }
      else
        {
-        int done=0;
        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Z(Tbuquj) p1,p2; VZERO(p1); VZERO(p2);
-          if (!done)
-            {
-            Y(Tbu) cth;
+          Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+          int skip=1;

-            for (int i=0; i<nval; ++i)
-              {
-              int itot=i+ith;
-              if (itot>=ulim-llim) itot=ulim-llim-1;
-              itot=idx[itot];
-              cth.s[i]=cth_[itot];
-              }
-            (job->type==SHARP_ALM2MAP) ?
-              Z(calc_alm2map_spin  ) (cth.b,gen,job,&p1.b,&p2.b,&done) :
-              Z(calc_alm2map_deriv1) (cth.b,gen,job,&p1.b,&p2.b,&done);
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
            }
+          if (!skip)
+            (job->type==SHARP_ALM2MAP) ?
+              Z(calc_alm2map_spin  )
+                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
+              Z(calc_alm2map_deriv1)
+                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);

          for (int i=0; i<nval; ++i)
            {
            int itot=i+ith;
            if (itot<ulim-llim)
              {
-              itot=idx[itot];
              for (int j=0; j<njobs; ++j)
                {
-                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
-                complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I,
-                               q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I,
-                               u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I,
-                               u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I;
+                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
+                complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
+                               q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
+                               u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
+                               u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
                job->phase[phas_idx] = q1+q2;
                job->phase[phas_idx+2] = u1+u2;
                if (ispair[itot])
@ -740,70 +726,77 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
      {
      if (job->spin==0)
        {
-        int done=0;
-        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
+        for (int ith=0; ith<ulim-llim; ith+=nval)
          {
-          Z(Tburij) p1, p2; VZERO(p1); VZERO(p2);
+          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
          Y(Tbu) cth, sth;
+          int skip=1;

          for (int i=0; i<nval; ++i)
            {
            int itot=i+ith;
            if (itot>=ulim-llim) itot=ulim-llim-1;
-            itot=idx[itot];
+            if (mlim[itot]>=m) skip=0;
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
+              {
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
+                dcmplx ph1=job->phase[phas_idx];
+                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+                p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
+                p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
+                }
+              }
+            }
+          if (!skip)
+            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+          }
+        }
+      else
+        {
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+          int skip=1;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            if (mlim[itot]>=m) skip=0;
            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
            if (i+ith<ulim-llim)
              {
              for (int j=0; j<njobs; ++j)
                {
-                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
-                dcmplx ph1=job->phase[phas_idx];
-                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-                p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2);
-                p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2);
-                }
-              }
-            }
-          Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
-          }
-        }
-      else
-        {
-        int done=0;
-        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
-          {
-          Z(Tbuquj) p1, p2; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            itot=idx[itot];
-            cth.s[i]=cth_[itot];
-            if (i+ith<ulim-llim)
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
+                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
                dcmplx p1Q=job->phase[phas_idx],
                       p1U=job->phase[phas_idx+2],
                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
                if ((gen->mhi-gen->m+gen->s)&1)
                  { p2Q=-p2Q; p2U=-p2U; }
-                p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q);
-                p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U);
-                p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q);
-                p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U);
+                p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
+                p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
+                p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
+                p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
                }
              }
            }
-          Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
+          if (!skip)
+            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
          }
        }
      break;
      }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
    }
  }

--- a/external/sharp/libsharp/sharp_core_inc3.c
+++ b/external/sharp/libsharp/sharp_core_inc3.c
@ -1,800 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core_inc3.c
- *  Type-dependent code for the computational core
- *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax, int njobs)
-  {
-  while (l<lmax-2)
-    {
-    Tb lam_3, lam_4;
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
-    r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar2=vload(creal(alm[njobs*l+j])),
-         ai2=vload(cimag(alm[njobs*l+j])),
-         ar4=vload(creal(alm[njobs*(l+2)+j])),
-         ai4=vload(cimag(alm[njobs*(l+2)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
-        vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
-        }
-      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
-         ai3=vload(cimag(alm[njobs*(l+1)+j])),
-         ar1=vload(creal(alm[njobs*(l+3)+j])),
-         ai1=vload(cimag(alm[njobs*(l+3)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
-        vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
-        }
-      }
-    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
-    l+=4;
-    }
-  while (l<lmax)
-    {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),
-         ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
-        }
-      ar=vload(creal(alm[njobs*(l+1)+j]));
-      ai=vload(cimag(alm[njobs*(l+1)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
-        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
-        }
-      }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    l+=2;
-    }
-  if (l==lmax)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
-        }
-      }
-    }
-  }
-
-static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
-  int njobs)
-  {
-  while (l<lmax)
-    {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
-        }
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
-        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
-        }
-      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
-      }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    l+=2;
-    }
-  if (l==lmax)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
-        }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
-    }
-  }
-
-static void Y(calc_alm2map) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2, int njobs, int *done)
-  {
-  int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) { *done=1; return; }
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
-
-  Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(p1[j].r.v[i],tmp,ar);
-        vfmaeq(p1[j].i.v[i],tmp,ai);
-        }
-      }
-    if (++l>lmax) break;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(p2[j].r.v[i],tmp,ar);
-        vfmaeq(p2[j].i.v[i],tmp,ai);
-        }
-      }
-    if (++l>lmax) break;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
-      }
-    }
-  if (l>lmax) { *done=1; return; }
-
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
-  }
-
-static void Y(calc_map2alm) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, int njobs, int *done)
-  {
-  int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
-  int l=gen->m;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) { *done=1; return; }
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p1[j].r.v[i]);
-        vfmaeq(tim,tmp,p1[j].i.v[i]);
-        }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
-    if (++l>lmax) { *done=1; return; }
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p2[j].r.v[i]);
-        vfmaeq(tim,tmp,p2[j].i.v[i]);
-        }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
-    if (++l>lmax) { *done=1; return; }
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
-      }
-    }
-
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
-  }
-
-static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
-       acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],agr,lw);
-      vfmaeq(px[j].qi.v[i],agi,lw);
-      vfmaeq(px[j].ur.v[i],acr,lw);
-      vfmaeq(px[j].ui.v[i],aci,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmseq(py[j].qr.v[i],aci,lx);
-      vfmaeq(py[j].qi.v[i],acr,lx);
-      vfmaeq(py[j].ur.v[i],agi,lx);
-      vfmseq(py[j].ui.v[i],agr,lx);
-      }
-    }
-  }
-
-static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
-  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2, int njobs)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
-       acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
-    Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
-       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
-      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
-      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
-      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
-      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
-      vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
-      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
-      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
-      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
-      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
-      vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
-      }
-    }
-  }
-
-static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
-  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm, int njobs)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp->v[i],rxm->v[i]);
-      vfmaeq(agr,px[j].qr.v[i],lw);
-      vfmaeq(agi,px[j].qi.v[i],lw);
-      vfmaeq(acr,px[j].ur.v[i],lw);
-      vfmaeq(aci,px[j].ui.v[i],lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm->v[i],rxp->v[i]);
-      vfmseq(agr,py[j].ui.v[i],lx);
-      vfmaeq(agi,py[j].ur.v[i],lx);
-      vfmaeq(acr,py[j].qi.v[i],lx);
-      vfmseq(aci,py[j].qr.v[i],lx);
-      }
-    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
-    }
-  }
-
-static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax, int njobs)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
-      &alm[2*njobs*(l+1)], njobs);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l], njobs);
-  }
-
-static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
-  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
-  int njobs)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l],njobs);
-    Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)],njobs);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
-  }
-
-static void Y(calc_alm2map_spin) (const Tb cth, const sharp_Ylmgen_C *gen,
-  sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
-  int *done)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax)
-   { *done=1; return; }
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[2*njobs*l],njobs);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[2*njobs*l], njobs);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax)
-    { *done=1; return; }
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax, njobs);
-  }
-
-static void Y(calc_map2alm_spin) (Tb cth, const sharp_Ylmgen_C * restrict gen,
-  sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
-  int njobs, int *done)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) { *done=1; return; }
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs);
-    if (++l>lmax) { *done=1; return; }
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs);
-    if (++l>lmax) { *done=1; return; }
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs);
-  }
-
-static inline void Y(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],ar,lw);
-      vfmaeq(px[j].qi.v[i],ai,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmaeq(py[j].ur.v[i],ai,lx);
-      vfmseq(py[j].ui.v[i],ar,lx);
-      }
-    }
-  }
-
-static void Y(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax, int njobs)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Y(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l],njobs);
-    Y(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)],njobs);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Y(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l], njobs);
-  }
-
-static void Y(calc_alm2map_deriv1) (const Tb cth, const sharp_Ylmgen_C *gen,
-  sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
-  int *done)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax)
-   { *done=1; return; }
-  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Y(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[njobs*l],njobs);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Y(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[njobs*l], njobs);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax)
-    { *done=1; return; }
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Y(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax, njobs);
-  }
-
-
-#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
-
-static void Y(inner_loop) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *idx, int njobs)
-  {
-  const int nval=nvec*VLEN;
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_ALM2MAP:
-    case SHARP_ALM2MAP_DERIV1:
-      {
-      if (job->spin==0)
-        {
-        int done=0;
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
-          if (!done)
-            {
-            Y(Tbu) cth, sth;
-
-            for (int i=0; i<nval; ++i)
-              {
-              int itot=i+ith;
-              if (itot>=ulim-llim) itot=ulim-llim-1;
-              itot=idx[itot];
-              cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-              }
-            Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
-            }
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              itot=idx[itot];
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
-                complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
-                               r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
-                job->phase[phas_idx] = r1+r2;
-                if (ispair[itot])
-                  job->phase[phas_idx+1] = r1-r2;
-                }
-              }
-            }
-          }
-        }
-      else
-        {
-        int done=0;
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
-          if (!done)
-            {
-            Y(Tbu) cth;
-
-            for (int i=0; i<nval; ++i)
-              {
-              int itot=i+ith;
-              if (itot>=ulim-llim) itot=ulim-llim-1;
-              itot=idx[itot];
-              cth.s[i]=cth_[itot];
-              }
-            (job->type==SHARP_ALM2MAP) ?
-              Y(calc_alm2map_spin  )
-                (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done) :
-              Y(calc_alm2map_deriv1)
-                (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
-            }
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              itot=idx[itot];
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
-                complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
-                               q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
-                               u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
-                               u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
-                job->phase[phas_idx] = q1+q2;
-                job->phase[phas_idx+2] = u1+u2;
-                if (ispair[itot])
-                  {
-                  dcmplx *phQ = &(job->phase[phas_idx+1]),
-                         *phU = &(job->phase[phas_idx+3]);
-                  *phQ = q1-q2;
-                  *phU = u1-u2;
-                  if ((gen->mhi-gen->m+gen->s)&1)
-                    { *phQ=-(*phQ); *phU=-(*phU); }
-                  }
-                }
-              }
-            }
-          }
-        }
-      break;
-      }
-    case SHARP_MAP2ALM:
-      {
-      if (job->spin==0)
-        {
-        int done=0;
-        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
-          {
-          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            itot=idx[itot];
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if (i+ith<ulim-llim)
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
-                dcmplx ph1=job->phase[phas_idx];
-                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-                p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
-                p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
-                }
-              }
-            }
-          Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
-          }
-        }
-      else
-        {
-        int done=0;
-        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
-          {
-          Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            itot=idx[itot];
-            cth.s[i]=cth_[itot];
-            if (i+ith<ulim-llim)
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
-                dcmplx p1Q=job->phase[phas_idx],
-                       p1U=job->phase[phas_idx+2],
-                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
-                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { p2Q=-p2Q; p2U=-p2U; }
-                p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
-                p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
-                p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
-                p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
-                }
-              }
-            }
-          Y(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
-          }
-        }
-      break;
-      }
-    }
-  }
-
-#undef VZERO
--- a/external/sharp/libsharp/sharp_core_inchelper.c
+++ b/external/sharp/libsharp/sharp_core_inchelper.c
@ -1,11 +1,21 @@
 #define Tb CONCAT2(Tb,nvec)
 #define Y(arg) CONCAT2(arg,nvec)
 #include "sharp_core_inc.c"
-#if (MAXJOB_SPECIAL<6)
-#include "sharp_core_inc3.c"
+
+#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
+#define NJ1 , int njobs
+#define NJ2 , njobs
+#define Z(arg) CONCAT2(arg,nvec)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef NJ1
+#undef NJ2
 #endif

-#if (MAXJOB_SPECIAL>=1)
+#define NJ1
+#define NJ2
+
+#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
 #define njobs 1
 #define Z(arg) CONCAT3(arg,nvec,njobs)
 #include "sharp_core_inc2.c"
@ -13,7 +23,7 @@
 #undef njobs
 #endif

-#if (MAXJOB_SPECIAL>=2)
+#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
 #define njobs 2
 #define Z(arg) CONCAT3(arg,nvec,njobs)
 #include "sharp_core_inc2.c"
@ -21,7 +31,7 @@
 #undef njobs
 #endif

-#if (MAXJOB_SPECIAL>=3)
+#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
 #define njobs 3
 #define Z(arg) CONCAT3(arg,nvec,njobs)
 #include "sharp_core_inc2.c"
@ -29,7 +39,7 @@
 #undef njobs
 #endif

-#if (MAXJOB_SPECIAL>=4)
+#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
 #define njobs 4
 #define Z(arg) CONCAT3(arg,nvec,njobs)
 #include "sharp_core_inc2.c"
@ -37,7 +47,7 @@
 #undef njobs
 #endif

-#if (MAXJOB_SPECIAL>=5)
+#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
 #define njobs 5
 #define Z(arg) CONCAT3(arg,nvec,njobs)
 #include "sharp_core_inc2.c"
@ -45,7 +55,7 @@
 #undef njobs
 #endif

-#if (MAXJOB_SPECIAL>=6)
+#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
 #define njobs 6
 #define Z(arg) CONCAT3(arg,nvec,njobs)
 #include "sharp_core_inc2.c"
@ -53,5 +63,8 @@
 #undef njobs
 #endif

+#undef NJ1
+#undef NJ2
+
 #undef Y
 #undef Tb
--- a/external/sharp/libsharp/sharp_cxx.h
+++ b/external/sharp/libsharp/sharp_cxx.h
@ -25,7 +25,7 @@
 /*! \file sharp_cxx.h
 *  Spherical transform library
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2015 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -35,7 +35,6 @@
 #include "sharp_lowlevel.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
-#include "xcomplex.h"

 class sharp_base
  {
@ -54,32 +53,50 @@ class sharp_base

    void set_general_geometry (int nrings, const int *nph, const ptrdiff_t *ofs,
      const int *stride, const double *phi0, const double *theta,
-      const double *weight)
+      const double *wgt)
      {
-      sharp_make_geom_info (nrings, nph, ofs, stride, phi0, theta, weight,
-        &ginfo);
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      sharp_make_geom_info (nrings, nph, ofs, stride, phi0, theta, wgt, &ginfo);
      }

    void set_ECP_geometry (int nrings, int nphi)
-      { sharp_make_ecp_geom_info (nrings, nphi, 0., 1, nphi, &ginfo); }
+      {
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      sharp_make_ecp_geom_info (nrings, nphi, 0., 1, nphi, &ginfo);
+      }

    void set_Gauss_geometry (int nrings, int nphi)
-      { sharp_make_gauss_geom_info (nrings, nphi, 1, nphi, &ginfo); }
+      {
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      sharp_make_gauss_geom_info (nrings, nphi, 0., 1, nphi, &ginfo);
+      }

    void set_Healpix_geometry (int nside)
-      { sharp_make_healpix_geom_info (nside, 1, &ginfo); }
+      {
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      sharp_make_healpix_geom_info (nside, 1, &ginfo);
+      }

    void set_weighted_Healpix_geometry (int nside, const double *weight)
-      { sharp_make_weighted_healpix_geom_info (nside, 1, weight, &ginfo); }
+      {
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      sharp_make_weighted_healpix_geom_info (nside, 1, weight, &ginfo);
+      }

    void set_triangular_alm_info (int lmax, int mmax)
-      { sharp_make_triangular_alm_info (lmax, mmax, 1, &ainfo); }
+      {
+      if (ainfo) sharp_destroy_alm_info(ainfo);
+      sharp_make_triangular_alm_info (lmax, mmax, 1, &ainfo);
+      }
+
+    const sharp_geom_info* get_geom_info() const { return ginfo; }
+    const sharp_alm_info* get_alm_info() const { return ainfo; }
  };

 template<typename T> struct cxxjobhelper__ {};

 template<> struct cxxjobhelper__<double>
-  { enum {val=1}; };
+  { enum {val=SHARP_DP}; };

 template<> struct cxxjobhelper__<float>
  { enum {val=0}; };
@ -88,52 +105,49 @@ template<> struct cxxjobhelper__<float>
 template<typename T> class sharp_cxxjob: public sharp_base
  {
  private:
-    static void *conv (xcomplex<T> *ptr)
-      { return reinterpret_cast<void *>(ptr); }
-    static void *conv (const xcomplex<T> *ptr)
-      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
    static void *conv (T *ptr)
      { return reinterpret_cast<void *>(ptr); }
    static void *conv (const T *ptr)
      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }

  public:
-    void alm2map (const xcomplex<T> *alm, T *map, bool add)
+    void alm2map (const T *alm, T *map, bool add)
      {
      void *aptr=conv(alm), *mptr=conv(map);
-      sharp_execute (SHARP_ALM2MAP, 0, add, &aptr, &mptr, ginfo, ainfo, 1,
-        cxxjobhelper__<T>::val,0,0,0);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
+        flags,0,0);
      }
-    void alm2map_spin (const xcomplex<T> *alm1, const xcomplex<T> *alm2,
-      T *map1, T *map2, int spin, bool add)
+    void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2,
+      int spin, bool add)
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
      mptr[0]=conv(map1); mptr[1]=conv(map2);
-      sharp_execute (SHARP_ALM2MAP, spin, add, aptr, mptr, ginfo, ainfo, 1,
-        cxxjobhelper__<T>::val,0,0,0);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
-    void alm2map_der1 (const xcomplex<T> *alm, T *map1, T *map2, bool add)
+    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add)
      {
      void *aptr=conv(alm), *mptr[2];
      mptr[0]=conv(map1); mptr[1]=conv(map2);
-      sharp_execute (SHARP_ALM2MAP_DERIV1, 1, add,&aptr, mptr, ginfo, ainfo,
-        1, cxxjobhelper__<T>::val,0,0,0);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
-    void map2alm (const T *map, xcomplex<T> *alm, bool add)
+    void map2alm (const T *map, T *alm, bool add)
      {
      void *aptr=conv(alm), *mptr=conv(map);
-      sharp_execute (SHARP_MAP2ALM, 0, add, &aptr, &mptr, ginfo, ainfo, 1,
-        cxxjobhelper__<T>::val,0,0,0);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
      }
-    void map2alm_spin (const T *map1, const T *map2, xcomplex<T> *alm1,
-      xcomplex<T> *alm2, int spin, bool add)
+    void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
+      int spin, bool add)
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
      mptr[0]=conv(map1); mptr[1]=conv(map2);
-      sharp_execute (SHARP_MAP2ALM, spin, add, aptr, mptr, ginfo, ainfo, 1,
-        cxxjobhelper__<T>::val,0,0,0);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
      }
  };

--- a/external/sharp/libsharp/sharp_geomhelpers.c
+++ b/external/sharp/libsharp/sharp_geomhelpers.c
@ -25,30 +25,24 @@
 /*! \file sharp_geomhelpers.c
 *  Spherical transform library
 *
- *  Copyright (C) 2006-2011 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2006-2012 Max-Planck-Society<br>
+ *  Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl)
+ *  \author Martin Reinecke \author Pavel Holoborodko
 */

 #include <math.h>
 #include "sharp_geomhelpers.h"
+#include "sharp_legendre_roots.h"
 #include "c_utils.h"
+#include "ls_fft.h"
+#include <stdio.h>

-void sharp_make_healpix_geom_info (int nside, int stride,
-  sharp_geom_info **geom_info)
-  {
-  double *weight=RALLOC(double,2*nside);
-  SET_ARRAY(weight,0,2*nside,1);
-  sharp_make_weighted_healpix_geom_info (nside, stride, weight, geom_info);
-  DEALLOC(weight);
-  }
-
-void sharp_make_weighted_healpix_geom_info (int nside, int stride,
-  const double *weight, sharp_geom_info **geom_info)
+void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
+  const int *rings, const double *weight, sharp_geom_info **geom_info)
  {
  const double pi=3.141592653589793238462643383279502884197;
  ptrdiff_t npix=(ptrdiff_t)nside*nside*12;
  ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1);
-  int nrings=4*nside-1;

  double *theta=RALLOC(double,nrings);
  double *weight_=RALLOC(double,nrings);
@ -56,9 +50,10 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
  double *phi0=RALLOC(double,nrings);
  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
  int *stride_=RALLOC(int,nrings);
+  ptrdiff_t curofs=0, checkofs; /* checkofs used for assertion introduced when adding rings arg */
  for (int m=0; m<nrings; ++m)
    {
-    int ring=m+1;
+    int ring = (rings==NULL)? (m+1) : rings[m];
    ptrdiff_t northring = (ring>2*nside) ? 4*nside-ring : ring;
    stride_[m] = stride;
    if (northring < nside)
@ -66,7 +61,7 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
      theta[m] = 2*asin(northring/(sqrt(6.)*nside));
      nph[m] = 4*northring;
      phi0[m] = pi/nph[m];
-      ofs[m] = 2*northring*(northring-1)*stride;
+      checkofs = 2*northring*(northring-1)*stride;
      }
    else
      {
@ -78,14 +73,21 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
        phi0[m] = 0;
      else
        phi0[m] = pi/nph[m];
-      ofs[m] = (ncap + (northring-nside)*nph[m])*stride;
+      checkofs = (ncap + (northring-nside)*nph[m])*stride;
+      ofs[m] = curofs;
      }
    if (northring != ring) /* southern hemisphere */
      {
      theta[m] = pi-theta[m];
-      ofs[m] = (npix - nph[m])*stride - ofs[m];
+      checkofs = (npix - nph[m])*stride - checkofs;
+      ofs[m] = curofs;
      }
-    weight_[m]=4.*pi/npix*weight[northring-1];
+    weight_[m]=4.*pi/npix*((weight==NULL) ? 1. : weight[northring-1]);
+    if (rings==NULL) {
+        UTIL_ASSERT(curofs==checkofs, "Bug in computing ofs[m]");
+    }
+    ofs[m] = curofs;
+    curofs+=nph[m];
    }

  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_,
@ -99,93 +101,13 @@ void sharp_make_weighted_healpix_geom_info (int nside, int stride,
  DEALLOC(stride_);
  }

-static void gauleg (double x1, double x2, double *x, double *w, int n)
+void sharp_make_weighted_healpix_geom_info (int nside, int stride,
+  const double *weight, sharp_geom_info **geom_info)
  {
-  const double pi = 3.141592653589793238462643383279502884197;
-  const double eps = 3.0E-14;
-
-  int m = (n+1)/2;
-  double xm = 0.5*(x2+x1);
-  double xl = 0.5*(x2-x1);
-  for(int i=1; i<=m; ++i)
-    {
-    double z = cos(pi*(i-0.25)/(n+0.5));
-    double pp;
-    int dobreak=0;
-    while(1)
-      {
-      double p1 = 1.0, p2 = 0.0;
-      double z1 = z;
-      int j;
-      for(j=1; j<=n; ++j)
-        {
-        double p3 = p2;
-        p2 = p1;
-        p1 = ((2*j-1)*z*p2-(j-1)*p3)/j;
-        }
-      pp = n*(z*p1-p2)/(z*z-1);
-      z = z1 - p1/pp;
-      if (dobreak) break;
-      if (fabs(z-z1) <= eps) dobreak=1;
-      }
-    x[i-1] = xm - xl*z;
-    x[n-i] = xm + xl*z;
-    w[i-1] = w[n-i] = 2*xl/((1-z*z)*pp*pp);
-    }
+  sharp_make_subset_healpix_geom_info(nside, stride, 4 * nside - 1, NULL, weight, geom_info);
  }

-static void makeweights (int bw, double *weights)
-  {
-  const double pi = 3.141592653589793238462643383279502884197;
-  const double fudge = pi/(4*bw);
-  for (int j=0; j<2*bw; ++j)
-    {
-    double tmpsum = 0;
-    for (int k=0; k<bw; ++k)
-      tmpsum += 1./(2*k+1) * sin((2*j+1)*(2*k+1)*fudge);
-    tmpsum *= sin((2*j+1)*fudge);
-    tmpsum *= 2./bw;
-    weights[j] = tmpsum;
-    /* weights[j + 2*bw] = tmpsum * sin((2*j+1)*fudge); */
-    }
-  }
-
-void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
-  int stride_lat, sharp_geom_info **geom_info)
-  {
-  const double pi=3.141592653589793238462643383279502884197;
-
-  double *theta=RALLOC(double,nrings);
-  double *weight=RALLOC(double,nrings);
-  int *nph=RALLOC(int,nrings);
-  double *phi0=RALLOC(double,nrings);
-  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
-  int *stride_=RALLOC(int,nrings);
-
-  gauleg(-1,1,theta,weight,nrings);
-
-  for (int m=0; m<nrings; ++m)
-    {
-    theta[m] = acos(-theta[m]);
-    nph[m]=nphi;
-    phi0[m]=0;
-    ofs[m]=(ptrdiff_t)m*stride_lat;
-    stride_[m]=stride_lon;
-    weight[m]*=2*pi/nphi;
-    }
-
-  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight,
-    geom_info);
-
-  DEALLOC(theta);
-  DEALLOC(weight);
-  DEALLOC(nph);
-  DEALLOC(phi0);
-  DEALLOC(ofs);
-  DEALLOC(stride_);
-  }
-
-void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
+void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
  {
  const double pi=3.141592653589793238462643383279502884197;
@ -197,12 +119,10 @@ void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
  int *stride_=RALLOC(int,nrings);

-  UTIL_ASSERT((nrings&1)==0,
-    "Even number of rings needed for equidistant grid!");
-  makeweights(nrings/2,weight);
+  sharp_legendre_roots(nrings,theta,weight);
  for (int m=0; m<nrings; ++m)
    {
-    theta[m] = (m+0.5)*pi/nrings;
+    theta[m] = acos(-theta[m]);
    nph[m]=nphi;
    phi0_[m]=phi0;
    ofs[m]=(ptrdiff_t)m*stride_lat;
@ -220,3 +140,178 @@ void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
  DEALLOC(ofs);
  DEALLOC(stride_);
  }
+
+/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
+void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+
+  double *theta=RALLOC(double,nrings);
+  double *weight=RALLOC(double,nrings);
+  int *nph=RALLOC(int,nrings);
+  double *phi0_=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+
+  weight[0]=2.;
+  for (int k=1; k<=(nrings-1)/2; ++k)
+    {
+    weight[2*k-1]=2./(1.-4.*k*k)*cos((k*pi)/nrings);
+    weight[2*k  ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
+    }
+  if ((nrings&1)==0) weight[nrings-1]=0.;
+  real_plan plan = make_real_plan(nrings);
+  real_plan_backward_fftpack(plan,weight);
+  kill_real_plan(plan);
+
+  for (int m=0; m<(nrings+1)/2; ++m)
+    {
+    theta[m]=pi*(m+0.5)/nrings;
+    theta[nrings-1-m]=pi-theta[m];
+    nph[m]=nph[nrings-1-m]=ppring;
+    phi0_[m]=phi0_[nrings-1-m]=phi0;
+    ofs[m]=(ptrdiff_t)m*stride_lat;
+    ofs[nrings-1-m]=(ptrdiff_t)((nrings-1-m)*stride_lat);
+    stride_[m]=stride_[nrings-1-m]=stride_lon;
+    weight[m]=weight[nrings-1-m]=weight[m]*2*pi/(nrings*nph[m]);
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(weight);
+  DEALLOC(nph);
+  DEALLOC(phi0_);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
+
+/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
+void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+
+  double *theta=RALLOC(double,nrings);
+  double *weight=RALLOC(double,nrings);
+  int *nph=RALLOC(int,nrings);
+  double *phi0_=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+
+  int n=nrings-1;
+  SET_ARRAY(weight,0,nrings,0.);
+  double dw=-1./(n*n-1.+(n&1));
+  weight[0]=2.+dw;
+  for (int k=1; k<=(n/2-1); ++k)
+    weight[2*k-1]=2./(1.-4.*k*k) + dw;
+  weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
+  real_plan plan = make_real_plan(n);
+  real_plan_backward_fftpack(plan,weight);
+  kill_real_plan(plan);
+  weight[n]=weight[0];
+
+  for (int m=0; m<(nrings+1)/2; ++m)
+    {
+    theta[m]=pi*m/(nrings-1.);
+    if (theta[m]<1e-15) theta[m]=1e-15;
+    theta[nrings-1-m]=pi-theta[m];
+    nph[m]=nph[nrings-1-m]=ppring;
+    phi0_[m]=phi0_[nrings-1-m]=phi0;
+    ofs[m]=(ptrdiff_t)m*stride_lat;
+    ofs[nrings-1-m]=(ptrdiff_t)((nrings-1-m)*stride_lat);
+    stride_[m]=stride_[nrings-1-m]=stride_lon;
+    weight[m]=weight[nrings-1-m]=weight[m]*2*pi/(n*nph[m]);
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(weight);
+  DEALLOC(nph);
+  DEALLOC(phi0_);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
+
+/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
+void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+
+  double *theta=RALLOC(double,nrings);
+  double *weight=RALLOC(double,nrings+1);
+  int *nph=RALLOC(int,nrings);
+  double *phi0_=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+
+  int n=nrings+1;
+  SET_ARRAY(weight,0,n,0.);
+  weight[0]=2.;
+  for (int k=1; k<=(n/2-1); ++k)
+    weight[2*k-1]=2./(1.-4.*k*k);
+  weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
+  real_plan plan = make_real_plan(n);
+  real_plan_backward_fftpack(plan,weight);
+  kill_real_plan(plan);
+  for (int m=0; m<nrings; ++m)
+    weight[m]=weight[m+1];
+
+  for (int m=0; m<(nrings+1)/2; ++m)
+    {
+    theta[m]=pi*(m+1)/(nrings+1.);
+    theta[nrings-1-m]=pi-theta[m];
+    nph[m]=nph[nrings-1-m]=ppring;
+    phi0_[m]=phi0_[nrings-1-m]=phi0;
+    ofs[m]=(ptrdiff_t)m*stride_lat;
+    ofs[nrings-1-m]=(ptrdiff_t)((nrings-1-m)*stride_lat);
+    stride_[m]=stride_[nrings-1-m]=stride_lon;
+    weight[m]=weight[nrings-1-m]=weight[m]*2*pi/(n*nph[m]);
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(weight);
+  DEALLOC(nph);
+  DEALLOC(phi0_);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
+
+void sharp_make_mw_geom_info (int nrings, int ppring, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+
+  double *theta=RALLOC(double,nrings);
+  int *nph=RALLOC(int,nrings);
+  double *phi0_=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+
+  for (int m=0; m<nrings; ++m)
+    {
+    theta[m]=pi*(2.*m+1.)/(2.*nrings-1.);
+    if (theta[m]>pi-1e-15) theta[m]=pi-1e-15;
+    nph[m]=ppring;
+    phi0_[m]=phi0;
+    ofs[m]=(ptrdiff_t)m*stride_lat;
+    stride_[m]=stride_lon;
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, NULL,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(nph);
+  DEALLOC(phi0_);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
--- a/external/sharp/libsharp/sharp_geomhelpers.h
+++ b/external/sharp/libsharp/sharp_geomhelpers.h
@ -25,7 +25,7 @@
 /*! \file sharp_geomhelpers.h
 *  SHARP helper function for the creation of grid geometries
 *
- *  Copyright (C) 2006-2011 Max-Planck-Society
+ *  Copyright (C) 2006-2013 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -39,26 +39,41 @@ extern "C" {
 #endif

 /*! Creates a geometry information describing a HEALPix map with an
-    Nside parameter \a nside.
+    Nside parameter \a nside. \a weight contains the relative ring
+    weights and must have \a 2*nside entries. The rings array contains
+    the indices of the rings, with 1 being the first ring at the north
+    pole; if NULL then we take them to be sequential. Pass 4 * nside - 1
+    as nrings and NULL to rings to get the full HEALPix grid.
+    \note if \a weight is a null pointer, all weights are assumed to be 1.
+    \note if \a rings is a null pointer, take all rings
    \ingroup geominfogroup */
-void sharp_make_healpix_geom_info (int nside, int stride,
-  sharp_geom_info **geom_info);
+void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
+  const int *rings, const double *weight, sharp_geom_info **geom_info);

 /*! Creates a geometry information describing a HEALPix map with an
    Nside parameter \a nside. \a weight contains the relative ring
    weights and must have \a 2*nside entries.
+    \note if \a weight is a null pointer, all weights are assumed to be 1.
    \ingroup geominfogroup */
 void sharp_make_weighted_healpix_geom_info (int nside, int stride,
  const double *weight, sharp_geom_info **geom_info);

+/*! Creates a geometry information describing a HEALPix map with an
+    Nside parameter \a nside.
+    \ingroup geominfogroup */
+static inline void sharp_make_healpix_geom_info (int nside, int stride,
+  sharp_geom_info **geom_info)
+  { sharp_make_weighted_healpix_geom_info (nside, stride, NULL, geom_info); }
+
 /*! Creates a geometry information describing a Gaussian map with \a nrings
    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
-    pixel in each ring is 0. The index difference between two adjacent pixels
-    in an iso-latitude ring is \a stride_lon, the index difference between the
-    two start pixels in consecutive iso-latitude rings is \a stride_lat.
+    pixel in each ring is \a phi0 (in radians). The index difference between
+    two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
+    difference between the two start pixels in consecutive iso-latitude rings
+    is \a stride_lat.
    \ingroup geominfogroup */
-void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
-  int stride_lat, sharp_geom_info **geom_info);
+void sharp_make_gauss_geom_info (int nrings, int nphi, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info);

 /*! Creates a geometry information describing an ECP map with \a nrings
    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
@ -68,11 +83,67 @@ void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
    is \a stride_lat.
    \note The spacing of pixel centers is equidistant in colatitude and
      longitude.
-    \note \a nrings must be an even number.
    \note The sphere is pixelized in a way that the colatitude of the first ring
-      is \a 0.5*(pi/nrings). There are no pixel centers at the poles.
+      is \a 0.5*(pi/nrings) and the colatitude of the last ring is
+      \a pi-0.5*(pi/nrings). There are no pixel centers at the poles.
+    \note This grid corresponds to Fejer's first rule.
    \ingroup geominfogroup */
-void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
+void sharp_make_fejer1_geom_info (int nrings, int nphi, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info);
+
+/*! Old name for sharp_make_fejer1_geom_info()
+    \ingroup geominfogroup */
+static inline void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
+  {
+  sharp_make_fejer1_geom_info (nrings, nphi, phi0, stride_lon, stride_lat,
+  geom_info);
+  }
+
+/*! Creates a geometry information describing an ECP map with \a nrings
+    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
+    pixel in each ring is \a phi0 (in radians). The index difference between
+    two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
+    difference between the two start pixels in consecutive iso-latitude rings
+    is \a stride_lat.
+    \note The spacing of pixel centers is equidistant in colatitude and
+      longitude.
+    \note The sphere is pixelized in a way that the colatitude of the first ring
+      is \a 0 and that of the last ring is \a pi.
+    \note This grid corresponds to Clenshaw-Curtis integration.
+    \ingroup geominfogroup */
+void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info);
+
+/*! Creates a geometry information describing an ECP map with \a nrings
+    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
+    pixel in each ring is \a phi0 (in radians). The index difference between
+    two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
+    difference between the two start pixels in consecutive iso-latitude rings
+    is \a stride_lat.
+    \note The spacing of pixel centers is equidistant in colatitude and
+      longitude.
+    \note The sphere is pixelized in a way that the colatitude of the first ring
+      is \a pi/(nrings+1) and that of the last ring is \a pi-pi/(nrings+1).
+    \note This grid corresponds to Fejer's second rule.
+    \ingroup geominfogroup */
+void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info);
+
+/*! Creates a geometry information describing a map with \a nrings
+    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
+    pixel in each ring is \a phi0 (in radians). The index difference between
+    two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
+    difference between the two start pixels in consecutive iso-latitude rings
+    is \a stride_lat.
+    \note The spacing of pixel centers is equidistant in colatitude and
+      longitude.
+    \note The sphere is pixelized in a way that the colatitude of the first ring
+      is \a pi/(2*nrings-1) and that of the last ring is \a pi.
+    \note This is the grid introduced by McEwen & Wiaux 2011.
+    \note This function does \e not define any quadrature weights.
+    \ingroup geominfogroup */
+void sharp_make_mw_geom_info (int nrings, int ppring, double phi0,
  int stride_lon, int stride_lat, sharp_geom_info **geom_info);

 #ifdef __cplusplus
--- a/external/sharp/libsharp/sharp_internal.h
+++ b/external/sharp/libsharp/sharp_internal.h
@ -25,8 +25,8 @@
 /*! \file sharp_internal.h
 *  Internally used functionality for the spherical transform library.
 *
- *  Copyright (C) 2006-2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

 #ifndef PLANCK_SHARP_INTERNAL_H
@ -38,23 +38,22 @@

 #include "sharp.h"

-typedef enum { FLOAT, DOUBLE } sharp_fde;
+#define SHARP_MAXTRANS 100

 typedef struct
  {
  sharp_jobtype type;
  int spin;
-  int add_output;
  int nmaps, nalm;
-  sharp_fde fde;
+  int flags;
  void **map;
  void **alm;
+  int s_m, s_th; // strides in m and theta direction
  complex double *phase;
  double *norm_l;
  complex double *almtmp;
  const sharp_geom_info *ginfo;
  const sharp_alm_info *ainfo;
-  int nv;
  double time;
  int ntrans;
  unsigned long long opcnt;
@ -62,5 +61,6 @@ typedef struct

 int sharp_get_nv_max (void);
 int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
+int sharp_get_mlim (int lmax, int spin, double sth, double cth);

 #endif
--- a/external/sharp/libsharp/sharp_legendre.c
+++ b/external/sharp/libsharp/sharp_legendre.c
--- a/external/sharp/libsharp/sharp_legendre.c.in
+++ b/external/sharp/libsharp/sharp_legendre.c.in
@ -0,0 +1,176 @@
+/*
+
+    NOTE NOTE NOTE
+
+    This file is edited in sharp_legendre.c.in which is then preprocessed.
+    Do not make manual  modifications to sharp_legendre.c.
+
+    NOTE NOTE NOTE
+
+*/
+
+
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.c.in
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#ifndef NO_LEGENDRE
+#if (VLEN==8)
+#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
+/* ...or test it (it probably works) and remove this check */
+#endif
+
+#ifndef SHARP_LEGENDRE_CS
+#define SHARP_LEGENDRE_CS 4
+#endif
+
+#define MAX_CS 6
+#if (SHARP_LEGENDRE_CS > MAX_CS)
+#error (SHARP_LEGENDRE_CS > MAX_CS)
+#endif
+
+#include "sharp_legendre.h"
+#include "sharp_vecsupport.h"
+
+#include <malloc.h>
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+/*{ for cs in range(1, 7) }*/
+static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
+                                              {{scalar}} xarr[({{cs}}) * VLEN{{T}}],
+                                              {{scalar}} out[({{cs}}) * VLEN{{T}}]) {
+    /*{ for i in range(cs) }*/
+    Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
+    /*{ endfor }*/
+    Tv{{T}} W1, W2, b, R;
+    ptrdiff_t l;
+
+    /*{ for i in range(cs) }*/
+    x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
+    Pm1_{{i}} = vload{{T}}(1.0);
+    P_{{i}} = x{{i}};
+    b = vload{{T}}(*bl);
+    y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
+    /*{ endfor }*/
+    
+    b = vload{{T}}(*(bl + 1));
+    /*{ for i in range(cs) }*/
+    vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
+    /*{ endfor }*/
+
+    for (l = 2; l <= lmax; ++l) {
+        b = vload{{T}}(*(bl + l));
+        R = vload{{T}}(*(recfacs + l));
+        
+        /* 
+           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
+        */
+        /*{ for i in range(cs) }*/
+        Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
+        W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
+        W2 = W1;
+        W2 = vsub{{T}}(W2, Pm2_{{i}});
+        P_{{i}} = W1;
+        vfmaeq{{T}}(P_{{i}}, W2, R);
+        vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
+        /*{ endfor }*/
+
+    }
+    /*{ for i in range(cs) }*/
+    vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
+    /*{ endfor }*/
+}
+/*{ endfor }*/
+/*{ endfor }*/
+
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
+    /* (l - 1) / l, for l >= 2 */
+    ptrdiff_t l;
+    r[0] = 0;
+    r[1] = 1;
+    for (l = 2; l <= lmax; ++l) {
+        r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
+    }
+}
+/*{ endfor }*/
+
+/*
+  Compute sum_l b_l P_l(x_i) for all i. 
+ */
+
+#define LEN (SHARP_LEGENDRE_CS * VLEN)
+#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
+
+/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
+void sharp_legendre_transform{{T}}({{scalar}} *bl,
+                                   {{scalar}} *recfac,
+                                   ptrdiff_t lmax,
+                                   {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
+    {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
+    int compute_recfac;
+    ptrdiff_t i, j, len;
+
+    compute_recfac = (recfac == NULL);
+    if (compute_recfac) {
+        recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
+        sharp_legendre_transform_recfac{{T}}(recfac, lmax);
+    }
+
+    for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
+
+    for (i = 0; i < nx; i += LEN{{T}}) {
+        len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
+        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
+        switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
+          case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+          case 1:
+          case 0:
+              legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
+        }
+        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
+    }
+    if (compute_recfac) {
+        free(recfac);
+    }
+}
+/*{ endfor }*/
+
+#endif
--- a/external/sharp/libsharp/sharp_legendre.h
+++ b/external/sharp/libsharp/sharp_legendre.h
@ -0,0 +1,62 @@
+/*
+ *  This file is part of libsharp.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file sharp_legendre.h
+ *  Interface for the Legendre transform parts of the spherical transform library.
+ *
+ *  Copyright (C) 2015 University of Oslo
+ *  \author Dag Sverre Seljebotn
+ */
+
+#ifndef SHARP_LEGENDRE_H
+#define SHARP_LEGENDRE_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NO_LEGENDRE
+
+void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
+                              double *out, ptrdiff_t nx);
+void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
+                                float *out, ptrdiff_t nx);
+void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
+void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/external/sharp/libsharp/sharp_legendre_roots.c
+++ b/external/sharp/libsharp/sharp_legendre_roots.c
@ -0,0 +1,67 @@
+/* Function adapted from GNU GSL file glfixed.c
+   Original author: Pavel Holoborodko (http://www.holoborodko.com)
+
+   Adjustments by M. Reinecke
+    - adjusted interface (keep epsilon internal, return full number of points)
+    - removed precomputed tables
+    - tweaked Newton iteration to obtain higher accuracy */
+
+#include <math.h>
+#include "sharp_legendre_roots.h"
+#include "c_utils.h"
+
+static inline double one_minus_x2 (double x)
+  { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
+
+void sharp_legendre_roots(int n, double *x, double *w)
+  {
+  const double pi = 3.141592653589793238462643383279502884197;
+  const double eps = 3e-14;
+  int m = (n+1)>>1;
+
+  double t0 = 1 - (1-1./n) / (8.*n*n);
+  double t1 = 1./(4.*n+2.);
+
+#pragma omp parallel
+{
+  int i;
+#pragma omp for schedule(dynamic,100)
+  for (i=1; i<=m; ++i)
+    {
+    double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
+
+    int dobreak=0;
+    int j=0;
+    double dpdx;
+    while(1)
+      {
+      double P_1 = 1.0;
+      double P0 = x0;
+      double dx, x1;
+
+      for (int k=2; k<=n; k++)
+        {
+        double P_2 = P_1;
+        P_1 = P0;
+//        P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
+        P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
+        }
+
+      dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
+
+      /* Newton step */
+      x1 = x0 - P0/dpdx;
+      dx = x0-x1;
+      x0 = x1;
+      if (dobreak) break;
+
+      if (fabs(dx)<=eps) dobreak=1;
+      UTIL_ASSERT(++j<100,"convergence problem");
+      }
+
+    x[i-1] = -x0;
+    x[n-i] = x0;
+    w[i-1] = w[n-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
+    }
+} // end of parallel region
+  }
--- a/external/sharp/libsharp/sharp_legendre_roots.h
+++ b/external/sharp/libsharp/sharp_legendre_roots.h
@ -0,0 +1,50 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_legendre_roots.h
+ *
+ *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef SHARP_LEGENDRE_ROOTS_H
+#define SHARP_LEGENDRE_ROOTS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Computes roots and Gaussian quadrature weights for Legendre polynomial
+    of degree \a n.
+    \param n Order of Legendre polynomial
+    \param x Array of length \a n for output (root position)
+    \param w Array of length \a w for output (weight for Gaussian quadrature)
+ */
+void sharp_legendre_roots(int n, double *x, double *w);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/external/sharp/libsharp/sharp_lowlevel.h
+++ b/external/sharp/libsharp/sharp_lowlevel.h
@ -25,8 +25,8 @@
 /*! \file sharp_lowlevel.h
 *  Low-level, portable interface for the spherical transform library.
 *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

 #ifndef PLANCK_SHARP_LOWLEVEL_H
@ -60,7 +60,7 @@ typedef struct
 typedef struct
  {
  sharp_ringpair *pair;
-  int npairs;
+  int npairs, nphmax;
  } sharp_geom_info;

 /*! \defgroup almgroup Helpers for dealing with a_lm */
@ -76,6 +76,8 @@ typedef struct
  int nm;
  /*! Array with \a nm entries containing the individual m values */
  int *mval;
+  /*! Combination of flags from sharp_almflags */
+  int flags;
  /*! Array with \a nm entries containing the (hypothetical) indices of
      the coefficients with quantum numbers 0,\a mval[i] */
  ptrdiff_t *mvstart;
@ -83,30 +85,59 @@ typedef struct
  ptrdiff_t stride;
  } sharp_alm_info;

-/*! Creates an Alm data structure information from the following parameters:
+/*! alm_info flags */
+typedef enum { SHARP_PACKED = 1,
+               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
+                    not present. mvstart is in units of *real* float/double for all
+                    m; stride is in units of reals for m=0 and complex for m!=0 */
+               SHARP_REAL_HARMONICS  = 1<<6
+               /*!< Use the real spherical harmonic convention. For
+                    m==0, the alm are treated exactly the same as in
+                    the complex case.  For m!=0, alm[i] represent a
+                    pair (+abs(m), -abs(m)) instead of (real, imag),
+                    and the coefficients are scaled by a factor of
+                    sqrt(2) relative to the complex case.  In other
+                    words, (sqrt(.5) * alm[i]) recovers the
+                    corresponding complex coefficient (when accessed
+                    as complex).
+                */
+             } sharp_almflags;
+
+
+
+/*! Creates an a_lm data structure from the following parameters:
    \param lmax maximum \a l quantum number (>=0)
    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
-    \param stride the stride between consecutive a_lm entries
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
    \param mstart the index of the (hypothetical) coefficient with the
      quantum numbers 0,\a m. Must have \a mmax+1 entries.
    \param alm_info will hold a pointer to the newly created data structure
 */
 void sharp_make_alm_info (int lmax, int mmax, int stride,
  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
-/*! Creates an Alm data structure information from the following parameters:
-    \param lmax maximum \a l quantum number (>=0)
-    \param nm number of different \a m (<=\a lmax+1)
-    \param stride the stride between consecutive a_lm entries
+/*! Creates an a_lm data structure which from the following parameters:
+    \param lmax maximum \a l quantum number (\a >=0)
+    \param nm number of different \a m (\a 0<=nm<=lmax+1)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
    \param mval array with \a nm entries containing the individual m values
    \param mvstart array with \a nm entries containing the (hypothetical)
      indices of the coefficients with the quantum numbers 0,\a mval[i]
+    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
    \param alm_info will hold a pointer to the newly created data structure
 */
 void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
-  const ptrdiff_t *mvstart, sharp_alm_info **alm_info);
+  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
 /*! Returns the index of the coefficient with quantum numbers \a l,
-    \a mval[mi]. */
+    \a mval[mi].
+    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
+    the index for the coefficient with the quantum numbers \a l, \a mi. */
 ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
+/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
+    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
+    otherwise it is the number of complex coefficients (with m>=0). */
+ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
 /*! Deallocates the a_lm info object. */
 void sharp_destroy_alm_info (sharp_alm_info *info);

@ -123,12 +154,19 @@ void sharp_destroy_alm_info (sharp_alm_info *info);
    \param stride the stride between consecutive pixels
    \param phi0 the azimuth (in radians) of the first pixel in each ring
    \param theta the colatitude (in radians) of each ring
-    \param weight the pixel weight to be used for the ring
+    \param wgt the pixel weight to be used for the ring in map2alm
+      and adjoint map2alm transforms.
+      Pass NULL to use 1.0 as weight for all rings.
    \param geom_info will hold a pointer to the newly created data structure
 */
 void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
  const int *stride, const double *phi0, const double *theta,
-  const double *weight, sharp_geom_info **geom_info);
+  const double *wgt, sharp_geom_info **geom_info);
+
+/*! Counts the number of grid points needed for (the local part of) a map described
+    by \a info.
+ */
+ptrdiff_t sharp_map_size(const sharp_geom_info *info);

 /*! Deallocates the geometry information in \a info. */
 void sharp_destroy_geom_info (sharp_geom_info *info);
@ -139,45 +177,91 @@ void sharp_destroy_geom_info (sharp_geom_info *info);
 /*! \{ */

 /*! Enumeration of SHARP job types. */
-typedef enum { SHARP_MAP2ALM,       /*!< analysis */
-               SHARP_ALM2MAP,       /*!< synthesis */
-               SHARP_ALM2MAP_DERIV1 /*!< synthesis of first derivatives */
+typedef enum { SHARP_YtW=0,               /*!< analysis */
+               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
+               SHARP_Y=1,                 /*!< synthesis */
+               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
+               SHARP_Yt=2,                /*!< adjoint synthesis */
+               SHARP_WY=3,                /*!< adjoint analysis */
+               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
             } sharp_jobtype;

+/*! Job flags */
+typedef enum { SHARP_DP              = 1<<4,
+               /*!< map and a_lm are in double precision */
+               SHARP_ADD             = 1<<5,
+               /*!< results are added to the output arrays, instead of
+                    overwriting them */
+
+               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
+                  but its use here is deprecated in favor of having it in the sharp_alm_info */
+
+               SHARP_NO_FFT          = 1<<7,
+
+               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
+               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
+               SHARP_NVMAX           = (1<<4)-1 /* internal use only */
+             } sharp_jobflags;
+
 /*! Performs a libsharp SHT job. The interface deliberately does not use
-  the C99 "complex" data type, in order to be callable from C. 
+  the C99 "complex" data type, in order to be callable from C89 and C++.
  \param type the type of SHT
  \param spin the spin of the quantities to be transformed
-  \param add_output if 0, the output arrays will be overwritten,
-    else the result will be added to the output arrays.
  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
-    depends on the \a dp parameter.
+    depends on whether the SHARP_DP flag is set.
  \param map contains pointers to the maps. If \a spin==0,
    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT,
-    map[2] and map[3] to those of the second, etc. The exact data type of \a map
-    depends on the \a dp parameter.
+    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
+    point to the maps of the first SHT, map[2] and map[3] to those of the
+    second, etc. The exact data type of \a map depends on whether the SHARP_DP
+    flag is set.
  \param geom_info A \c sharp_geom_info object compatible with the provided
    \a map arrays.
  \param alm_info A \c sharp_alm_info object compatible with the provided
    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
    exactly once.
  \param ntrans the number of simultaneous SHTs
-  \param dp if 0, the \a alm is expected to have the type "complex float **"
-    and \a map is expected to have the type "float **"; otherwise the expected
-    types are "complex double **" and "double **", respectively.
-  \param nv Internally used SHT parameter. Set to 0 unless you know what you are
-    doing.
+  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
+    \a alm is expected to have the type "complex double **" and \a map is
+    expected to have the type "double **"; otherwise, the expected
+    types are "complex float **" and "float **", respectively.
  \param time If not NULL, the wall clock time required for this SHT
-    (in seconds)will be written here.
+    (in seconds) will be written here.
  \param opcnt If not NULL, a conservative estimate of the total floating point
    operation count for this SHT will be written here. */
-void sharp_execute (sharp_jobtype type, int spin, int add_output, void *alm,
-  void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
-  int ntrans, int dp, int nv, double *time, unsigned long long *opcnt);
+void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
+  int flags, double *time, unsigned long long *opcnt);
+
+void sharp_set_chunksize_min(int new_chunksize_min);
+void sharp_set_nchunks_max(int new_nchunks_max);
+
+
+typedef enum { SHARP_ERROR_NO_MPI = 1,
+               /*!< libsharp not compiled with MPI support */
+              } sharp_errors;
+
+/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
+    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
+
+    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
+    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
+    is *(MPI_Comm*)pcomm.
+
+    Other parameters are the same as sharp_execute_mpi.
+
+    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
+    (in which case nothing is done).
+ */
+int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  unsigned long long *opcnt);
+
+

 /*! \} */

--- a/external/sharp/libsharp/sharp_mpi.c
+++ b/external/sharp/libsharp/sharp_mpi.c
@ -25,8 +25,8 @@
 /*! \file sharp_mpi.c
 *  Functionality only needed for MPI-parallel transforms
 *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

 #ifdef USE_MPI
@ -185,116 +185,161 @@ static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
  ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
    (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
+  job->s_m=2*job->ntrans*job->nmaps;
+  job->s_th = job->s_m * ((job->type==SHARP_MAP2ALM) ? nmfull : nm);
  }

 static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo)
  {
  if (job->type != SHARP_MAP2ALM)
+    {
    sharp_communicate_alm2map (minfo,&job->phase);
+    job->s_th=job->s_m*minfo->nmtotal;
+    }
  }

 static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo)
  {
  if (job->type == SHARP_MAP2ALM)
+    {
    sharp_communicate_map2alm (minfo,&job->phase);
+    job->s_th=job->s_m*minfo->nm[minfo->mytask];
+    }
  }

 static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
  {
-  double timer=wallTime();
  int ntasks;
  MPI_Comm_size(comm, &ntasks);
  if (ntasks==1) /* fall back to scalar implementation */
    { sharp_execute_job (job); return; }

-  int lmax = job->ainfo->lmax;
-
-  job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);
-
+  MPI_Barrier(comm);
+  double timer=wallTime();
+  job->opcnt=0;
  sharp_mpi_info minfo;
  sharp_make_mpi_info(comm, job, &minfo);

-/* clear output arrays if requested */
-  init_output (job);
-
-  alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
-    minfo.npairtotal);
-
-  double *cth = RALLOC(double,minfo.npairtotal),
-         *sth = RALLOC(double,minfo.npairtotal);
-  idxhelper *stmp = RALLOC(idxhelper,minfo.npairtotal);
-  for (int i=0; i<minfo.npairtotal; ++i)
+  if (minfo.npairtotal>minfo.ntasks*300)
    {
-    cth[i] = cos(minfo.theta[i]);
-    sth[i] = sin(minfo.theta[i]);
-    stmp[i].s=sth[i];
-    stmp[i].i=i;
+    int nsub=(minfo.npairtotal+minfo.ntasks*200-1)/(minfo.ntasks*200);
+    for (int isub=0; isub<nsub; ++isub)
+      {
+      sharp_job ljob=*job;
+      // When creating a_lm, every sub-job produces a complete set of
+      // coefficients; they need to be added up.
+      if ((isub>0)&&(job->type==SHARP_MAP2ALM)) ljob.flags|=SHARP_ADD;
+      sharp_geom_info lginfo;
+      lginfo.pair=RALLOC(sharp_ringpair,(job->ginfo->npairs/nsub)+1);
+      lginfo.npairs=0;
+      lginfo.nphmax = job->ginfo->nphmax;
+      while (lginfo.npairs*nsub+isub<job->ginfo->npairs)
+        {
+        lginfo.pair[lginfo.npairs]=job->ginfo->pair[lginfo.npairs*nsub+isub];
+        ++lginfo.npairs;
+        }
+      ljob.ginfo=&lginfo;
+      sharp_execute_job_mpi (&ljob,comm);
+      job->opcnt+=ljob.opcnt;
+      DEALLOC(lginfo.pair);
+      }
    }
-  qsort (stmp,minfo.npairtotal,sizeof(idxhelper),idx_compare);
-  int *idx = RALLOC(int,minfo.npairtotal);
-  for (int i=0; i<minfo.npairtotal; ++i)
-    idx[i]=stmp[i].i;
-  DEALLOC(stmp);
+  else
+    {
+    int lmax = job->ainfo->lmax;
+    job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);

-/* map->phase where necessary */
-  map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
+    /* clear output arrays if requested */
+    init_output (job);

-  map2alm_comm (job, &minfo);
+    alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
+      minfo.npairtotal);

-#pragma omp parallel
+    double *cth = RALLOC(double,minfo.npairtotal),
+          *sth = RALLOC(double,minfo.npairtotal);
+    int *mlim = RALLOC(int,minfo.npairtotal);
+    for (int i=0; i<minfo.npairtotal; ++i)
+      {
+      cth[i] = cos(minfo.theta[i]);
+      sth[i] = sin(minfo.theta[i]);
+      mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i]);
+      }
+
+    /* map->phase where necessary */
+    map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
+
+    map2alm_comm (job, &minfo);
+
+#pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
 {
-  sharp_job ljob = *job;
-  sharp_Ylmgen_C generator;
-  sharp_Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
-  alloc_almtmp(&ljob,lmax);
+    sharp_job ljob = *job;
+    sharp_Ylmgen_C generator;
+    sharp_Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
+    alloc_almtmp(&ljob,lmax);

 #pragma omp for schedule(dynamic,1)
-  for (int mi=0; mi<job->ainfo->nm; ++mi)
-    {
-/* alm->alm_tmp where necessary */
-    alm2almtmp (&ljob, lmax, mi);
+    for (int mi=0; mi<job->ainfo->nm; ++mi)
+      {
+  /* alm->alm_tmp where necessary */
+      alm2almtmp (&ljob, lmax, mi);

-/* inner conversion loop */
-    inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
-      &generator, mi, idx);
+  /* inner conversion loop */
+      inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
+        &generator, mi, mlim);

-/* alm_tmp->alm where necessary */
-    almtmp2alm (&ljob, lmax, mi);
-    }
+  /* alm_tmp->alm where necessary */
+      almtmp2alm (&ljob, lmax, mi);
+      }

-  sharp_Ylmgen_destroy(&generator);
-  dealloc_almtmp(&ljob);
+    sharp_Ylmgen_destroy(&generator);
+    dealloc_almtmp(&ljob);

 #pragma omp critical
-  job->opcnt+=ljob.opcnt;
+    job->opcnt+=ljob.opcnt;
 } /* end of parallel region */

-  alm2map_comm (job, &minfo);
+    alm2map_comm (job, &minfo);

-/* phase->map where necessary */
-  phase2map (job, minfo.mmax, 0, job->ginfo->npairs);
+  /* phase->map where necessary */
+    phase2map (job, minfo.mmax, 0, job->ginfo->npairs);

-  DEALLOC(cth);
-  DEALLOC(sth);
-  DEALLOC(idx);
-  DEALLOC(job->norm_l);
-  dealloc_phase (job);
+    DEALLOC(mlim);
+    DEALLOC(cth);
+    DEALLOC(sth);
+    DEALLOC(job->norm_l);
+    dealloc_phase (job);
+    }
  sharp_destroy_mpi_info(&minfo);
  job->time=wallTime()-timer;
  }

 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
-  int add_output, void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int dp, int nv, double *time,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
  unsigned long long *opcnt)
  {
  sharp_job job;
-  sharp_build_job_common (&job, type, spin, add_output, alm, map, geom_info,
-    alm_info, ntrans, dp, nv);
+  sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
+    ntrans, flags);

  sharp_execute_job_mpi (&job, comm);
  if (time!=NULL) *time = job.time;
  if (opcnt!=NULL) *opcnt = job.opcnt;
  }

+/* We declare this only in C file to make symbol available for Fortran wrappers;
+   without declaring it in C header as it should not be available to C code */
+void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  unsigned long long *opcnt);
+void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  unsigned long long *opcnt)
+  {
+  sharp_execute_mpi(MPI_Comm_f2c(comm), type, spin, alm, map, geom_info,
+                    alm_info, ntrans, flags, time, opcnt);
+  }
+
 #endif
--- a/external/sharp/libsharp/sharp_mpi.h
+++ b/external/sharp/libsharp/sharp_mpi.h
@ -26,14 +26,14 @@
 *  Interface for the spherical transform library with MPI support.
 *
 *  Copyright (C) 2011,2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

 #ifndef PLANCK_SHARP_MPI_H
 #define PLANCK_SHARP_MPI_H

 #include <mpi.h>
-#include "sharp.h"
+#include "sharp_lowlevel.h"

 #ifdef __cplusplus
 extern "C" {
@ -44,18 +44,17 @@ extern "C" {
  \param comm the MPI communicator to be used for this SHT
  \param type the type of SHT
  \param spin the spin of the quantities to be transformed
-  \param add_output if 0, the output arrays will be overwritten,
-    else the result will be added to the output arrays.
  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
-    depends on the \a dp parameter.
+    depends on whether the SHARP_DP flag is set.
  \param map contains pointers to the maps. If \a spin==0,
    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT,
-    map[2] and map[3] to those of the second, etc. The exact data type of \a map
-    depends on the \a dp parameter.
+    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
+    point to the maps of the first SHT, map[2] and map[3] to those of the
+    second, etc. The exact data type of \a map depends on whether the SHARP_DP
+    flag is set.
  \param geom_info A \c sharp_geom_info object compatible with the provided
    \a map arrays. The total map geometry is the union of all \a geom_info
    objects over the participating MPI tasks.
@ -64,18 +63,17 @@ extern "C" {
    exactly once in the union of all \a alm_info objects over the participating
    MPI tasks.
  \param ntrans the number of simultaneous SHTs
-  \param dp if 0, the \a alm is expected to have the type "complex float **"
-    and \a map is expected to have the type "float **"; otherwise the expected
-    types are "complex double **" and "double **", respectively.
-  \param nv Internally used SHT parameter. Set to 0 unless you know what you are
-    doing.
+  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
+    \a alm is expected to have the type "complex double **" and \a map is
+    expected to have the type "double **"; otherwise, the expected
+    types are "complex float **" and "float **", respectively.
  \param time If not NULL, the wall clock time required for this SHT
-    (in seconds)will be written here.
+    (in seconds) will be written here.
  \param opcnt If not NULL, a conservative estimate of the total floating point
    operation count for this SHT will be written here. */
 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
-  int add_output, void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int dp, int nv, double *time,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
  unsigned long long *opcnt);

 #ifdef __cplusplus
--- a/external/sharp/libsharp/sharp_test.c
+++ b/external/sharp/libsharp/sharp_test.c
@ -1,249 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_test.c
-    Accuracy test for libsharp's map analysis.
-
-    This program first generates a_lm coefficients up to
-    a user-specified lmax (with mmax=lmax); where applicable, the
-    real and imaginary parts of the coefficients are uniform
-    random numbers of the interval [-1;1[.
-    Afterwards, the random a_lm are converted to a map.
-    This map is analyzed (optionally using an iterative scheme
-    with a user-supplied number of steps).
-    After every iteration, the code then outputs the RMS of the residual a_lm
-    (i.e. the difference between the current and original a_lm), divided by
-    the RMS of the original a_lm, as well as the maximum absolute change of any
-    real or imaginary part between the current and original a_lm.
-
-    This operation can be performed for several different pixelisations:
-      - a Gaussian with the minimal number of rings for exact analysis
-        and a user-defined ring resolution
-      - an ECP grid with the minimal number of rings for exact analysis
-        and a user-defined ring resolution
-      - a Healpix grid with a user-defined Nside parameter.
-
-    The user can specify the spin of the desired transform.
-
-    Copyright (C) 2006-2012 Max-Planck-Society
-    \author Martin Reinecke
-*/
-
-#include <stdio.h>
-#include <string.h>
-#ifdef USE_MPI
-#include "mpi.h"
-#endif
-#include "sharp.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
-#include "sharp_announce.h"
-#include "sharp_core.h"
-#include "memusage.h"
-
-typedef complex double dcmplx;
-
-static double drand (double min, double max)
-  { return min + (max-min)*rand()/(RAND_MAX+1.0); }
-
-static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
-  {
-  for (int mi=0;mi<helper->nm; ++mi)
-    {
-    int m=helper->mval[mi];
-    for (int l=m;l<=helper->lmax; ++l)
-      {
-      if ((l<spin)&&(m<spin))
-        alm[sharp_alm_index(helper,l,mi)] = 0.;
-      else
-        {
-        double rv = drand(-1,1);
-        double iv = (m==0) ? 0 : drand(-1,1);
-        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
-        }
-      }
-    }
-  }
-
-static void measure_errors (dcmplx **alm, dcmplx **alm2,
-  ptrdiff_t nalms, int ncomp)
-  {
-  for (int i=0; i<ncomp; ++i)
-    {
-    double sum=0, sum2=0, maxdiff=0;
-    for (ptrdiff_t m=0; m<nalms; ++m)
-      {
-      double x=creal(alm[i][m])-creal(alm2[i][m]),
-             y=cimag(alm[i][m])-cimag(alm2[i][m]);
-      sum+=x*x+y*y;
-      sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
-      if (fabs(x)>maxdiff) maxdiff=fabs(x);
-      if (fabs(y)>maxdiff) maxdiff=fabs(y);
-      }
-    sum=sqrt(sum/nalms);
-    sum2=sqrt(sum2/nalms);
-    printf("component %i: rms %e, maxerr %e\n",i, sum/sum2, maxdiff);
-    }
-  }
-
-static void map2alm_iter (sharp_geom_info *tinfo, double **map,
-  dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
-  ptrdiff_t npix, ptrdiff_t nalms, int spin, int ntrans, int niter)
-  {
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  double time;
-  unsigned long long opcnt;
-  sharp_execute(SHARP_MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,
-    &time,&opcnt);
-  printf("wall time for map2alm: %fs\n",time);
-  printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
-  measure_errors(alm_orig,alm,nalms,ncomp);
-
-  for (int iter=0; iter<niter; ++iter)
-    {
-    double **map2;
-    ALLOC2D(map2,double,ncomp,npix);
-    printf ("\niteration %i:\n", iter+1);
-    sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans,1,0,
-      &time,&opcnt);
-    printf("wall time for alm2map: %fs\n",time);
-    printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
-    for (int i=0; i<ncomp; ++i)
-      for (ptrdiff_t m=0; m<npix; ++m)
-        map2[i][m] = map[i][m]-map2[i][m];
-
-    sharp_execute(SHARP_MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans,1,0,
-      &time,&opcnt);
-    printf("wall time for map2alm: %fs\n",time);
-    printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
-    DEALLOC2D(map2);
-    measure_errors(alm_orig,alm,nalms,ncomp);
-    }
-
-  sharp_destroy_alm_info(alms);
-  }
-
-static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
-  ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
-  {
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  srand(4);
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  for (int i=0; i<ncomp; ++i)
-    random_alm(alm[i],alms,spin);
-
-  dcmplx **alm2;
-  ALLOC2D(alm2,dcmplx,ncomp,nalms);
-
-  double time;
-  unsigned long long opcnt;
-  printf ("\niteration 0:\n");
-  sharp_execute(SHARP_ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans,1,0,
-    &time,&opcnt);
-  printf("wall time for alm2map: %fs\n",time);
-  printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/time);
-
-  map2alm_iter(tinfo,map,alm,alm2,lmax,mmax,npix,nalms,spin,ntrans,niter);
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-  DEALLOC2D(alm2);
-
-  sharp_destroy_alm_info(alms);
-  }
-
-int main(int argc, char **argv)
-  {
-#ifdef USE_MPI
-  MPI_Init(NULL,NULL);
-#endif
-  sharp_module_startup("sharp_test",argc,7,
-    "<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",1);
-
-  int lmax=atoi(argv[2]);
-  int niter=atoi(argv[4]);
-  int spin=atoi(argv[5]);
-  int ntrans=atoi(argv[6]);
-
-  printf("Testing map analysis accuracy.\n");
-  printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
-
-  sharp_geom_info *tinfo;
-  if (strcmp(argv[1],"gauss")==0)
-    {
-    int nrings=lmax+1;
-    int ppring=atoi(argv[3]);
-    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-    printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
-          nrings,ppring,(long)npix);
-    sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
-    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
-    sharp_destroy_geom_info(tinfo);
-    }
-  else if (strcmp(argv[1],"ecp")==0)
-    {
-    int nrings=2*lmax+2;
-    int ppring=atoi(argv[3]);
-    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-    printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
-          nrings,ppring,(long)npix);
-    sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
-    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
-    sharp_destroy_geom_info(tinfo);
-    }
-  else if (strcmp(argv[1],"healpix")==0)
-    {
-    int nside=atoi(argv[3]);
-    if (nside<1) nside=1;
-    ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
-    printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
-          nside,(long)npix);
-    sharp_make_healpix_geom_info (nside, 1, &tinfo);
-    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
-    sharp_destroy_geom_info(tinfo);
-    }
-  else
-    UTIL_FAIL("unknown grid geometry");
-
-  printf("\nMemory high water mark: %.2f MB\n",VmHWM()/(1<<20));
-
-#ifdef USE_MPI
-  MPI_Finalize();
-#endif
-  return 0;
-  }
--- a/external/sharp/libsharp/sharp_test_mpi.c
+++ b/external/sharp/libsharp/sharp_test_mpi.c
@ -1,359 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_test_mpi.c
-    Accuracy test for libsharp's map analysis with MPI support.
-
-    This program first generates a_lm coefficients up to
-    a user-specified lmax (with mmax=lmax); where applicable, the
-    real and imaginary parts of the coefficients are uniform
-    random numbers of the interval [-1;1[.
-    Afterwards, the random a_lm are converted to a map.
-    This map is analyzed (optionally using an iterative scheme
-    with a user-supplied number of steps).
-    After every iteration, the code then outputs the RMS of the residual a_lm
-    (i.e. the difference between the current and original a_lm), divided by
-    the RMS of the original a_lm, as well as the maximum absolute change of any
-    real or imaginary part between the current and original a_lm.
-
-    This operation can be performed for several different pixelisations:
-      - a Gaussian with the minimal number of rings for exact analysis
-        and a user-defined ring resolution
-      - an ECP grid with the minimal number of rings for exact analysis
-        and a user-defined ring resolution
-      - a Healpix grid with a user-defined Nside parameter.
-
-    The user can specify the spin of the desired transform.
-
-    Copyright (C) 2006-2012 Max-Planck-Society
-    \author Martin Reinecke
-*/
-
-#ifdef USE_MPI
-
-#include <stdio.h>
-#include <string.h>
-#include "sharp_mpi.h"
-#include "sharp_geomhelpers.h"
-#include "sharp_almhelpers.h"
-#include "c_utils.h"
-#include "walltime_c.h"
-#include "sharp_announce.h"
-#include "sharp_core.h"
-
-typedef complex double dcmplx;
-
-int ntasks, mytask;
-
-static unsigned long long totalops (unsigned long long val)
-  {
-  unsigned long long tmp;
-  MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-  return tmp;
-  }
-
-static double maxTime (double val)
-  {
-  double tmp;
-  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  return tmp;
-  }
-
-static double drand (double min, double max)
-  { return min + (max-min)*rand()/(RAND_MAX+1.0); }
-
-static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
-  {
-  ptrdiff_t res=0;
-  for (int i=0; i<ainfo->nm; ++i)
-    res += ainfo->lmax-ainfo->mval[i]+1;
-  return res;
-  }
-
-static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
-  {
-  ptrdiff_t res=0;
-  for (int i=0; i<ginfo->npairs; ++i)
-    {
-    res += ginfo->pair[i].r1.nph;
-    if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
-    }
-  return res;
-  }
-
-static void reduce_alm_info(sharp_alm_info *ainfo)
-  {
-  int nmnew=0;
-  ptrdiff_t ofs = 0;
-  for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
-    {
-    ainfo->mval[nmnew]=ainfo->mval[i];
-    ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
-    ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
-    }
-  ainfo->nm=nmnew;
-  }
-
-static void reduce_geom_info(sharp_geom_info *ginfo)
-  {
-  int npairsnew=0;
-  ptrdiff_t ofs = 0;
-  for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
-    {
-    ginfo->pair[npairsnew]=ginfo->pair[i];
-    ginfo->pair[npairsnew].r1.ofs=ofs;
-    ofs+=ginfo->pair[npairsnew].r1.nph;
-    ginfo->pair[npairsnew].r2.ofs=ofs;
-    if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
-    }
-  ginfo->npairs=npairsnew;
-  }
-
-static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
-  {
-  static int cnt=0;
-  ++cnt;
-  for (int mi=0;mi<helper->nm; ++mi)
-    {
-    int m=helper->mval[mi];
-    srand(1234567*cnt+8912*m);
-    for (int l=m;l<=helper->lmax; ++l)
-      {
-      if ((l<spin)&&(m<spin))
-        alm[sharp_alm_index(helper,l,mi)] = 0.;
-      else
-        {
-        double rv = drand(-1,1);
-        double iv = (m==0) ? 0 : drand(-1,1);
-        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
-        }
-      }
-    }
-  }
-
-static void measure_errors (dcmplx **alm, dcmplx **alm2,
-  const sharp_alm_info *ainfo, int ncomp)
-  {
-  long nalms=get_nalms(ainfo), nalms_tot;
-  MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
-
-  for (int i=0; i<ncomp; ++i)
-    {
-    double sum=0, sum2=0, maxdiff=0, sumtot, sum2tot, maxdifftot;
-    for (int mi=0; mi<ainfo->nm; ++mi)
-      {
-      int m=ainfo->mval[mi];
-      for (int l=m; l<=ainfo->lmax; ++l)
-        {
-        ptrdiff_t idx=sharp_alm_index(ainfo,l,mi);
-        double x=creal(alm[i][idx])-creal(alm2[i][idx]),
-               y=cimag(alm[i][idx])-cimag(alm2[i][idx]);
-        sum+=x*x+y*y;
-        sum2+=creal(alm[i][idx])*creal(alm[i][idx])
-             +cimag(alm[i][idx])*cimag(alm[i][idx]);
-        if (fabs(x)>maxdiff) maxdiff=fabs(x);
-        if (fabs(y)>maxdiff) maxdiff=fabs(y);
-        }
-      }
-
-    MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
-    MPI_Allreduce(&sum2,&sum2tot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
-    MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
-    sumtot=sqrt(sumtot/nalms_tot);
-    sum2tot=sqrt(sum2tot/nalms_tot);
-    if (mytask==0)
-      printf("component %i: rms %e, maxerr %e\n",i, sumtot/sum2tot, maxdifftot);
-    }
-  }
-
-static void map2alm_iter (sharp_geom_info *tinfo, double **map,
-  dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
-  ptrdiff_t npix, int spin, int ntrans, int niter)
-  {
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-  reduce_alm_info(alms);
-
-  double jtime;
-  unsigned long long jopcnt;
-
-  sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,0,&alm[0],&map[0],
-    tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
-  unsigned long long opcnt=totalops(jopcnt);
-  double timer=maxTime(jtime);
-  if (mytask==0) printf("wall time for map2alm: %fs\n",timer);
-  if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
-  measure_errors(alm_orig,alm,alms,ncomp);
-
-  for (int iter=0; iter<niter; ++iter)
-    {
-    double **map2;
-    ALLOC2D(map2,double,ncomp,npix);
-    if (mytask==0) printf ("\niteration %i:\n", iter+1);
-    sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,0,&alm[0],&map2[0],
-      tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
-    opcnt=totalops(jopcnt);
-    timer=maxTime(jtime);
-    if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
-    if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
-    for (int i=0; i<ncomp; ++i)
-      for (ptrdiff_t m=0; m<npix; ++m)
-        map2[i][m] = map[i][m]-map2[i][m];
-
-    sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,1,&alm[0],&map2[0],
-      tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
-    opcnt=totalops(jopcnt);
-    timer=maxTime(jtime);
-    if (mytask==0) printf("wall time for map2alm: %fs\n",wallTime()-timer);
-    if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
-    DEALLOC2D(map2);
-    measure_errors(alm_orig,alm,alms,ncomp);
-    }
-
-  sharp_destroy_alm_info(alms);
-  }
-
-static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
-  ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
-  {
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-
-  double jtime;
-  unsigned long long jopcnt;
-
-  sharp_alm_info *alms;
-  ptrdiff_t nalms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-  reduce_alm_info(alms);
-  nalms=get_nalms(alms);
-
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  srand(4);
-  for (int i=0; i<ncomp; ++i)
-    random_alm(alm[i],alms,spin);
-
-  dcmplx **alm2;
-  ALLOC2D(alm2,dcmplx,ncomp,nalms);
-
-  if (mytask==0) printf ("\niteration 0:\n");
-  sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,0,&alm[0],&map[0],
-    tinfo,alms,ntrans,1,0,&jtime,&jopcnt);
-  unsigned long long opcnt=totalops(jopcnt);
-  double timer=maxTime(jtime);
-  if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
-  if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
-
-  map2alm_iter(tinfo, map, alm, alm2, lmax, mmax, npix, spin, ntrans, niter);
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-  DEALLOC2D(alm2);
-
-  sharp_destroy_alm_info(alms);
-  }
-
-int main(int argc, char **argv)
-  {
-  MPI_Init(NULL,NULL);
-  MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
-  MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
-
-  sharp_module_startup("sharp_test_mpi",argc,7,
-    "<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",
-    mytask==0);
-  int lmax=atoi(argv[2]);
-  int niter=atoi(argv[4]);
-  int spin=atoi(argv[5]);
-  int ntrans=atoi(argv[6]);
-
-  if (mytask==0)
-    {
-    printf("Testing map analysis accuracy.\n");
-    printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
-    }
-
-  sharp_geom_info *tinfo;
-  if (strcmp(argv[1],"gauss")==0)
-    {
-    int nrings=lmax+1;
-    int ppring=atoi(argv[3]);
-    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-    if (mytask==0)
-      printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
-             nrings,ppring,(long)npix);
-    sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
-    reduce_geom_info(tinfo);
-    npix=get_npix(tinfo);
-    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
-    sharp_destroy_geom_info(tinfo);
-    }
-  else if (strcmp(argv[1],"ecp")==0)
-    {
-    int nrings=2*lmax+2;
-    int ppring=atoi(argv[3]);
-    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-    if (mytask==0)
-      printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
-             nrings,ppring,(long)npix);
-    sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
-    reduce_geom_info(tinfo);
-    npix=get_npix(tinfo);
-    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
-    sharp_destroy_geom_info(tinfo);
-    }
-  else if (strcmp(argv[1],"healpix")==0)
-    {
-    int nside=atoi(argv[3]);
-    if (nside<1) nside=1;
-    ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
-    if (mytask==0)
-      printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
-             nside,(long)npix);
-    sharp_make_healpix_geom_info (nside, 1, &tinfo);
-    reduce_geom_info(tinfo);
-    npix=get_npix(tinfo);
-    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
-    sharp_destroy_geom_info(tinfo);
-    }
-  else
-    UTIL_FAIL("unknown grid geometry");
-
-  MPI_Finalize();
-  return 0;
-  }
-
-#else
-
-#include "c_utils.h"
-
-int main(void)
-  { UTIL_FAIL("MPI support not enabled."); return 1; }
-
-#endif
--- a/external/sharp/libsharp/sharp_testsuite.c
+++ b/external/sharp/libsharp/sharp_testsuite.c
@ -0,0 +1,708 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*  \file sharp_testsuite.c
+ * 
+ *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <stdio.h>
+#include <string.h>
+#ifdef USE_MPI
+#include "mpi.h"
+#include "sharp_mpi.h"
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "sharp.h"
+#include "sharp_internal.h"
+#include "sharp_geomhelpers.h"
+#include "sharp_almhelpers.h"
+#include "c_utils.h"
+#include "sharp_announce.h"
+#include "memusage.h"
+#include "sharp_vecsupport.h"
+
+typedef complex double dcmplx;
+
+int ntasks, mytask;
+
+static double drand (double min, double max, int *state)
+  {
+  *state = (((*state) * 1103515245) + 12345) & 0x7fffffff;
+  return min + (max-min)*(*state)/(0x7fffffff+1.0);
+  }
+
+static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
+  {
+#pragma omp parallel
+{
+  int mi;
+#pragma omp for schedule (dynamic,100)
+  for (mi=0;mi<helper->nm; ++mi)
+    {
+    int m=helper->mval[mi];
+    int state=1234567*cnt+8912*m; // random seed
+    for (int l=m;l<=helper->lmax; ++l)
+      {
+      if ((l<spin)&&(m<spin))
+        alm[sharp_alm_index(helper,l,mi)] = 0.;
+      else
+        {
+        double rv = drand(-1,1,&state);
+        double iv = (m==0) ? 0 : drand(-1,1,&state);
+        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
+        }
+      }
+    }
+} // end of parallel region
+  }
+
+static unsigned long long totalops (unsigned long long val)
+  {
+#ifdef USE_MPI
+  unsigned long long tmp;
+  MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+  return tmp;
+#else
+  return val;
+#endif
+  }
+
+static double maxTime (double val)
+  {
+#ifdef USE_MPI
+  double tmp;
+  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  return tmp;
+#else
+  return val;
+#endif
+  }
+
+static double allreduceSumDouble (double val)
+  {
+#ifdef USE_MPI
+  double tmp;
+  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  return tmp;
+#else
+  return val;
+#endif
+  }
+
+static double totalMem()
+  {
+#ifdef USE_MPI
+  double tmp, val=VmHWM();
+  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  return tmp;
+#else
+  return VmHWM();
+#endif
+  }
+
+#ifdef USE_MPI
+static void reduce_alm_info(sharp_alm_info *ainfo)
+  {
+  int nmnew=0;
+  ptrdiff_t ofs = 0;
+  for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
+    {
+    ainfo->mval[nmnew]=ainfo->mval[i];
+    ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
+    ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
+    }
+  ainfo->nm=nmnew;
+  }
+
+static void reduce_geom_info(sharp_geom_info *ginfo)
+  {
+  int npairsnew=0;
+  ptrdiff_t ofs = 0;
+  for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
+    {
+    ginfo->pair[npairsnew]=ginfo->pair[i];
+    ginfo->pair[npairsnew].r1.ofs=ofs;
+    ofs+=ginfo->pair[npairsnew].r1.nph;
+    ginfo->pair[npairsnew].r2.ofs=ofs;
+    if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
+    }
+  ginfo->npairs=npairsnew;
+  }
+#endif
+
+static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
+  {
+  ptrdiff_t res=0;
+  for (int i=0; i<ainfo->nm; ++i)
+    res += ainfo->lmax-ainfo->mval[i]+1;
+  return res;
+  }
+
+static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
+  {
+  ptrdiff_t res=0;
+  for (int i=0; i<ginfo->npairs; ++i)
+    {
+    res += ginfo->pair[i].r1.nph;
+    if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
+    }
+  return res;
+  }
+
+static double *get_sqsum_and_invert (dcmplx **alm, ptrdiff_t nalms, int ncomp)
+  {
+  double *sqsum=RALLOC(double,ncomp);
+  for (int i=0; i<ncomp; ++i)
+    {
+    sqsum[i]=0;
+    for (ptrdiff_t j=0; j<nalms; ++j)
+      {
+      sqsum[i]+=creal(alm[i][j])*creal(alm[i][j])
+               +cimag(alm[i][j])*cimag(alm[i][j]);
+      alm[i][j]=-alm[i][j];
+      }
+    }
+  return sqsum;
+  }
+
+static void get_errors (dcmplx **alm, ptrdiff_t nalms, int ncomp, double *sqsum,
+  double **err_abs, double **err_rel)
+  {
+  long nalms_tot=nalms;
+#ifdef USE_MPI
+  MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
+#endif
+
+  *err_abs=RALLOC(double,ncomp);
+  *err_rel=RALLOC(double,ncomp);
+  for (int i=0; i<ncomp; ++i)
+    {
+    double sum=0, maxdiff=0, sumtot, sqsumtot, maxdifftot;
+    for (ptrdiff_t j=0; j<nalms; ++j)
+      {
+      double sqr=creal(alm[i][j])*creal(alm[i][j])
+                +cimag(alm[i][j])*cimag(alm[i][j]);
+      sum+=sqr;
+      if (sqr>maxdiff) maxdiff=sqr;
+      }
+   maxdiff=sqrt(maxdiff);
+
+#ifdef USE_MPI
+    MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+    MPI_Allreduce(&sqsum[i],&sqsumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+    MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
+#else
+    sumtot=sum;
+    sqsumtot=sqsum[i];
+    maxdifftot=maxdiff;
+#endif
+    sumtot=sqrt(sumtot/nalms_tot);
+    sqsumtot=sqrt(sqsumtot/nalms_tot);
+    (*err_abs)[i]=maxdifftot;
+    (*err_rel)[i]=sumtot/sqsumtot;
+    }
+  }
+
+static int good_fft_size(int n)
+  {
+  if (n<=6) return n;
+  int bestfac=2*n;
+
+  for (int f2=1; f2<bestfac; f2*=2)
+    for (int f23=f2; f23<bestfac; f23*=3)
+      for (int f235=f23; f235<bestfac; f235*=5)
+        if (f235>=n) bestfac=f235;
+
+  return bestfac;
+  }
+
+static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
+  int *gpar2, sharp_geom_info **ginfo, sharp_alm_info **ainfo)
+  {
+  UTIL_ASSERT(lmax>=0,"lmax must not be negative");
+  if (*mmax<0) *mmax=lmax;
+  UTIL_ASSERT(*mmax<=lmax,"mmax larger than lmax");
+
+  if (mytask==0) printf ("lmax: %d, mmax: %d\n",lmax,*mmax);
+
+  sharp_make_triangular_alm_info(lmax,*mmax,1,ainfo);
+#ifdef USE_MPI
+  reduce_alm_info(*ainfo);
+#endif
+
+  if (strcmp(gname,"healpix")==0)
+    {
+    if (*gpar1<1) *gpar1=lmax/2;
+    if (*gpar1==0) ++(*gpar1);
+    sharp_make_healpix_geom_info (*gpar1, 1, ginfo);
+    if (mytask==0) printf ("HEALPix grid, nside=%d\n",*gpar1);
+    }
+  else if (strcmp(gname,"gauss")==0)
+    {
+    if (*gpar1<1) *gpar1=lmax+1;
+    if (*gpar2<1) *gpar2=2*(*mmax)+1;
+    sharp_make_gauss_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
+    if (mytask==0)
+      printf ("Gauss-Legendre grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    }
+  else if (strcmp(gname,"fejer1")==0)
+    {
+    if (*gpar1<1) *gpar1=2*lmax+1;
+    if (*gpar2<1) *gpar2=2*(*mmax)+1;
+    sharp_make_fejer1_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
+    if (mytask==0) printf ("Fejer1 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    }
+  else if (strcmp(gname,"fejer2")==0)
+    {
+    if (*gpar1<1) *gpar1=2*lmax+1;
+    if (*gpar2<1) *gpar2=2*(*mmax)+1;
+    sharp_make_fejer2_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
+    if (mytask==0) printf ("Fejer2 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    }
+  else if (strcmp(gname,"cc")==0)
+    {
+    if (*gpar1<1) *gpar1=2*lmax+1;
+    if (*gpar2<1) *gpar2=2*(*mmax)+1;
+    sharp_make_cc_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
+    if (mytask==0)
+      printf("Clenshaw-Curtis grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    }
+  else if (strcmp(gname,"smallgauss")==0)
+    {
+    int nlat=*gpar1, nlon=*gpar2;
+    if (nlat<1) nlat=lmax+1;
+    if (nlon<1) nlon=2*(*mmax)+1;
+    *gpar1=nlat; *gpar2=nlon;
+    sharp_make_gauss_geom_info (nlat, nlon, 0., 1, nlon, ginfo);
+    ptrdiff_t npix_o=get_npix(*ginfo);
+    size_t ofs=0;
+    for (int i=0; i<(*ginfo)->npairs; ++i)
+      {
+      sharp_ringpair *pair=&((*ginfo)->pair[i]);
+      int pring=1+2*sharp_get_mlim(lmax,0,pair->r1.sth,pair->r1.cth);
+      if (pring>nlon) pring=nlon;
+      pring=good_fft_size(pring);
+      pair->r1.nph=pring;
+      pair->r1.weight*=nlon*1./pring;
+      pair->r1.ofs=ofs;
+      ofs+=pring;
+      if (pair->r2.nph>0)
+        {
+        pair->r2.nph=pring;
+        pair->r2.weight*=nlon*1./pring;
+        pair->r2.ofs=ofs;
+        ofs+=pring;
+        }
+      }
+    if (mytask==0)
+      {
+      ptrdiff_t npix=get_npix(*ginfo);
+      printf("Small Gauss grid, nlat=%d, npix=%ld, savings=%.2f%%\n",
+        nlat,(long)npix,(npix_o-npix)*100./npix_o);
+      }
+    }
+  else
+    UTIL_FAIL("unknown grid geometry");
+
+#ifdef USE_MPI
+  reduce_geom_info(*ginfo);
+#endif
+  }
+
+static void check_sign_scale(void)
+  {
+  int lmax=50;
+  int mmax=lmax;
+  sharp_geom_info *tinfo;
+  int nrings=lmax+1;
+  int ppring=2*lmax+2;
+  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+  sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
+
+  /* flip theta to emulate the "old" Gaussian grid geometry */
+  for (int i=0; i<tinfo->npairs; ++i)
+    {
+    const double pi=3.141592653589793238462643383279502884197;
+    tinfo->pair[i].r1.cth=-tinfo->pair[i].r1.cth;
+    tinfo->pair[i].r2.cth=-tinfo->pair[i].r2.cth;
+    tinfo->pair[i].r1.theta=pi-tinfo->pair[i].r1.theta;
+    tinfo->pair[i].r2.theta=pi-tinfo->pair[i].r2.theta;
+    }
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
+
+  for (int ntrans=1; ntrans<10; ++ntrans)
+    {
+    double **map;
+    ALLOC2D(map,double,2*ntrans,npix);
+
+    dcmplx **alm;
+    ALLOC2D(alm,dcmplx,2*ntrans,nalms);
+    for (int i=0; i<2*ntrans; ++i)
+      for (int j=0; j<nalms; ++j)
+        alm[i][j]=1.+_Complex_I;
+
+    sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
+      NULL,NULL);
+    for (int it=0; it<ntrans; ++it)
+      {
+      UTIL_ASSERT(FAPPROX(map[it][0     ], 3.588246976618616912e+00,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
+        "error");
+      }
+    sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
+      NULL,NULL);
+    for (int it=0; it<ntrans; ++it)
+      {
+      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ], 2.750897760535633285e+00,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2], 3.137704477368562905e+01,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-8.405730859837063917e+01,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-2.398026536095463346e+00,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
+        "error");
+      }
+
+    sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
+      NULL,NULL);
+    for (int it=0; it<ntrans; ++it)
+      {
+      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-1.398186224727334448e+00,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.456676000884031197e+01,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.516249174408820863e+02,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-3.173406200299964119e+00,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
+        "error");
+      }
+
+    sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,ntrans,
+      SHARP_DP,NULL,NULL);
+    for (int it=0; it<ntrans; ++it)
+      {
+      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-6.859393905369091105e-01,1e-11),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.103947835973212364e+02,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.092463246472086439e+03,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-1.411433220713928165e+02,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
+        "error");
+      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
+        "error");
+      }
+
+    DEALLOC2D(map);
+    DEALLOC2D(alm);
+    }
+
+  sharp_destroy_alm_info(alms);
+  sharp_destroy_geom_info(tinfo);
+  }
+
+static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
+  int spin, int ntrans, int nv, double **err_abs, double **err_rel,
+  double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
+  unsigned long long *op_m2a)
+  {
+  ptrdiff_t nalms = get_nalms(ainfo);
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  size_t npix = get_npix(ginfo);
+  double **map;
+  ALLOC2D(map,double,ncomp,npix);
+  for (int i=0; i<ncomp; ++i)
+    SET_ARRAY(map[i],0,(int)npix,0);
+
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,ncomp,nalms);
+  for (int i=0; i<ncomp; ++i)
+    random_alm(alm[i],ainfo,spin,i+1);
+
+#ifdef USE_MPI
+  sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
+    ainfo,ntrans, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
+#else
+  sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
+    SHARP_DP|nv,t_a2m,op_a2m);
+#endif
+  if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
+  if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
+  double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
+#ifdef USE_MPI
+  sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
+    ainfo,ntrans,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+#else
+  sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
+    SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+#endif
+  if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
+  if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
+  get_errors(alm, nalms, ncomp, sqsum, err_abs, err_rel);
+
+  DEALLOC(sqsum);
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+  }
+
+static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
+  int spin, int ntrans, int nv)
+  {
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  double *err_abs, *err_rel;
+  do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel, NULL, NULL,
+    NULL, NULL);
+  for (int i=0; i<ncomp; ++i)
+    UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
+  DEALLOC(err_rel);
+  DEALLOC(err_abs);
+  }
+
+static void sharp_acctest(void)
+  {
+  if (mytask==0) sharp_module_startup("sharp_acctest",1,1,"",1);
+
+  if (mytask==0) printf("Checking signs and scales.\n");
+  check_sign_scale();
+  if (mytask==0) printf("Passed.\n\n");
+
+  if (mytask==0) printf("Testing map analysis accuracy.\n");
+
+  sharp_geom_info *ginfo;
+  sharp_alm_info *ainfo;
+  int lmax=127, mmax=127, nlat=128, nlon=256;
+  get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
+  for (int nv=1; nv<=6; ++nv)
+    for (int ntrans=1; ntrans<=6; ++ntrans)
+      {
+      check_accuracy(ginfo,ainfo,0,ntrans,nv);
+      check_accuracy(ginfo,ainfo,1,ntrans,nv);
+      check_accuracy(ginfo,ainfo,2,ntrans,nv);
+      check_accuracy(ginfo,ainfo,3,ntrans,nv);
+      check_accuracy(ginfo,ainfo,30,ntrans,nv);
+      }
+  sharp_destroy_alm_info(ainfo);
+  sharp_destroy_geom_info(ginfo);
+  if (mytask==0) printf("Passed.\n\n");
+  }
+
+static void sharp_test (int argc, const char **argv)
+  {
+  if (mytask==0) sharp_announce("sharp_test");
+  UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
+  int lmax=atoi(argv[3]);
+  int mmax=atoi(argv[4]);
+  int gpar1=atoi(argv[5]);
+  int gpar2=atoi(argv[6]);
+  int spin=atoi(argv[7]);
+  int ntrans=atoi(argv[8]);
+
+  if (mytask==0) printf("Testing map analysis accuracy.\n");
+  if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
+
+  sharp_geom_info *ginfo;
+  sharp_alm_info *ainfo;
+  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
+
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  double t_a2m=1e30, t_m2a=1e30;
+  unsigned long long op_a2m, op_m2a;
+  double *err_abs,*err_rel;
+
+  double t_acc=0;
+  int nrpt=0;
+  while(1)
+    {
+    ++nrpt;
+    double ta2m2, tm2a2;
+    do_sht (ginfo, ainfo, spin, ntrans, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
+      &op_a2m, &op_m2a);
+    if (ta2m2<t_a2m) t_a2m=ta2m2;
+    if (tm2a2<t_m2a) t_m2a=tm2a2;
+    t_acc+=t_a2m+t_m2a;
+    if (t_acc>2.)
+      {
+      if (mytask==0) printf("Best of %d runs\n",nrpt);
+      break;
+      }
+    DEALLOC(err_abs);
+    DEALLOC(err_rel);
+    }
+
+  if (mytask==0) printf("wall time for alm2map: %fs\n",t_a2m);
+  if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*op_a2m/t_a2m);
+  if (mytask==0) printf("wall time for map2alm: %fs\n",t_m2a);
+  if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*op_m2a/t_m2a);
+
+  if (mytask==0)
+    for (int i=0; i<ncomp; ++i)
+      printf("component %i: rms %e, maxerr %e\n",i,err_rel[i], err_abs[i]);
+
+  double iosize = ncomp*(16.*get_nalms(ainfo) + 8.*get_npix(ginfo));
+  iosize = allreduceSumDouble(iosize);
+
+  sharp_destroy_alm_info(ainfo);
+  sharp_destroy_geom_info(ginfo);
+
+  double tmem=totalMem();
+  if (mytask==0)
+    printf("\nMemory high water mark: %.2f MB\n",tmem/(1<<20));
+  if (mytask==0)
+    printf("Memory overhead: %.2f MB (%.2f%% of working set)\n",
+      (tmem-iosize)/(1<<20),100.*(1.-iosize/tmem));
+
+#ifdef _OPENMP
+  int nomp=omp_get_max_threads();
+#else
+  int nomp=1;
+#endif
+
+  double maxerel=0., maxeabs=0.;
+  for (int i=0; i<ncomp; ++i)
+    {
+    if (maxerel<err_rel[i]) maxerel=err_rel[i];
+    if (maxeabs<err_abs[i]) maxeabs=err_abs[i];
+    }
+
+  if (mytask==0)
+    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
+           " %9.2f %6.2f %.2e %.2e\n",
+      getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2,
+      ntrans,t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
+      100.*(1.-iosize/tmem),maxerel,maxeabs);
+
+  DEALLOC(err_abs);
+  DEALLOC(err_rel);
+  }
+
+static void sharp_bench (int argc, const char **argv)
+  {
+  if (mytask==0) sharp_announce("sharp_bench");
+  UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
+  int lmax=atoi(argv[3]);
+  int mmax=atoi(argv[4]);
+  int gpar1=atoi(argv[5]);
+  int gpar2=atoi(argv[6]);
+  int spin=atoi(argv[7]);
+  int ntrans=atoi(argv[8]);
+
+  if (mytask==0) printf("Testing map analysis accuracy.\n");
+  if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
+
+  sharp_geom_info *ginfo;
+  sharp_alm_info *ainfo;
+  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
+
+  double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
+  unsigned long long opa2m_min=0, opm2a_min=0;
+  int nvmin_a2m=-1, nvmin_m2a=-1;
+  for (int nv=0; nv<=6; ++nv)
+    {
+    int ntries=0;
+    double tacc=0;
+    do
+      {
+      double t_a2m, t_m2a;
+      unsigned long long op_a2m, op_m2a;
+      double *err_abs,*err_rel;
+      do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel,
+        &t_a2m, &t_m2a, &op_a2m, &op_m2a);
+
+      DEALLOC(err_abs);
+      DEALLOC(err_rel);
+      tacc+=t_a2m+t_m2a;
+      ++ntries;
+      if (nv==0)
+        {
+        if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
+        if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
+        }
+      else
+        {
+        if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
+        if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
+        }
+      } while((ntries<2)||(tacc<3.));
+    }
+  if (mytask==0)
+    {
+    printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
+      nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
+      1e-9*opa2m_min/ta2m_min);
+    printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
+      nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
+      1e-9*opm2a_min/tm2a_min);
+    }
+
+  sharp_destroy_alm_info(ainfo);
+  sharp_destroy_geom_info(ginfo);
+  }
+
+int main(int argc, const char **argv)
+  {
+#ifdef USE_MPI
+  MPI_Init(NULL,NULL);
+  MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
+  MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
+#else
+  mytask=0; ntasks=1;
+#endif
+
+  UTIL_ASSERT(argc>=2,"need at least one command line argument");
+
+  if (strcmp(argv[1],"acctest")==0)
+    sharp_acctest();
+  else if (strcmp(argv[1],"test")==0)
+    sharp_test(argc,argv);
+  else if (strcmp(argv[1],"bench")==0)
+    sharp_bench(argc,argv);
+  else
+    UTIL_FAIL("unknown command");
+
+#ifdef USE_MPI
+  MPI_Finalize();
+#endif
+  return 0;
+  }
--- a/external/sharp/libsharp/sharp_vecsupport.h
+++ b/external/sharp/libsharp/sharp_vecsupport.h
@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
 *  Convenience functions for vector arithmetics
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012,2013 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -40,34 +40,46 @@ typedef double Ts;
 #if (VLEN==1)

 typedef double Tv;
+typedef float Tv_s;
+typedef int Tm;

 #define vadd(a,b) ((a)+(b))
+#define vadd_s(a,b) ((a)+(b))
 #define vaddeq(a,b) ((a)+=(b))
+#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
 #define vsub(a,b) ((a)-(b))
+#define vsub_s(a,b) ((a)-(b))
 #define vsubeq(a,b) ((a)-=(b))
+#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
 #define vmul(a,b) ((a)*(b))
+#define vmul_s(a,b) ((a)*(b))
 #define vmuleq(a,b) ((a)*=(b))
+#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
 #define vfmaeq(a,b,c) ((a)+=(b)*(c))
+#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
 #define vfmseq(a,b,c) ((a)-=(b)*(c))
 #define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
 #define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
 #define vneg(a) (-(a))
 #define vload(a) (a)
+#define vload_s(a) (a)
+#define vloadu(p) (*(p))
+#define vloadu_s(p) (*(p))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
-#define vlt(a,b) (((a)<(b))?1.:0.)
-#define vgt(a,b) (((a)>(b))?1.:0.)
-#define vge(a,b) (((a)>=(b))?1.:0.)
-#define vne(a,b) (((a)!=(b))?1.:0.)
-#define vand(a,b) ((((a)*(b))!=0.)?1.:0.)
-#define vor(a,b) ((((a)+(b))!=0.)?1.:0.)
+#define vlt(a,b) ((a)<(b))
+#define vgt(a,b) ((a)>(b))
+#define vge(a,b) ((a)>=(b))
+#define vne(a,b) ((a)!=(b))
+#define vand_mask(a,b) ((a)&&(b))
+#define vstoreu(p, a) (*(p)=a)
+#define vstoreu_s(p, a) (*(p)=a)

 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }

-#define vanyTrue(a) ((a)!=0.)
-#define vallTrue(a) ((a)!=0.)
-#define vblend(m,a,b) (((m)!=0.) ? (a) : (b))
+#define vanyTrue(a) (a)
+#define vallTrue(a) (a)
 #define vzero 0.
 #define vone 1.

@ -85,14 +97,32 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif

 typedef __m128d Tv;
+typedef __m128 Tv_s;
+typedef __m128d Tm;
+
+#if defined(__SSE4_1__)
+#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
+#else
+static inline Tv vblend__(Tv m, Tv a, Tv b)
+  { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
+#endif
+#define vzero _mm_setzero_pd()
+#define vone _mm_set1_pd(1.)

 #define vadd(a,b) _mm_add_pd(a,b)
+#define vadd_s(a,b) _mm_add_ps(a,b)
 #define vaddeq(a,b) a=_mm_add_pd(a,b)
+#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm_sub_pd(a,b)
+#define vsub_s(a,b) _mm_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm_sub_pd(a,b)
+#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm_mul_pd(a,b)
+#define vmul_s(a,b) _mm_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm_mul_pd(a,b)
+#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
 #define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
+#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
 #define vfmaaeq(a,b,c,d,e) \
  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
@ -100,51 +130,61 @@ typedef __m128d Tv;
  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
 #define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
+#define vload_s(a) _mm_set1_ps(a)
 #define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
 #define vgt(a,b) _mm_cmpgt_pd(a,b)
 #define vge(a,b) _mm_cmpge_pd(a,b)
 #define vne(a,b) _mm_cmpneq_pd(a,b)
-#define vand(a,b) _mm_and_pd(a,b)
-#define vor(a,b) _mm_or_pd(a,b)
+#define vand_mask(a,b) _mm_and_pd(a,b)
 #define vmin(a,b) _mm_min_pd(a,b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
-#if defined(__SSE4_1__)
-#define vblend(m,a,b) _mm_blendv_pd(b,a,m)
-#else
-static inline Tv vblend(Tv m, Tv a, Tv b)
-  { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
-#endif
-#define vzero _mm_setzero_pd()
-#define vone _mm_set1_pd(1.)
+#define vloadu(p) _mm_loadu_pd(p)
+#define vloadu_s(p) _mm_loadu_ps(p)
+#define vstoreu(p, v) _mm_storeu_pd(p, v)
+#define vstoreu_s(p, v) _mm_storeu_ps(p, v)

 #endif

 #if (VLEN==4)

 #include <immintrin.h>
-#ifdef __FMA4__
+#if (USE_FMA4)
 #include <x86intrin.h>
 #endif

 typedef __m256d Tv;
+typedef __m256 Tv_s;
+typedef __m256d Tm;
+
+#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
+#define vzero _mm256_setzero_pd()
+#define vone _mm256_set1_pd(1.)

 #define vadd(a,b) _mm256_add_pd(a,b)
+#define vadd_s(a,b) _mm256_add_ps(a,b)
 #define vaddeq(a,b) a=_mm256_add_pd(a,b)
+#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
 #define vsub(a,b) _mm256_sub_pd(a,b)
+#define vsub_s(a,b) _mm256_sub_ps(a,b)
 #define vsubeq(a,b) a=_mm256_sub_pd(a,b)
+#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
 #define vmul(a,b) _mm256_mul_pd(a,b)
+#define vmul_s(a,b) _mm256_mul_ps(a,b)
 #define vmuleq(a,b) a=_mm256_mul_pd(a,b)
-#ifdef __FMA4__
+#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
+#if (USE_FMA4)
 #define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
+#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
 #define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
+#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
 #define vfmaaeq(a,b,c,d,e) \
  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
@ -153,21 +193,62 @@ typedef __m256d Tv;
 #endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
+#define vload_s(a) _mm256_set1_ps(a)
 #define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
 #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
 #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
-#define vand(a,b) _mm256_and_pd(a,b)
-#define vor(a,b) _mm256_or_pd(a,b)
+#define vand_mask(a,b) _mm256_and_pd(a,b)
 #define vmin(a,b) _mm256_min_pd(a,b)
 #define vmax(a,b) _mm256_max_pd(a,b)
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)
-#define vblend(m,a,b) _mm256_blendv_pd(b,a,m)
-#define vzero _mm256_setzero_pd()
-#define vone _mm256_set1_pd(1.)
+
+#define vloadu(p) _mm256_loadu_pd(p)
+#define vloadu_s(p) _mm256_loadu_ps(p)
+#define vstoreu(p, v) _mm256_storeu_pd(p, v)
+#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
+
+#endif
+
+#if (VLEN==8)
+
+#include <immintrin.h>
+
+typedef __m512d Tv;
+typedef __mmask8 Tm;
+
+#define vadd(a,b) _mm512_add_pd(a,b)
+#define vaddeq(a,b) a=_mm512_add_pd(a,b)
+#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
+#define vsub(a,b) _mm512_sub_pd(a,b)
+#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
+#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
+#define vmul(a,b) _mm512_mul_pd(a,b)
+#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
+#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
+#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
+#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
+#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
+#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
+#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
+#define vload(a) _mm512_set1_pd(a)
+#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
+#define vsqrt(a) _mm512_sqrt_pd(a)
+#define vlt(a,b) _mm512_cmplt_pd_mask(a,b)
+#define vgt(a,b) _mm512_cmpnle_pd_mask(a,b)
+#define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
+#define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
+#define vand_mask(a,b) ((a)&(b))
+#define vmin(a,b) _mm512_min_pd(a,b)
+#define vmax(a,b) _mm512_max_pd(a,b)
+#define vanyTrue(a) (a!=0)
+#define vallTrue(a) (a==255)
+
+#define vzero _mm512_setzero_pd()
+#define vone _mm512_set1_pd(1.)

 #endif

--- a/external/sharp/libsharp/sharp_vecutil.h
+++ b/external/sharp/libsharp/sharp_vecutil.h
@ -25,14 +25,18 @@
 /*! \file sharp_vecutil.h
 *  Functionality related to vector instruction support
 *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012,2013 Max-Planck-Society
 *  \author Martin Reinecke
 */

 #ifndef SHARP_VECUTIL_H
 #define SHARP_VECUTIL_H

-#if (defined (__AVX__))
+#ifndef VLEN
+
+#if (defined (__MIC__))
+#define VLEN 8
+#elif (defined (__AVX__))
 #define VLEN 4
 #elif (defined (__SSE2__))
 #define VLEN 2
@ -41,3 +45,19 @@
 #endif

 #endif
+
+#if (VLEN==1)
+#define VLEN_s 1
+#else
+#define VLEN_s (2*VLEN)
+#endif
+
+#ifndef USE_FMA4
+#ifdef __FMA4__
+#define USE_FMA4 1
+#else
+#define USE_FMA4 0
+#endif
+#endif
+
+#endif
--- a/external/sharp/libsharp/sharp_ylmgen_c.c
+++ b/external/sharp/libsharp/sharp_ylmgen_c.c
@ -25,7 +25,7 @@
 /*
 *  Helper code for efficient calculation of Y_lm(theta,phi=0)
 *
- *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  Copyright (C) 2005-2014 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -47,7 +47,9 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)

  gen->lmax = l_max;
  gen->mmax = m_max;
-  UTIL_ASSERT(spin>=0,"incorrect spin");
+  UTIL_ASSERT(spin>=0,"incorrect spin: must be nonnegative");
+  UTIL_ASSERT(l_max>=spin,"incorrect l_max: must be >= spin");
+  UTIL_ASSERT(l_max>=m_max,"incorrect l_max: must be >= m_max");
  gen->s = spin;
  UTIL_ASSERT((sharp_minscale<=0)&&(sharp_maxscale>0),
    "bad value for min/maxscale");