Merge branch 'pol_ispack' into 'master'

Pol ispack See merge request mtr/libsharp!16
2019-01-15 11:16:06 +01:00 · 2019-01-15 11:16:06 +01:00 · ea75d4f65b
commit ea75d4f65b
parent d9c8021edf 7440aab6ec
85 changed files with 4561 additions and 10046 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,17 +1,35 @@
-*.o
-*.so
-#*
-*~
-*.pyc
-*.pyo
-
+**.o
+**.lo
+**.la
+**.so
+**/#*
+**~
+**.pyc
+**.pyo
+.libs
+**/.deps
+**/.dirstamp
+libsharp-uninstalled.pc
+libsharp-uninstalled.sh
+libsharp.pc
+libsharp.pc.in
+perf.data*
 /auto
 /autom4te.cache
-/config.log
-/config.status
-/config/config.auto
+/m4
+config.log
+config.guess
+config.status
+config.sub
+ltmain.sh
+compile
+libtool
+missing
+/comp
 /configure
-/sharp_oracle.inc
-
-/python/libsharp/libsharp.c
-/python/libsharp/libsharp_mpi.c
+/Makefile
+/Makefile.in
+/aclocal.m4
+/ar-lib
+/depcomp
+/install-sh
--- a/85
+++ b/85
@ -0,0 +1,85 @@
+Libsharp is configured, compiled and installed using GNU autotools.
+
+If you have cloned the libsharp repository, you have to run
+"autoreconf -i" before starting the configuration, which requires several
+GNU developer tools to be available on your system.
+
+When using a release tarball, configuration is done via
+
+[CC=...] [CFLAGS=...] ./configure
+
+The following sections briefly describe possible choices for compilers and
+flags.
+
+
+Fast math
+---------
+
+Specifying "-ffast-math" is important for all compilers, since it allows the
+compiler to fuse multiplications and additions into FMA instructions, which is
+forbidden by the C99 standard. Since FMAs are a central aspect of the algorithm,
+they are needed for optimum performance.
+
+If you are calling libsharp from other code which requires strict adherence
+to the C99 standard, you should still be able to compile libsharp with
+"-ffast-math" without any problems.
+
+
+Runtime CPU selection with gcc
+------------------------------
+
+When using a recent gcc (6.0 and newer) on an x86_64 platform, the build
+machinery will compile the time-critical functions for several different
+architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate
+implementation will be selected at runtime.
+This only happens if you do _not_ explicitly specify a target architecture via
+the compiler flags. I.e., please do _not_ specify "-march=native" or
+"-mtarget=avx" or similar if you want a portable binary that will run
+efficiently on different x86_64 CPUs.
+If you are compiling libsharp for a particular target CPU only, or if you are
+using a different compiler, however, "-march-native" should be used. The
+resulting binary will most likely not run on other computers, though.
+
+
+OpenMP
+------
+
+OpenMP should be switched on for maximum performance, and at runtime
+OMP_NUM_THREADS should be set to the number of hardware threads (not physical
+cores) of the system.
+(Usually this is  already the default setting when OMP_NUM_THREADS is not
+specified.)
+
+
+MPI
+---
+
+MPI support is enabled by using the MPI compiler (typically "mpicc") _and_
+adding the flag "-DUSE_MPI".
+When using MPI and OpenMP simultaneously, the product of MPI tasks per node
+and OMP_NUM_THREADS should be equal to the number of hardware threads available
+on the node. One MPI task per node should result in the best performance.
+
+
+Example configure invocations
+=============================
+
+GCC, OpenMP, portable binary:
+CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure
+
+GCC, no OpenMP, portable binary:
+CFLAGS="-std=c99 -O3 -ffast-math" ./configure
+
+Clang, OpenMP, nonportable binary:
+CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
+
+Intel C compiler, OpenMP, nonportable binary:
+CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
+
+MPI support, nonportable binary:
+CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
+
+Additional GCC flags for pedantic warning and debugging:
+
+-Wall -Wextra -Wshadow -Wmissing-prototypes -Wfatal-errors -pedantic -g
+-fsanitize=address
--- a/80
+++ b/80
@ -1,80 +0,0 @@
-SHARP_TARGET?=auto
-ifndef SHARP_TARGET
-  SHARP_TARGET:=$(error SHARP_TARGET undefined. Please see README.compilation for help)UNDEFINED
-endif
-
-default: compile_all
-SRCROOT:=$(shell pwd)
-include $(SRCROOT)/config/config.$(SHARP_TARGET)
-include $(SRCROOT)/config/rules.common
-
-all_hdr:=
-all_lib:=
-all_cbin:=
-
-FULL_INCLUDE:=
-
-include c_utils/planck.make
-include libfftpack/planck.make
-include libsharp/planck.make
-include docsrc/planck.make
-
-CYTHON_MODULES=python/libsharp/libsharp.so $(if $(MPI_CFLAGS), python/libsharp/libsharp_mpi.so)
-
-$(all_lib): %: | $(LIBDIR)_mkdir
-	@echo "#  creating library $*"
-	$(ARCREATE) $@ $^
-
-$(all_cbin): %: | $(BINDIR)_mkdir
-	@echo "#  linking C binary $*"
-	$(CL) -o $@ $^ $(CLFLAGS)
-
-compile_all: $(all_cbin) hdrcopy
-
-hdrclean:
-	@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
-
-hdrcopy: | $(INCDIR)_mkdir
-	@if [ "$(all_hdr)" ]; then cp -p $(all_hdr) $(INCDIR); fi
-
-$(notdir $(all_cbin)) : % : $(BINDIR)/%
-
-test: compile_all
-	$(BINDIR)/sharp_testsuite acctest && \
-	$(BINDIR)/sharp_testsuite test healpix 2048 -1 1024 -1 0 1 && \
-	$(BINDIR)/sharp_testsuite test fejer1 2047 -1 -1 4096 2 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 2
-
-perftest: compile_all
-	$(BINDIR)/sharp_testsuite test healpix 2048 -1 1024 -1 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 63 -1 -1 128 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 127 -1 -1 256 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 255 -1 -1 512 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 511 -1 -1 1024 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 1023 -1 -1 2048 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 2047 -1 -1 4096 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 4095 -1 -1 8192 0 1 && \
-	$(BINDIR)/sharp_testsuite test gauss 8191 -1 -1 16384 0 1
-
-%.c: %.c.in
-# Only do this if the md5sum changed, in order to avoid Python and Jinja
-# dependency when not modifying the c.in file
-	grep `md5sum $< | cut -d ' ' -f 1` $@ || ./runjinja.py < $< > $@
-
-genclean:
-	rm libsharp/sharp_legendre.c || exit 0
-
-$(CYTHON_MODULES): %.so: %.pyx
-ifndef PIC_CFLAGS
-	$(error Python extension must be built using the --enable-pic configure option.)
-endif
-	cython $<
-	$(CC) $(DEBUG_CFLAGS) $(OPENMP_CFLAGS) $(PIC_CFLAGS) `python-config --cflags` -I$(INCDIR) -o $(<:.pyx=.o) -c $(<:.pyx=.c)
-	$(CL) -shared $(<:.pyx=.o) $(OPENMP_CFLAGS) $(CYTHON_OBJ) -L$(LIBDIR) -lsharp -lfftpack -lc_utils -L`python-config --prefix`/lib `python-config --ldflags` -o $@
-
-python: $(all_lib) hdrcopy $(CYTHON_MODULES)
-
-# the following test files are automatic; the sht wrapper test
-# must be run manually and requires MPI at the moment..
-pytest: python
-	cd python && nosetests --nocapture libsharp/tests/test_legendre_table.py libsharp/tests/test_legendre.py
--- a/Makefile.am
+++ b/Makefile.am
@ -0,0 +1,49 @@
+ACLOCAL_AMFLAGS = -I m4
+
+lib_LTLIBRARIES = libsharp.la
+
+src_sharp = \
+  c_utils/c_utils.c \
+  c_utils/c_utils.h \
+  pocketfft/pocketfft.c \
+  pocketfft/pocketfft.h \
+  libsharp/sharp.c \
+  libsharp/sharp_almhelpers.c \
+  libsharp/sharp_core.c \
+  libsharp/sharp_core_avx.c \
+  libsharp/sharp_core_avx2.c \
+  libsharp/sharp_core_fma.c \
+  libsharp/sharp_core_fma4.c \
+  libsharp/sharp_core_avx512f.c \
+  libsharp/sharp_geomhelpers.c \
+  libsharp/sharp_legendre_roots.c \
+  libsharp/sharp_ylmgen_c.c \
+  libsharp/sharp_internal.h \
+  libsharp/sharp_legendre_roots.h \
+  libsharp/sharp_vecsupport.h \
+  libsharp/sharp_ylmgen_c.h
+
+include_HEADERS = \
+  libsharp/sharp.h \
+  libsharp/sharp_geomhelpers.h \
+  libsharp/sharp_almhelpers.h \
+  libsharp/sharp_cxx.h
+
+EXTRA_DIST = \
+  libsharp/sharp_core_inc.c \
+  runtest.sh
+
+libsharp_la_SOURCES = $(src_sharp)
+
+check_PROGRAMS = sharp_testsuite
+sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
+sharp_testsuite_LDADD = libsharp.la
+
+TESTS = runtest.sh
+
+AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
+
+pkgconfigdir = $(libdir)/pkgconfig
+nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
+
+DISTCLEANFILES=@PACKAGE_NAME@.pc @PACKAGE_NAME@.pc.in  @PACKAGE_NAME@-uninstalled.pc  @PACKAGE_NAME@-uninstalled.sh
--- a/README.md
+++ b/README.md
@ -1,43 +1,35 @@
 # Libsharp

-*IMPORTANT NOTE*: It appears that the default branch upon cloning from
-github.com/dagss/libsharp was an outdated 'dagss' branch instead of
-the 'master' branch. To get the latest copy,
-please do `git checkout master; git pull`. New clones are no longer affected.
+Library for efficient spherical harmonic transforms at arbitrary spins,
+supporting CPU vectorization, OpenMP and MPI.

 ## Paper

 https://arxiv.org/abs/1303.4945

+## News
+
+### January 2019
+
+This update features significant speedups thanks to important algorithmic
+discoveries by Keiichi Ishioka
+(https://www.jstage.jst.go.jp/article/jmsj/96/2/96_2018-019/_article and
+personal communication).
+
+These improvements reduce the fraction of CPU time spent on evaluating the
+recurrences for Y_lm coefficients, which means that computing multiple
+simultaneous SHTs no longer have a big performance advantage compared to SHTs
+done one after the other.
+As a consequence, libsharp support for simultaneous SHTs was dropped, making
+its interface much simpler.
+
+With the proper compilers and flags (see the file COMPILE for details) libsharp
+is now built with support for SSE2, AVX, AVX2, FMA3, FMA4 and AVX512f and the
+appropriate implementation is selected dynamically at runtime. This should
+provide a very significant performance boost for everyone using pre-compiled
+portable binaries.
+
 ## Compilation

-GNU make is required for compilation.
-
-Libsharp compilation has been successfully tested with GNU and Intel compilers.
-When using gcc, version 4.x is required [1].
-Since libsharp was written in standard C99, other compilers should work fine,
-but SSE2/AVX support will most likely be deactivated.
-
-If you obtained libsharp directly from the git repository, you will also
-need a copy of the GNU autotools. In this case, run "autoconf" in libsharp's
-main directory before any other steps.
-For libsharp releases distributed as a .tar.gz file, this step is not necessary.
-
-Afterwards, simply run "./configure"; if this fails, please refer to the output
-of "./configure --help" for additional hints and, if necessary, provide
-additional flags to the configure script.
-Once the script finishes successfully, run "make"
-(or "gmake"). This should install the compilation products in the
-subdirectory "auto/".
-
-Documentation can be created by the command "(g)make doc".
-However this requires the doxygen application to be installed
-on your system.
-The documentation will be created in the subdirectory doc/.
-
-
-[1] Some versions of the gcc 4.4.x release series contain a bug which causes
-the compiler to crash during libsharp compilation. This appears to be fixed
-in the gcc 4.4.7 release. It is possible to work around this problem by adding
-the compiler flag "-fno-tree-fre" after the other optimization flags - the
-configure script should do this automatically.
+The library uses the standard `autotools` mechanism for configuration,
+compilation and installation. See the file `COMPILE` for configuration hints.
--- a/c_utils/c_utils.c
+++ b/c_utils/c_utils.c
@ -25,7 +25,7 @@
 /*
 *  Convenience functions
 *
- *  Copyright (C) 2008, 2009, 2010, 2011, 2012 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -44,7 +44,7 @@ void util_warn_ (const char *file, int line, const char *func, const char *msg)

 /* This function tries to avoid allocations with a total size close to a high
   power of two (called the "critical stride" here), by adding a few more bytes
-   if necssary. This lowers the probability that two arrays differ by a multiple
+   if necessary. This lowers the probability that two arrays differ by a multiple
   of the critical stride in their starting address, which in turn lowers the
   risk of cache line contention. */
 static size_t manipsize(size_t sz)
@ -61,7 +61,7 @@ void *util_malloc_ (size_t sz)
  {
  void *res;
  if (sz==0) return NULL;
-  res = _mm_malloc(manipsize(sz),16);
+  res = _mm_malloc(manipsize(sz),32);
  UTIL_ASSERT(res,"_mm_malloc() failed");
  return res;
  }
--- a/c_utils/c_utils.h
+++ b/c_utils/c_utils.h
@ -25,7 +25,7 @@
 /*! \file c_utils.h
 *  Convenience functions
 *
- *  Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
 *  \author Martin Reinecke
 *  \note This file should only be included from .c files, NOT from .h files.
 */
@ -144,4 +144,10 @@ void util_free_ (void *ptr);
 }
 #endif

+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#else
+#define NOINLINE
+#endif
+
 #endif
--- a/c_utils/planck.make
+++ b/c_utils/planck.make
@ -1,18 +0,0 @@
-PKG:=c_utils
-
-SD:=$(SRCROOT)/$(PKG)
-OD:=$(BLDROOT)/$(PKG)
-
-FULL_INCLUDE+= -I$(SD)
-
-HDR_$(PKG):=$(SD)/*.h
-LIB_$(PKG):=$(LIBDIR)/libc_utils.a
-
-OBJ:=c_utils.o walltime_c.o memusage.o
-OBJ:=$(OBJ:%=$(OD)/%)
-
-$(OBJ): $(HDR_$(PKG)) | $(OD)_mkdir
-$(LIB_$(PKG)): $(OBJ)
-
-all_hdr+=$(HDR_$(PKG))
-all_lib+=$(LIB_$(PKG))
--- a/c_utils/walltime_c.c
+++ b/c_utils/walltime_c.c
@ -25,7 +25,7 @@
 /*
 *  Functionality for reading wall clock time
 *
- *  Copyright (C) 2010, 2011 Max-Planck-Society
+ *  Copyright (C) 2010-2016 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -33,6 +33,8 @@
 #include <omp.h>
 #elif defined (USE_MPI)
 #include "mpi.h"
+#elif defined (_WIN32)
+#include <Windows.h>
 #else
 #include <sys/time.h>
 #include <stdlib.h>
@ -46,6 +48,17 @@ double wallTime(void)
  return omp_get_wtime();
 #elif defined (USE_MPI)
  return MPI_Wtime();
+#elif defined (_WIN32)
+  static double inv_freq = -1.;
+  if (inv_freq<0)
+    {
+    LARGE_INTEGER freq;
+    QueryPerformanceFrequency(&freq);
+    inv_freq = 1. / double(freq.QuadPart);
+    }
+  LARGE_INTEGER count;
+  QueryPerformanceCounter(&count);
+  return count.QuadPart*inv_freq;
 #else
  struct timeval t;
  gettimeofday(&t, NULL);
--- a/config/config.auto.in
+++ b/config/config.auto.in
@ -1,12 +0,0 @@
-@SILENT_RULE@
-
-CC=@CC@
-CL=@CC@
-CCFLAGS_NO_C=@CCFLAGS_NO_C@
-CCFLAGS=$(CCFLAGS_NO_C) -c
-CLFLAGS=-L. -L$(LIBDIR) @LDCCFLAGS@ -lm
-DEBUG_CFLAGS=@DEBUG_CFLAGS@
-MPI_CFLAGS=@MPI_CFLAGS@
-OPENMP_CFLAGS=@OPENMP_CFLAGS@
-PIC_CFLAGS=@PIC_CFLAGS@
-ARCREATE=@ARCREATE@
--- a/config/rules.common
+++ b/config/rules.common
@ -1,33 +0,0 @@
-BLDROOT   = $(SRCROOT)/build.$(SHARP_TARGET)
-PREFIX    = $(SRCROOT)/$(SHARP_TARGET)
-BINDIR    = $(PREFIX)/bin
-INCDIR    = $(PREFIX)/include
-LIBDIR    = $(PREFIX)/lib
-DOCDIR    = $(SRCROOT)/doc
-PYTHONDIR = $(SRCROOT)/python/libsharp
-
-# do not use any suffix rules
-.SUFFIXES:
-# do not use any default rules
-.DEFAULT:
-
-echo_config:
-	@echo using configuration \'$(SHARP_TARGET)\'
-
-$(BLDROOT)/%.o : $(SRCROOT)/%.c | echo_config
-	@echo "#  compiling $*.c"
-	cd $(@D) && $(CC) $(FULL_INCLUDE) -I$(BLDROOT) $(CCFLAGS) $<
-
-$(BLDROOT)/%.o : $(SRCROOT)/%.cc | echo_config
-	@echo "#  compiling $*.cc"
-	cd $(@D) && $(CXX) $(FULL_INCLUDE) -I$(BLDROOT) $(CXXCFLAGS) $<
-
-%_mkdir:
-	@if [ ! -d $* ]; then mkdir -p $* ; fi
-
-clean:
-	rm -rf $(BLDROOT) $(PREFIX) $(DOCDIR) autom4te.cache/ config.log config.status
-	rm -rf $(PYTHONDIR)/*.c $(PYTHONDIR)/*.o $(PYTHONDIR)/*.so
-
-distclean: clean
-	rm -f config/config.auto
--- a/configure.ac
+++ b/configure.ac
@ -1,113 +1,45 @@
-AC_INIT(config/config.auto.in)
+AC_INIT([libsharp], [1.0.0])
+AM_INIT_AUTOMAKE([foreign subdir-objects -Wall -Werror])
+AM_MAINTAINER_MODE([enable])

-AC_CHECK_PROG([uname_found],[uname],[1],[0])
-if test $uname_found -eq 0 ; then
-    echo "No uname found; setting system type to unknown."
-    system="unknown"
-else
-    system=`uname -s`-`uname -r`
-fi
-AC_LANG([C])

-AC_TRY_COMPILE([], [@%:@ifndef __INTEL_COMPILER
-choke me
-@%:@endif], [ICC=[yes]], [ICC=[no]])
+dnl
+dnl Needed for linking on Windows.
+dnl Protect with m4_ifdef because AM_PROG_AR is required in
+dnl autoconf >= 1.12 when using -Wall, but the macro is
+dnl absent in old versions of autoconf.
+dnl
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])

-if test $ICC = yes; then GCC=no; fi
-CCTYPE=unknown
-if test $GCC = yes; then CCTYPE=gcc; fi
-if test $ICC = yes; then CCTYPE=icc; fi
-AC_OPENMP
+LT_INIT
+AC_CONFIG_MACRO_DIR([m4])

-SILENT_RULE=".SILENT:"
-AC_ARG_ENABLE(noisy-make,
-  [  --enable-noisy-make     enable detailed make output],
-  [if test "$enableval" = yes; then
-     SILENT_RULE=""
-   fi])
+dnl
+dnl By default, install the headers into a subdirectory of
+dnl ${prefix}/include to avoid possible header filename collisions.
+dnl
+includedir="${includedir}/${PACKAGE_NAME}"

-ENABLE_MPI=no
-AC_ARG_ENABLE(mpi,
-  [  --enable-mpi            enable generation of MPI-parallel code],
-  [if test "$enableval" = yes; then
-     ENABLE_MPI=yes
-   fi])
+dnl
+dnl Enable silent build rules if this version of Automake supports them
+dnl
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

-ENABLE_DEBUG=no
-AC_ARG_ENABLE(debug,
-  [  --enable-debug          enable generation of debugging symbols],
-  [if test "$enableval" = yes; then
-     ENABLE_DEBUG=yes
-   fi])

-ENABLE_PIC=no
-AC_ARG_ENABLE(pic,
-  [  --enable-pic            enable generation of position independent code],
-  [if test "$enableval" = yes; then
-     ENABLE_PIC=yes
-   fi])
+AC_PROG_CC_C99

-case $CCTYPE in
-  gcc)
-    CCFLAGS="-O3 -fno-tree-vectorize -ffast-math -fomit-frame-pointer -std=c99 -pedantic -Wextra -Wall -Wno-unknown-pragmas -Wshadow -Wmissing-prototypes -Wfatal-errors"
-    GCCVERSION="`$CC -dumpversion 2>&1`"
-    echo "Using gcc version $GCCVERSION"
-    AC_SUBST(GCCVERSION)
-    changequote(,)
-    gcc43=`echo $GCCVERSION | grep -c '^4\.[3456789]'`
-    gcc44=`echo $GCCVERSION | grep -c '^4\.4'`
-    changequote([,])
-    if test $gcc43 -gt 0; then
-      CCFLAGS="$CCFLAGS -march=native"
-    fi
-    if test $gcc44 -gt 0; then
-      CCFLAGS="$CCFLAGS -fno-tree-fre"
-    fi
-    ;;
-  icc)
-    CCFLAGS="-O3 -xHOST -std=c99 -ip -Wbrief -Wall -vec-report0 -openmp-report0 -wd383,981,1419,1572"
-    ;;
-  *)
-    CCFLAGS="-O2"
-    # Don't do anything now
-    ;;
-esac
+# adding the lib to the files to link
+LIBS="-lm"

-case $system in
-  Darwin-*)
-    ARCREATE="libtool -static -o"
-    ;;
-  *)
-    ARCREATE="ar cr"
-    ;;
-esac
+AC_PROG_LIBTOOL

-if test $ENABLE_DEBUG = yes; then
-  DEBUG_CFLAGS="-g"
-fi
+dnl
+dnl Create pkgconfig .pc file.
+dnl
+AX_CREATE_PKGCONFIG_INFO(,,,,[])
+AC_SUBST([LIBS])
+AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_LDFLAGS])

-if test $ENABLE_PIC = yes; then
-  PIC_CFLAGS="-fPIC"
-fi
-
-if test $ENABLE_MPI = yes; then
-  MPI_CFLAGS="-DUSE_MPI"
-fi
-
-CCFLAGS="$CCFLAGS $DEBUG_CFLAGS $OPENMP_CFLAGS $PIC_CFLAGS $MPI_CFLAGS"
-
-CCFLAGS_NO_C="$CCFLAGS $CPPFLAGS"
-
-LDCCFLAGS="$LDFLAGS $CCFLAGS"
-
-AC_SUBST(SILENT_RULE)
-AC_SUBST(CC)
-AC_SUBST(CCFLAGS_NO_C)
-AC_SUBST(LDCCFLAGS)
-AC_SUBST(DEBUG_CFLAGS)
-AC_SUBST(MPI_CFLAGS)
-AC_SUBST(OPENMP_CFLAGS)
-AC_SUBST(PIC_CFLAGS)
-AC_SUBST(ARCREATE)
-
-AC_OUTPUT(config/config.auto)
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT
--- a/docsrc/c_utils.dox
+++ b/docsrc/c_utils.dox
@ -1,290 +0,0 @@
-# Doxyfile 1.8.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "LevelS C support library"
-PROJECT_NUMBER         = 0.1
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = .
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = NO
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = NO
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = YES
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = YES
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = NO
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = YES
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = ../c_utils
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          = *.h \
-                         *.c \
-                         *.dox
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = YES
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = NO
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = htmldoc
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            = footer.html
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = NO
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = NO
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = YES
-PAPER_TYPE             = a4wide
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = NO
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               =
-GENERATE_TAGFILE       = c_utils.tag
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = FreeSans
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-UML_LIMIT_NUM_FIELDS   = 10
-TEMPLATE_RELATIONS     = YES
-INCLUDE_GRAPH          = NO
-INCLUDED_BY_GRAPH      = NO
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = NO
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = NO
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
--- a/docsrc/footer.html
+++ b/docsrc/footer.html
@ -1,5 +0,0 @@
-<hr><address style="align: right;"><small>
-Generated on $datetime for $projectname
-</a> </small></address>
-</body>
-</html>
--- a/docsrc/index_code.html
+++ b/docsrc/index_code.html
@ -1,15 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
-<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
-<title>Libsharp source code documentation</title>
-</head><body>
-<H1>Libsharp source code documentation</H1>
-
-<H2>C interfaces</H2>
-
-<ul>
-<li><a href="c_utils/index.html">C support library</a>
-<li><a href="libfftpack/index.html">FFT interface</a>
-<li><a href="libsharp/index.html">Library for spherical harmonic transforms</a>
-</ul>
-</body>
-</html>
--- a/docsrc/libfftpack.dox
+++ b/docsrc/libfftpack.dox
@ -1,290 +0,0 @@
-# Doxyfile 1.8.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "LevelS FFT library"
-PROJECT_NUMBER         = 0.1
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = .
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = NO
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = NO
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = YES
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = YES
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = NO
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = YES
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = ../libfftpack
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          = *.h \
-                         *.c \
-                         *.dox
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = YES
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = NO
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = htmldoc
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            = footer.html
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = NO
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = NO
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = YES
-PAPER_TYPE             = a4wide
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = NO
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               = c_utils.tag=../c_utils
-GENERATE_TAGFILE       = libfftpack.tag
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = FreeSans
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-UML_LIMIT_NUM_FIELDS   = 10
-TEMPLATE_RELATIONS     = YES
-INCLUDE_GRAPH          = NO
-INCLUDED_BY_GRAPH      = NO
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = NO
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = NO
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
--- a/docsrc/libsharp.dox
+++ b/docsrc/libsharp.dox
@ -1,291 +0,0 @@
-# Doxyfile 1.8.1
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "LevelS SHT library"
-PROJECT_NUMBER         = 0.1
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = .
-CREATE_SUBDIRS         = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = NO
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       =
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = NO
-STRIP_FROM_PATH        =
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 8
-ALIASES                =
-TCL_SUBST              =
-OPTIMIZE_OUTPUT_FOR_C  = YES
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-SYMBOL_CACHE_SIZE      = 0
-LOOKUP_CACHE_SIZE      = 0
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-HIDE_UNDOC_MEMBERS     = YES
-HIDE_UNDOC_CLASSES     = YES
-HIDE_FRIEND_COMPOUNDS  = YES
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = YES
-HIDE_SCOPE_NAMES       = NO
-SHOW_INCLUDE_FILES     = YES
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = NO
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-QUIET                  = YES
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_NO_PARAMDOC       = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LOGFILE           =
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-INPUT                  = ../libsharp
-INPUT_ENCODING         = UTF-8
-FILE_PATTERNS          = *.h \
-                         *.c \
-                         *.dox
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       =
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             =
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-SOURCE_BROWSER         = YES
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = NO
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
-IGNORE_PREFIX          =
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-GENERATE_HTML          = YES
-HTML_OUTPUT            = htmldoc
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            = footer.html
-HTML_STYLESHEET        =
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = NO
-ENUM_VALUES_PER_LINE   = 4
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-FORMULA_FONTSIZE       = 10
-FORMULA_TRANSPARENT    = YES
-USE_MATHJAX            = NO
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-MATHJAX_EXTENSIONS     =
-SEARCHENGINE           = NO
-SERVER_BASED_SEARCH    = NO
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-GENERATE_LATEX         = NO
-LATEX_OUTPUT           = latex
-LATEX_CMD_NAME         = latex
-MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = YES
-PAPER_TYPE             = a4wide
-EXTRA_PACKAGES         =
-LATEX_HEADER           =
-LATEX_FOOTER           =
-PDF_HYPERLINKS         = YES
-USE_PDFLATEX           = YES
-LATEX_BATCHMODE        = NO
-LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
-LATEX_BIB_STYLE        = plain
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-GENERATE_RTF           = NO
-RTF_OUTPUT             = rtf
-COMPACT_RTF            = NO
-RTF_HYPERLINKS         = NO
-RTF_STYLESHEET_FILE    =
-RTF_EXTENSIONS_FILE    =
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-GENERATE_MAN           = NO
-MAN_OUTPUT             = man
-MAN_EXTENSION          = .3
-MAN_LINKS              = NO
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-GENERATE_XML           = NO
-XML_OUTPUT             = xml
-XML_SCHEMA             =
-XML_DTD                =
-XML_PROGRAMLISTING     = YES
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-GENERATE_AUTOGEN_DEF   = NO
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-GENERATE_PERLMOD       = NO
-PERLMOD_LATEX          = NO
-PERLMOD_PRETTY         = YES
-PERLMOD_MAKEVAR_PREFIX =
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-ENABLE_PREPROCESSING   = YES
-MACRO_EXPANSION        = NO
-EXPAND_ONLY_PREDEF     = NO
-SEARCH_INCLUDES        = YES
-INCLUDE_PATH           =
-INCLUDE_FILE_PATTERNS  =
-PREDEFINED             =
-EXPAND_AS_DEFINED      =
-SKIP_FUNCTION_MACROS   = YES
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-TAGFILES               = libfftpack.tag=../libfftpack \
-                         c_utils.tag=../c_utils
-GENERATE_TAGFILE       = libsharp.tag
-ALLEXTERNALS           = NO
-EXTERNAL_GROUPS        = YES
-PERL_PATH              = /usr/bin/perl
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-CLASS_DIAGRAMS         = YES
-MSCGEN_PATH            =
-HIDE_UNDOC_RELATIONS   = YES
-HAVE_DOT               = NO
-DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = FreeSans
-DOT_FONTSIZE           = 10
-DOT_FONTPATH           =
-CLASS_GRAPH            = YES
-COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
-UML_LOOK               = NO
-UML_LIMIT_NUM_FIELDS   = 10
-TEMPLATE_RELATIONS     = YES
-INCLUDE_GRAPH          = NO
-INCLUDED_BY_GRAPH      = NO
-CALL_GRAPH             = NO
-CALLER_GRAPH           = NO
-GRAPHICAL_HIERARCHY    = NO
-DIRECTORY_GRAPH        = YES
-DOT_IMAGE_FORMAT       = png
-INTERACTIVE_SVG        = NO
-DOT_PATH               =
-DOTFILE_DIRS           =
-MSCFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
-MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
-DOT_MULTI_TARGETS      = NO
-GENERATE_LEGEND        = YES
-DOT_CLEANUP            = YES
--- a/docsrc/planck.make
+++ b/docsrc/planck.make
@ -1,20 +0,0 @@
-PKG:=docsrc
-
-docsrc_idx: $(DOCDIR)_mkdir
-	cp $(SRCROOT)/docsrc/index_code.html $(DOCDIR)/index.html
-
-docsrc_code_doc: $(DOCDIR)_mkdir docsrc_idx
-	cd $(SRCROOT)/docsrc; \
-	for i in c_utils libfftpack libsharp; do \
-	  doxygen $${i}.dox; \
-	  rm -rf $(DOCDIR)/$${i}; mv htmldoc $(DOCDIR)/$${i}; \
-	done; \
-	rm *.tag;
-
-docsrc_clean:
-	cd $(SRCROOT)/docsrc; \
-	rm -f *.tag
-	cd $(SRCROOT)/docsrc; \
-	rm -rf htmldoc
-
-doc: docsrc_code_doc
--- a/fortran/sharp.f90
+++ b/fortran/sharp.f90
@ -1,286 +0,0 @@
-module sharp
-  use iso_c_binding
-  implicit none
-  ! alm_info flags
-  integer, parameter :: SHARP_PACKED = 1
-
-  ! sharp job types
-  enum, bind(c)
-      enumerator :: SHARP_YtW = 0
-      enumerator :: SHARP_Y = 1
-      enumerator :: SHARP_Yt = 2
-      enumerator :: SHARP_WY = 3
-      enumerator :: SHARP_ALM2MAP_DERIV1 = 4
-   end enum
-
-  ! sharp job flags
-  integer, parameter :: SHARP_DP             = ISHFT(1, 4)
-  integer, parameter :: SHARP_ADD            = ISHFT(1, 5)
-  integer, parameter :: SHARP_REAL_HARMONICS = ISHFT(1, 6)
-  integer, parameter :: SHARP_NO_FFT         = ISHFT(1, 7)
-
-  type sharp_geom_info
-     type(c_ptr) :: handle
-     integer(c_intptr_t) :: n_local
-  end type sharp_geom_info
-
-  type sharp_alm_info
-     type(c_ptr) :: handle
-     integer(c_intptr_t) :: n_local
-  end type sharp_alm_info
-
-  interface
-
-     ! alm_info
-     subroutine sharp_make_general_alm_info( &
-         lmax, nm, stride, mval, mvstart, flags, alm_info) bind(c)
-       use iso_c_binding
-       integer(c_int), value, intent(in)    :: lmax, nm, stride, flags
-       integer(c_int), intent(in)           :: mval(nm)
-       integer(c_intptr_t), intent(in)     :: mvstart(nm)
-       type(c_ptr), intent(out)             :: alm_info
-     end subroutine sharp_make_general_alm_info
-
-     subroutine c_sharp_make_mmajor_real_packed_alm_info( &
-         lmax, stride, nm, ms, alm_info) bind(c, name='sharp_make_mmajor_real_packed_alm_info')
-       use iso_c_binding
-       integer(c_int), value, intent(in)    :: lmax, nm, stride
-       integer(c_int), intent(in), optional :: ms(nm)
-       type(c_ptr), intent(out)             :: alm_info
-     end subroutine c_sharp_make_mmajor_real_packed_alm_info
-
-     function c_sharp_alm_count(alm_info) bind(c, name='sharp_alm_count')
-       use iso_c_binding
-       integer(c_intptr_t)           :: c_sharp_alm_count
-       type(c_ptr), value, intent(in) :: alm_info
-     end function c_sharp_alm_count
-
-     subroutine c_sharp_destroy_alm_info(alm_info) bind(c, name='sharp_destroy_alm_info')
-       use iso_c_binding
-       type(c_ptr), value                   :: alm_info
-     end subroutine c_sharp_destroy_alm_info
-
-     ! geom_info
-     subroutine sharp_make_subset_healpix_geom_info ( &
-          nside, stride, nrings, rings, weight, geom_info) bind(c)
-       use iso_c_binding
-       integer(c_int), value, intent(in)    :: nside, stride, nrings
-       integer(c_int), intent(in), optional :: rings(nrings)
-       real(c_double), intent(in), optional :: weight(2 * nside)
-       type(c_ptr), intent(out)             :: geom_info
-     end subroutine sharp_make_subset_healpix_geom_info
-
-     subroutine c_sharp_destroy_geom_info(geom_info) bind(c, name='sharp_destroy_geom_info')
-       use iso_c_binding
-       type(c_ptr), value                   :: geom_info
-     end subroutine c_sharp_destroy_geom_info
-
-     function c_sharp_map_size(info) bind(c, name='sharp_map_size')
-       use iso_c_binding
-       integer(c_intptr_t) :: c_sharp_map_size
-       type(c_ptr), value   :: info
-     end function c_sharp_map_size
-
-
-     ! execute
-     subroutine c_sharp_execute(type, spin, alm, map, geom_info, alm_info, ntrans, &
-                                flags, time, opcnt) bind(c, name='sharp_execute')
-       use iso_c_binding
-       integer(c_int), value                        :: type, spin, ntrans, flags
-       type(c_ptr), value                           :: alm_info, geom_info
-       real(c_double), intent(out), optional        :: time
-       integer(c_long_long), intent(out), optional  :: opcnt
-       type(c_ptr), intent(in)                      :: alm(*), map(*)
-     end subroutine c_sharp_execute
-
-     subroutine c_sharp_execute_mpi(comm, type, spin, alm, map, geom_info, alm_info, ntrans, &
-                                    flags, time, opcnt) bind(c, name='sharp_execute_mpi_fortran')
-       use iso_c_binding
-       integer(c_int), value                        :: comm, type, spin, ntrans, flags
-       type(c_ptr), value                           :: alm_info, geom_info
-       real(c_double), intent(out), optional        :: time
-       integer(c_long_long), intent(out), optional  :: opcnt
-       type(c_ptr), intent(in)                      :: alm(*), map(*)
-     end subroutine c_sharp_execute_mpi
-
-     ! Legendre transforms
-     subroutine c_sharp_legendre_transform(bl, recfac, lmax, x, out, nx) &
-          bind(c, name='sharp_legendre_transform')
-       use iso_c_binding
-       integer(c_intptr_t), value :: lmax, nx
-       real(c_double) :: bl(lmax + 1), x(nx), out(nx)
-       real(c_double), optional :: recfac(lmax + 1)
-     end subroutine c_sharp_legendre_transform
-
-     subroutine c_sharp_legendre_transform_s(bl, recfac, lmax, x, out, nx) &
-          bind(c, name='sharp_legendre_transform_s')
-       use iso_c_binding
-       integer(c_intptr_t), value :: lmax, nx
-       real(c_float) :: bl(lmax + 1), x(nx), out(nx)
-       real(c_float), optional :: recfac(lmax + 1)
-     end subroutine c_sharp_legendre_transform_s
-  end interface
-
-  interface sharp_execute
-     module procedure sharp_execute_d
-  end interface
-
-  interface sharp_legendre_transform
-     module procedure sharp_legendre_transform_d, sharp_legendre_transform_s
-  end interface sharp_legendre_transform
-
-contains
-  ! alm info
-
-  ! if ms is not passed, we default to using m=0..lmax.
-  subroutine sharp_make_mmajor_real_packed_alm_info(lmax, ms, alm_info)
-    use iso_c_binding
-    integer(c_int), value, intent(in)    :: lmax
-    integer(c_int), intent(in), optional :: ms(:)
-    type(sharp_alm_info), intent(out)    :: alm_info
-    !--
-    integer(c_int), allocatable          :: ms_copy(:)
-    integer(c_int)                       :: nm
-
-    if (present(ms)) then
-       nm = size(ms)
-       allocate(ms_copy(nm))
-       ms_copy = ms
-       call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, nm, ms_copy, alm_info=alm_info%handle)
-       deallocate(ms_copy)
-    else
-       call c_sharp_make_mmajor_real_packed_alm_info(lmax, 1, lmax + 1, alm_info=alm_info%handle)
-    end if
-    alm_info%n_local = c_sharp_alm_count(alm_info%handle)
-  end subroutine sharp_make_mmajor_real_packed_alm_info
-
-  subroutine sharp_destroy_alm_info(alm_info)
-    use iso_c_binding
-    type(sharp_alm_info), intent(inout) :: alm_info
-    call c_sharp_destroy_alm_info(alm_info%handle)
-    alm_info%handle = c_null_ptr
-  end subroutine sharp_destroy_alm_info
-
-
-  ! geom info
-  subroutine sharp_make_healpix_geom_info(nside, rings, weight, geom_info)
-    integer(c_int), value                :: nside
-    integer(c_int), optional             :: rings(:)
-    real(c_double), intent(in), optional :: weight(2 * nside)
-    type(sharp_geom_info), intent(out)   :: geom_info
-    !--
-    integer(c_int) :: nrings
-    integer(c_int), allocatable :: rings_copy(:)
-
-    if (present(rings)) then
-       nrings = size(rings)
-       allocate(rings_copy(nrings))
-       rings_copy = rings
-       call sharp_make_subset_healpix_geom_info(nside, 1, nrings, rings_copy, &
-                                                weight, geom_info%handle)
-       deallocate(rings_copy)
-    else
-       call sharp_make_subset_healpix_geom_info(nside, 1, nrings=4 * nside - 1, &
-                                                weight=weight, geom_info=geom_info%handle)
-    end if
-    geom_info%n_local = c_sharp_map_size(geom_info%handle)
-  end subroutine sharp_make_healpix_geom_info
-
-  subroutine sharp_destroy_geom_info(geom_info)
-    use iso_c_binding
-    type(sharp_geom_info), intent(inout) :: geom_info
-    call c_sharp_destroy_geom_info(geom_info%handle)
-    geom_info%handle = c_null_ptr
-  end subroutine sharp_destroy_geom_info
-
-
-  ! Currently the only mode supported is stacked (not interleaved) maps.
-  !
-  ! Note that passing the exact dimension of alm/map is necesarry, it
-  ! prevents the caller from doing too crazy slicing prior to pass array
-  ! in...
-  !
-  ! Usage:
-  !
-  ! The alm array must have shape exactly alm(alm_info%n_local, nmaps)
-  ! The maps array must have shape exactly map(map_info%n_local, nmaps).
-  subroutine sharp_execute_d(type, spin, nmaps, alm, alm_info, map, geom_info, &
-                             add, time, opcnt, comm)
-    use iso_c_binding
-    use mpi
-    implicit none
-    integer(c_int), value                        :: type, spin, nmaps
-    integer(c_int), optional                     :: comm
-    logical, value, optional                     :: add  ! should add instead of replace out
-
-    type(sharp_alm_info)                         :: alm_info
-    type(sharp_geom_info)                        :: geom_info
-    real(c_double), intent(out), optional        :: time
-    integer(c_long_long), intent(out), optional  :: opcnt
-    real(c_double), target, intent(inout)        :: alm(0:alm_info%n_local - 1, 1:nmaps)
-    real(c_double), target, intent(inout)        :: map(0:geom_info%n_local - 1, 1:nmaps)
-    !--
-    integer(c_int)         :: mod_flags, ntrans, k
-    type(c_ptr), target    :: alm_ptr(nmaps)
-    type(c_ptr), target    :: map_ptr(nmaps)
-
-    mod_flags = SHARP_DP
-    if (present(add) .and. add) then
-       mod_flags = or(mod_flags, SHARP_ADD)
-    end if
-
-    if (spin == 0) then
-       ntrans = nmaps
-    else
-       ntrans = nmaps / 2
-    end if
-
-    ! Set up pointer table to access maps
-    alm_ptr(:) = c_null_ptr
-    map_ptr(:) = c_null_ptr
-    do k = 1, nmaps
-       if (alm_info%n_local > 0) alm_ptr(k) = c_loc(alm(0, k))
-       if (geom_info%n_local > 0) map_ptr(k) = c_loc(map(0, k))
-    end do
-
-    if (present(comm)) then
-      call c_sharp_execute_mpi(comm, type, spin, alm_ptr, map_ptr, &
-          geom_info=geom_info%handle, &
-          alm_info=alm_info%handle, &
-          ntrans=ntrans, &
-          flags=mod_flags, &
-          time=time, &
-          opcnt=opcnt)
-    else
-      call c_sharp_execute(type, spin, alm_ptr, map_ptr, &
-          geom_info=geom_info%handle, &
-          alm_info=alm_info%handle, &
-          ntrans=ntrans, &
-          flags=mod_flags, &
-          time=time, &
-          opcnt=opcnt)
-   end if
-  end subroutine sharp_execute_d
-
-  subroutine sharp_legendre_transform_d(bl, x, out)
-    use iso_c_binding
-    real(c_double) :: bl(:)
-    real(c_double) :: x(:), out(size(x))
-    !--
-    integer(c_intptr_t) :: lmax, nx
-    call c_sharp_legendre_transform(bl, lmax=int(size(bl) - 1, c_intptr_t), &
-                                    x=x, out=out, nx=int(size(x), c_intptr_t))
-  end subroutine sharp_legendre_transform_d
-
-  subroutine sharp_legendre_transform_s(bl, x, out)
-    use iso_c_binding
-    real(c_float) :: bl(:)
-    real(c_float) :: x(:), out(size(x))
-    !--
-    integer(c_intptr_t) :: lmax, nx
-    call c_sharp_legendre_transform_s(bl, lmax=int(size(bl) - 1, c_intptr_t), &
-                                      x=x, out=out, nx=int(size(x), c_intptr_t))
-  end subroutine sharp_legendre_transform_s
-
-
-end module
--- a/fortran/test_sharp.f90
+++ b/fortran/test_sharp.f90
@ -1,84 +0,0 @@
-program test_sharp
-  use mpi
-  use sharp
-  use iso_c_binding, only : c_ptr, c_double
-  implicit none
-
-  integer, parameter :: lmax = 2, nside = 2
-  type(sharp_alm_info) :: alm_info
-  type(sharp_geom_info) :: geom_info
-
-  real(c_double), dimension(0:(lmax + 1)**2 - 1, 1:1) :: alm
-  real(c_double), dimension(0:12*nside**2 - 1, 1:1) :: map
-
-  integer(c_int), dimension(1:lmax + 1) :: ms
-  integer(c_int), dimension(1:4 * nside - 1) :: rings
-  integer(c_int) :: nm, m, nrings, iring
-  integer :: nodecount, rank, ierr
-
-  call MPI_Init(ierr)
-  call MPI_Comm_size(MPI_COMM_WORLD, nodecount, ierr)
-  call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
-
-  nm = 0
-  do m = rank, lmax, nodecount
-     nm = nm + 1
-     ms(nm) = m
-  end do
-
-  nrings = 0
-  do iring = rank + 1, 4 * nside - 1, nodecount
-     nrings = nrings + 1
-     rings(nrings) = iring
-  end do
-
-  alm = 0
-  map = 0
-  if (rank == 0) then
-    alm(0, 1) = 1
-  end if
-
-  print *, ms(1:nm)
-  call sharp_make_mmajor_real_packed_alm_info(lmax, ms=ms(1:nm), alm_info=alm_info)
-  print *, 'alm_info%n_local', alm_info%n_local
-  call sharp_make_healpix_geom_info(nside, rings=rings(1:nrings), geom_info=geom_info)
-  print *, 'geom_info%n_local', geom_info%n_local
-  print *, 'execute'
-  call sharp_execute(SHARP_Y, 0, 1, alm, alm_info, map, geom_info, comm=MPI_COMM_WORLD)
-
-  print *, alm
-  print *, map
-
-  call sharp_destroy_alm_info(alm_info)
-  call sharp_destroy_geom_info(geom_info)
-  print *, 'DONE'
-  call MPI_Finalize(ierr)
-
-  print *, 'LEGENDRE TRANSFORMS'
-
-  call test_legendre_transforms()
-
-contains
-  subroutine test_legendre_transforms()
-    integer, parameter :: lmax = 20, nx=10
-    real(c_double) :: bl(0:lmax)
-    real(c_double) :: x(nx), out(nx)
-    real(c_float) :: out_s(nx)
-    !--
-    integer :: l, i
-
-    do l = 0, lmax
-       bl(l) = 1.0 / real(l + 1, c_double)
-    end do
-    do i = 1, nx
-       x(i) = 1 / real(i, c_double)
-    end do
-    out = 0
-    call sharp_legendre_transform(bl, x, out)
-    print *, out
-    call sharp_legendre_transform(real(bl, c_float), real(x, c_float), out_s)
-    print *, out_s
-  end subroutine test_legendre_transforms
-
-
-end program test_sharp
--- a/libfftpack/README
+++ b/libfftpack/README
@ -1,34 +0,0 @@
-ls_fft description:
-
-This package is intended to calculate one-dimensional real or complex FFTs
-with high accuracy and good efficiency even for lengths containing large
-prime factors.
-The code is written in C, but a Fortran wrapper exists as well.
-
-Before any FFT is executed, a plan must be generated for it. Plan creation
-is designed to be fast, so that there is no significant overhead if the
-plan is only used once or a few times.
-
-The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the
-double precision incarnation by Hugh C. Pumphrey
-(http://www.netlib.org/fftpack/dp.tgz).
-
-I replaced the iterative sine and cosine calculations in radfg() and radbg()
-by an exact calculation, which slightly improves the transform accuracy for
-real FFTs with lengths containing large prime factors.
-
-Since FFTPACK becomes quite slow for FFT lengths with large prime factors
-(in the worst case of prime lengths it reaches O(n*n) complexity), I
-implemented Bluestein's algorithm, which computes a FFT of length n by
-several FFTs of length n2>=2*n-1 and a convolution. Since n2 can be chosen
-to be highly composite, this algorithm is more efficient if n has large
-prime factors. The longer FFTs themselves are then computed using the FFTPACK
-routines.
-Bluestein's algorithm was implemented according to the description at
-http://en.wikipedia.org/wiki/Bluestein's_FFT_algorithm.
-
-Thread-safety:
-All routines can be called concurrently; all information needed by ls_fft
-is stored in the plan variable. However, using the same plan variable on
-multiple threads simultaneously is not supported and will lead to data
-corruption.
--- a/libfftpack/bluestein.c
+++ b/libfftpack/bluestein.c
@ -1,173 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Copyright (C) 2005, 2006, 2007, 2008 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include "fftpack.h"
-#include "bluestein.h"
-
-/* returns the sum of all prime factors of n */
-size_t prime_factor_sum (size_t n)
-  {
-  size_t result=0,x,limit,tmp;
-  while (((tmp=(n>>1))<<1)==n)
-    { result+=2; n=tmp; }
-
-  limit=(size_t)sqrt(n+0.01);
-  for (x=3; x<=limit; x+=2)
-  while ((tmp=(n/x))*x==n)
-    {
-    result+=x;
-    n=tmp;
-    limit=(size_t)sqrt(n+0.01);
-    }
-  if (n>1) result+=n;
-
-  return result;
-  }
-
-/* returns the smallest composite of 2, 3 and 5 which is >= n */
-static size_t good_size(size_t n)
-  {
-  size_t f2, f23, f235, bestfac=2*n;
-  if (n<=6) return n;
-
-  for (f2=1; f2<bestfac; f2*=2)
-    for (f23=f2; f23<bestfac; f23*=3)
-      for (f235=f23; f235<bestfac; f235*=5)
-        if (f235>=n) bestfac=f235;
-  return bestfac;
-  }
-
-void bluestein_i (size_t n, double **tstorage, size_t *worksize)
-  {
-  static const double pi=3.14159265358979323846;
-  size_t n2=good_size(n*2-1);
-  size_t m, coeff;
-  double angle, xn2;
-  double *bk, *bkf, *work;
-  double pibyn=pi/n;
-  *worksize=2+2*n+8*n2+16;
-  *tstorage = RALLOC(double,2+2*n+8*n2+16);
-  ((size_t *)(*tstorage))[0]=n2;
-  bk  = *tstorage+2;
-  bkf = *tstorage+2+2*n;
-  work= *tstorage+2+2*(n+n2);
-
-/* initialize b_k */
-  bk[0] = 1;
-  bk[1] = 0;
-
-  coeff=0;
-  for (m=1; m<n; ++m)
-    {
-    coeff+=2*m-1;
-    if (coeff>=2*n) coeff-=2*n;
-    angle = pibyn*coeff;
-    bk[2*m] = cos(angle);
-    bk[2*m+1] = sin(angle);
-    }
-
-/* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
-  xn2 = 1./n2;
-  bkf[0] = bk[0]*xn2;
-  bkf[1] = bk[1]*xn2;
-  for (m=2; m<2*n; m+=2)
-    {
-    bkf[m]   = bkf[2*n2-m]   = bk[m]   *xn2;
-    bkf[m+1] = bkf[2*n2-m+1] = bk[m+1] *xn2;
-    }
-  for (m=2*n;m<=(2*n2-2*n+1);++m)
-    bkf[m]=0.;
-  cffti (n2,work);
-  cfftf (n2,bkf,work);
-  }
-
-void bluestein (size_t n, double *data, double *tstorage, int isign)
-  {
-  size_t n2=*((size_t *)tstorage);
-  size_t m;
-  double *bk, *bkf, *akf, *work;
-  bk  = tstorage+2;
-  bkf = tstorage+2+2*n;
-  work= tstorage+2+2*(n+n2);
-  akf = tstorage+2+2*n+6*n2+16;
-
-/* initialize a_k and FFT it */
-  if (isign>0)
-    for (m=0; m<2*n; m+=2)
-      {
-      akf[m]   = data[m]*bk[m]   - data[m+1]*bk[m+1];
-      akf[m+1] = data[m]*bk[m+1] + data[m+1]*bk[m];
-      }
-  else
-    for (m=0; m<2*n; m+=2)
-      {
-      akf[m]   = data[m]*bk[m]   + data[m+1]*bk[m+1];
-      akf[m+1] =-data[m]*bk[m+1] + data[m+1]*bk[m];
-      }
-  for (m=2*n; m<2*n2; ++m)
-    akf[m]=0;
-
-  cfftf (n2,akf,work);
-
-/* do the convolution */
-  if (isign>0)
-    for (m=0; m<2*n2; m+=2)
-      {
-      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
-      akf[m+1]  = im;
-      }
-  else
-    for (m=0; m<2*n2; m+=2)
-      {
-      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
-      akf[m+1]  = im;
-      }
-
-
-/* inverse FFT */
-  cfftb (n2,akf,work);
-
-/* multiply by b_k* */
-  if (isign>0)
-    for (m=0; m<2*n; m+=2)
-      {
-      data[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
-      data[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
-      }
-  else
-    for (m=0; m<2*n; m+=2)
-      {
-      data[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
-      data[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
-      }
-  }
--- a/libfftpack/bluestein.h
+++ b/libfftpack/bluestein.h
@ -1,48 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Copyright (C) 2005 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_BLUESTEIN_H
-#define PLANCK_BLUESTEIN_H
-
-#include "c_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-size_t prime_factor_sum (size_t n);
-
-void bluestein_i (size_t n, double **tstorage, size_t *worksize);
-void bluestein (size_t n, double *data, double *tstorage, int isign);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libfftpack/fftpack.c
+++ b/libfftpack/fftpack.c
@ -1,833 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
-  fftpack.c : A set of FFT routines in C.
-  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
-  (Version 4, 1985).
-
-  C port by Martin Reinecke (2010)
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "fftpack.h"
-
-#define WA(x,i) wa[(i)+(x)*ido]
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define PM(a,b,c,d) { a=c+d; b=c-d; }
-#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
-#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
-#define SCALEC(a,b) { a.r*=b; a.i*=b; }
-#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
-/* (a+ib) = conj(c+id) * (e+if) */
-#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
-
-typedef struct {
-  double r,i;
-} cmplx;
-
-#define CONCAT(a,b) a ## b
-
-#define X(arg) CONCAT(passb,arg)
-#define BACKWARD
-#include "fftpack_inc.c"
-#undef BACKWARD
-#undef X
-
-#define X(arg) CONCAT(passf,arg)
-#include "fftpack_inc.c"
-#undef X
-
-#undef CC
-#undef CH
-#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
-
-static void radf2 (size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=2;
-  size_t i, k, ic;
-  double ti2, tr2;
-
-  for (k=0; k<l1; k++)
-    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      CH(    0,1,k) = -CC(ido-1,k,1);
-      CH(ido-1,0,k) =  CC(ido-1,k,0);
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
-      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
-      }
-  }
-
-static void radf3(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui=0.86602540378443864676;
-  size_t i, k, ic;
-  double ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
-
-  for (k=0; k<l1; k++)
-    {
-    cr2=CC(0,k,1)+CC(0,k,2);
-    CH(0,0,k) = CC(0,k,0)+cr2;
-    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
-    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
-    }
-  if (ido==1) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      cr2=dr2+dr3;
-      ci2=di2+di3;
-      CH(i-1,0,k) = CC(i-1,k,0)+cr2;
-      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
-      tr2 = CC(i-1,k,0)+taur*cr2;
-      ti2 = CC(i  ,k,0)+taur*ci2;
-      tr3 = taui*(di2-di3);
-      ti3 = taui*(dr3-dr2);
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2)
-      }
-  }
-
-static void radf4(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=4;
-  static const double hsqt2=0.70710678118654752440;
-  size_t i, k, ic;
-  double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-
-  for (k=0; k<l1; k++)
-    {
-    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
-    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
-    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
-    }
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
-      tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
-      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
-      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
-      PM(tr1,tr4,cr4,cr2)
-      PM(ti1,ti4,ci2,ci4)
-      PM(tr2,tr3,CC(i-1,k,0),cr3)
-      PM(ti2,ti3,CC(i  ,k,0),ci3)
-      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
-      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
-      }
-  }
-
-static void radf5(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
-                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
-  size_t i, k, ic;
-  double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
-         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-
-  for (k=0; k<l1; k++)
-    {
-    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
-    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
-    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
-    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
-    CH(0,2,k)=ti11*ci5+ti12*ci4;
-    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
-    CH(0,4,k)=ti12*ci5-ti11*ci4;
-    }
-  if (ido==1) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
-      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
-      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
-      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
-      PM(cr2,ci5,dr5,dr2)
-      PM(ci2,cr5,di2,di5)
-      PM(cr3,ci4,dr4,dr3)
-      PM(ci3,cr4,di3,di4)
-      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
-      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
-      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
-      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
-      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
-      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
-      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
-      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
-      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
-      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
-      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
-      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
-      }
-  }
-
-#undef CH
-#undef CC
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
-#define C2(a,b) cc[(a)+idl1*(b)]
-#define CH2(a,b) ch[(a)+idl1*(b)]
-static void radfg(size_t ido, size_t ip, size_t l1, size_t idl1,
-  double *cc, double *ch, const double *wa)
-  {
-  const size_t cdim=ip;
-  static const double twopi=6.28318530717958647692;
-  size_t idij, ipph, i, j, k, l, j2, ic, jc, lc, ik;
-  double ai1, ai2, ar1, ar2, arg;
-  double *csarr;
-  size_t aidx;
-
-  ipph=(ip+1)/ 2;
-  if(ido!=1)
-    {
-    memcpy(ch,cc,idl1*sizeof(double));
-
-    for(j=1; j<ip; j++)
-      for(k=0; k<l1; k++)
-        {
-        CH(0,k,j)=C1(0,k,j);
-        idij=(j-1)*ido+1;
-        for(i=2; i<ido; i+=2,idij+=2)
-          MULPM(CH(i-1,k,j),CH(i,k,j),wa[idij-1],wa[idij],C1(i-1,k,j),C1(i,k,j))
-        }
-
-    for(j=1,jc=ip-1; j<ipph; j++,jc--)
-      for(k=0; k<l1; k++)
-        for(i=2; i<ido; i+=2)
-          {
-          PM(C1(i-1,k,j),C1(i  ,k,jc),CH(i-1,k,jc),CH(i-1,k,j ))
-          PM(C1(i  ,k,j),C1(i-1,k,jc),CH(i  ,k,j ),CH(i  ,k,jc))
-          }
-    }
-  else
-    memcpy(cc,ch,idl1*sizeof(double));
-
-  for(j=1,jc=ip-1; j<ipph; j++,jc--)
-    for(k=0; k<l1; k++)
-      PM(C1(0,k,j),C1(0,k,jc),CH(0,k,jc),CH(0,k,j))
-
-  csarr=RALLOC(double,2*ip);
-  arg=twopi / ip;
-  csarr[0]=1.;
-  csarr[1]=0.;
-  csarr[2]=csarr[2*ip-2]=cos(arg);
-  csarr[3]=sin(arg); csarr[2*ip-1]=-csarr[3];
-  for (i=2; i<=ip/2; ++i)
-    {
-    csarr[2*i]=csarr[2*ip-2*i]=cos(i*arg);
-    csarr[2*i+1]=sin(i*arg);
-    csarr[2*ip-2*i+1]=-csarr[2*i+1];
-    }
-  for(l=1,lc=ip-1; l<ipph; l++,lc--)
-    {
-    ar1=csarr[2*l];
-    ai1=csarr[2*l+1];
-    for(ik=0; ik<idl1; ik++)
-      {
-      CH2(ik,l)=C2(ik,0)+ar1*C2(ik,1);
-      CH2(ik,lc)=ai1*C2(ik,ip-1);
-      }
-    aidx=2*l;
-    for(j=2,jc=ip-2; j<ipph; j++,jc--)
-      {
-      aidx+=2*l;
-      if (aidx>=2*ip) aidx-=2*ip;
-      ar2=csarr[aidx];
-      ai2=csarr[aidx+1];
-      for(ik=0; ik<idl1; ik++)
-        {
-        CH2(ik,l )+=ar2*C2(ik,j );
-        CH2(ik,lc)+=ai2*C2(ik,jc);
-        }
-      }
-    }
-  DEALLOC(csarr);
-
-  for(j=1; j<ipph; j++)
-    for(ik=0; ik<idl1; ik++)
-      CH2(ik,0)+=C2(ik,j);
-
-  for(k=0; k<l1; k++)
-    memcpy(&CC(0,0,k),&CH(0,k,0),ido*sizeof(double));
-  for(j=1; j<ipph; j++)
-    {
-    jc=ip-j;
-    j2=2*j;
-    for(k=0; k<l1; k++)
-      {
-      CC(ido-1,j2-1,k) = CH(0,k,j );
-      CC(0    ,j2  ,k) = CH(0,k,jc);
-      }
-    }
-  if(ido==1) return;
-
-  for(j=1; j<ipph; j++)
-    {
-    jc=ip-j;
-    j2=2*j;
-    for(k=0; k<l1; k++)
-      for(i=2; i<ido; i+=2)
-        {
-        ic=ido-i;
-        PM (CC(i-1,j2,k),CC(ic-1,j2-1,k),CH(i-1,k,j ),CH(i-1,k,jc))
-        PM (CC(i  ,j2,k),CC(ic  ,j2-1,k),CH(i  ,k,jc),CH(i  ,k,j ))
-        }
-    }
-  }
-
-#undef CC
-#undef CH
-#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
-#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
-
-static void radb2(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=2;
-  size_t i, k, ic;
-  double ti2, tr2;
-
-  for (k=0; k<l1; k++)
-    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      CH(ido-1,k,0) =  2*CC(ido-1,0,k);
-      CH(ido-1,k,1) = -2*CC(0    ,1,k);
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
-      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
-      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
-      }
-  }
-
-static void radb3(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui=0.86602540378443864676;
-  size_t i, k, ic;
-  double ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-
-  for (k=0; k<l1; k++)
-    {
-    tr2=2*CC(ido-1,1,k);
-    cr2=CC(0,0,k)+taur*tr2;
-    CH(0,k,0)=CC(0,0,k)+tr2;
-    ci3=2*taui*CC(0,2,k);
-    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
-    }
-  if (ido==1) return;
-  for (k=0; k<l1; k++)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      tr2=CC(i-1,2,k)+CC(ic-1,1,k);
-      ti2=CC(i  ,2,k)-CC(ic  ,1,k);
-      cr2=CC(i-1,0,k)+taur*tr2;
-      ci2=CC(i  ,0,k)+taur*ti2;
-      CH(i-1,k,0)=CC(i-1,0,k)+tr2;
-      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
-      cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));
-      ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
-      PM(dr3,dr2,cr2,ci3)
-      PM(di2,di3,ci2,cr3)
-      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
-      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
-      }
-  }
-
-static void radb4(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=4;
-  static const double sqrt2=1.41421356237309504880;
-  size_t i, k, ic;
-  double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-
-  for (k=0; k<l1; k++)
-    {
-    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
-    tr3=2*CC(ido-1,1,k);
-    tr4=2*CC(0,2,k);
-    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
-    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
-    }
-  if ((ido&1)==0)
-    for (k=0; k<l1; k++)
-      {
-      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
-      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
-      CH(ido-1,k,0)=tr2+tr2;
-      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
-      CH(ido-1,k,2)=ti2+ti2;
-      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
-      }
-  if (ido<=2) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
-      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
-      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
-      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
-      PM (CH(i-1,k,0),cr3,tr2,tr3)
-      PM (CH(i  ,k,0),ci3,ti2,ti3)
-      PM (cr4,cr2,tr1,tr4)
-      PM (ci2,ci4,ti1,ti4)
-      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
-      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
-      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
-      }
-  }
-
-static void radb5(size_t ido, size_t l1, const double *cc, double *ch,
-  const double *wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
-                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
-  size_t i, k, ic;
-  double ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4,
-         ti2, ti3, ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-
-  for (k=0; k<l1; k++)
-    {
-    ti5=2*CC(0,2,k);
-    ti4=2*CC(0,4,k);
-    tr2=2*CC(ido-1,1,k);
-    tr3=2*CC(ido-1,3,k);
-    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
-    cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
-    cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
-    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
-    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
-    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
-    }
-  if (ido==1) return;
-  for (k=0; k<l1;++k)
-    for (i=2; i<ido; i+=2)
-      {
-      ic=ido-i;
-      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
-      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
-      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
-      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
-      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
-      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
-      cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
-      ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
-      cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
-      ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
-      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
-      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
-      PM(dr4,dr3,cr3,ci4)
-      PM(di3,di4,ci3,cr4)
-      PM(dr5,dr2,cr2,ci5)
-      PM(di2,di5,ci2,cr5)
-      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
-      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
-      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
-      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
-      }
-  }
-
-static void radbg(size_t ido, size_t ip, size_t l1, size_t idl1,
-  double *cc, double *ch, const double *wa)
-  {
-  const size_t cdim=ip;
-  static const double twopi=6.28318530717958647692;
-  size_t idij, ipph, i, j, k, l, j2, ic, jc, lc, ik;
-  double ai1, ai2, ar1, ar2, arg;
-  double *csarr;
-  size_t aidx;
-
-  ipph=(ip+1)/ 2;
-  for(k=0; k<l1; k++)
-    memcpy(&CH(0,k,0),&CC(0,0,k),ido*sizeof(double));
-  for(j=1; j<ipph; j++)
-    {
-    jc=ip-j;
-    j2=2*j;
-    for(k=0; k<l1; k++)
-      {
-      CH(0,k,j )=2*CC(ido-1,j2-1,k);
-      CH(0,k,jc)=2*CC(0    ,j2  ,k);
-      }
-    }
-
-  if(ido!=1)
-    for(j=1,jc=ip-1; j<ipph; j++,jc--)
-      for(k=0; k<l1; k++)
-        for(i=2; i<ido; i+=2)
-          {
-          ic=ido-i;
-          PM (CH(i-1,k,j ),CH(i-1,k,jc),CC(i-1,2*j,k),CC(ic-1,2*j-1,k))
-          PM (CH(i  ,k,jc),CH(i  ,k,j ),CC(i  ,2*j,k),CC(ic  ,2*j-1,k))
-          }
-
-  csarr=RALLOC(double,2*ip);
-  arg=twopi/ip;
-  csarr[0]=1.;
-  csarr[1]=0.;
-  csarr[2]=csarr[2*ip-2]=cos(arg);
-  csarr[3]=sin(arg); csarr[2*ip-1]=-csarr[3];
-  for (i=2; i<=ip/2; ++i)
-    {
-    csarr[2*i]=csarr[2*ip-2*i]=cos(i*arg);
-    csarr[2*i+1]=sin(i*arg);
-    csarr[2*ip-2*i+1]=-csarr[2*i+1];
-    }
-  for(l=1; l<ipph; l++)
-    {
-    lc=ip-l;
-    ar1=csarr[2*l];
-    ai1=csarr[2*l+1];
-    for(ik=0; ik<idl1; ik++)
-      {
-      C2(ik,l)=CH2(ik,0)+ar1*CH2(ik,1);
-      C2(ik,lc)=ai1*CH2(ik,ip-1);
-      }
-    aidx=2*l;
-    for(j=2; j<ipph; j++)
-      {
-      jc=ip-j;
-      aidx+=2*l;
-      if (aidx>=2*ip) aidx-=2*ip;
-      ar2=csarr[aidx];
-      ai2=csarr[aidx+1];
-      for(ik=0; ik<idl1; ik++)
-        {
-        C2(ik,l )+=ar2*CH2(ik,j );
-        C2(ik,lc)+=ai2*CH2(ik,jc);
-        }
-      }
-    }
-  DEALLOC(csarr);
-
-  for(j=1; j<ipph; j++)
-    for(ik=0; ik<idl1; ik++)
-      CH2(ik,0)+=CH2(ik,j);
-
-  for(j=1,jc=ip-1; j<ipph; j++,jc--)
-    for(k=0; k<l1; k++)
-      PM (CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc))
-
-  if(ido==1)
-    return;
-  for(j=1,jc=ip-1; j<ipph; j++,jc--)
-    for(k=0; k<l1; k++)
-      for(i=2; i<ido; i+=2)
-        {
-        PM (CH(i-1,k,jc),CH(i-1,k,j ),C1(i-1,k,j),C1(i  ,k,jc))
-        PM (CH(i  ,k,j ),CH(i  ,k,jc),C1(i  ,k,j),C1(i-1,k,jc))
-        }
-  memcpy(cc,ch,idl1*sizeof(double));
-
-  for(j=1; j<ip; j++)
-    for(k=0; k<l1; k++)
-      {
-      C1(0,k,j)=CH(0,k,j);
-      idij=(j-1)*ido+1;
-      for(i=2; i<ido; i+=2,idij+=2)
-        MULPM (C1(i,k,j),C1(i-1,k,j),wa[idij-1],wa[idij],CH(i,k,j),CH(i-1,k,j))
-      }
-  }
-
-#undef CC
-#undef CH
-#undef PM
-#undef MULPM
-
-
-/*----------------------------------------------------------------------
-   cfftf1, cfftb1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
-  ----------------------------------------------------------------------*/
-
-static void cfft1(size_t n, cmplx c[], cmplx ch[], const cmplx wa[],
-  const size_t ifac[], int isign)
-  {
-  size_t k1, l1=1, nf=ifac[1], iw=0;
-  cmplx *p1=c, *p2=ch;
-
-  for(k1=0; k1<nf; k1++)
-    {
-    size_t ip=ifac[k1+2];
-    size_t l2=ip*l1;
-    size_t ido = n/l2;
-    if(ip==4)
-      (isign>0) ? passb4(ido, l1, p1, p2, wa+iw)
-                : passf4(ido, l1, p1, p2, wa+iw);
-    else if(ip==2)
-      (isign>0) ? passb2(ido, l1, p1, p2, wa+iw)
-                : passf2(ido, l1, p1, p2, wa+iw);
-    else if(ip==3)
-      (isign>0) ? passb3(ido, l1, p1, p2, wa+iw)
-                : passf3(ido, l1, p1, p2, wa+iw);
-    else if(ip==5)
-      (isign>0) ? passb5(ido, l1, p1, p2, wa+iw)
-                : passf5(ido, l1, p1, p2, wa+iw);
-    else if(ip==6)
-      (isign>0) ? passb6(ido, l1, p1, p2, wa+iw)
-                : passf6(ido, l1, p1, p2, wa+iw);
-    else
-      (isign>0) ? passbg(ido, ip, l1, p1, p2, wa+iw)
-                : passfg(ido, ip, l1, p1, p2, wa+iw);
-    SWAP(p1,p2,cmplx *);
-    l1=l2;
-    iw+=(ip-1)*ido;
-    }
-  if (p1!=c)
-    memcpy (c,p1,n*sizeof(cmplx));
-  }
-
-void cfftf(size_t n, double c[], double wsave[])
-  {
-  if (n!=1)
-    cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n),
-          (size_t*)(wsave+4*n),-1);
-  }
-
-void cfftb(size_t n, double c[], double wsave[])
-  {
-  if (n!=1)
-    cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n),
-          (size_t*)(wsave+4*n),+1);
-  }
-
-static void factorize (size_t n, const size_t *pf, size_t npf, size_t *ifac)
-  {
-  size_t nl=n, nf=0, ntry=0, j=0, i;
-
-startloop:
-  j++;
-  ntry = (j<=npf) ? pf[j-1] : ntry+2;
-  do
-    {
-    size_t nq=nl / ntry;
-    size_t nr=nl-ntry*nq;
-    if (nr!=0)
-      goto startloop;
-    nf++;
-    ifac[nf+1]=ntry;
-    nl=nq;
-    if ((ntry==2) && (nf!=1))
-      {
-      for (i=nf+1; i>2; --i)
-        ifac[i]=ifac[i-1];
-      ifac[2]=2;
-      }
-    }
-  while(nl!=1);
-  ifac[0]=n;
-  ifac[1]=nf;
-  }
-
-static void cffti1(size_t n, double wa[], size_t ifac[])
-  {
-  static const size_t ntryh[5]={4,6,3,2,5};
-  static const double twopi=6.28318530717958647692;
-  size_t j, k, fi;
-
-  double argh=twopi/n;
-  size_t i=0, l1=1;
-  factorize (n,ntryh,5,ifac);
-  for(k=1; k<=ifac[1]; k++)
-    {
-    size_t ip=ifac[k+1];
-    size_t ido=n/(l1*ip);
-    for(j=1; j<ip; j++)
-      {
-      size_t is = i;
-      double argld=j*l1*argh;
-      wa[i  ]=1;
-      wa[i+1]=0;
-      for(fi=1; fi<=ido; fi++)
-        {
-        double arg=fi*argld;
-        i+=2;
-        wa[i  ]=cos(arg);
-        wa[i+1]=sin(arg);
-        }
-      if(ip>6)
-        {
-        wa[is  ]=wa[i  ];
-        wa[is+1]=wa[i+1];
-        }
-      }
-    l1*=ip;
-    }
-  }
-
-void cffti(size_t n, double wsave[])
-  { if (n!=1) cffti1(n, wsave+2*n,(size_t*)(wsave+4*n)); }
-
-
-/*----------------------------------------------------------------------
-   rfftf1, rfftb1, rfftf, rfftb, rffti1, rffti. Real FFTs.
-  ----------------------------------------------------------------------*/
-
-static void rfftf1(size_t n, double c[], double ch[], const double wa[],
-  const size_t ifac[])
-  {
-  size_t k1, l1=n, nf=ifac[1], iw=n-1;
-  double *p1=ch, *p2=c;
-
-  for(k1=1; k1<=nf;++k1)
-    {
-    size_t ip=ifac[nf-k1+2];
-    size_t ido=n / l1;
-    l1 /= ip;
-    iw-=(ip-1)*ido;
-    SWAP (p1,p2,double *);
-    if(ip==4)
-      radf4(ido, l1, p1, p2, wa+iw);
-    else if(ip==2)
-      radf2(ido, l1, p1, p2, wa+iw);
-    else if(ip==3)
-      radf3(ido, l1, p1, p2, wa+iw);
-    else if(ip==5)
-      radf5(ido, l1, p1, p2, wa+iw);
-    else
-      {
-      if (ido==1)
-        SWAP (p1,p2,double *);
-      radfg(ido, ip, l1, ido*l1, p1, p2, wa+iw);
-      SWAP (p1,p2,double *);
-      }
-    }
-  if (p1==c)
-    memcpy (c,ch,n*sizeof(double));
-  }
-
-static void rfftb1(size_t n, double c[], double ch[], const double wa[],
-  const size_t ifac[])
-  {
-  size_t k1, l1=1, nf=ifac[1], iw=0;
-  double *p1=c, *p2=ch;
-
-  for(k1=1; k1<=nf; k1++)
-    {
-    size_t ip = ifac[k1+1],
-           ido= n/(ip*l1);
-    if(ip==4)
-      radb4(ido, l1, p1, p2, wa+iw);
-    else if(ip==2)
-      radb2(ido, l1, p1, p2, wa+iw);
-    else if(ip==3)
-      radb3(ido, l1, p1, p2, wa+iw);
-    else if(ip==5)
-      radb5(ido, l1, p1, p2, wa+iw);
-    else
-      {
-      radbg(ido, ip, l1, ido*l1, p1, p2, wa+iw);
-      if (ido!=1)
-        SWAP (p1,p2,double *);
-      }
-    SWAP (p1,p2,double *);
-    l1*=ip;
-    iw+=(ip-1)*ido;
-    }
-  if (p1!=c)
-    memcpy (c,ch,n*sizeof(double));
-  }
-
-void rfftf(size_t n, double r[], double wsave[])
-  { if(n!=1) rfftf1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); }
-
-void rfftb(size_t n, double r[], double wsave[])
-  { if(n!=1) rfftb1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); }
-
-static void rffti1(size_t n, double wa[], size_t ifac[])
-  {
-  static const size_t ntryh[4]={4,2,3,5};
-  static const double twopi=6.28318530717958647692;
-  size_t i, j, k, fi;
-
-  double argh=twopi/n;
-  size_t is=0, l1=1;
-  factorize (n,ntryh,4,ifac);
-  for (k=1; k<ifac[1]; k++)
-    {
-    size_t ip=ifac[k+1],
-           ido=n/(l1*ip);
-    for (j=1; j<ip; ++j)
-      {
-      double argld=j*l1*argh;
-      for(i=is,fi=1; i<=ido+is-3; i+=2,++fi)
-        {
-        double arg=fi*argld;
-        wa[i  ]=cos(arg);
-        wa[i+1]=sin(arg);
-        }
-      is+=ido;
-      }
-    l1*=ip;
-    }
-  }
-
-void rffti(size_t n, double wsave[])
-  { if (n!=1) rffti1(n, wsave+n,(size_t*)(wsave+2*n)); }
--- a/libfftpack/fftpack.h
+++ b/libfftpack/fftpack.h
@ -1,64 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
-  fftpack.h : function declarations for fftpack.c
-  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
-  (Version 4, 1985).
-
-  Pekka Janhunen 23.2.1995
-
-  (reformatted by joerg arndt)
-
-  reformatted and slightly enhanced by Martin Reinecke (2004)
- */
-
-#ifndef PLANCK_FFTPACK_H
-#define PLANCK_FFTPACK_H
-
-#include "c_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! forward complex transform */
-void cfftf(size_t N, double complex_data[], double wrk[]);
-/*! backward complex transform */
-void cfftb(size_t N, double complex_data[], double wrk[]);
-/*! initializer for complex transforms */
-void cffti(size_t N, double wrk[]);
-
-/*! forward real transform */
-void rfftf(size_t N, double data[], double wrk[]);
-/*! backward real transform */
-void rfftb(size_t N, double data[], double wrk[]);
-/*! initializer for real transforms */
-void rffti(size_t N, double wrk[]);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libfftpack/fftpack_inc.c
+++ b/libfftpack/fftpack_inc.c
@ -1,306 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
-  fftpack.c : A set of FFT routines in C.
-  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
-  (Version 4, 1985).
-
-  C port by Martin Reinecke (2010)
- */
-
-#ifdef BACKWARD
-#define PSIGN +
-#define PMSIGNC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
-/* a = b*c */
-#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
-#else
-#define PSIGN -
-#define PMSIGNC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; }
-/* a = conj(b)*c */
-#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
-#endif
-
-static void X(2) (size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=2;
-  size_t k,i;
-  cmplx t;
-  if (ido==1)
-    for (k=0;k<l1;++k)
-      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
-  else
-    for (k=0;k<l1;++k)
-      for (i=0;i<ido;++i)
-        {
-        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
-        MULPMSIGNC (CH(i,k,1),WA(0,i),t)
-        }
-  }
-
-static void X(3)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=3;
-  static const double taur=-0.5, taui= PSIGN 0.86602540378443864676;
-  size_t i, k;
-  cmplx c2, c3, d2, d3, t2;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC (t2,c3,CC(0,1,k),CC(0,2,k))
-      ADDC (CH(0,k,0),t2,CC(0,0,k))
-      SCALEC(t2,taur)
-      ADDC(c2,CC(0,0,k),t2)
-      SCALEC(c3,taui)
-      CONJFLIPC(c3)
-      PMC(CH(0,k,1),CH(0,k,2),c2,c3)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC (t2,c3,CC(i,1,k),CC(i,2,k))
-        ADDC (CH(i,k,0),t2,CC(i,0,k))
-        SCALEC(t2,taur)
-        ADDC(c2,CC(i,0,k),t2)
-        SCALEC(c3,taui)
-        CONJFLIPC(c3)
-        PMC(d2,d3,c2,c3)
-        MULPMSIGNC(CH(i,k,1),WA(0,i),d2)
-        MULPMSIGNC(CH(i,k,2),WA(1,i),d3)
-        }
-  }
-
-static void X(4)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=4;
-  size_t i, k;
-  cmplx c2, c3, c4, t1, t2, t3, t4;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
-      CONJFLIPC(t4)
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
-      PMSIGNC (CH(0,k,1),CH(0,k,3),t1,t4)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC(t2,t1,CC(i,0,k),CC(i,2,k))
-        PMC(t3,t4,CC(i,1,k),CC(i,3,k))
-        CONJFLIPC(t4)
-        PMC(CH(i,k,0),c3,t2,t3)
-        PMSIGNC (c2,c4,t1,t4)
-        MULPMSIGNC (CH(i,k,1),WA(0,i),c2)
-        MULPMSIGNC (CH(i,k,2),WA(1,i),c3)
-        MULPMSIGNC (CH(i,k,3),WA(2,i),c4)
-        }
-  }
-
-static void X(5)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=5;
-  static const double tr11= 0.3090169943749474241,
-                      ti11= PSIGN 0.95105651629515357212,
-                      tr12=-0.8090169943749474241,
-                      ti12= PSIGN 0.58778525229247312917;
-  size_t i, k;
-  cmplx c2, c3, c4, c5, d2, d3, d4, d5, t2, t3, t4, t5;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC (t2,t5,CC(0,1,k),CC(0,4,k))
-      PMC (t3,t4,CC(0,2,k),CC(0,3,k))
-      CH(0,k,0).r=CC(0,0,k).r+t2.r+t3.r;
-      CH(0,k,0).i=CC(0,0,k).i+t2.i+t3.i;
-      c2.r=CC(0,0,k).r+tr11*t2.r+tr12*t3.r;
-      c2.i=CC(0,0,k).i+tr11*t2.i+tr12*t3.i;
-      c3.r=CC(0,0,k).r+tr12*t2.r+tr11*t3.r;
-      c3.i=CC(0,0,k).i+tr12*t2.i+tr11*t3.i;
-      c5.r=ti11*t5.r+ti12*t4.r;
-      c5.i=ti11*t5.i+ti12*t4.i;
-      c4.r=ti12*t5.r-ti11*t4.r;
-      c4.i=ti12*t5.i-ti11*t4.i;
-      CONJFLIPC(c5)
-      PMC(CH(0,k,1),CH(0,k,4),c2,c5)
-      CONJFLIPC(c4)
-      PMC(CH(0,k,2),CH(0,k,3),c3,c4)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC (t2,t5,CC(i,1,k),CC(i,4,k))
-        PMC (t3,t4,CC(i,2,k),CC(i,3,k))
-        CH(i,k,0).r=CC(i,0,k).r+t2.r+t3.r;
-        CH(i,k,0).i=CC(i,0,k).i+t2.i+t3.i;
-        c2.r=CC(i,0,k).r+tr11*t2.r+tr12*t3.r;
-        c2.i=CC(i,0,k).i+tr11*t2.i+tr12*t3.i;
-        c3.r=CC(i,0,k).r+tr12*t2.r+tr11*t3.r;
-        c3.i=CC(i,0,k).i+tr12*t2.i+tr11*t3.i;
-        c5.r=ti11*t5.r+ti12*t4.r;
-        c5.i=ti11*t5.i+ti12*t4.i;
-        c4.r=ti12*t5.r-ti11*t4.r;
-        c4.i=ti12*t5.i-ti11*t4.i;
-        CONJFLIPC(c5)
-        PMC(d2,d5,c2,c5)
-        CONJFLIPC(c4)
-        PMC(d3,d4,c3,c4)
-        MULPMSIGNC (CH(i,k,1),WA(0,i),d2)
-        MULPMSIGNC (CH(i,k,2),WA(1,i),d3)
-        MULPMSIGNC (CH(i,k,3),WA(2,i),d4)
-        MULPMSIGNC (CH(i,k,4),WA(3,i),d5)
-        }
-  }
-
-static void X(6)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=6;
-  static const double taui= PSIGN 0.86602540378443864676;
-  cmplx ta1,ta2,ta3,a0,a1,a2,tb1,tb2,tb3,b0,b1,b2,d1,d2,d3,d4,d5;
-  size_t i, k;
-
-  if (ido==1)
-    for (k=0; k<l1; ++k)
-      {
-      PMC(ta1,ta3,CC(0,2,k),CC(0,4,k))
-      ta2.r = CC(0,0,k).r - .5*ta1.r;
-      ta2.i = CC(0,0,k).i - .5*ta1.i;
-      SCALEC(ta3,taui)
-      ADDC(a0,CC(0,0,k),ta1)
-      CONJFLIPC(ta3)
-      PMC(a1,a2,ta2,ta3)
-      PMC(tb1,tb3,CC(0,5,k),CC(0,1,k))
-      tb2.r = CC(0,3,k).r - .5*tb1.r;
-      tb2.i = CC(0,3,k).i - .5*tb1.i;
-      SCALEC(tb3,taui)
-      ADDC(b0,CC(0,3,k),tb1)
-      CONJFLIPC(tb3)
-      PMC(b1,b2,tb2,tb3)
-      PMC(CH(0,k,0),CH(0,k,3),a0,b0)
-      PMC(CH(0,k,4),CH(0,k,1),a1,b1)
-      PMC(CH(0,k,2),CH(0,k,5),a2,b2)
-      }
-  else
-    for (k=0; k<l1; ++k)
-      for (i=0; i<ido; ++i)
-        {
-        PMC(ta1,ta3,CC(i,2,k),CC(i,4,k))
-        ta2.r = CC(i,0,k).r - .5*ta1.r;
-        ta2.i = CC(i,0,k).i - .5*ta1.i;
-        SCALEC(ta3,taui)
-        ADDC(a0,CC(i,0,k),ta1)
-        CONJFLIPC(ta3)
-        PMC(a1,a2,ta2,ta3)
-        PMC(tb1,tb3,CC(i,5,k),CC(i,1,k))
-        tb2.r = CC(i,3,k).r - .5*tb1.r;
-        tb2.i = CC(i,3,k).i - .5*tb1.i;
-        SCALEC(tb3,taui)
-        ADDC(b0,CC(i,3,k),tb1)
-        CONJFLIPC(tb3)
-        PMC(b1,b2,tb2,tb3)
-        PMC(CH(i,k,0),d3,a0,b0)
-        PMC(d4,d1,a1,b1)
-        PMC(d2,d5,a2,b2)
-        MULPMSIGNC (CH(i,k,1),WA(0,i),d1)
-        MULPMSIGNC (CH(i,k,2),WA(1,i),d2)
-        MULPMSIGNC (CH(i,k,3),WA(2,i),d3)
-        MULPMSIGNC (CH(i,k,4),WA(3,i),d4)
-        MULPMSIGNC (CH(i,k,5),WA(4,i),d5)
-        }
-  }
-
-static void X(g)(size_t ido, size_t ip, size_t l1, const cmplx *cc, cmplx *ch,
-  const cmplx *wa)
-  {
-  const size_t cdim=ip;
-  cmplx *tarr=RALLOC(cmplx,2*ip);
-  cmplx *ccl=tarr, *wal=tarr+ip;
-  size_t i,j,k,l,jc,lc;
-  size_t ipph = (ip+1)/2;
-
-  for (i=1; i<ip; ++i)
-    wal[i]=wa[ido*(i-1)];
-  for (k=0; k<l1; ++k)
-    for (i=0; i<ido; ++i)
-      {
-      cmplx s=CC(i,0,k);
-      ccl[0] = CC(i,0,k);
-      for(j=1,jc=ip-1; j<ipph; ++j,--jc)
-        {
-        PMC (ccl[j],ccl[jc],CC(i,j,k),CC(i,jc,k))
-        ADDC (s,s,ccl[j])
-        }
-      CH(i,k,0) = s;
-      for (j=1, jc=ip-1; j<=ipph; ++j,--jc)
-        {
-        cmplx abr=ccl[0], abi={0.,0.};
-        size_t iang=0;
-        for (l=1,lc=ip-1; l<ipph; ++l,--lc)
-          {
-          iang+=j;
-          if (iang>ip) iang-=ip;
-          abr.r += ccl[l ].r*wal[iang].r;
-          abr.i += ccl[l ].i*wal[iang].r;
-          abi.r += ccl[lc].r*wal[iang].i;
-          abi.i += ccl[lc].i*wal[iang].i;
-          }
-#ifndef BACKWARD
-          { abi.i=-abi.i; abi.r=-abi.r; }
-#endif
-        CONJFLIPC(abi)
-        PMC(CH(i,k,j),CH(i,k,jc),abr,abi)
-        }
-      }
-
-  DEALLOC(tarr);
-
-  if (ido==1) return;
-
-  for (j=1; j<ip; ++j)
-    for (k=0; k<l1; ++k)
-      {
-      size_t idij=(j-1)*ido+1;
-      for(i=1; i<ido; ++i, ++idij)
-        {
-        cmplx t=CH(i,k,j);
-        MULPMSIGNC (CH(i,k,j),wa[idij],t)
-        }
-      }
-  }
-
-#undef PSIGN
-#undef PMSIGNC
-#undef MULPMSIGNC
--- a/libfftpack/libfftpack.dox
+++ b/libfftpack/libfftpack.dox
@ -1,5 +0,0 @@
-/*! \mainpage Libfftpack documentation
-  <ul>
-  <li>\ref fftgroup "Programming interface"
-  </ul>
- */
--- a/libfftpack/ls_fft.c
+++ b/libfftpack/ls_fft.c
@ -1,291 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*
- *  Copyright (C) 2005 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include "bluestein.h"
-#include "fftpack.h"
-#include "ls_fft.h"
-
-complex_plan make_complex_plan (size_t length)
-  {
-  complex_plan plan = RALLOC(complex_plan_i,1);
-  size_t pfsum = prime_factor_sum(length);
-  double comp1 = (double)(length*pfsum);
-  double comp2 = 2*3*length*log(3.*length);
-  comp2*=3.; /* fudge factor that appears to give good overall performance */
-  plan->length=length;
-  plan->bluestein = (comp2<comp1);
-  if (plan->bluestein)
-    bluestein_i (length,&(plan->work),&(plan->worksize));
-  else
-    {
-    plan->worksize=4*length+15;
-    plan->work=RALLOC(double,4*length+15);
-    cffti(length, plan->work);
-    }
-  return plan;
-  }
-
-complex_plan copy_complex_plan (complex_plan plan)
-  {
-  if (!plan) return NULL;
-  {
-  complex_plan newplan = RALLOC(complex_plan_i,1);
-  *newplan = *plan;
-  newplan->work=RALLOC(double,newplan->worksize);
-  memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize);
-  return newplan;
-  }
-  }
-
-void kill_complex_plan (complex_plan plan)
-  {
-  DEALLOC(plan->work);
-  DEALLOC(plan);
-  }
-
-void complex_plan_forward (complex_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    bluestein (plan->length, data, plan->work, -1);
-  else
-    cfftf (plan->length, data, plan->work);
-  }
-
-void complex_plan_backward (complex_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    bluestein (plan->length, data, plan->work, 1);
-  else
-    cfftb (plan->length, data, plan->work);
-  }
-
-
-real_plan make_real_plan (size_t length)
-  {
-  real_plan plan = RALLOC(real_plan_i,1);
-  size_t pfsum = prime_factor_sum(length);
-  double comp1 = .5*length*pfsum;
-  double comp2 = 2*3*length*log(3.*length);
-  comp2*=3; /* fudge factor that appears to give good overall performance */
-  plan->length=length;
-  plan->bluestein = (comp2<comp1);
-  if (plan->bluestein)
-    bluestein_i (length,&(plan->work),&(plan->worksize));
-  else
-    {
-    plan->worksize=2*length+15;
-    plan->work=RALLOC(double,2*length+15);
-    rffti(length, plan->work);
-    }
-  return plan;
-  }
-
-real_plan copy_real_plan (real_plan plan)
-  {
-  if (!plan) return NULL;
-  {
-  real_plan newplan = RALLOC(real_plan_i,1);
-  *newplan = *plan;
-  newplan->work=RALLOC(double,newplan->worksize);
-  memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize);
-  return newplan;
-  }
-  }
-
-void kill_real_plan (real_plan plan)
-  {
-  DEALLOC(plan->work);
-  DEALLOC(plan);
-  }
-
-void real_plan_forward_fftpack (real_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    {
-    size_t m;
-    size_t n=plan->length;
-    double *tmp = RALLOC(double,2*n);
-    for (m=0; m<n; ++m)
-      {
-      tmp[2*m] = data[m];
-      tmp[2*m+1] = 0.;
-      }
-    bluestein(n,tmp,plan->work,-1);
-    data[0] = tmp[0];
-    memcpy (data+1, tmp+2, (n-1)*sizeof(double));
-    DEALLOC(tmp);
-    }
-  else
-    rfftf (plan->length, data, plan->work);
-  }
-
-static void fftpack2halfcomplex (double *data, size_t n)
-  {
-  size_t m;
-  double *tmp = RALLOC(double,n);
-  tmp[0]=data[0];
-  for (m=1; m<(n+1)/2; ++m)
-    {
-    tmp[m]=data[2*m-1];
-    tmp[n-m]=data[2*m];
-    }
-  if (!(n&1))
-    tmp[n/2]=data[n-1];
-  memcpy (data,tmp,n*sizeof(double));
-  DEALLOC(tmp);
-  }
-
-static void halfcomplex2fftpack (double *data, size_t n)
-  {
-  size_t m;
-  double *tmp = RALLOC(double,n);
-  tmp[0]=data[0];
-  for (m=1; m<(n+1)/2; ++m)
-    {
-    tmp[2*m-1]=data[m];
-    tmp[2*m]=data[n-m];
-    }
-  if (!(n&1))
-    tmp[n-1]=data[n/2];
-  memcpy (data,tmp,n*sizeof(double));
-  DEALLOC(tmp);
-  }
-
-void real_plan_forward_fftw (real_plan plan, double *data)
-  {
-  real_plan_forward_fftpack (plan, data);
-  fftpack2halfcomplex (data,plan->length);
-  }
-
-void real_plan_backward_fftpack (real_plan plan, double *data)
-  {
-  if (plan->bluestein)
-    {
-    size_t m;
-    size_t n=plan->length;
-    double *tmp = RALLOC(double,2*n);
-    tmp[0]=data[0];
-    tmp[1]=0.;
-    memcpy (tmp+2,data+1, (n-1)*sizeof(double));
-    if ((n&1)==0) tmp[n+1]=0.;
-    for (m=2; m<n; m+=2)
-      {
-      tmp[2*n-m]=tmp[m];
-      tmp[2*n-m+1]=-tmp[m+1];
-      }
-    bluestein (n, tmp, plan->work, 1);
-    for (m=0; m<n; ++m)
-      data[m] = tmp[2*m];
-    DEALLOC(tmp);
-    }
-  else
-    rfftb (plan->length, data, plan->work);
-  }
-
-void real_plan_backward_fftw (real_plan plan, double *data)
-  {
-  halfcomplex2fftpack (data,plan->length);
-  real_plan_backward_fftpack (plan, data);
-  }
-
-void real_plan_forward_c (real_plan plan, double *data)
-  {
-  size_t m;
-  size_t n=plan->length;
-
-  if (plan->bluestein)
-    {
-    for (m=1; m<2*n; m+=2)
-      data[m]=0;
-    bluestein (plan->length, data, plan->work, -1);
-    data[1]=0;
-    for (m=2; m<n; m+=2)
-      {
-      double avg;
-      avg = 0.5*(data[2*n-m]+data[m]);
-      data[2*n-m] = data[m] = avg;
-      avg = 0.5*(data[2*n-m+1]-data[m+1]);
-      data[2*n-m+1] = avg;
-      data[m+1] = -avg;
-      }
-    if ((n&1)==0) data[n+1] = 0.;
-    }
-  else
-    {
-/* using "m+m" instead of "2*m" to avoid a nasty bug in Intel's compiler */
-    for (m=0; m<n; ++m) data[m+1] = data[m+m];
-    rfftf (n, data+1, plan->work);
-    data[0] = data[1];
-    data[1] = 0;
-    for (m=2; m<n; m+=2)
-      {
-      data[2*n-m]   =  data[m];
-      data[2*n-m+1] = -data[m+1];
-      }
-    if ((n&1)==0) data[n+1] = 0.;
-    }
-  }
-
-void real_plan_backward_c (real_plan plan, double *data)
-  {
-  size_t n=plan->length;
-
-  if (plan->bluestein)
-    {
-    size_t m;
-    data[1]=0;
-    for (m=2; m<n; m+=2)
-      {
-      double avg;
-      avg = 0.5*(data[2*n-m]+data[m]);
-      data[2*n-m] = data[m] = avg;
-      avg = 0.5*(data[2*n-m+1]-data[m+1]);
-      data[2*n-m+1] = avg;
-      data[m+1] = -avg;
-      }
-    if ((n&1)==0) data[n+1] = 0.;
-    bluestein (plan->length, data, plan->work, 1);
-    for (m=1; m<2*n; m+=2)
-      data[m]=0;
-    }
-  else
-    {
-    ptrdiff_t m;
-    data[1] = data[0];
-    rfftb (n, data+1, plan->work);
-    for (m=n-1; m>=0; --m)
-      {
-      data[2*m]   = data[m+1];
-      data[2*m+1] = 0.;
-      }
-    }
-  }
--- a/libfftpack/ls_fft.h
+++ b/libfftpack/ls_fft.h
@ -1,161 +0,0 @@
-/*
- *  This file is part of libfftpack.
- *
- *  libfftpack is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libfftpack is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libfftpack; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file ls_fft.h
- *  Interface for the LevelS FFT package.
- *
- *  Copyright (C) 2004 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_LS_FFT_H
-#define PLANCK_LS_FFT_H
-
-#include "c_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!\defgroup fftgroup FFT interface
-This package is intended to calculate one-dimensional real or complex FFTs
-with high accuracy and good efficiency even for lengths containing large
-prime factors.
-The code is written in C, but a Fortran wrapper exists as well.
-
-Before any FFT is executed, a plan must be generated for it. Plan creation
-is designed to be fast, so that there is no significant overhead if the
-plan is only used once or a few times.
-
-The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the
-double precision incarnation by Hugh C. Pumphrey
-(http://www.netlib.org/fftpack/dp.tgz).
-
-I replaced the iterative sine and cosine calculations in radfg() and radbg()
-by an exact calculation, which slightly improves the transform accuracy for
-real FFTs with lengths containing large prime factors.
-
-Since FFTPACK becomes quite slow for FFT lengths with large prime factors
-(in the worst case of prime lengths it reaches \f$\mathcal{O}(n^2)\f$
-complexity), I implemented Bluestein's algorithm, which computes a FFT of length
-\f$n\f$ by several FFTs of length \f$n_2\ge 2n-1\f$ and a convolution. Since
-\f$n_2\f$ can be chosen to be highly composite, this algorithm is more efficient
-if \f$n\f$ has large prime factors. The longer FFTs themselves are then computed
-using the FFTPACK routines.
-Bluestein's algorithm was implemented according to the description on Wikipedia
-(<a href="http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm">
-http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm</a>).
-
-\b Thread-safety:
-All routines can be called concurrently; all information needed by
-<tt>ls_fft</tt> is stored in the plan variable. However, using the same plan
-variable on multiple threads simultaneously is not supported and will lead to
-data corruption.
-*/
-/*! \{ */
-
-typedef struct
-  {
-  double *work;
-  size_t length, worksize;
-  int bluestein;
-  } complex_plan_i;
-
-/*! The opaque handle type for complex-FFT plans. */
-typedef complex_plan_i * complex_plan;
-
-/*! Returns a plan for a complex FFT with \a length elements. */
-complex_plan make_complex_plan (size_t length);
-/*! Constructs a copy of \a plan. */
-complex_plan copy_complex_plan (complex_plan plan);
-/*! Destroys a plan for a complex FFT. */
-void kill_complex_plan (complex_plan plan);
-/*! Computes a complex forward FFT on \a data, using \a plan.
-    \a Data has the form <tt>r0, i0, r1, i1, ...,
-    r[length-1], i[length-1]</tt>. */
-void complex_plan_forward (complex_plan plan, double *data);
-/*! Computes a complex backward FFT on \a data, using \a plan.
-    \a Data has the form <tt>r0, i0, r1, i1, ...,
-    r[length-1], i[length-1]</tt>. */
-void complex_plan_backward (complex_plan plan, double *data);
-
-typedef struct
-  {
-  double *work;
-  size_t length, worksize;
-  int bluestein;
-  } real_plan_i;
-
-/*! The opaque handle type for real-FFT plans. */
-typedef real_plan_i * real_plan;
-
-/*! Returns a plan for a real FFT with \a length elements. */
-real_plan make_real_plan (size_t length);
-/*! Constructs a copy of \a plan. */
-real_plan copy_real_plan (real_plan plan);
-/*! Destroys a plan for a real FFT. */
-void kill_real_plan (real_plan plan);
-/*! Computes a real forward FFT on \a data, using \a plan
-    and assuming the FFTPACK storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, ..., r[length-1]</tt>;
-    - on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt>
-      (a total of \a length values). */
-void real_plan_forward_fftpack (real_plan plan, double *data);
-/*! Computes a real backward FFT on \a data, using \a plan
-    and assuming the FFTPACK storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, i1, r2, i2, ...</tt>
-    (a total of \a length values);
-    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
-void real_plan_backward_fftpack (real_plan plan, double *data);
-/*! Computes a real forward FFT on \a data, using \a plan
-    and assuming the FFTW halfcomplex storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, ..., r[length-1]</tt>;
-    - on exit, it has the form <tt>r0, r1, r2, ..., i2, i1</tt>. */
-void real_plan_forward_fftw (real_plan plan, double *data);
-/*! Computes a real backward FFT on \a data, using \a plan
-    and assuming the FFTW halfcomplex storage scheme:
-    - on entry, \a data has the form <tt>r0, r1, r2, ..., i2, i1</tt>.
-    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
-void real_plan_backward_fftw (real_plan plan, double *data);
-/*! Computes a real forward FFT on \a data, using \a plan
-    and assuming a full-complex storage scheme:
-    - on entry, \a data has the form <tt>r0, [ignored], r1, [ignored], ...,
-      r[length-1], [ignored]</tt>;
-    - on exit, it has the form <tt>r0, i0, r1, i1, ...,
-      r[length-1], i[length-1]</tt>. */
-void real_plan_forward_c (real_plan plan, double *data);
-/*! Computes a real backward FFT on \a data, using \a plan
-    and assuming a full-complex storage scheme:
-    - on entry, \a data has the form <tt>r0, i0, r1, i1, ...,
-      r[length-1], i[length-1]</tt>;
-    - on exit, it has the form <tt>r0, 0, r1, 0, ..., r[length-1], 0</tt>. */
-void real_plan_backward_c (real_plan plan, double *data);
-
-/*! \} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libfftpack/planck.make
+++ b/libfftpack/planck.make
@ -1,21 +0,0 @@
-PKG:=libfftpack
-
-SD:=$(SRCROOT)/$(PKG)
-OD:=$(BLDROOT)/$(PKG)
-
-FULL_INCLUDE+= -I$(SD)
-
-HDR_$(PKG):=$(SD)/*.h
-LIB_$(PKG):=$(LIBDIR)/libfftpack.a
-OBJ:=fftpack.o bluestein.o ls_fft.o
-OBJ:=$(OBJ:%=$(OD)/%)
-
-ODEP:=$(HDR_$(PKG)) $(HDR_c_utils)
-
-$(OD)/fftpack.o: $(SD)/fftpack_inc.c
-
-$(OBJ): $(ODEP) | $(OD)_mkdir
-$(LIB_$(PKG)): $(OBJ)
-
-all_hdr+=$(HDR_$(PKG))
-all_lib+=$(LIB_$(PKG))
--- a/libsharp/libsharp.dox
+++ b/libsharp/libsharp.dox
@ -70,8 +70,8 @@
  libsharp supports shared-memory parallelisation via OpenMP; this feature will
  be automatically enabled if the compiler supports it.

-  Libsharp will also make use of SSE2 and AVX instructions when compiled for a
-  platform known to support them.
+  Libsharp will also make use of SSE2/AVX/AVX512 instructions when compiled
+  for a platform known to support them.

  Support for MPI-parallel transforms is also available; in this mode,
  every MPI task must provide a unique subset of the map and a_lm coefficients.
--- a/libsharp/planck.make
+++ b/libsharp/planck.make
@ -1,29 +0,0 @@
-PKG:=libsharp
-
-SD:=$(SRCROOT)/$(PKG)
-OD:=$(BLDROOT)/$(PKG)
-
-FULL_INCLUDE+= -I$(SD)
-
-HDR_$(PKG):=$(SD)/*.h
-LIB_$(PKG):=$(LIBDIR)/libsharp.a
-BIN:=sharp_testsuite
-LIBOBJ:=sharp_ylmgen_c.o sharp.o sharp_announce.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o sharp_legendre.o sharp_legendre_roots.o sharp_legendre_table.o
-ALLOBJ:=$(LIBOBJ) sharp_testsuite.o
-LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
-ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
-
-ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
-$(OD)/sharp_core.o: $(SD)/sharp_core_inchelper.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c
-$(OD)/sharp.o: $(SD)/sharp_mpi.c
-BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
-
-$(LIB_$(PKG)): $(LIBOBJ)
-
-$(ALLOBJ): $(ODEP) | $(OD)_mkdir
-BIN:=$(BIN:%=$(BINDIR)/%)
-$(BIN): $(BINDIR)/% : $(OD)/%.o $(BDEP)
-
-all_hdr+=$(HDR_$(PKG))
-all_lib+=$(LIB_$(PKG))
-all_cbin+=$(BIN)
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@ -25,17 +25,16 @@
 /*! \file sharp.c
 *  Spherical transform library
 *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2016 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

 #include <math.h>
-#include "ls_fft.h"
+#include <string.h>
+#include "pocketfft/pocketfft.h"
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
 #include "c_utils.h"
-#include "sharp_core.h"
-#include "sharp_vecutil.h"
 #include "walltime_c.h"
 #include "sharp_almhelpers.h"
 #include "sharp_geomhelpers.h"
@ -63,7 +62,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
  *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
  }

-int sharp_get_mlim (int lmax, int spin, double sth, double cth)
+NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
  {
  double ofs=lmax*0.01;
  if (ofs<100.) ofs=100.;
@ -82,24 +81,25 @@ typedef struct
  double phi0_;
  dcmplx *shiftarr;
  int s_shift;
-  real_plan plan;
+  rfft_plan plan;
+  int length;
  int norot;
  } ringhelper;

 static void ringhelper_init (ringhelper *self)
  {
-  static ringhelper rh_null = { 0, NULL, 0, NULL, 0 };
+  static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 };
  *self = rh_null;
  }

 static void ringhelper_destroy (ringhelper *self)
  {
-  if (self->plan) kill_real_plan(self->plan);
+  if (self->plan) destroy_rfft_plan(self->plan);
  DEALLOC(self->shiftarr);
  ringhelper_init(self);
  }

-static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
+NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
  {
  self->norot = (fabs(phi0)<1e-14);
  if (!(self->norot))
@ -108,14 +108,18 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
      RESIZE (self->shiftarr,dcmplx,mmax+1);
      self->s_shift = mmax+1;
      self->phi0_ = phi0;
+// FIXME: improve this by using sincos2pibyn(nph) etc.
      for (int m=0; m<=mmax; ++m)
        self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
+//      double *tmp=(double *) self->shiftarr;
+//      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
      }
-  if (!self->plan) self->plan=make_real_plan(nph);
-  if (nph!=(int)self->plan->length)
+//  if (!self->plan) self->plan=make_rfft_plan(nph);
+  if (nph!=(int)self->length)
    {
-    kill_real_plan(self->plan);
-    self->plan=make_real_plan(nph);
+    if (self->plan) destroy_rfft_plan(self->plan);
+    self->plan=make_rfft_plan(nph);
+    self->length=nph;
    }
  }

@ -127,6 +131,7 @@ static int ringinfo_compare (const void *xa, const void *xb)
 static int ringpair_compare (const void *xa, const void *xb)
  {
  const sharp_ringpair *a=xa, *b=xb;
+//  return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
  if (a->r1.nph==b->r1.nph)
    return (a->r1.phi0 < b->r1.phi0) ? -1 :
      ((a->r1.phi0 > b->r1.phi0) ? 1 :
@ -261,6 +266,7 @@ void sharp_destroy_geom_info (sharp_geom_info *geom_info)
   distribution are permissible. */
 static int sharp_get_mmax (int *mval, int nm)
  {
+  //FIXME: if gaps are allowed, we have to search the maximum m in the array
  int *mcheck=RALLOC(int,nm);
  SET_ARRAY(mcheck,0,nm,0);
  for (int i=0; i<nm; ++i)
@ -274,7 +280,7 @@ static int sharp_get_mmax (int *mval, int nm)
  return nm-1;
  }

-static void ringhelper_phase2ring (ringhelper *self,
+NOINLINE static void ringhelper_phase2ring (ringhelper *self,
  const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
  int pstride, int flags)
  {
@ -288,13 +294,19 @@ static void ringhelper_phase2ring (ringhelper *self,

  if (nph>=2*mmax+1)
    {
-    for (int m=0; m<=mmax; ++m)
-      {
-      dcmplx tmp = phase[m*pstride]*wgt;
-      if(!self->norot) tmp*=self->shiftarr[m];
-      data[2*m]=creal(tmp);
-      data[2*m+1]=cimag(tmp);
-      }
+    if (self->norot)
+      for (int m=0; m<=mmax; ++m)
+        {
+        data[2*m]=creal(phase[m*pstride])*wgt;
+        data[2*m+1]=cimag(phase[m*pstride])*wgt;
+        }
+    else
+      for (int m=0; m<=mmax; ++m)
+        {
+        dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
+        data[2*m]=creal(tmp)*wgt;
+        data[2*m+1]=cimag(tmp)*wgt;
+        }
    for (int m=2*(mmax+1); m<nph+2; ++m)
      data[m]=0.;
    }
@ -323,10 +335,10 @@ static void ringhelper_phase2ring (ringhelper *self,
      }
    }
  data[1]=data[0];
-  real_plan_backward_fftpack (self->plan, &(data[1]));
+  rfft_backward (self->plan, &(data[1]), 1.);
  }

-static void ringhelper_ring2phase (ringhelper *self,
+NOINLINE static void ringhelper_ring2phase (ringhelper *self,
  const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
  int pstride, int flags)
  {
@ -342,7 +354,7 @@ static void ringhelper_ring2phase (ringhelper *self,
  if (flags&SHARP_REAL_HARMONICS)
    wgt *= sqrt_two;

-  real_plan_forward_fftpack (self->plan, &(data[1]));
+  rfft_forward (self->plan, &(data[1]), 1.);
  data[0]=data[1];
  data[1]=data[nph+1]=0.;

@ -376,7 +388,7 @@ static void ringhelper_ring2phase (ringhelper *self,
    phase[m*pstride]=0.;
  }

-static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
+NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
  int flags)
  {
  if (flags & SHARP_NO_FFT)
@ -386,50 +398,55 @@ static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
      if (flags&SHARP_DP)
        {
        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =value;
+          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =value;
+          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
        }
      else
        {
        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =(float)value;
+          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =(float)value;
+          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
        }
      }
    }
  else
    {
-    for (int j=0;j<ginfo->npairs;++j)
+    if (flags&SHARP_DP)
      {
-      if (flags&SHARP_DP)
+      for (int j=0;j<ginfo->npairs;++j)
        {
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =value;
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =value;
+        double *dmap=(double *)map;
+        if (ginfo->pair[j].r1.stride==1)
+          memset(&dmap[ginfo->pair[j].r1.ofs],0,
+            ginfo->pair[j].r1.nph*sizeof(double));
+        else
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
+            dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
+        if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1))
+          memset(&dmap[ginfo->pair[j].r2.ofs],0,
+            ginfo->pair[j].r2.nph*sizeof(double));
+        else
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
+            dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
        }
-      else
+      }
+    else
+      {
+      for (int j=0;j<ginfo->npairs;++j)
        {
        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =(float)value;
+          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =(float)value;
+          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
        }
      }
    }
  }

-static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
+NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
+  int flags)
  {
 #define CLEARLOOP(real_t,body)             \
      {                                    \
@ -465,59 +482,67 @@ static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
    }
  }

-static void init_output (sharp_job *job)
+NOINLINE static void init_output (sharp_job *job)
  {
  if (job->flags&SHARP_ADD) return;
  if (job->type == SHARP_MAP2ALM)
-    for (int i=0; i<job->ntrans*job->nalm; ++i)
+    for (int i=0; i<job->nalm; ++i)
      clear_alm (job->ainfo,job->alm[i],job->flags);
  else
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
-      fill_map (job->ginfo,job->map[i],0.,job->flags);
+    for (int i=0; i<job->nmaps; ++i)
+      clear_map (job->ginfo,job->map[i],job->flags);
  }

-static void alloc_phase (sharp_job *job, int nm, int ntheta)
+NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
  {
  if (job->type==SHARP_MAP2ALM)
    {
-    if ((nm&1023)==0) nm+=3; // hack to avoid critical strides
-    job->s_m=2*job->ntrans*job->nmaps;
+    job->s_m=2*job->nmaps;
+    if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
    job->s_th=job->s_m*nm;
    }
  else
    {
-    if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides
-    job->s_th=2*job->ntrans*job->nmaps;
+    job->s_th=2*job->nmaps;
+    if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
    job->s_m=job->s_th*ntheta;
    }
-  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
+  job->phase=RALLOC(dcmplx,2*job->nmaps*nm*ntheta);
  }

 static void dealloc_phase (sharp_job *job)
  { DEALLOC(job->phase); }

 static void alloc_almtmp (sharp_job *job, int lmax)
-  { job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
+  { job->almtmp=RALLOC(dcmplx,job->nalm*(lmax+2)); }

 static void dealloc_almtmp (sharp_job *job)
  { DEALLOC(job->almtmp); }

-static void alm2almtmp (sharp_job *job, int lmax, int mi)
+NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
  {

-#define COPY_LOOP(real_t, source_t, expr_of_x)                      \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)            \
-    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
+#define COPY_LOOP(real_t, source_t, expr_of_x)              \
+  {                                                         \
+  for (int l=m; l<lmin; ++l)                                \
+    for (int i=0; i<job->nalm; ++i)             \
+      job->almtmp[job->nalm*l+i] = 0;           \
+  for (int l=lmin; l<=lmax; ++l)                            \
+    for (int i=0; i<job->nalm; ++i)             \
      {                                                     \
-        source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
-        job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
-      }
+      source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
+      job->almtmp[job->nalm*l+i] = expr_of_x;   \
+      }                                                     \
+  for (int i=0; i<job->nalm; ++i)             \
+    job->almtmp[job->nalm*(lmax+1)+i] = 0;           \
+  }

  if (job->type!=SHARP_MAP2ALM)
    {
    ptrdiff_t ofs=job->ainfo->mvstart[mi];
    int stride=job->ainfo->stride;
    int m=job->ainfo->mval[mi];
+    int lmin=(m<job->spin) ? job->spin : m;
    /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the
       coefficients by sqrt_one_half; here we must compensate to avoid scaling
       m=0 */
@ -562,20 +587,20 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi)
      }
    }
  else
-    SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
-              job->ntrans*job->nalm*(lmax+1),0.);
+    memset (job->almtmp+job->nalm*job->ainfo->mval[mi], 0,
+      job->nalm*(lmax+2-job->ainfo->mval[mi])*sizeof(dcmplx));

 #undef COPY_LOOP
  }

-static void almtmp2alm (sharp_job *job, int lmax, int mi)
+NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
  {

 #define COPY_LOOP(real_t, target_t, expr_of_x)               \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)             \
-    for (int i=0; i<job->ntrans*job->nalm; ++i)              \
+  for (int l=lmin; l<=lmax; ++l)                             \
+    for (int i=0; i<job->nalm; ++i)              \
      {                                                      \
-        dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i];   \
+        dcmplx x = job->almtmp[job->nalm*l+i];   \
        *(target_t *)(((real_t *)job->alm[i])+ofs+l*stride) += expr_of_x; \
      }

@ -583,6 +608,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
  ptrdiff_t ofs=job->ainfo->mvstart[mi];
  int stride=job->ainfo->stride;
  int m=job->ainfo->mval[mi];
+  int lmin=(m<job->spin) ? job->spin : m;
  /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the
     coefficients by sqrt_two; here we must compensate to avoid scaling
     m=0 */
@ -629,27 +655,56 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
 #undef COPY_LOOP
  }

-static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
-  int rstride)
+NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
+  const double *ringtmp, int rstride)
  {
-  double **dmap = (double **)job->map;
-  float  **fmap = (float  **)job->map;
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
-    for (int m=0; m<ri->nph; ++m)
-      if (job->flags & SHARP_DP)
-        dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1];
+  if (job->flags & SHARP_DP)
+    {
+    double **dmap = (double **)job->map;
+    for (int i=0; i<job->nmaps; ++i)
+      {
+      double *restrict p1=&dmap[i][ri->ofs];
+      const double *restrict p2=&ringtmp[i*rstride+1];
+      if (ri->stride==1)
+        {
+        if (job->flags&SHARP_ADD)
+          for (int m=0; m<ri->nph; ++m)
+            p1[m] += p2[m];
+        else
+          memcpy(p1,p2,ri->nph*sizeof(double));
+        }
      else
+        for (int m=0; m<ri->nph; ++m)
+          p1[m*ri->stride] += p2[m];
+      }
+    }
+  else
+    {
+    float  **fmap = (float  **)job->map;
+    for (int i=0; i<job->nmaps; ++i)
+      for (int m=0; m<ri->nph; ++m)
        fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
+    }
  }

-static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
-  int rstride)
+NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
+  double *ringtmp, int rstride)
  {
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
-    for (int m=0; m<ri->nph; ++m)
-      ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ?
-        ((double *)(job->map[i]))[ri->ofs+m*ri->stride] :
-        ((float  *)(job->map[i]))[ri->ofs+m*ri->stride];
+  if (job->flags & SHARP_DP)
+    for (int i=0; i<job->nmaps; ++i)
+      {
+      double *restrict p1=&ringtmp[i*rstride+1],
+             *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
+      if (ri->stride==1)
+        memcpy(p1,p2,ri->nph*sizeof(double));
+      else
+        for (int m=0; m<ri->nph; ++m)
+          p1[m] = p2[m*ri->stride];
+      }
+  else
+    for (int i=0; i<job->nmaps; ++i)
+      for (int m=0; m<ri->nph; ++m)
+        ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
  }

 static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@ -657,7 +712,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
  {
  if (ri->nph<0)
    {
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
      for (int m=0; m<=mmax; ++m)
        phase[2*i+job->s_m*m]=0.;
    }
@ -667,7 +722,7 @@ static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
    double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
    if (job->flags&SHARP_REAL_HARMONICS)
      wgt *= sqrt_two;
-    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+    for (int i=0; i<job->nmaps; ++i)
      for (int m=0; m<=mmax; ++m)
        phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
          ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
@ -684,7 +739,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
  double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
  if (job->flags&SHARP_REAL_HARMONICS)
    wgt *= sqrt_one_half;
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
+  for (int i=0; i<job->nmaps; ++i)
    for (int m=0; m<=mmax; ++m)
      if (job->flags & SHARP_DP)
        dmap[i][ri->ofs+m*ri->stride] += wgt*phase[2*i+job->s_m*m];
@ -693,7 +748,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
  }

 //FIXME: set phase to zero if not SHARP_MAP2ALM?
-static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
  {
  if (job->type != SHARP_MAP2ALM) return;
  int pstride = job->s_m;
@ -715,19 +770,19 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
    ringhelper helper;
    ringhelper_init(&helper);
    int rstride=job->ginfo->nphmax+2;
-    double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
+    double *ringtmp=RALLOC(double,job->nmaps*rstride);
 #pragma omp for schedule(dynamic,1)
    for (int ith=llim; ith<ulim; ++ith)
      {
      int dim2 = job->s_th*(ith-llim);
      ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
-      for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int i=0; i<job->nmaps; ++i)
        ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
          &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
      if (job->ginfo->pair[ith].r2.nph>0)
        {
        ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
-        for (int i=0; i<job->ntrans*job->nmaps; ++i)
+        for (int i=0; i<job->nmaps; ++i)
          ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
           &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
        }
@ -738,7 +793,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
    }
  }

-static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
  {
  if (job->type == SHARP_MAP2ALM) return;
  int pstride = job->s_m;
@ -760,18 +815,18 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
    ringhelper helper;
    ringhelper_init(&helper);
    int rstride=job->ginfo->nphmax+2;
-    double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
+    double *ringtmp=RALLOC(double,job->nmaps*rstride);
 #pragma omp for schedule(dynamic,1)
    for (int ith=llim; ith<ulim; ++ith)
      {
      int dim2 = job->s_th*(ith-llim);
-      for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int i=0; i<job->nmaps; ++i)
        ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
          &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
      ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
      if (job->ginfo->pair[ith].r2.nph>0)
        {
-        for (int i=0; i<job->ntrans*job->nmaps; ++i)
+        for (int i=0; i<job->nmaps; ++i)
          ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
            &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
        ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
@ -783,7 +838,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
    }
  }

-static void sharp_execute_job (sharp_job *job)
+NOINLINE static void sharp_execute_job (sharp_job *job)
  {
  double timer=wallTime();
  job->opcnt=0;
@ -798,8 +853,9 @@ static void sharp_execute_job (sharp_job *job)
  init_output (job);

  int nchunks, chunksize;
-  get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
-    &chunksize);
+  get_chunk_info(job->ginfo->npairs,sharp_veclen()*sharp_max_nvec(job->spin),
+                 &nchunks,&chunksize);
+//FIXME: needs to be changed to "nm"
  alloc_phase (job,mmax+1,chunksize);

 /* chunk loop */
@ -863,10 +919,8 @@ static void sharp_execute_job (sharp_job *job)

 static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
  int spin, void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags)
+  const sharp_alm_info *alm_info, int flags)
  {
-  UTIL_ASSERT((ntrans>0)&&(ntrans<=SHARP_MAXTRANS),
-    "bad number of simultaneous transforms");
  if (type==SHARP_ALM2MAP_DERIV1) spin=1;
  if (type==SHARP_MAP2ALM) flags|=SHARP_USE_WEIGHTS;
  if (type==SHARP_Yt) type=SHARP_MAP2ALM;
@ -881,24 +935,21 @@ static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
  job->ginfo = geom_info;
  job->ainfo = alm_info;
  job->flags = flags;
-  if ((job->flags&SHARP_NVMAX)==0)
-    job->flags|=sharp_nv_oracle (type, spin, ntrans);
  if (alm_info->flags&SHARP_REAL_HARMONICS)
    job->flags|=SHARP_REAL_HARMONICS;
  job->time = 0.;
  job->opcnt = 0;
-  job->ntrans = ntrans;
  job->alm=alm;
  job->map=map;
  }

 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
-  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
  int flags, double *time, unsigned long long *opcnt)
  {
  sharp_job job;
  sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
-    ntrans, flags);
+    flags);

  sharp_execute_job (&job);
  if (time!=NULL) *time = job.time;
@ -910,96 +961,16 @@ void sharp_set_chunksize_min(int new_chunksize_min)
 void sharp_set_nchunks_max(int new_nchunks_max)
  { nchunks_max=new_nchunks_max; }

-int sharp_get_nv_max (void)
-{ return 6; }
-
-static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
-  {
-  int lmax=511;
-  int mmax=(lmax+1)/2;
-  int nrings=(lmax+1)/4;
-  int ppring=1;
-
-  spin = (spin!=0) ? 2 : 0;
-
-  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
-  sharp_geom_info *tinfo;
-  sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
-
-  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
-
-  double **map;
-  ALLOC2D(map,double,ncomp,npix);
-  SET_ARRAY(map[0],0,npix*ncomp,0.);
-
-  sharp_alm_info *alms;
-  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
-
-  dcmplx **alm;
-  ALLOC2D(alm,dcmplx,ncomp,nalms);
-  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
-
-  double time=1e30;
-  int nvbest=-1;
-
-  for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
-    {
-    double time_acc=0.;
-    double jtime;
-    int ntries=0;
-    do
-      {
-      sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,ntrans,
-        nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
-
-      if (jtime<time) { time=jtime; nvbest=nv; }
-      time_acc+=jtime;
-      ++ntries;
-      }
-    while ((time_acc<0.02)&&(ntries<2));
-    }
-
-  DEALLOC2D(map);
-  DEALLOC2D(alm);
-
-  sharp_destroy_alm_info(alms);
-  sharp_destroy_geom_info(tinfo);
-  return nvbest;
-  }
-
-int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
-  {
-  static const int maxtr = 6;
-  static int nv_opt[6][2][5] = {
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}},
-    {{0,0,0,0,0},{0,0,0,0,0}} };
-
-  if (type==SHARP_ALM2MAP_DERIV1) spin=1;
-  UTIL_ASSERT(type<5,"bad type");
-  UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
-  UTIL_ASSERT(spin>=0, "bad spin");
-  ntrans=IMIN(ntrans,maxtr);
-
-  if (nv_opt[ntrans-1][spin!=0][type]==0)
-    nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
-  return nv_opt[ntrans-1][spin!=0][type];
-  }
-
 #ifdef USE_MPI
 #include "sharp_mpi.c"

 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt)
  {
  MPI_Comm comm = *(MPI_Comm*)pcomm;
-  sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info, ntrans,
+  sharp_execute_mpi((MPI_Comm)comm, type, spin, alm, map, geom_info, alm_info,
    flags, time, opcnt);
  return 0;
  }
@ -1008,12 +979,12 @@ int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,

 int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt)
  {
  /* Suppress unused warning: */
  (void)pcomm; (void)type; (void)spin; (void)alm; (void)map; (void)geom_info;
-  (void)alm_info; (void)ntrans; (void)flags; (void)time; (void)opcnt;
+  (void)alm_info; (void)flags; (void)time; (void)opcnt;
  return SHARP_ERROR_NO_MPI;
  }

--- a/libsharp/sharp.h
+++ b/libsharp/sharp.h
@ -23,24 +23,243 @@
 */

 /*! \file sharp.h
- *  Interface for the spherical transform library.
+ *  Portable interface for the spherical transform library.
 *
- *  Copyright (C) 2006-2012 Max-Planck-Society
- *  \author Martin Reinecke
+ *  Copyright (C) 2012-2019 Max-Planck-Society
+ *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

 #ifndef PLANCK_SHARP_H
 #define PLANCK_SHARP_H

+#include <stddef.h>
+
 #ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
+extern "C" {
 #endif

-#include <complex.h>
+/*! \internal
+    Helper type containing information about a single ring. */
+typedef struct
+  {
+  double theta, phi0, weight, cth, sth;
+  ptrdiff_t ofs;
+  int nph, stride;
+  } sharp_ringinfo;

-#include "sharp_lowlevel.h"
-#include "sharp_legendre.h"
-#include "sharp_legendre_roots.h"
-#include "sharp_legendre_table.h"
+/*! \internal
+    Helper type containing information about a pair of rings with colatitudes
+    symmetric around the equator. */
+typedef struct
+  {
+  sharp_ringinfo r1,r2;
+  } sharp_ringpair;
+
+/*! \internal
+    Type holding all required information about a map geometry. */
+typedef struct
+  {
+  sharp_ringpair *pair;
+  int npairs, nphmax;
+  } sharp_geom_info;
+
+/*! \defgroup almgroup Helpers for dealing with a_lm */
+/*! \{ */
+
+/*! \internal
+    Helper type for index calculation in a_lm arrays. */
+typedef struct
+  {
+  /*! Maximum \a l index of the array */
+  int lmax;
+  /*! Number of different \a m values in this object */
+  int nm;
+  /*! Array with \a nm entries containing the individual m values */
+  int *mval;
+  /*! Combination of flags from sharp_almflags */
+  int flags;
+  /*! Array with \a nm entries containing the (hypothetical) indices of
+      the coefficients with quantum numbers 0,\a mval[i] */
+  ptrdiff_t *mvstart;
+  /*! Stride between a_lm and a_(l+1),m */
+  ptrdiff_t stride;
+  } sharp_alm_info;
+
+/*! alm_info flags */
+typedef enum { SHARP_PACKED = 1,
+               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
+                    not present. mvstart is in units of *real* float/double for all
+                    m; stride is in units of reals for m=0 and complex for m!=0 */
+               SHARP_REAL_HARMONICS  = 1<<6
+               /*!< Use the real spherical harmonic convention. For
+                    m==0, the alm are treated exactly the same as in
+                    the complex case.  For m!=0, alm[i] represent a
+                    pair (+abs(m), -abs(m)) instead of (real, imag),
+                    and the coefficients are scaled by a factor of
+                    sqrt(2) relative to the complex case.  In other
+                    words, (sqrt(.5) * alm[i]) recovers the
+                    corresponding complex coefficient (when accessed
+                    as complex).
+                */
+             } sharp_almflags;
+
+
+
+/*! Creates an a_lm data structure from the following parameters:
+    \param lmax maximum \a l quantum number (>=0)
+    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
+    \param mstart the index of the (hypothetical) coefficient with the
+      quantum numbers 0,\a m. Must have \a mmax+1 entries.
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_alm_info (int lmax, int mmax, int stride,
+  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
+/*! Creates an a_lm data structure which from the following parameters:
+    \param lmax maximum \a l quantum number (\a >=0)
+    \param nm number of different \a m (\a 0<=nm<=lmax+1)
+    \param stride the stride between entries with identical \a m, and \a l
+      differing by 1.
+    \param mval array with \a nm entries containing the individual m values
+    \param mvstart array with \a nm entries containing the (hypothetical)
+      indices of the coefficients with the quantum numbers 0,\a mval[i]
+    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
+  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
+/*! Returns the index of the coefficient with quantum numbers \a l,
+    \a mval[mi].
+    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
+    the index for the coefficient with the quantum numbers \a l, \a mi. */
+ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
+/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
+    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
+    otherwise it is the number of complex coefficients (with m>=0). */
+ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
+/*! Deallocates the a_lm info object. */
+void sharp_destroy_alm_info (sharp_alm_info *info);
+
+/*! \} */
+
+/*! \defgroup geominfogroup Functions for dealing with geometry information */
+/*! \{ */
+
+/*! Creates a geometry information from a set of ring descriptions.
+    All arrays passed to this function must have \a nrings elements.
+    \param nrings the number of rings in the map
+    \param nph the number of pixels in each ring
+    \param ofs the index of the first pixel in each ring in the map array
+    \param stride the stride between consecutive pixels
+    \param phi0 the azimuth (in radians) of the first pixel in each ring
+    \param theta the colatitude (in radians) of each ring
+    \param wgt the pixel weight to be used for the ring in map2alm
+      and adjoint map2alm transforms.
+      Pass NULL to use 1.0 as weight for all rings.
+    \param geom_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
+  const int *stride, const double *phi0, const double *theta,
+  const double *wgt, sharp_geom_info **geom_info);
+
+/*! Counts the number of grid points needed for (the local part of) a map described
+    by \a info.
+ */
+ptrdiff_t sharp_map_size(const sharp_geom_info *info);
+
+/*! Deallocates the geometry information in \a info. */
+void sharp_destroy_geom_info (sharp_geom_info *info);
+
+/*! \} */
+
+/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
+/*! \{ */
+
+/*! Enumeration of SHARP job types. */
+typedef enum { SHARP_YtW=0,               /*!< analysis */
+               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
+               SHARP_Y=1,                 /*!< synthesis */
+               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
+               SHARP_Yt=2,                /*!< adjoint synthesis */
+               SHARP_WY=3,                /*!< adjoint analysis */
+               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
+             } sharp_jobtype;
+
+/*! Job flags */
+typedef enum { SHARP_DP              = 1<<4,
+               /*!< map and a_lm are in double precision */
+               SHARP_ADD             = 1<<5,
+               /*!< results are added to the output arrays, instead of
+                    overwriting them */
+
+               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
+                  but its use here is deprecated in favor of having it in the sharp_alm_info */
+
+               SHARP_NO_FFT          = 1<<7,
+
+               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
+               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
+             } sharp_jobflags;
+
+/*! Performs a libsharp SHT job. The interface deliberately does not use
+  the C99 "complex" data type, in order to be callable from C89 and C++.
+  \param type the type of SHT
+  \param spin the spin of the quantities to be transformed
+  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
+    alm[0] points to the a_lm of the SHT. If \a spin>0, alm[0] and alm[1]
+    point to the two a_lm sets of the SHT. The exact data type of \a alm
+    depends on whether the SHARP_DP flag is set.
+  \param map contains pointers to the maps. If \a spin==0,
+    map[0] points to the map of the SHT. If \a spin>0, or \a type is
+    SHARP_ALM2MAP_DERIV1, map[0] and map[1] point to the two maps of the SHT.
+    The exact data type of \a map depends on whether the SHARP_DP flag is set.
+  \param geom_info A \c sharp_geom_info object compatible with the provided
+    \a map arrays.
+  \param alm_info A \c sharp_alm_info object compatible with the provided
+    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
+    exactly once.
+  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
+    \a alm is expected to have the type "complex double **" and \a map is
+    expected to have the type "double **"; otherwise, the expected
+    types are "complex float **" and "float **", respectively.
+  \param time If not NULL, the wall clock time required for this SHT
+    (in seconds) will be written here.
+  \param opcnt If not NULL, a conservative estimate of the total floating point
+    operation count for this SHT will be written here. */
+void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info,
+  int flags, double *time, unsigned long long *opcnt);
+
+void sharp_set_chunksize_min(int new_chunksize_min);
+void sharp_set_nchunks_max(int new_nchunks_max);
+
+
+typedef enum { SHARP_ERROR_NO_MPI = 1,
+               /*!< libsharp not compiled with MPI support */
+              } sharp_errors;
+
+/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
+    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
+
+    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
+    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
+    is *(MPI_Comm*)pcomm.
+
+    Other parameters are the same as sharp_execute_mpi.
+
+    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
+    (in which case nothing is done).
+ */
+int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
+  void *alm, void *map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int flags, double *time,
+  unsigned long long *opcnt);
+
+/*! \} */
+
+#ifdef __cplusplus
+}
+#endif

 #endif
--- a/libsharp/sharp_almhelpers.c
+++ b/libsharp/sharp_almhelpers.c
@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.c
 *  Spherical transform library
 *
- *  Copyright (C) 2008-2013 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
 *  \author Martin Reinecke
 */

--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@ -25,14 +25,14 @@
 /*! \file sharp_almhelpers.h
 *  SHARP helper function for the creation of a_lm data structures
 *
- *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  Copyright (C) 2008-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

 #ifndef PLANCK_SHARP_ALMHELPERS_H
 #define PLANCK_SHARP_ALMHELPERS_H

-#include "sharp_lowlevel.h"
+#include "sharp.h"

 #ifdef __cplusplus
 extern "C" {
--- a/libsharp/sharp_announce.c
+++ b/libsharp/sharp_announce.c
@ -1,98 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_announce.c
- *  Banner for module startup
- *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#ifdef USE_MPI
-#include <mpi.h>
-#endif
-
-#include "sharp_announce.h"
-#include "sharp_vecutil.h"
-
-static void OpenMP_status(void)
-  {
-#ifndef _OPENMP
-  printf("OpenMP: not supported by this binary\n");
-#else
-  int threads = omp_get_max_threads();
-  if (threads>1)
-    printf("OpenMP active: max. %d threads.\n",threads);
-  else
-    printf("OpenMP active, but running with 1 thread only.\n");
-#endif
-  }
-
-static void MPI_status(void)
-  {
-#ifndef USE_MPI
-  printf("MPI: not supported by this binary\n");
-#else
-  int tasks;
-  MPI_Comm_size(MPI_COMM_WORLD,&tasks);
-  if (tasks>1)
-    printf("MPI active with %d tasks.\n",tasks);
-  else
-    printf("MPI active, but running with 1 task only.\n");
-#endif
-  }
-
-static void vecmath_status(void)
-  { printf("Supported vector length: %d\n",VLEN); }
-
-void sharp_announce (const char *name)
-  {
-  size_t m, nlen=strlen(name);
-  printf("\n+-");
-  for (m=0; m<nlen; ++m) printf("-");
-  printf("-+\n");
-  printf("| %s |\n", name);
-  printf("+-");
-  for (m=0; m<nlen; ++m) printf("-");
-  printf("-+\n\n");
-  vecmath_status();
-  OpenMP_status();
-  MPI_status();
-  printf("\n");
-  }
-
-void sharp_module_startup (const char *name, int argc, int argc_expected,
-  const char *argv_expected, int verbose)
-  {
-  if (verbose) sharp_announce (name);
-  if (argc==argc_expected) return;
-  if (verbose) fprintf(stderr, "Usage: %s %s\n", name, argv_expected);
-  exit(1);
-  }
--- a/libsharp/sharp_announce.h
+++ b/libsharp/sharp_announce.h
@ -1,39 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_announce.h
- *  Banner for module startup
- *
- *  Copyright (C) 2012 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef SHARP_ANNOUNCE_H
-#define SHARP_ANNOUNCE_H
-
-void sharp_announce (const char *name);
-void sharp_module_startup (const char *name, int argc, int argc_expected,
-  const char *argv_expected, int verbose);
-
-#endif
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@ -1,149 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*  \file sharp_complex_hacks.h
- *  support for converting vector types and complex numbers
- *
- *  Copyright (C) 2012,2013 Max-Planck-Society
- *  Author: Martin Reinecke
- */
-
-#ifndef SHARP_COMPLEX_HACKS_H
-#define SHARP_COMPLEX_HACKS_H
-
-#ifdef __cplusplus
-#error This header file cannot be included from C++, only from C
-#endif
-
-#include <math.h>
-#include <complex.h>
-#include "sharp_vecsupport.h"
-
-#define UNSAFE_CODE
-
-#if (VLEN==1)
-
-static inline complex double vhsum_cmplx(Tv a, Tv b)
-  { return a+_Complex_I*b; }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
-  { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
-
-#endif
-
-#if (VLEN==2)
-
-static inline complex double vhsum_cmplx (Tv a, Tv b)
-  {
-#if defined(__SSE3__)
-  Tv tmp = _mm_hadd_pd(a,b);
-#else
-  Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-                _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
-#endif
-  union {Tv v; complex double c; } u;
-  u.v=tmp; return u.c;
-  }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
-  Tv d, complex double * restrict c1, complex double * restrict c2)
-  {
-#ifdef UNSAFE_CODE
-#if defined(__SSE3__)
-  vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
-  vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
-#else
-  vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-                               _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
-  vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
-                               _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
-#endif
-#else
-  union {Tv v; complex double c; } u1, u2;
-#if defined(__SSE3__)
-  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
-#else
-  u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
-              _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
-  u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
-              _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
-#endif
-  *c1+=u1.c; *c2+=u2.c;
-#endif
-  }
-
-#endif
-
-#if (VLEN==4)
-
-static inline complex double vhsum_cmplx (Tv a, Tv b)
-  {
-  Tv tmp=_mm256_hadd_pd(a,b);
-  Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
-  tmp=_mm256_add_pd(tmp,tmp2);
-#ifdef UNSAFE_CODE
-  complex double ret;
-  *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
-  return ret;
-#else
-  union {Tv v; complex double c[2]; } u;
-  u.v=tmp; return u.c[0];
-#endif
-  }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
-  {
-  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
-  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
-     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
-  tmp1=vadd(tmp3,tmp4);
-#ifdef UNSAFE_CODE
-  *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
-  *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
-#else
-  union {Tv v; complex double c[2]; } u;
-  u.v=tmp1;
-  *c1+=u.c[0]; *c2+=u.c[1];
-#endif
-  }
-
-#endif
-
-#if (VLEN==8)
-
-static inline complex double vhsum_cmplx(Tv a, Tv b)
-  { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
-
-static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
-  complex double * restrict c1, complex double * restrict c2)
-  {
-  *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
-  *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
-  }
-
-#endif
-
-#endif
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@ -1,240 +1,116 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)

-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
+#define ARCH default
+#include "sharp_core_inc.c"
+#undef ARCH

-/*! \file sharp_core.c
- *  Computational core
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke
- */
+typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim);
+typedef int (*t_veclen) (void);
+typedef int (*t_max_nvec) (int spin);
+typedef const char *(*t_architecture) (void);

-#include <complex.h>
-#include <math.h>
-#include <string.h>
-#include "sharp_vecsupport.h"
-#include "sharp_complex_hacks.h"
-#include "sharp_ylmgen_c.h"
-#include "sharp.h"
-#include "sharp_core.h"
-#include "c_utils.h"
+static t_inner_loop inner_loop_ = NULL;
+static t_veclen veclen_ = NULL;
+static t_max_nvec max_nvec_ = NULL;
+static t_architecture architecture_ = NULL;

-typedef complex double dcmplx;
+#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)

-// must be in the range [0;6]
-#define MAXJOB_SPECIAL 2
+#define DECL(arch) \
+static int XCONCATX2(have,arch)(void) \
+  { \
+  static int res=-1; \
+  if (res<0) \
+    { \
+    __builtin_cpu_init(); \
+    res = __builtin_cpu_supports(#arch); \
+    } \
+  return res; \
+  } \
+\
+void XCONCATX2(inner_loop,arch) (sharp_job *job, const int *ispair, \
+  const double *cth_, const double *sth_, int llim, int ulim, \
+  sharp_Ylmgen_C *gen, int mi, const int *mlim); \
+int XCONCATX2(sharp_veclen,arch) (void); \
+int XCONCATX2(sharp_max_nvec,arch) (int spin); \
+const char *XCONCATX2(sharp_architecture,arch) (void);

-#define XCONCAT2(a,b) a##_##b
-#define CONCAT2(a,b) XCONCAT2(a,b)
-#define XCONCAT3(a,b,c) a##_##b##_##c
-#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
+#if (!defined(__AVX512F__))
+DECL(avx512f)
+#endif
+#if (!defined(__FMA4__))
+DECL(fma4)
+#endif
+#if (!defined(__FMA__))
+DECL(fma)
+#endif
+#if (!defined(__AVX2__))
+DECL(avx2)
+#endif
+#if (!defined(__AVX__))
+DECL(avx)
+#endif

-#define nvec 1
-#include "sharp_core_inchelper.c"
-#undef nvec
+#endif

-#define nvec 2
-#include "sharp_core_inchelper.c"
-#undef nvec
+static void assign_funcs(void)
+  {
+#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#define DECL2(arch) \
+  if (XCONCATX2(have,arch)()) \
+    { \
+    inner_loop_ = XCONCATX2(inner_loop,arch); \
+    veclen_ = XCONCATX2(sharp_veclen,arch); \
+    max_nvec_ = XCONCATX2(sharp_max_nvec,arch); \
+    architecture_ = XCONCATX2(sharp_architecture,arch); \
+    return; \
+    }
+#if (!defined(__AVX512F__))
+DECL2(avx512f)
+#endif
+#if (!defined(__FMA4__))
+DECL2(fma4)
+#endif
+#if (!defined(__FMA__))
+DECL2(fma)
+#endif
+#if (!defined(__AVX2__))
+DECL2(avx2)
+#endif
+#if (!defined(__AVX__))
+DECL2(avx)
+#endif
+#endif
+  inner_loop_ = inner_loop_default;
+  veclen_ = sharp_veclen_default;
+  max_nvec_ = sharp_max_nvec_default;
+  architecture_ = sharp_architecture_default;
+  }

-#define nvec 3
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 4
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 5
-#include "sharp_core_inchelper.c"
-#undef nvec
-
-#define nvec 6
-#include "sharp_core_inchelper.c"
-#undef nvec

 void inner_loop (sharp_job *job, const int *ispair,const double *cth,
  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
  const int *mlim)
  {
-  int njobs=job->ntrans, nv=job->flags&SHARP_NVMAX;
-  if (njobs<=MAXJOB_SPECIAL)
-    {
-    switch (njobs*16+nv)
-      {
-#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
-      case 0x11:
-        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x12:
-        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x13:
-        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x14:
-        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x15:
-        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x16:
-        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
-      case 0x21:
-        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x22:
-        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x23:
-        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x24:
-        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x25:
-        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x26:
-        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
-      case 0x31:
-        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x32:
-        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x33:
-        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x34:
-        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x35:
-        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x36:
-        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
-      case 0x41:
-        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x42:
-        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x43:
-        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x44:
-        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x45:
-        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x46:
-        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
-      case 0x51:
-        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x52:
-        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x53:
-        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x54:
-        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x55:
-        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x56:
-        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
-      case 0x61:
-        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x62:
-        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x63:
-        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x64:
-        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x65:
-        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-      case 0x66:
-        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,mlim);
-        return;
-#endif
-      }
-    }
-#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
-  else
-    {
-    switch (nv)
-      {
-      case 1:
-        CONCAT2(inner_loop,1)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 2:
-        CONCAT2(inner_loop,2)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 3:
-        CONCAT2(inner_loop,3)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 4:
-        CONCAT2(inner_loop,4)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 5:
-        CONCAT2(inner_loop,5)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      case 6:
-        CONCAT2(inner_loop,6)
-          (job, ispair,cth,sth,llim,ulim,gen,mi,mlim,job->ntrans);
-        return;
-      }
-    }
-#endif
-  UTIL_FAIL("Incorrect vector parameters");
+  if (!inner_loop_) assign_funcs();
+  inner_loop_(job, ispair, cth, sth, llim, ulim, gen, mi, mlim);
+  }
+int sharp_veclen(void)
+  {
+  if (!veclen_) assign_funcs();
+  return veclen_();
+  }
+int sharp_max_nvec(int spin)
+  {
+  if (!max_nvec_) assign_funcs();
+  return max_nvec_(spin);
+  }
+const char *sharp_architecture(void)
+  {
+  if (!architecture_) assign_funcs();
+  return architecture_();
  }
--- a/libsharp/sharp_core.h
+++ b/libsharp/sharp_core.h
@ -1,50 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core.h
- *  Interface for the computational core
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef PLANCK_SHARP_CORE_H
-#define PLANCK_SHARP_CORE_H
-
-#include "sharp_internal.h"
-#include "sharp_ylmgen_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inner_loop (sharp_job *job, const int *ispair,const double *cth,
-  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
-  const int *mlim);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libsharp/sharp_core_avx.c
+++ b/libsharp/sharp_core_avx.c
@ -0,0 +1,11 @@
+#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH avx
+#pragma GCC target("avx")
+#include "sharp_core_inc.c"
+
+#endif
--- a/libsharp/sharp_core_avx2.c
+++ b/libsharp/sharp_core_avx2.c
@ -0,0 +1,11 @@
+#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH avx2
+#pragma GCC target("avx2")
+#include "sharp_core_inc.c"
+
+#endif
--- a/libsharp/sharp_core_avx512f.c
+++ b/libsharp/sharp_core_avx512f.c
@ -0,0 +1,11 @@
+#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH avx512f
+#pragma GCC target("avx512f")
+#include "sharp_core_inc.c"
+
+#endif
--- a/libsharp/sharp_core_fma.c
+++ b/libsharp/sharp_core_fma.c
@ -0,0 +1,11 @@
+#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH fma
+#pragma GCC target("fma")
+#include "sharp_core_inc.c"
+
+#endif
--- a/libsharp/sharp_core_fma4.c
+++ b/libsharp/sharp_core_fma4.c
@ -0,0 +1,11 @@
+#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
+#define ARCH fma4
+#pragma GCC target("fma4")
+#include "sharp_core_inc.c"
+
+#endif
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@ -1,803 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_core_inc2.c
- *  Type-dependent code for the computational core
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
-  int l, int lmax NJ1)
-  {
-if (njobs>1)
-  {
-  while (l<lmax-2)
-    {
-    Tb lam_3, lam_4;
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
-    r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar2=vload(creal(alm[njobs*l+j])),
-         ai2=vload(cimag(alm[njobs*l+j])),
-         ar4=vload(creal(alm[njobs*(l+2)+j])),
-         ai4=vload(cimag(alm[njobs*(l+2)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
-        vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
-        }
-      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
-         ai3=vload(cimag(alm[njobs*(l+1)+j])),
-         ar1=vload(creal(alm[njobs*(l+3)+j])),
-         ai1=vload(cimag(alm[njobs*(l+3)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
-        vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
-        }
-      }
-    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
-    l+=4;
-    }
-  }
-  while (l<lmax)
-    {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),
-         ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
-        }
-      ar=vload(creal(alm[njobs*(l+1)+j]));
-      ai=vload(cimag(alm[njobs*(l+1)+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
-        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
-        }
-      }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    l+=2;
-    }
-  if (l==lmax)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
-        }
-      }
-    }
-  }
-
-static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
-  NJ1)
-  {
-  while (l<lmax)
-    {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
-        }
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
-        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
-        }
-      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
-      }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    l+=2;
-    }
-  if (l==lmax)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
-        }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
-    }
-  }
-
-static void Z(calc_alm2map) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
-  Y(Tbri) * restrict p2 NJ1)
-  {
-  int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
-
-  Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(p1[j].r.v[i],tmp,ar);
-        vfmaeq(p1[j].i.v[i],tmp,ai);
-        }
-      }
-    if (++l>lmax) break;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(p2[j].r.v[i],tmp,ar);
-        vfmaeq(p2[j].i.v[i],tmp,ai);
-        }
-      }
-    if (++l>lmax) break;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
-      }
-    }
-  if (l>lmax) return;
-
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
-  }
-
-static void Z(calc_map2alm) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2 NJ1)
-  {
-  int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
-  int l=gen->m;
-  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
-  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
-  Tb corfac;
-  Y(getCorfac)(scale,&corfac,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scale,sharp_minscale);
-  while (!full_ieee)
-    {
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p1[j].r.v[i]);
-        vfmaeq(tim,tmp,p1[j].i.v[i]);
-        }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
-    if (++l>lmax) return;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
-      for (int i=0; i<nvec; ++i)
-        {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p2[j].r.v[i]);
-        vfmaeq(tim,tmp,p2[j].i.v[i]);
-        }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
-    if (++l>lmax) return;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
-    if (Y(rescale)(&lam_1,&lam_2,&scale))
-      {
-      Y(getCorfac)(scale,&corfac,gen->cf);
-      full_ieee = Y(TballGe)(scale,sharp_minscale);
-      }
-    }
-
-  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
-  }
-
-static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
-       acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],agr,lw);
-      vfmaeq(px[j].qi.v[i],agi,lw);
-      vfmaeq(px[j].ur.v[i],acr,lw);
-      vfmaeq(px[j].ui.v[i],aci,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmseq(py[j].qr.v[i],aci,lx);
-      vfmaeq(py[j].qi.v[i],acr,lx);
-      vfmaeq(py[j].ur.v[i],agi,lx);
-      vfmseq(py[j].ui.v[i],agr,lx);
-      }
-    }
-  }
-
-static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
-  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
-  const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
-       acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
-    Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
-       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
-      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
-      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
-      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
-      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
-      vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
-      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
-      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
-      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
-      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
-      vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
-      }
-    }
-  }
-
-static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
-  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
-  const Tb * restrict rxm, dcmplx * restrict alm NJ1)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp->v[i],rxm->v[i]);
-      vfmaeq(agr,px[j].qr.v[i],lw);
-      vfmaeq(agi,px[j].qi.v[i],lw);
-      vfmaeq(acr,px[j].ur.v[i],lw);
-      vfmaeq(aci,px[j].ui.v[i],lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm->v[i],rxp->v[i]);
-      vfmseq(agr,py[j].ui.v[i],lx);
-      vfmaeq(agi,py[j].ur.v[i],lx);
-      vfmaeq(acr,py[j].qi.v[i],lx);
-      vfmseq(aci,py[j].qr.v[i],lx);
-      }
-    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
-    }
-  }
-
-static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax NJ1)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
-      &alm[2*njobs*(l+1)] NJ2);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
-  }
-
-static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
-  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
-  NJ1)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
-    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
-  }
-
-static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2 NJ1)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[2*njobs*l] NJ2);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[2*njobs*l] NJ2);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax) return;
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax NJ2);
-  }
-
-static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
-  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
-  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
-    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
-    if (++l>lmax) return;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
-    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
-    if (++l>lmax) return;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
-  }
-
-static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
-  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
-  {
-  for (int j=0; j<njobs; ++j)
-    {
-    Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lw=vadd(rxp.v[i],rxm.v[i]);
-      vfmaeq(px[j].qr.v[i],ar,lw);
-      vfmaeq(px[j].qi.v[i],ai,lw);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      Tv lx=vsub(rxm.v[i],rxp.v[i]);
-      vfmaeq(py[j].ur.v[i],ai,lx);
-      vfmseq(py[j].ui.v[i],ar,lx);
-      }
-    }
-  }
-
-static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
-  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
-  int lmax NJ1)
-  {
-  while (l<lmax)
-    {
-    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
-       fx2=vload(fx[l+1].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
-      }
-    Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
-    Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
-    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
-    fx2=vload(fx[l+2].f[2]);
-    for (int i=0; i<nvec; ++i)
-      {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
-      }
-    l+=2;
-    }
-  if (l==lmax)
-    Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
-  }
-
-static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
-  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
-  Y(Tbqu) * restrict p2 NJ1)
-  {
-  int l, lmax=gen->lmax;
-  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
-  Y(iter_to_ieee_spin)
-    (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
-  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
-  if (l>lmax) return;
-  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
-
-  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
-  Tb corfacp,corfacm;
-  Y(getCorfac)(scalep,&corfacp,gen->cf);
-  Y(getCorfac)(scalem,&corfacm,gen->cf);
-  const dcmplx * restrict alm=job->almtmp;
-  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-  while (!full_ieee)
-    {
-    Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
-      &alm[njobs*l] NJ2);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
-    Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
-      &alm[njobs*l] NJ2);
-    if (++l>lmax) break;
-    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
-    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
-      {
-      Y(getCorfac)(scalep,&corfacp,gen->cf);
-      Y(getCorfac)(scalem,&corfacm,gen->cf);
-      full_ieee = Y(TballGe)(scalep,sharp_minscale)
-               && Y(TballGe)(scalem,sharp_minscale);
-      }
-    }
-
-  if (l>lmax) return;
-
-  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
-  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
-  Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
-    lmax NJ2);
-  }
-
-
-#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
-
-static void Z(inner_loop) (sharp_job *job, const int *ispair,
-  const double *cth_, const double *sth_, int llim, int ulim,
-  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
-  {
-  const int nval=nvec*VLEN;
-  const int m = job->ainfo->mval[mi];
-  sharp_Ylmgen_prepare (gen, m);
-
-  switch (job->type)
-    {
-    case SHARP_ALM2MAP:
-    case SHARP_ALM2MAP_DERIV1:
-      {
-      if (job->spin==0)
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-
-          int skip=1;
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
-                complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
-                               r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
-                job->phase[phas_idx] = r1+r2;
-                if (ispair[itot])
-                  job->phase[phas_idx+1] = r1-r2;
-                }
-              }
-            }
-          }
-        }
-      else
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            }
-          if (!skip)
-            (job->type==SHARP_ALM2MAP) ?
-              Z(calc_alm2map_spin  )
-                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
-              Z(calc_alm2map_deriv1)
-                (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot<ulim-llim)
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
-                complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
-                               q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
-                               u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
-                               u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
-                job->phase[phas_idx] = q1+q2;
-                job->phase[phas_idx+2] = u1+u2;
-                if (ispair[itot])
-                  {
-                  dcmplx *phQ = &(job->phase[phas_idx+1]),
-                         *phU = &(job->phase[phas_idx+3]);
-                  *phQ = q1-q2;
-                  *phU = u1-u2;
-                  if ((gen->mhi-gen->m+gen->s)&1)
-                    { *phQ=-(*phQ); *phU=-(*phU); }
-                  }
-                }
-              }
-            }
-          }
-        }
-      break;
-      }
-    case SHARP_MAP2ALM:
-      {
-      if (job->spin==0)
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
-                dcmplx ph1=job->phase[phas_idx];
-                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
-                p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
-                p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
-                }
-              }
-            }
-          if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
-          }
-        }
-      else
-        {
-        for (int ith=0; ith<ulim-llim; ith+=nval)
-          {
-          Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
-          Y(Tbu) cth, sth;
-          int skip=1;
-
-          for (int i=0; i<nval; ++i)
-            {
-            int itot=i+ith;
-            if (itot>=ulim-llim) itot=ulim-llim-1;
-            if (mlim[itot]>=m) skip=0;
-            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
-            if (i+ith<ulim-llim)
-              {
-              for (int j=0; j<njobs; ++j)
-                {
-                int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
-                dcmplx p1Q=job->phase[phas_idx],
-                       p1U=job->phase[phas_idx+2],
-                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
-                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
-                if ((gen->mhi-gen->m+gen->s)&1)
-                  { p2Q=-p2Q; p2U=-p2U; }
-                p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
-                p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
-                p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
-                p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
-                }
-              }
-            }
-          if (!skip)
-            Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
-          }
-        }
-      break;
-      }
-    default:
-      {
-      UTIL_FAIL("must not happen");
-      break;
-      }
-    }
-  }
-
-#undef VZERO
--- a/libsharp/sharp_core_inchelper.c
+++ b/libsharp/sharp_core_inchelper.c
@ -1,70 +0,0 @@
-#define Tb CONCAT2(Tb,nvec)
-#define Y(arg) CONCAT2(arg,nvec)
-#include "sharp_core_inc.c"
-
-#if (SHARP_MAXTRANS>MAXJOB_SPECIAL)
-#define NJ1 , int njobs
-#define NJ2 , njobs
-#define Z(arg) CONCAT2(arg,nvec)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef NJ1
-#undef NJ2
-#endif
-
-#define NJ1
-#define NJ2
-
-#if ((MAXJOB_SPECIAL>=1)&&(SHARP_MAXTRANS>=1))
-#define njobs 1
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=2)&&(SHARP_MAXTRANS>=2))
-#define njobs 2
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=3)&&(SHARP_MAXTRANS>=3))
-#define njobs 3
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=4)&&(SHARP_MAXTRANS>=4))
-#define njobs 4
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=5)&&(SHARP_MAXTRANS>=5))
-#define njobs 5
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#if ((MAXJOB_SPECIAL>=6)&&(SHARP_MAXTRANS>=6))
-#define njobs 6
-#define Z(arg) CONCAT3(arg,nvec,njobs)
-#include "sharp_core_inc2.c"
-#undef Z
-#undef njobs
-#endif
-
-#undef NJ1
-#undef NJ2
-
-#undef Y
-#undef Tb
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@ -25,14 +25,15 @@
 /*! \file sharp_cxx.h
 *  Spherical transform library
 *
- *  Copyright (C) 2012-2015 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

 #ifndef PLANCK_SHARP_CXX_H
 #define PLANCK_SHARP_CXX_H

-#include "sharp_lowlevel.h"
+#include <complex>
+#include "sharp.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"

@ -47,8 +48,8 @@ class sharp_base
      : ainfo(0), ginfo(0) {}
    ~sharp_base()
      {
-      sharp_destroy_geom_info(ginfo);
-      sharp_destroy_alm_info(ainfo);
+      if (ginfo) sharp_destroy_geom_info(ginfo);
+      if (ainfo) sharp_destroy_alm_info(ainfo);
      }

    void set_general_geometry (int nrings, const int *nph, const ptrdiff_t *ofs,
@ -107,47 +108,115 @@ template<typename T> class sharp_cxxjob: public sharp_base
  private:
    static void *conv (T *ptr)
      { return reinterpret_cast<void *>(ptr); }
+    static void *conv (std::complex<T> *ptr)
+      { return reinterpret_cast<void *>(ptr); }
    static void *conv (const T *ptr)
      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
+    static void *conv (const std::complex<T> *ptr)
+      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }

  public:
-    void alm2map (const T *alm, T *map, bool add)
+    void alm2map (const T *alm, T *map, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
-        flags,0,0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, flags, 0, 0);
      }
-    void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2,
-      int spin, bool add)
+    void alm2map (const std::complex<T> *alm, T *map, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, flags, 0, 0);
+      }
+    void alm2map_spin (const T *alm1, const T *alm2,
+      T *map1, T *map2, int spin, bool add) const
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,flags, 0, 0);
      }
-    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add)
+    void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2,
+      T *map1, T *map2, int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP, spin, aptr, mptr, ginfo, ainfo, flags,0,0);
+      }
+    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const
      {
      void *aptr=conv(alm), *mptr[2];
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,flags,0,0);
      }
-    void map2alm (const T *map, T *alm, bool add)
+    void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add)
+      const
+      {
+      void *aptr=conv(alm), *mptr[2];
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,flags,0,0);
+      }
+    void alm2map_adjoint (const T *map, T *alm, bool add) const
      {
      void *aptr=conv(alm), *mptr=conv(map);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
      }
-    void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
-      int spin, bool add)
+    void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
+      }
+    void alm2map_spin_adjoint (const T *map1, const T *map2, T *alm1, T *alm2,
+      int spin, bool add) const
      {
      void *aptr[2], *mptr[2];
      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
      mptr[0]=conv(map1); mptr[1]=conv(map2);
      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
-      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      sharp_execute (SHARP_Yt,spin,aptr,mptr,ginfo,ainfo,flags,0,0);
+      }
+    void alm2map_spin_adjoint (const T *map1, const T *map2,
+      std::complex<T> *alm1, std::complex<T> *alm2, int spin, bool add) const
+      {
+      alm2map_spin_adjoint (map1, map2, reinterpret_cast<T *>(alm1),
+        reinterpret_cast<T *>(alm2), spin, add);
+      }
+    void map2alm (const T *map, T *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
+      }
+    void map2alm (const T *map, std::complex<T> *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,flags,0,0);
+      }
+    void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
+      int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,flags,0,0);
+      }
+    void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1,
+      std::complex<T> *alm2, int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,flags,0,0);
      }
  };

--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@ -25,17 +25,15 @@
 /*! \file sharp_geomhelpers.c
 *  Spherical transform library
 *
- *  Copyright (C) 2006-2012 Max-Planck-Society<br>
- *  Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl)
- *  \author Martin Reinecke \author Pavel Holoborodko
+ *  Copyright (C) 2006-2018 Max-Planck-Society
+ *  \author Martin Reinecke
 */

 #include <math.h>
 #include "sharp_geomhelpers.h"
 #include "sharp_legendre_roots.h"
 #include "c_utils.h"
-#include "ls_fft.h"
-#include <stdio.h>
+#include "pocketfft/pocketfft.h"

 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
  const int *rings, const double *weight, sharp_geom_info **geom_info)
@ -161,9 +159,9 @@ void sharp_make_fejer1_geom_info (int nrings, int ppring, double phi0,
    weight[2*k  ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
    }
  if ((nrings&1)==0) weight[nrings-1]=0.;
-  real_plan plan = make_real_plan(nrings);
-  real_plan_backward_fftpack(plan,weight);
-  kill_real_plan(plan);
+  rfft_plan plan = make_rfft_plan(nrings);
+  rfft_backward(plan,weight,1.);
+  destroy_rfft_plan(plan);

  for (int m=0; m<(nrings+1)/2; ++m)
    {
@ -208,9 +206,9 @@ void sharp_make_cc_geom_info (int nrings, int ppring, double phi0,
  for (int k=1; k<=(n/2-1); ++k)
    weight[2*k-1]=2./(1.-4.*k*k) + dw;
  weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1);
-  real_plan plan = make_real_plan(n);
-  real_plan_backward_fftpack(plan,weight);
-  kill_real_plan(plan);
+  rfft_plan plan = make_rfft_plan(n);
+  rfft_backward(plan,weight,1.);
+  destroy_rfft_plan(plan);
  weight[n]=weight[0];

  for (int m=0; m<(nrings+1)/2; ++m)
@ -256,9 +254,9 @@ void sharp_make_fejer2_geom_info (int nrings, int ppring, double phi0,
  for (int k=1; k<=(n/2-1); ++k)
    weight[2*k-1]=2./(1.-4.*k*k);
  weight[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1.;
-  real_plan plan = make_real_plan(n);
-  real_plan_backward_fftpack(plan,weight);
-  kill_real_plan(plan);
+  rfft_plan plan = make_rfft_plan(n);
+  rfft_backward(plan,weight,1.);
+  destroy_rfft_plan(plan);
  for (int m=0; m<nrings; ++m)
    weight[m]=weight[m+1];

--- a/libsharp/sharp_geomhelpers.h
+++ b/libsharp/sharp_geomhelpers.h
@ -25,14 +25,14 @@
 /*! \file sharp_geomhelpers.h
 *  SHARP helper function for the creation of grid geometries
 *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

 #ifndef PLANCK_SHARP_GEOMHELPERS_H
 #define PLANCK_SHARP_GEOMHELPERS_H

-#include "sharp_lowlevel.h"
+#include "sharp.h"

 #ifdef __cplusplus
 extern "C" {
--- a/libsharp/sharp_internal.h
+++ b/libsharp/sharp_internal.h
@ -25,7 +25,7 @@
 /*! \file sharp_internal.h
 *  Internally used functionality for the spherical transform library.
 *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

@ -36,7 +36,9 @@
 #error This header file cannot be included from C++, only from C
 #endif

+#include <complex.h>
 #include "sharp.h"
+#include "sharp_ylmgen_c.h"

 #define SHARP_MAXTRANS 100

@ -55,12 +57,17 @@ typedef struct
  const sharp_geom_info *ginfo;
  const sharp_alm_info *ainfo;
  double time;
-  int ntrans;
  unsigned long long opcnt;
  } sharp_job;

-int sharp_get_nv_max (void);
-int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
 int sharp_get_mlim (int lmax, int spin, double sth, double cth);

+void inner_loop (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, sharp_Ylmgen_C *gen, int mi,
+  const int *mlim);
+
+int sharp_veclen(void);
+int sharp_max_nvec(int spin);
+const char *sharp_architecture(void);
+
 #endif
--- a/libsharp/sharp_legendre.c
+++ b/libsharp/sharp_legendre.c
--- a/libsharp/sharp_legendre.c.in
+++ b/libsharp/sharp_legendre.c.in
@ -1,176 +0,0 @@
-/*
-
-    NOTE NOTE NOTE
-
-    This file is edited in sharp_legendre.c.in which is then preprocessed.
-    Do not make manual  modifications to sharp_legendre.c.
-
-    NOTE NOTE NOTE
-
-*/
-
-
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre.c.in
- *
- *  Copyright (C) 2015 University of Oslo
- *  \author Dag Sverre Seljebotn
- */
-
-#ifndef NO_LEGENDRE
-#if (VLEN==8)
-#error This code is not tested with MIC; please compile with -DNO_LEGENDRE
-/* ...or test it (it probably works) and remove this check */
-#endif
-
-#ifndef SHARP_LEGENDRE_CS
-#define SHARP_LEGENDRE_CS 4
-#endif
-
-#define MAX_CS 6
-#if (SHARP_LEGENDRE_CS > MAX_CS)
-#error (SHARP_LEGENDRE_CS > MAX_CS)
-#endif
-
-#include "sharp_legendre.h"
-#include "sharp_vecsupport.h"
-
-#include <stdlib.h>
-
-/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
-/*{ for cs in range(1, 7) }*/
-static void legendre_transform_vec{{cs}}{{T}}({{scalar}} *recfacs, {{scalar}} *bl, ptrdiff_t lmax,
-                                              {{scalar}} xarr[({{cs}}) * VLEN{{T}}],
-                                              {{scalar}} out[({{cs}}) * VLEN{{T}}]) {
-    /*{ for i in range(cs) }*/
-    Tv{{T}} P_{{i}}, Pm1_{{i}}, Pm2_{{i}}, x{{i}}, y{{i}};
-    /*{ endfor }*/
-    Tv{{T}} W1, W2, b, R;
-    ptrdiff_t l;
-
-    /*{ for i in range(cs) }*/
-    x{{i}} = vloadu{{T}}(xarr + {{i}} * VLEN{{T}});
-    Pm1_{{i}} = vload{{T}}(1.0);
-    P_{{i}} = x{{i}};
-    b = vload{{T}}(*bl);
-    y{{i}} = vmul{{T}}(Pm1_{{i}}, b);
-    /*{ endfor }*/
-    
-    b = vload{{T}}(*(bl + 1));
-    /*{ for i in range(cs) }*/
-    vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
-    /*{ endfor }*/
-
-    for (l = 2; l <= lmax; ++l) {
-        b = vload{{T}}(*(bl + l));
-        R = vload{{T}}(*(recfacs + l));
-        
-        /* 
-           P = x * Pm1 + recfacs[l] * (x * Pm1 - Pm2)
-        */
-        /*{ for i in range(cs) }*/
-        Pm2_{{i}} = Pm1_{{i}}; Pm1_{{i}} = P_{{i}};
-        W1 = vmul{{T}}(x{{i}}, Pm1_{{i}});
-        W2 = W1;
-        W2 = vsub{{T}}(W2, Pm2_{{i}});
-        P_{{i}} = W1;
-        vfmaeq{{T}}(P_{{i}}, W2, R);
-        vfmaeq{{T}}(y{{i}}, P_{{i}}, b);
-        /*{ endfor }*/
-
-    }
-    /*{ for i in range(cs) }*/
-    vstoreu{{T}}(out + {{i}} * VLEN{{T}}, y{{i}});
-    /*{ endfor }*/
-}
-/*{ endfor }*/
-/*{ endfor }*/
-
-
-/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
-void sharp_legendre_transform_recfac{{T}}({{scalar}} *r, ptrdiff_t lmax) {
-    /* (l - 1) / l, for l >= 2 */
-    ptrdiff_t l;
-    r[0] = 0;
-    r[1] = 1;
-    for (l = 2; l <= lmax; ++l) {
-        r[l] = ({{scalar}})(l - 1) / ({{scalar}})l;
-    }
-}
-/*{ endfor }*/
-
-/*
-  Compute sum_l b_l P_l(x_i) for all i. 
- */
-
-#define LEN (SHARP_LEGENDRE_CS * VLEN)
-#define LEN_s (SHARP_LEGENDRE_CS * VLEN_s)
-
-/*{ for scalar, T in [("double", ""), ("float", "_s")] }*/
-void sharp_legendre_transform{{T}}({{scalar}} *bl,
-                                   {{scalar}} *recfac,
-                                   ptrdiff_t lmax,
-                                   {{scalar}} *x, {{scalar}} *out, ptrdiff_t nx) {
-    {{scalar}} xchunk[MAX_CS * VLEN{{T}}], outchunk[MAX_CS * LEN{{T}}];
-    int compute_recfac;
-    ptrdiff_t i, j, len;
-
-    compute_recfac = (recfac == NULL);
-    if (compute_recfac) {
-        recfac = malloc(sizeof({{scalar}}) * (lmax + 1));
-        sharp_legendre_transform_recfac{{T}}(recfac, lmax);
-    }
-
-    for (j = 0; j != LEN{{T}}; ++j) xchunk[j] = 0;
-
-    for (i = 0; i < nx; i += LEN{{T}}) {
-        len = (i + (LEN{{T}}) <= nx) ? (LEN{{T}}) : (nx - i);
-        for (j = 0; j != len; ++j) xchunk[j] = x[i + j];
-        switch ((len + VLEN{{T}} - 1) / VLEN{{T}}) {
-          case 6: legendre_transform_vec6{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 5: legendre_transform_vec5{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 4: legendre_transform_vec4{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 3: legendre_transform_vec3{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 2: legendre_transform_vec2{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-          case 1:
-          case 0:
-              legendre_transform_vec1{{T}}(recfac, bl, lmax, xchunk, outchunk); break;
-        }
-        for (j = 0; j != len; ++j) out[i + j] = outchunk[j];
-    }
-    if (compute_recfac) {
-        free(recfac);
-    }
-}
-/*{ endfor }*/
-
-#endif
--- a/libsharp/sharp_legendre.h
+++ b/libsharp/sharp_legendre.h
@ -1,62 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre.h
- *  Interface for the Legendre transform parts of the spherical transform library.
- *
- *  Copyright (C) 2015 University of Oslo
- *  \author Dag Sverre Seljebotn
- */
-
-#ifndef SHARP_LEGENDRE_H
-#define SHARP_LEGENDRE_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef NO_LEGENDRE
-
-void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
-                              double *out, ptrdiff_t nx);
-void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
-                                float *out, ptrdiff_t nx);
-void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax);
-void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libsharp/sharp_legendre_roots.h
+++ b/libsharp/sharp_legendre_roots.h
@ -24,7 +24,7 @@

 /*! \file sharp_legendre_roots.h
 *
- *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  Copyright (C) 2006-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

--- a/libsharp/sharp_legendre_table.c
+++ b/libsharp/sharp_legendre_table.c
--- a/libsharp/sharp_legendre_table.h
+++ b/libsharp/sharp_legendre_table.h
@ -1,97 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- * Redistribution and use in source and binary forms, with or without
- * met:
- * 
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file sharp_legendre_table.h
- *  Interface for computing tables of the normalized associated Legendre transform
- *
- *  Copyright (C) 2017 Dag Sverre Seljebotn
- *  \author Dag Sverre Seljebotn
- *
- *  Note: This code was mainly copied from libpsht; only a small high-level wrapper added
- */
-
-#ifndef SHARP_LEGENDRE_TABLE_H
-#define SHARP_LEGENDRE_TABLE_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef NO_LEGENDRE_TABLE
-
-
-/*! Returns a table of the normalized associated Legendre polynomials. m is a single
-    fixed argument and a table for multiple l and cos(theta) is provided.
-    (Internally, sin(theta) is also used for part of the computation, making theta
-    the most convenient argument.)
-
-    NOTE: Support for spin-weighted Legendre functions is on the TODO-list. Only spin=0
-    is supported now.
-
-    \param m The m-value to compute a table for; must be >= 0
-    \param spin The spin parameter; pass 0 for the regular associated Legendre functions.
-                NOTE: This is present for future compatability, currently only 0 is supported.
-    \param lmax A table will be provided for l = m .. lmax
-    \param ntheta How many theta values to evaluate for
-    \param theta Contiguous 1D array of theta values
-    \param theta_stride See below
-    \param l_stride See below
-    \param spin_stride See below. "ispin" will always be 0 if spin==0, or 0 for positive spin
-                       and 1 for the corresponding negative spin otherwise.
-    \param out Contiguous 3D array that will receive the output. Each output entry
-               is assigned to out[itheta * theta_stride + (l - m) * l_stride + ispin * spin_stride].
- */
-void sharp_normalized_associated_legendre_table(
-  ptrdiff_t m,
-  int spin,
-  ptrdiff_t lmax,
-  ptrdiff_t ntheta,
-  /* contiguous 1D array of theta values to compute for,
-     contains ntheta values */
-  double *theta,
-  /* contiguous 2D array, in "theta-major ordering". Has `ntheta`
-     rows and `ncols` columns. Indexed as out[itheta * ncols + (l - m)].
-     If `ncols > lmax - m` then those entries are not accessed.
-  */
-  ptrdiff_t theta_stride,
-  ptrdiff_t l_stride,
-  ptrdiff_t spin_stride,
-  double *out
-);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libsharp/sharp_lowlevel.h
+++ b/libsharp/sharp_lowlevel.h
@ -1,272 +0,0 @@
-/*
- *  This file is part of libsharp.
- *
- *  libsharp is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libsharp is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libsharp; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_lowlevel.h
- *  Low-level, portable interface for the spherical transform library.
- *
- *  Copyright (C) 2012-2013 Max-Planck-Society
- *  \author Martin Reinecke \author Dag Sverre Seljebotn
- */
-
-#ifndef PLANCK_SHARP_LOWLEVEL_H
-#define PLANCK_SHARP_LOWLEVEL_H
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! \internal
-    Helper type containing information about a single ring. */
-typedef struct
-  {
-  double theta, phi0, weight, cth, sth;
-  ptrdiff_t ofs;
-  int nph, stride;
-  } sharp_ringinfo;
-
-/*! \internal
-    Helper type containing information about a pair of rings with colatitudes
-    symmetric around the equator. */
-typedef struct
-  {
-  sharp_ringinfo r1,r2;
-  } sharp_ringpair;
-
-/*! \internal
-    Type holding all required information about a map geometry. */
-typedef struct
-  {
-  sharp_ringpair *pair;
-  int npairs, nphmax;
-  } sharp_geom_info;
-
-/*! \defgroup almgroup Helpers for dealing with a_lm */
-/*! \{ */
-
-/*! \internal
-    Helper type for index calculation in a_lm arrays. */
-typedef struct
-  {
-  /*! Maximum \a l index of the array */
-  int lmax;
-  /*! Number of different \a m values in this object */
-  int nm;
-  /*! Array with \a nm entries containing the individual m values */
-  int *mval;
-  /*! Combination of flags from sharp_almflags */
-  int flags;
-  /*! Array with \a nm entries containing the (hypothetical) indices of
-      the coefficients with quantum numbers 0,\a mval[i] */
-  ptrdiff_t *mvstart;
-  /*! Stride between a_lm and a_(l+1),m */
-  ptrdiff_t stride;
-  } sharp_alm_info;
-
-/*! alm_info flags */
-typedef enum { SHARP_PACKED = 1,
-               /*!< m=0-coefficients are packed so that the (zero) imaginary part is
-                    not present. mvstart is in units of *real* float/double for all
-                    m; stride is in units of reals for m=0 and complex for m!=0 */
-               SHARP_REAL_HARMONICS  = 1<<6
-               /*!< Use the real spherical harmonic convention. For
-                    m==0, the alm are treated exactly the same as in
-                    the complex case.  For m!=0, alm[i] represent a
-                    pair (+abs(m), -abs(m)) instead of (real, imag),
-                    and the coefficients are scaled by a factor of
-                    sqrt(2) relative to the complex case.  In other
-                    words, (sqrt(.5) * alm[i]) recovers the
-                    corresponding complex coefficient (when accessed
-                    as complex).
-                */
-             } sharp_almflags;
-
-
-
-/*! Creates an a_lm data structure from the following parameters:
-    \param lmax maximum \a l quantum number (>=0)
-    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
-    \param stride the stride between entries with identical \a m, and \a l
-      differing by 1.
-    \param mstart the index of the (hypothetical) coefficient with the
-      quantum numbers 0,\a m. Must have \a mmax+1 entries.
-    \param alm_info will hold a pointer to the newly created data structure
- */
-void sharp_make_alm_info (int lmax, int mmax, int stride,
-  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
-/*! Creates an a_lm data structure which from the following parameters:
-    \param lmax maximum \a l quantum number (\a >=0)
-    \param nm number of different \a m (\a 0<=nm<=lmax+1)
-    \param stride the stride between entries with identical \a m, and \a l
-      differing by 1.
-    \param mval array with \a nm entries containing the individual m values
-    \param mvstart array with \a nm entries containing the (hypothetical)
-      indices of the coefficients with the quantum numbers 0,\a mval[i]
-    \param flags a combination of sharp_almflags (pass 0 unless you know you need this)
-    \param alm_info will hold a pointer to the newly created data structure
- */
-void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
-  const ptrdiff_t *mvstart, int flags, sharp_alm_info **alm_info);
-/*! Returns the index of the coefficient with quantum numbers \a l,
-    \a mval[mi].
-    \note for a \a sharp_alm_info generated by sharp_make_alm_info() this is
-    the index for the coefficient with the quantum numbers \a l, \a mi. */
-ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
-/*! Returns the number of alm coefficients described by \a self. If the SHARP_PACKED
-    flag is set, this is number of "real" coeffecients (for m < 0 and m >= 0),
-    otherwise it is the number of complex coefficients (with m>=0). */
-ptrdiff_t sharp_alm_count(const sharp_alm_info *self);
-/*! Deallocates the a_lm info object. */
-void sharp_destroy_alm_info (sharp_alm_info *info);
-
-/*! \} */
-
-/*! \defgroup geominfogroup Functions for dealing with geometry information */
-/*! \{ */
-
-/*! Creates a geometry information from a set of ring descriptions.
-    All arrays passed to this function must have \a nrings elements.
-    \param nrings the number of rings in the map
-    \param nph the number of pixels in each ring
-    \param ofs the index of the first pixel in each ring in the map array
-    \param stride the stride between consecutive pixels
-    \param phi0 the azimuth (in radians) of the first pixel in each ring
-    \param theta the colatitude (in radians) of each ring
-    \param wgt the pixel weight to be used for the ring in map2alm
-      and adjoint map2alm transforms.
-      Pass NULL to use 1.0 as weight for all rings.
-    \param geom_info will hold a pointer to the newly created data structure
- */
-void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
-  const int *stride, const double *phi0, const double *theta,
-  const double *wgt, sharp_geom_info **geom_info);
-
-/*! Counts the number of grid points needed for (the local part of) a map described
-    by \a info.
- */
-ptrdiff_t sharp_map_size(const sharp_geom_info *info);
-
-/*! Deallocates the geometry information in \a info. */
-void sharp_destroy_geom_info (sharp_geom_info *info);
-
-/*! \} */
-
-/*! \defgroup lowlevelgroup Low-level libsharp SHT interface */
-/*! \{ */
-
-/*! Enumeration of SHARP job types. */
-typedef enum { SHARP_YtW=0,               /*!< analysis */
-               SHARP_MAP2ALM=SHARP_YtW,   /*!< analysis */
-               SHARP_Y=1,                 /*!< synthesis */
-               SHARP_ALM2MAP=SHARP_Y,     /*!< synthesis */
-               SHARP_Yt=2,                /*!< adjoint synthesis */
-               SHARP_WY=3,                /*!< adjoint analysis */
-               SHARP_ALM2MAP_DERIV1=4     /*!< synthesis of first derivatives */
-             } sharp_jobtype;
-
-/*! Job flags */
-typedef enum { SHARP_DP              = 1<<4,
-               /*!< map and a_lm are in double precision */
-               SHARP_ADD             = 1<<5,
-               /*!< results are added to the output arrays, instead of
-                    overwriting them */
-
-               /* NOTE: SHARP_REAL_HARMONICS, 1<<6, is also available in sharp_jobflags,
-                  but its use here is deprecated in favor of having it in the sharp_alm_info */
-
-               SHARP_NO_FFT          = 1<<7,
-
-               SHARP_USE_WEIGHTS     = 1<<20,    /* internal use only */
-               SHARP_NO_OPENMP       = 1<<21,    /* internal use only */
-               SHARP_NVMAX           = (1<<4)-1 /* internal use only */
-             } sharp_jobflags;
-
-/*! Performs a libsharp SHT job. The interface deliberately does not use
-  the C99 "complex" data type, in order to be callable from C89 and C++.
-  \param type the type of SHT
-  \param spin the spin of the quantities to be transformed
-  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
-    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
-    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
-    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
-    depends on whether the SHARP_DP flag is set.
-  \param map contains pointers to the maps. If \a spin==0,
-    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
-    point to the maps of the first SHT, map[2] and map[3] to those of the
-    second, etc. The exact data type of \a map depends on whether the SHARP_DP
-    flag is set.
-  \param geom_info A \c sharp_geom_info object compatible with the provided
-    \a map arrays.
-  \param alm_info A \c sharp_alm_info object compatible with the provided
-    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
-    exactly once.
-  \param ntrans the number of simultaneous SHTs
-  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
-    \a alm is expected to have the type "complex double **" and \a map is
-    expected to have the type "double **"; otherwise, the expected
-    types are "complex float **" and "float **", respectively.
-  \param time If not NULL, the wall clock time required for this SHT
-    (in seconds) will be written here.
-  \param opcnt If not NULL, a conservative estimate of the total floating point
-    operation count for this SHT will be written here. */
-void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
-  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
-  int flags, double *time, unsigned long long *opcnt);
-
-void sharp_set_chunksize_min(int new_chunksize_min);
-void sharp_set_nchunks_max(int new_nchunks_max);
-
-
-typedef enum { SHARP_ERROR_NO_MPI = 1,
-               /*!< libsharp not compiled with MPI support */
-              } sharp_errors;
-
-/*! Works like sharp_execute_mpi, but is always present whether or not libsharp
-    is compiled with USE_MPI. This is primarily useful for wrapper code etc.
-
-    Note that \a pcomm has the type MPI_Comm*, except we declare void* to avoid
-    pulling in MPI headers. I.e., the comm argument of sharp_execute_mpi
-    is *(MPI_Comm*)pcomm.
-
-    Other parameters are the same as sharp_execute_mpi.
-
-    Returns 0 if successful, or SHARP_ERROR_NO_MPI if MPI is not available
-    (in which case nothing is done).
- */
-int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
-  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
-  unsigned long long *opcnt);
-
-
-
-/*! \} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/libsharp/sharp_mpi.c
+++ b/libsharp/sharp_mpi.c
@ -25,7 +25,7 @@
 /*! \file sharp_mpi.c
 *  Functionality only needed for MPI-parallel transforms
 *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

@ -101,7 +101,7 @@ static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
  DEALLOC(theta_tmp);
  DEALLOC(ispair_tmp);

-  minfo->nph=2*job->nmaps*job->ntrans;
+  minfo->nph=2*job->nmaps;

  minfo->almcount=RALLOC(int,minfo->ntasks);
  minfo->almdisp=RALLOC(int,minfo->ntasks+1);
@ -184,8 +184,8 @@ static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
  {
  ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
    (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
-  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
-  job->s_m=2*job->ntrans*job->nmaps;
+  job->phase=RALLOC(dcmplx,2*job->nmaps*phase_size);
+  job->s_m=2*job->nmaps;
  job->s_th = job->s_m * ((job->type==SHARP_MAP2ALM) ? nmfull : nm);
  }

@ -315,12 +315,12 @@ static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)

 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt)
  {
  sharp_job job;
  sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
-    ntrans, flags);
+    flags);

  sharp_execute_job_mpi (&job, comm);
  if (time!=NULL) *time = job.time;
@ -331,15 +331,15 @@ void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
   without declaring it in C header as it should not be available to C code */
 void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt);
 void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt)
  {
  sharp_execute_mpi(MPI_Comm_f2c(comm), type, spin, alm, map, geom_info,
-                    alm_info, ntrans, flags, time, opcnt);
+                    alm_info, flags, time, opcnt);
  }

 #endif
--- a/libsharp/sharp_mpi.h
+++ b/libsharp/sharp_mpi.h
@ -25,7 +25,7 @@
 /*! \file sharp_mpi.h
 *  Interface for the spherical transform library with MPI support.
 *
- *  Copyright (C) 2011,2012 Max-Planck-Society
+ *  Copyright (C) 2011-2019 Max-Planck-Society
 *  \author Martin Reinecke \author Dag Sverre Seljebotn
 */

@ -33,28 +33,25 @@
 #define PLANCK_SHARP_MPI_H

 #include <mpi.h>
-#include "sharp_lowlevel.h"
+#include "sharp.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*! Performs an MPI parallel libsharp SHT job. The interface deliberately does
-  not use the C99 "complex" data type, in order to be callable from C.
+  not use the C99 "complex" data type, in order to be callable from C89 and C++.
  \param comm the MPI communicator to be used for this SHT
  \param type the type of SHT
  \param spin the spin of the quantities to be transformed
  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
-    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
-    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
-    alm[2] and alm[3] to those of the second, etc. The exact data type of \a alm
+    alm[0] points to the a_lm of the SHT. If \a spin>0, alm[0] and alm[1]
+    point to the two a_lm sets of the SHT. The exact data type of \a alm
    depends on whether the SHARP_DP flag is set.
  \param map contains pointers to the maps. If \a spin==0,
-    map[0] points to the map of the first SHT, map[1] to that of the second
-    etc. If \a spin>0, or \a type is SHARP_ALM2MAP_DERIV1, map[0] and map[1]
-    point to the maps of the first SHT, map[2] and map[3] to those of the
-    second, etc. The exact data type of \a map depends on whether the SHARP_DP
-    flag is set.
+    map[0] points to the map of the SHT. If \a spin>0, or \a type is
+    SHARP_ALM2MAP_DERIV1, map[0] and map[1] point to the two maps of the SHT.
+    The exact data type of \a map depends on whether the SHARP_DP flag is set.
  \param geom_info A \c sharp_geom_info object compatible with the provided
    \a map arrays. The total map geometry is the union of all \a geom_info
    objects over the participating MPI tasks.
@ -62,7 +59,6 @@ extern "C" {
    \a alm arrays. All \c m values from 0 to some \c mmax<=lmax must be present
    exactly once in the union of all \a alm_info objects over the participating
    MPI tasks.
-  \param ntrans the number of simultaneous SHTs
  \param flags See sharp_jobflags. In particular, if SHARP_DP is set, then
    \a alm is expected to have the type "complex double **" and \a map is
    expected to have the type "double **"; otherwise, the expected
@ -73,7 +69,7 @@ extern "C" {
    operation count for this SHT will be written here. */
 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
  void *alm, void *map, const sharp_geom_info *geom_info,
-  const sharp_alm_info *alm_info, int ntrans, int flags, double *time,
+  const sharp_alm_info *alm_info, int flags, double *time,
  unsigned long long *opcnt);

 #ifdef __cplusplus
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@ -23,8 +23,8 @@
 */

 /*  \file sharp_testsuite.c
- * 
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *
+ *  Copyright (C) 2012-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -42,17 +42,68 @@
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
 #include "c_utils.h"
-#include "sharp_announce.h"
 #include "memusage.h"
-#include "sharp_vecsupport.h"
+
+static void OpenMP_status(void)
+  {
+#ifndef _OPENMP
+  printf("OpenMP: not supported by this binary\n");
+#else
+  int threads = omp_get_max_threads();
+  if (threads>1)
+    printf("OpenMP active: max. %d threads.\n",threads);
+  else
+    printf("OpenMP active, but running with 1 thread only.\n");
+#endif
+  }
+
+static void MPI_status(void)
+  {
+#ifndef USE_MPI
+  printf("MPI: not supported by this binary\n");
+#else
+  int tasks;
+  MPI_Comm_size(MPI_COMM_WORLD,&tasks);
+  if (tasks>1)
+    printf("MPI active with %d tasks.\n",tasks);
+  else
+    printf("MPI active, but running with 1 task only.\n");
+#endif
+  }
+
+static void sharp_announce (const char *name)
+  {
+  size_t m, nlen=strlen(name);
+  printf("\n+-");
+  for (m=0; m<nlen; ++m) printf("-");
+  printf("-+\n");
+  printf("| %s |\n", name);
+  printf("+-");
+  for (m=0; m<nlen; ++m) printf("-");
+  printf("-+\n\n");
+  printf("Detected hardware architecture: %s\n", sharp_architecture());
+  printf("Supported vector length: %d\n", sharp_veclen());
+  OpenMP_status();
+  MPI_status();
+  printf("\n");
+  }
+
+static void sharp_module_startup (const char *name, int argc, int argc_expected,
+  const char *argv_expected, int verbose)
+  {
+  if (verbose) sharp_announce (name);
+  if (argc==argc_expected) return;
+  if (verbose) fprintf(stderr, "Usage: %s %s\n", name, argv_expected);
+  exit(1);
+  }

 typedef complex double dcmplx;

 int ntasks, mytask;

-static double drand (double min, double max, int *state)
+static double drand (double min, double max, unsigned *state)
  {
-  *state = (((*state) * 1103515245) + 12345) & 0x7fffffff;
+  *state = (((*state) * 1103515245u) + 12345u) & 0x7fffffffu;
  return min + (max-min)*(*state)/(0x7fffffff+1.0);
  }

@ -65,7 +116,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
  for (mi=0;mi<helper->nm; ++mi)
    {
    int m=helper->mval[mi];
-    int state=1234567*cnt+8912*m; // random seed
+    unsigned state=1234567u*(unsigned)cnt+8912u*(unsigned)m; // random seed
    for (int l=m;l<=helper->lmax; ++l)
      {
      if ((l<spin)&&(m<spin))
@ -242,13 +293,14 @@ static int good_fft_size(int n)
  }

 static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
-  int *gpar2, sharp_geom_info **ginfo, sharp_alm_info **ainfo)
+  int *gpar2, sharp_geom_info **ginfo, sharp_alm_info **ainfo, int verbose)
  {
  UTIL_ASSERT(lmax>=0,"lmax must not be negative");
  if (*mmax<0) *mmax=lmax;
  UTIL_ASSERT(*mmax<=lmax,"mmax larger than lmax");

-  if (mytask==0) printf ("lmax: %d, mmax: %d\n",lmax,*mmax);
+  verbose &= (mytask==0);
+  if (verbose) printf ("lmax: %d, mmax: %d\n",lmax,*mmax);

  sharp_make_triangular_alm_info(lmax,*mmax,1,ainfo);
 #ifdef USE_MPI
@ -260,14 +312,14 @@ static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
    if (*gpar1<1) *gpar1=lmax/2;
    if (*gpar1==0) ++(*gpar1);
    sharp_make_healpix_geom_info (*gpar1, 1, ginfo);
-    if (mytask==0) printf ("HEALPix grid, nside=%d\n",*gpar1);
+    if (verbose) printf ("HEALPix grid, nside=%d\n",*gpar1);
    }
  else if (strcmp(gname,"gauss")==0)
    {
    if (*gpar1<1) *gpar1=lmax+1;
    if (*gpar2<1) *gpar2=2*(*mmax)+1;
    sharp_make_gauss_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0)
+    if (verbose)
      printf ("Gauss-Legendre grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
    }
  else if (strcmp(gname,"fejer1")==0)
@ -275,21 +327,21 @@ static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
    if (*gpar1<1) *gpar1=2*lmax+1;
    if (*gpar2<1) *gpar2=2*(*mmax)+1;
    sharp_make_fejer1_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0) printf ("Fejer1 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    if (verbose) printf ("Fejer1 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
    }
  else if (strcmp(gname,"fejer2")==0)
    {
    if (*gpar1<1) *gpar1=2*lmax+1;
    if (*gpar2<1) *gpar2=2*(*mmax)+1;
    sharp_make_fejer2_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0) printf ("Fejer2 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
+    if (verbose) printf ("Fejer2 grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
    }
  else if (strcmp(gname,"cc")==0)
    {
    if (*gpar1<1) *gpar1=2*lmax+1;
    if (*gpar2<1) *gpar2=2*(*mmax)+1;
    sharp_make_cc_geom_info (*gpar1, *gpar2, 0., 1, *gpar2, ginfo);
-    if (mytask==0)
+    if (verbose)
      printf("Clenshaw-Curtis grid, nlat=%d, nlon=%d\n",*gpar1,*gpar2);
    }
  else if (strcmp(gname,"smallgauss")==0)
@ -319,7 +371,7 @@ static void get_infos (const char *gname, int lmax, int *mmax, int *gpar1,
        ofs+=pring;
        }
      }
-    if (mytask==0)
+    if (verbose)
      {
      ptrdiff_t npix=get_npix(*ginfo);
      printf("Small Gauss grid, nlat=%d, npix=%ld, savings=%.2f%%\n",
@ -358,97 +410,83 @@ static void check_sign_scale(void)
  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);

-  for (int ntrans=1; ntrans<10; ++ntrans)
-    {
-    double **map;
-    ALLOC2D(map,double,2*ntrans,npix);
+  double **map;
+  ALLOC2D(map,double,2,npix);

-    dcmplx **alm;
-    ALLOC2D(alm,dcmplx,2*ntrans,nalms);
-    for (int i=0; i<2*ntrans; ++i)
-      for (int j=0; j<nalms; ++j)
-        alm[i][j]=1.+_Complex_I;
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,2,nalms);
+  for (int i=0; i<2; ++i)
+    for (int j=0; j<nalms; ++j)
+      alm[i][j]=1.+_Complex_I;

-    sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
-      NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[it][0     ], 3.588246976618616912e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[it][npix/2], 4.042209792157496651e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[it][npix-1],-1.234675107554816442e+01,1e-12),
-        "error");
-      }
-    sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
-      NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ], 2.750897760535633285e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2], 3.137704477368562905e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-8.405730859837063917e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-2.398026536095463346e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-4.961140548331700728e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.412765834230440021e+01,1e-12),
-        "error");
-      }
+  sharp_execute(SHARP_ALM2MAP,0,&alm[0],&map[0],tinfo,alms,SHARP_DP,
+    NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ], 3.588246976618616912e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2], 4.042209792157496651e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),
+    "error");

-    sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,ntrans,SHARP_DP,
-      NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-1.398186224727334448e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.456676000884031197e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.516249174408820863e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-3.173406200299964119e+00,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-5.831327404513146462e+01,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1],-1.863257892248353897e+01,1e-12),
-        "error");
-      }
+  sharp_execute(SHARP_ALM2MAP,1,&alm[0],&map[0],tinfo,alms,SHARP_DP,
+    NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2], 3.137704477368562905e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-8.405730859837063917e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-2.398026536095463346e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-4.961140548331700728e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.412765834230440021e+01,1e-12),
+    "error");

-    sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,ntrans,
-      SHARP_DP,NULL,NULL);
-    for (int it=0; it<ntrans; ++it)
-      {
-      UTIL_ASSERT(FAPPROX(map[2*it  ][0     ],-6.859393905369091105e-01,1e-11),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix/2],-2.103947835973212364e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it  ][npix-1],-1.092463246472086439e+03,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][0     ],-1.411433220713928165e+02,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix/2],-1.146122859381925082e+03,1e-12),
-        "error");
-      UTIL_ASSERT(FAPPROX(map[2*it+1][npix-1], 7.821618677689795049e+02,1e-12),
-        "error");
-      }
+  sharp_execute(SHARP_ALM2MAP,2,&alm[0],&map[0],tinfo,alms,SHARP_DP,
+    NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ],-1.398186224727334448e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.456676000884031197e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.516249174408820863e+02,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-3.173406200299964119e+00,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-5.831327404513146462e+01,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),
+    "error");

-    DEALLOC2D(map);
-    DEALLOC2D(alm);
-    }
+  sharp_execute(SHARP_ALM2MAP_DERIV1,1,&alm[0],&map[0],tinfo,alms,
+    SHARP_DP,NULL,NULL);
+  UTIL_ASSERT(FAPPROX(map[0][0     ],-6.859393905369091105e-01,1e-11),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.103947835973212364e+02,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.092463246472086439e+03,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-1.411433220713928165e+02,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-1.146122859381925082e+03,1e-12),
+    "error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1], 7.821618677689795049e+02,1e-12),
+    "error");
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);

  sharp_destroy_alm_info(alms);
  sharp_destroy_geom_info(tinfo);
  }

 static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int ntrans, int nv, double **err_abs, double **err_rel,
+  int spin, double **err_abs, double **err_rel,
  double *t_a2m, double *t_m2a, unsigned long long *op_a2m,
  unsigned long long *op_m2a)
  {
  ptrdiff_t nalms = get_nalms(ainfo);
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;

  size_t npix = get_npix(ginfo);
  double **map;
@ -463,20 +501,20 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,

 #ifdef USE_MPI
  sharp_execute_mpi(MPI_COMM_WORLD,SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,
-    ainfo,ntrans, SHARP_DP|SHARP_ADD|nv,t_a2m,op_a2m);
+    ainfo, SHARP_DP|SHARP_ADD,t_a2m,op_a2m);
 #else
-  sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
-    SHARP_DP|nv,t_a2m,op_a2m);
+  sharp_execute(SHARP_ALM2MAP,spin,&alm[0],&map[0],ginfo,ainfo,
+    SHARP_DP,t_a2m,op_a2m);
 #endif
  if (t_a2m!=NULL) *t_a2m=maxTime(*t_a2m);
  if (op_a2m!=NULL) *op_a2m=totalops(*op_a2m);
  double *sqsum=get_sqsum_and_invert(alm,nalms,ncomp);
 #ifdef USE_MPI
  sharp_execute_mpi(MPI_COMM_WORLD,SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,
-    ainfo,ntrans,SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+    ainfo,SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
 #else
-  sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,ntrans,
-    SHARP_DP|SHARP_ADD|nv,t_m2a,op_m2a);
+  sharp_execute(SHARP_MAP2ALM,spin,&alm[0],&map[0],ginfo,ainfo,
+    SHARP_DP|SHARP_ADD,t_m2a,op_m2a);
 #endif
  if (t_m2a!=NULL) *t_m2a=maxTime(*t_m2a);
  if (op_m2a!=NULL) *op_m2a=totalops(*op_m2a);
@ -488,11 +526,11 @@ static void do_sht (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
  }

 static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
-  int spin, int ntrans, int nv)
+  int spin)
  {
-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;
  double *err_abs, *err_rel;
-  do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel, NULL, NULL,
+  do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, NULL, NULL,
    NULL, NULL);
  for (int i=0; i<ncomp; ++i)
    UTIL_ASSERT((err_rel[i]<1e-10) && (err_abs[i]<1e-10),"error");
@ -500,6 +538,16 @@ static void check_accuracy (sharp_geom_info *ginfo, sharp_alm_info *ainfo,
  DEALLOC(err_abs);
  }

+static void run(int lmax, int mmax, int nlat, int nlon, int spin)
+  {
+  sharp_geom_info *ginfo;
+  sharp_alm_info *ainfo;
+  get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo, 0);
+  check_accuracy(ginfo,ainfo,spin);
+  sharp_destroy_alm_info(ainfo);
+  sharp_destroy_geom_info(ginfo);
+  }
+
 static void sharp_acctest(void)
  {
  if (mytask==0) sharp_module_startup("sharp_acctest",1,1,"",1);
@ -510,43 +558,36 @@ static void sharp_acctest(void)

  if (mytask==0) printf("Testing map analysis accuracy.\n");

-  sharp_geom_info *ginfo;
-  sharp_alm_info *ainfo;
-  int lmax=127, mmax=127, nlat=128, nlon=256;
-  get_infos ("gauss", lmax, &mmax, &nlat, &nlon, &ginfo, &ainfo);
-  for (int nv=1; nv<=6; ++nv)
-    for (int ntrans=1; ntrans<=6; ++ntrans)
-      {
-      check_accuracy(ginfo,ainfo,0,ntrans,nv);
-      check_accuracy(ginfo,ainfo,1,ntrans,nv);
-      check_accuracy(ginfo,ainfo,2,ntrans,nv);
-      check_accuracy(ginfo,ainfo,3,ntrans,nv);
-      check_accuracy(ginfo,ainfo,30,ntrans,nv);
-      }
-  sharp_destroy_alm_info(ainfo);
-  sharp_destroy_geom_info(ginfo);
+  run(127, 127, 128, 256, 0);
+  run(127, 127, 128, 256, 1);
+  run(127, 127, 128, 256, 2);
+  run(127, 127, 128, 256, 3);
+  run(127, 127, 128, 256, 30);
+  run(5, 0, 6, 1, 0);
+  run(5, 0, 7, 2, 0);
+  run(8, 8, 9, 17, 0);
+  run(8, 8, 9, 17, 2);
  if (mytask==0) printf("Passed.\n\n");
  }

 static void sharp_test (int argc, const char **argv)
  {
  if (mytask==0) sharp_announce("sharp_test");
-  UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
+  UTIL_ASSERT(argc>=8,"usage: grid lmax mmax geom1 geom2 spin");
  int lmax=atoi(argv[3]);
  int mmax=atoi(argv[4]);
  int gpar1=atoi(argv[5]);
  int gpar2=atoi(argv[6]);
  int spin=atoi(argv[7]);
-  int ntrans=atoi(argv[8]);

  if (mytask==0) printf("Testing map analysis accuracy.\n");
-  if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
+  if (mytask==0) printf("spin=%d\n", spin);

  sharp_geom_info *ginfo;
  sharp_alm_info *ainfo;
-  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
+  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo, 1);

-  int ncomp = ntrans*((spin==0) ? 1 : 2);
+  int ncomp = (spin==0) ? 1 : 2;
  double t_a2m=1e30, t_m2a=1e30;
  unsigned long long op_a2m, op_m2a;
  double *err_abs,*err_rel;
@ -557,7 +598,7 @@ static void sharp_test (int argc, const char **argv)
    {
    ++nrpt;
    double ta2m2, tm2a2;
-    do_sht (ginfo, ainfo, spin, ntrans, 0, &err_abs, &err_rel, &ta2m2, &tm2a2,
+    do_sht (ginfo, ainfo, spin, &err_abs, &err_rel, &ta2m2, &tm2a2,
      &op_a2m, &op_m2a);
    if (ta2m2<t_a2m) t_a2m=ta2m2;
    if (tm2a2<t_m2a) t_m2a=tm2a2;
@ -607,79 +648,16 @@ static void sharp_test (int argc, const char **argv)
    }

  if (mytask==0)
-    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %2d %.2e %7.2f %.2e %7.2f"
+    printf("%-12s %-10s %2d %d %2d %3d %6d %6d %6d %6d %.2e %7.2f %.2e %7.2f"
           " %9.2f %6.2f %.2e %.2e\n",
-      getenv("HOST"),argv[2],spin,VLEN,nomp,ntasks,lmax,mmax,gpar1,gpar2,
-      ntrans,t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
+      getenv("HOST"),argv[2],spin,sharp_veclen(),nomp,ntasks,lmax,mmax,gpar1,gpar2,
+      t_a2m,1e-9*op_a2m/t_a2m,t_m2a,1e-9*op_m2a/t_m2a,tmem/(1<<20),
      100.*(1.-iosize/tmem),maxerel,maxeabs);

  DEALLOC(err_abs);
  DEALLOC(err_rel);
  }

-static void sharp_bench (int argc, const char **argv)
-  {
-  if (mytask==0) sharp_announce("sharp_bench");
-  UTIL_ASSERT(argc>=9,"usage: grid lmax mmax geom1 geom2 spin ntrans");
-  int lmax=atoi(argv[3]);
-  int mmax=atoi(argv[4]);
-  int gpar1=atoi(argv[5]);
-  int gpar2=atoi(argv[6]);
-  int spin=atoi(argv[7]);
-  int ntrans=atoi(argv[8]);
-
-  if (mytask==0) printf("Testing map analysis accuracy.\n");
-  if (mytask==0) printf("spin=%d, ntrans=%d\n", spin, ntrans);
-
-  sharp_geom_info *ginfo;
-  sharp_alm_info *ainfo;
-  get_infos (argv[2], lmax, &mmax, &gpar1, &gpar2, &ginfo, &ainfo);
-
-  double ta2m_auto=1e30, tm2a_auto=1e30, ta2m_min=1e30, tm2a_min=1e30;
-  unsigned long long opa2m_min=0, opm2a_min=0;
-  int nvmin_a2m=-1, nvmin_m2a=-1;
-  for (int nv=0; nv<=6; ++nv)
-    {
-    int ntries=0;
-    double tacc=0;
-    do
-      {
-      double t_a2m, t_m2a;
-      unsigned long long op_a2m, op_m2a;
-      double *err_abs,*err_rel;
-      do_sht (ginfo, ainfo, spin, ntrans, nv, &err_abs, &err_rel,
-        &t_a2m, &t_m2a, &op_a2m, &op_m2a);
-
-      DEALLOC(err_abs);
-      DEALLOC(err_rel);
-      tacc+=t_a2m+t_m2a;
-      ++ntries;
-      if (nv==0)
-        {
-        if (t_a2m<ta2m_auto) ta2m_auto=t_a2m;
-        if (t_m2a<tm2a_auto) tm2a_auto=t_m2a;
-        }
-      else
-        {
-        if (t_a2m<ta2m_min) { nvmin_a2m=nv; ta2m_min=t_a2m; opa2m_min=op_a2m; }
-        if (t_m2a<tm2a_min) { nvmin_m2a=nv; tm2a_min=t_m2a; opm2a_min=op_m2a; }
-        }
-      } while((ntries<2)||(tacc<3.));
-    }
-  if (mytask==0)
-    {
-    printf("a2m: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
-      nvmin_a2m,ta2m_min,100.*(ta2m_auto-ta2m_min)/ta2m_auto,
-      1e-9*opa2m_min/ta2m_min);
-    printf("m2a: nvmin=%d tmin=%fs speedup=%.2f%% perf=%.2fGFlops/s\n",
-      nvmin_m2a,tm2a_min,100.*(tm2a_auto-tm2a_min)/tm2a_auto,
-      1e-9*opm2a_min/tm2a_min);
-    }
-
-  sharp_destroy_alm_info(ainfo);
-  sharp_destroy_geom_info(ginfo);
-  }
-
 int main(int argc, const char **argv)
  {
 #ifdef USE_MPI
@ -696,8 +674,6 @@ int main(int argc, const char **argv)
    sharp_acctest();
  else if (strcmp(argv[1],"test")==0)
    sharp_test(argc,argv);
-  else if (strcmp(argv[1],"bench")==0)
-    sharp_bench(argc,argv);
  else
    UTIL_FAIL("unknown command");

--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
 *  Convenience functions for vector arithmetics
 *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -33,38 +33,36 @@
 #define SHARP_VECSUPPORT_H

 #include <math.h>
-#include "sharp_vecutil.h"
+
+#ifndef VLEN
+
+#if (defined(__AVX512F__))
+#define VLEN 8
+#elif (defined (__AVX__))
+#define VLEN 4
+#elif (defined (__SSE2__))
+#define VLEN 2
+#else
+#define VLEN 1
+#endif
+
+#endif

 typedef double Ts;

 #if (VLEN==1)

 typedef double Tv;
-typedef float Tv_s;
 typedef int Tm;

-#define vadd(a,b) ((a)+(b))
-#define vadd_s(a,b) ((a)+(b))
-#define vaddeq(a,b) ((a)+=(b))
-#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
-#define vsub(a,b) ((a)-(b))
-#define vsub_s(a,b) ((a)-(b))
-#define vsubeq(a,b) ((a)-=(b))
-#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
-#define vmul(a,b) ((a)*(b))
-#define vmul_s(a,b) ((a)*(b))
-#define vmuleq(a,b) ((a)*=(b))
-#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
-#define vfmaeq(a,b,c) ((a)+=(b)*(c))
-#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
-#define vfmseq(a,b,c) ((a)-=(b)*(c))
-#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
-#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
-#define vneg(a) (-(a))
 #define vload(a) (a)
-#define vload_s(a) (a)
-#define vloadu(p) (*(p))
-#define vloadu_s(p) (*(p))
+#define vzero 0.
+#define vone 1.
+
+#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
+#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
+#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
+#define vneg(a) (-(a))
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@ -72,16 +70,16 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
-#define vstoreu(p, a) (*(p)=a)
-#define vstoreu_s(p, a) (*(p)=a)
-
+#define vor_mask(a,b) ((a)||(b))
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
-
 #define vanyTrue(a) (a)
 #define vallTrue(a) (a)
-#define vzero 0.
-#define vone 1.
+
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
+

 #endif

@ -97,7 +95,6 @@ static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
 #endif

 typedef __m128d Tv;
-typedef __m128 Tv_s;
 typedef __m128d Tm;

 #if defined(__SSE4_1__)
@ -106,110 +103,84 @@ typedef __m128d Tm;
 static inline Tv vblend__(Tv m, Tv a, Tv b)
  { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
 #endif
-#define vzero _mm_setzero_pd()
-#define vone _mm_set1_pd(1.)
-
-#define vadd(a,b) _mm_add_pd(a,b)
-#define vadd_s(a,b) _mm_add_ps(a,b)
-#define vaddeq(a,b) a=_mm_add_pd(a,b)
-#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
-#define vsub(a,b) _mm_sub_pd(a,b)
-#define vsub_s(a,b) _mm_sub_ps(a,b)
-#define vsubeq(a,b) a=_mm_sub_pd(a,b)
-#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
-#define vmul(a,b) _mm_mul_pd(a,b)
-#define vmul_s(a,b) _mm_mul_ps(a,b)
-#define vmuleq(a,b) a=_mm_mul_pd(a,b)
-#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
-#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
-#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
-#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
-#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
 #define vload(a) _mm_set1_pd(a)
-#define vload_s(a) _mm_set1_ps(a)
-#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
+#define vzero _mm_setzero_pd()
+#define vone vload(1.)
+
+#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
+#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
+#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
+#define vneg(a) _mm_xor_pd(vload(-0.),a)
+#define vabs(a) _mm_andnot_pd(vload(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
 #define vgt(a,b) _mm_cmpgt_pd(a,b)
 #define vge(a,b) _mm_cmpge_pd(a,b)
 #define vne(a,b) _mm_cmpneq_pd(a,b)
 #define vand_mask(a,b) _mm_and_pd(a,b)
+#define vor_mask(a,b) _mm_or_pd(a,b)
 #define vmin(a,b) _mm_min_pd(a,b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm_movemask_pd(a)==3)
-#define vloadu(p) _mm_loadu_pd(p)
-#define vloadu_s(p) _mm_loadu_ps(p)
-#define vstoreu(p, v) _mm_storeu_pd(p, v)
-#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
+
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c,
+  Tv d, _Complex double * restrict cc)
+  {
+  union {Tv v; _Complex double c; } u1, u2;
+#if defined(__SSE3__)
+  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
+#else
+  u1.v = _mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0));
+  u2.v = _mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)) +
+         _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0));
+#endif
+  cc[0]+=u1.c; cc[1]+=u2.c;
+  }

 #endif

 #if (VLEN==4)

 #include <immintrin.h>
-#if (USE_FMA4)
-#include <x86intrin.h>
-#endif

 typedef __m256d Tv;
-typedef __m256 Tv_s;
 typedef __m256d Tm;

 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
-#define vzero _mm256_setzero_pd()
-#define vone _mm256_set1_pd(1.)
-
-#define vadd(a,b) _mm256_add_pd(a,b)
-#define vadd_s(a,b) _mm256_add_ps(a,b)
-#define vaddeq(a,b) a=_mm256_add_pd(a,b)
-#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
-#define vsub(a,b) _mm256_sub_pd(a,b)
-#define vsub_s(a,b) _mm256_sub_ps(a,b)
-#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
-#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
-#define vmul(a,b) _mm256_mul_pd(a,b)
-#define vmul_s(a,b) _mm256_mul_ps(a,b)
-#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
-#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#if (USE_FMA4)
-#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
-#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
-#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
-#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
-#else
-#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
-#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
-#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
-#define vfmaaeq(a,b,c,d,e) \
-  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
-#define vfmaseq(a,b,c,d,e) \
-  a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
-#endif
-#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
-#define vload_s(a) _mm256_set1_ps(a)
-#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
+#define vzero _mm256_setzero_pd()
+#define vone vload(1.)
+
+#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
+#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
+#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
+#define vneg(a) _mm256_xor_pd(vload(-0.),a)
+#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
 #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
 #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
 #define vand_mask(a,b) _mm256_and_pd(a,b)
+#define vor_mask(a,b) _mm256_or_pd(a,b)
 #define vmin(a,b) _mm256_min_pd(a,b)
 #define vmax(a,b) _mm256_max_pd(a,b)
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
 #define vallTrue(a) (_mm256_movemask_pd(a)==15)

-#define vloadu(p) _mm256_loadu_pd(p)
-#define vloadu_s(p) _mm256_loadu_ps(p)
-#define vstoreu(p, v) _mm256_storeu_pd(p, v)
-#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=tmp3+tmp4;
+  union {Tv v; _Complex double c[2]; } u;
+  u.v=tmp1;
+  cc[0]+=u.c[0]; cc[1]+=u.c[1];
+  }

 #endif

@ -220,35 +191,33 @@ typedef __m256d Tm;
 typedef __m512d Tv;
 typedef __mmask8 Tm;

-#define vadd(a,b) _mm512_add_pd(a,b)
-#define vaddeq(a,b) a=_mm512_add_pd(a,b)
-#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
-#define vsub(a,b) _mm512_sub_pd(a,b)
-#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
-#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
-#define vmul(a,b) _mm512_mul_pd(a,b)
-#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
-#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
-#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
-#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
-#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
-#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
-#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
 #define vload(a) _mm512_set1_pd(a)
-#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
+#define vzero _mm512_setzero_pd()
+#define vone vload(1.)
+
+#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
+#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
+#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
+#define vneg(a) _mm512_mul_pd(a,vload(-1.))
+#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
 #define vsqrt(a) _mm512_sqrt_pd(a)
-#define vlt(a,b) _mm512_cmplt_pd_mask(a,b)
-#define vgt(a,b) _mm512_cmpnle_pd_mask(a,b)
-#define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
-#define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
+#define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
+#define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
+#define vge(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GE_OQ)
+#define vne(a,b) _mm512_cmp_pd_mask(a,b,_CMP_NEQ_OQ)
 #define vand_mask(a,b) ((a)&(b))
+#define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)
 #define vmax(a,b) _mm512_max_pd(a,b)
 #define vanyTrue(a) (a!=0)
 #define vallTrue(a) (a==255)

-#define vzero _mm512_setzero_pd()
-#define vone _mm512_set1_pd(1.)
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  _Complex double * restrict cc)
+  {
+  cc[0] += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
+  cc[1] += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
+  }

 #endif

--- a/libsharp/sharp_vecutil.h
+++ b/libsharp/sharp_vecutil.h
@ -1,63 +0,0 @@
-/*
- *  This file is part of libc_utils.
- *
- *  libc_utils is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  libc_utils is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with libc_utils; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
- *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
- *  (DLR).
- */
-
-/*! \file sharp_vecutil.h
- *  Functionality related to vector instruction support
- *
- *  Copyright (C) 2012,2013 Max-Planck-Society
- *  \author Martin Reinecke
- */
-
-#ifndef SHARP_VECUTIL_H
-#define SHARP_VECUTIL_H
-
-#ifndef VLEN
-
-#if (defined (__MIC__))
-#define VLEN 8
-#elif (defined (__AVX__))
-#define VLEN 4
-#elif (defined (__SSE2__))
-#define VLEN 2
-#else
-#define VLEN 1
-#endif
-
-#endif
-
-#if (VLEN==1)
-#define VLEN_s 1
-#else
-#define VLEN_s (2*VLEN)
-#endif
-
-#ifndef USE_FMA4
-#ifdef __FMA4__
-#define USE_FMA4 1
-#else
-#define USE_FMA4 0
-#endif
-#endif
-
-#endif
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@ -25,7 +25,7 @@
 /*
 *  Helper code for efficient calculation of Y_lm(theta,phi=0)
 *
- *  Copyright (C) 2005-2014 Max-Planck-Society
+ *  Copyright (C) 2005-2019 Max-Planck-Society
 *  Author: Martin Reinecke
 */

@ -59,35 +59,44 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
    gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
  for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
    gen->cf[m]=gen->cf[m-1]*sharp_fbig;
+  gen->powlimit=RALLOC(double,m_max+spin+1);
+  gen->powlimit[0]=0.;
+  const double ln2 = 0.6931471805599453094172321214581766;
+  const double expo=-400*ln2;
+  for (int m=1; m<=m_max+spin; ++m)
+    gen->powlimit[m]=exp(expo/m);

  gen->m = -1;
  if (spin==0)
    {
-    gen->rf = RALLOC(sharp_ylmgen_dbl2,gen->lmax+1);
    gen->mfac = RALLOC(double,gen->mmax+1);
    gen->mfac[0] = inv_sqrt4pi;
    for (int m=1; m<=gen->mmax; ++m)
      gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
-    gen->root = RALLOC(double,2*gen->lmax+5);
-    gen->iroot = RALLOC(double,2*gen->lmax+5);
-    for (int m=0; m<2*gen->lmax+5; ++m)
+    gen->root = RALLOC(double,2*gen->lmax+8);
+    gen->iroot = RALLOC(double,2*gen->lmax+8);
+    for (int m=0; m<2*gen->lmax+8; ++m)
      {
      gen->root[m] = sqrt(m);
      gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
      }
+    gen->eps=RALLOC(double, gen->lmax+4);
+    gen->alpha=RALLOC(double, gen->lmax/2+2);
+    gen->coef=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+2);
    }
  else
    {
    gen->m=gen->mlo=gen->mhi=-1234567890;
-    ALLOC(gen->fx,sharp_ylmgen_dbl3,gen->lmax+2);
-    for (int m=0; m<gen->lmax+2; ++m)
-      gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
-    ALLOC(gen->inv,double,gen->lmax+1);
+    ALLOC(gen->coef,sharp_ylmgen_dbl2,gen->lmax+3);
+    for (int m=0; m<gen->lmax+3; ++m)
+      gen->coef[m].a=gen->coef[m].b=0.;
+    ALLOC(gen->alpha,double,gen->lmax+3);
+    ALLOC(gen->inv,double,gen->lmax+2);
    gen->inv[0]=0;
-    for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
-    ALLOC(gen->flm1,double,2*gen->lmax+1);
-    ALLOC(gen->flm2,double,2*gen->lmax+1);
-    for (int m=0; m<2*gen->lmax+1; ++m)
+    for (int m=1; m<gen->lmax+2; ++m) gen->inv[m]=1./m;
+    ALLOC(gen->flm1,double,2*gen->lmax+3);
+    ALLOC(gen->flm2,double,2*gen->lmax+3);
+    for (int m=0; m<2*gen->lmax+3; ++m)
      {
      gen->flm1[m] = sqrt(1./(m+1.));
      gen->flm2[m] = sqrt(m/(m+1.));
@ -124,16 +133,18 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
 void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
  {
  DEALLOC(gen->cf);
+  DEALLOC(gen->powlimit);
+  DEALLOC(gen->alpha);
+  DEALLOC(gen->coef);
  if (gen->s==0)
    {
-    DEALLOC(gen->rf);
    DEALLOC(gen->mfac);
    DEALLOC(gen->root);
    DEALLOC(gen->iroot);
+    DEALLOC(gen->eps);
    }
  else
    {
-    DEALLOC(gen->fx);
    DEALLOC(gen->prefac);
    DEALLOC(gen->fscale);
    DEALLOC(gen->flm1);
@ -150,13 +161,20 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)

  if (gen->s==0)
    {
-    gen->rf[m].f[0] = gen->root[2*m+3];
-    gen->rf[m].f[1] = 0.;
-    for (int l=m+1; l<=gen->lmax; ++l)
+    gen->eps[m] = 0.;
+    for (int l=m+1; l<gen->lmax+4; ++l)
+      gen->eps[l] = gen->root[l+m]*gen->root[l-m]
+                   *gen->iroot[2*l+1]*gen->iroot[2*l-1];
+    gen->alpha[0] = 1./gen->eps[m+1];
+    gen->alpha[1] = gen->eps[m+1]/(gen->eps[m+2]*gen->eps[m+3]);
+    for (int il=1, l=m+2; l<gen->lmax+1; ++il, l+=2)
+      gen->alpha[il+1]= ((il&1) ? -1 : 1)
+                       /(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]);
+    for (int il=0, l=m; l<gen->lmax+2; ++il, l+=2)
      {
-      double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
-      gen->rf[l].f[0] = tmp*gen->root[2*l+1];
-      gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1];
+      gen->coef[il].a = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il];
+      double t1 = gen->eps[l+2], t2 = gen->eps[l+1];
+      gen->coef[il].b = -gen->coef[il].a*(t1*t1+t2*t2);
      }
    }
  else
@ -169,17 +187,25 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m)

    if (!ms_similar)
      {
-      for (int l=gen->mhi; l<gen->lmax; ++l)
+      gen->alpha[gen->mhi] = 1.;
+      gen->coef[gen->mhi].a = gen->coef[gen->mhi].b = 0.;
+      for (int l=gen->mhi; l<gen->lmax+1; ++l)
        {
        double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
                  *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];
        double lt = 2*l+1;
        double l1 = l+1;
-        gen->fx[l+1].f[0]=l1*lt*t;
-        gen->fx[l+1].f[1]=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
+        double flp10=l1*lt*t;
+        double flp11=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
        t = gen->flm2[l+gen->m]*gen->flm2[l-gen->m]
           *gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
-        gen->fx[l+1].f[2]=t*l1*gen->inv[l];
+        double flp12=t*l1*gen->inv[l];
+        if (l>gen->mhi)
+          gen->alpha[l+1] = gen->alpha[l-1]*flp12;
+        else
+          gen->alpha[l+1] = 1.;
+        gen->coef[l+1].a = flp10*gen->alpha[l]/gen->alpha[l+1];
+        gen->coef[l+1].b = flp11*gen->coef[l+1].a;
        }
      }

--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@ -25,7 +25,7 @@
 /*! \file sharp_ylmgen_c.h
 *  Code for efficient calculation of Y_lm(phi=0,theta)
 *
- *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  Copyright (C) 2005-2019 Max-Planck-Society
 *  \author Martin Reinecke
 */

@ -41,27 +41,28 @@ static const double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800;
 static const double sharp_ftol=0x1p-60;
 static const double sharp_fbighalf=0x1p+400;

-typedef struct { double f[2]; } sharp_ylmgen_dbl2;
-typedef struct { double f[3]; } sharp_ylmgen_dbl3;
+typedef struct { double a, b; } sharp_ylmgen_dbl2;

 typedef struct
  {
 /* for public use; immutable during lifetime */
  int lmax, mmax, s;
  double *cf;
+  double *powlimit;

 /* for public use; will typically change after call to Ylmgen_prepare() */
  int m;

+  double *alpha;
+  sharp_ylmgen_dbl2 *coef;
+
 /* used if s==0 */
-  double *mfac;
-  sharp_ylmgen_dbl2 *rf;
+  double *mfac, *eps;

 /* used if s!=0 */
  int sinPow, cosPow, preMinus_p, preMinus_m;
  double *prefac;
  int *fscale;
-  sharp_ylmgen_dbl3 *fx;

 /* internal usage only */
 /* used if s==0 */
--- a/pocketfft/pocketfft.c
+++ b/pocketfft/pocketfft.c
--- a/pocketfft/pocketfft.h
+++ b/pocketfft/pocketfft.h
@ -0,0 +1,34 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*! \file pocketfft.h
+ *  Public interface of the pocketfft library
+ *
+ *  Copyright (C) 2008-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef POCKETFFT_H
+#define POCKETFFT_H
+
+#include <stdlib.h>
+
+struct cfft_plan_i;
+typedef struct cfft_plan_i * cfft_plan;
+cfft_plan make_cfft_plan (size_t length);
+void destroy_cfft_plan (cfft_plan plan);
+int cfft_backward(cfft_plan plan, double c[], double fct);
+int cfft_forward(cfft_plan plan, double c[], double fct);
+size_t cfft_length(cfft_plan plan);
+
+struct rfft_plan_i;
+typedef struct rfft_plan_i * rfft_plan;
+rfft_plan make_rfft_plan (size_t length);
+void destroy_rfft_plan (rfft_plan plan);
+int rfft_backward(rfft_plan plan, double c[], double fct);
+int rfft_forward(rfft_plan plan, double c[], double fct);
+size_t rfft_length(rfft_plan plan);
+
+#endif
--- a/python/fake_pyrex/Pyrex/Distutils/init.py
+++ b/python/fake_pyrex/Pyrex/Distutils/init.py
@ -1 +0,0 @@
-# work around broken setuptools monkey patching
--- a/python/fake_pyrex/Pyrex/Distutils/build_ext.py
+++ b/python/fake_pyrex/Pyrex/Distutils/build_ext.py
@ -1 +0,0 @@
-build_ext = "yes, it's there!"
--- a/python/fake_pyrex/Pyrex/init.py
+++ b/python/fake_pyrex/Pyrex/init.py
@ -1 +0,0 @@
-# work around broken setuptools monkey patching
--- a/python/fake_pyrex/README
+++ b/python/fake_pyrex/README
@ -1,2 +0,0 @@
-This directory is here to fool setuptools into building .pyx files
-even if Pyrex is not installed. See ../setup.py.
--- a/python/libsharp/init.py
+++ b/python/libsharp/init.py
@ -1 +0,0 @@
-from .libsharp import *
--- a/python/libsharp/libsharp.pxd
+++ b/python/libsharp/libsharp.pxd
@ -1,92 +0,0 @@
-cdef extern from "sharp.h":
-
-    void sharp_legendre_transform_s(float *bl, float *recfac, ptrdiff_t lmax, float *x,
-                                    float *out, ptrdiff_t nx)
-    void sharp_legendre_transform(double *bl, double *recfac, ptrdiff_t lmax, double *x,
-                                  double *out, ptrdiff_t nx)
-    void sharp_legendre_transform_recfac(double *r, ptrdiff_t lmax)
-    void sharp_legendre_transform_recfac_s(float *r, ptrdiff_t lmax)
-    void sharp_legendre_roots(int n, double *x, double *w)
-
-    # sharp_lowlevel.h
-    ctypedef struct sharp_alm_info:
-      # Maximum \a l index of the array
-      int lmax
-      # Number of different \a m values in this object
-      int nm
-      # Array with \a nm entries containing the individual m values
-      int *mval
-      # Combination of flags from sharp_almflags
-      int flags
-      # Array with \a nm entries containing the (hypothetical) indices of
-      #   the coefficients with quantum numbers 0,\a mval[i]
-      long *mvstart
-      # Stride between a_lm and a_(l+1),m
-      long stride
-
-    ctypedef struct sharp_geom_info:
-        pass
-
-    void sharp_make_alm_info (int lmax, int mmax, int stride,
-                             ptrdiff_t *mvstart, sharp_alm_info **alm_info)
-
-    void sharp_make_geom_info (int nrings, int *nph, ptrdiff_t *ofs,
-                               int *stride, double *phi0, double *theta,
-                               double *wgt, sharp_geom_info **geom_info)
-
-    void sharp_destroy_alm_info(sharp_alm_info *info)
-    void sharp_destroy_geom_info(sharp_geom_info *info)
-
-    ptrdiff_t sharp_map_size(sharp_geom_info *info)
-    ptrdiff_t sharp_alm_count(sharp_alm_info *self)
-
-
-    ctypedef enum sharp_jobtype:
-        SHARP_YtW
-        SHARP_Yt
-        SHARP_WY
-        SHARP_Y
-
-    ctypedef enum:
-        SHARP_DP
-        SHARP_ADD
-
-    void sharp_execute(sharp_jobtype type_,
-                       int spin,
-                       void *alm,
-                       void *map,
-                       sharp_geom_info *geom_info,
-                       sharp_alm_info *alm_info,
-                       int ntrans,
-                       int flags,
-                       double *time,
-                       unsigned long long *opcnt) nogil
-
-    ctypedef enum:
-        SHARP_ERROR_NO_MPI
-
-    int sharp_execute_mpi_maybe (void *pcomm, sharp_jobtype type, int spin,
-        void *alm, void *map, sharp_geom_info *geom_info,
-        sharp_alm_info *alm_info, int ntrans, int flags, double *time,
-        unsigned long long *opcnt) nogil
-
-    void sharp_normalized_associated_legendre_table(int m, int spin, int lmax, int ntheta,
-        double *theta, int theta_stride, int l_stride, int spin_stride, double *out) nogil
-
-
-cdef extern from "sharp_geomhelpers.h":
-    void sharp_make_subset_healpix_geom_info(
-        int nside, int stride, int nrings,
-        int *rings, double *weight, sharp_geom_info **geom_info)
-    void sharp_make_gauss_geom_info(
-        int nrings, int nphi, double phi0,
-        int stride_lon, int stride_lat, sharp_geom_info **geom_info)
-
-cdef extern from "sharp_almhelpers.h":
-    void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
-        sharp_alm_info **alm_info)
-    void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
-        sharp_alm_info **alm_info)
-    void sharp_make_mmajor_real_packed_alm_info (int lmax, int stride,
-        int nm, const int *ms, sharp_alm_info **alm_info)
-
--- a/python/libsharp/libsharp.pyx
+++ b/python/libsharp/libsharp.pyx
@ -1,324 +0,0 @@
-import numpy as np
-cimport numpy as np
-cimport cython
-
-__all__ = ['legendre_transform', 'legendre_roots', 'sht', 'synthesis', 'adjoint_synthesis',
-           'analysis', 'adjoint_analysis', 'healpix_grid', 'triangular_order', 'rectangular_order',
-           'packed_real_order', 'normalized_associated_legendre_table']
-
-
-def legendre_transform(x, bl, out=None):
-    if out is None:
-        out = np.empty_like(x)
-    if out.shape[0] == 0:
-        return out
-    elif x.dtype == np.float64:
-        if bl.dtype != np.float64:
-            bl = bl.astype(np.float64)
-        return _legendre_transform(x, bl, out=out)
-    elif x.dtype == np.float32:
-        if bl.dtype != np.float32:
-            bl = bl.astype(np.float32)
-        return _legendre_transform_s(x, bl, out=out)
-    else:
-        raise ValueError("unsupported dtype")
-
-
-def _legendre_transform(double[::1] x, double[::1] bl, double[::1] out):
-    if out.shape[0] != x.shape[0]:
-        raise ValueError('x and out must have same shape')
-    sharp_legendre_transform(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
-    return np.asarray(out)
-
-
-def _legendre_transform_s(float[::1] x, float[::1] bl, float[::1] out):
-    if out.shape[0] != x.shape[0]:
-        raise ValueError('x and out must have same shape')
-    sharp_legendre_transform_s(&bl[0], NULL, bl.shape[0] - 1, &x[0], &out[0], x.shape[0])
-    return np.asarray(out)
-
-
-def legendre_roots(n):
-    x = np.empty(n, np.double)
-    w = np.empty(n, np.double)
-    cdef double[::1] x_buf = x, w_buf = w
-    if not (x_buf.shape[0] == w_buf.shape[0] == n):
-        raise AssertionError()
-    if n > 0:
-        sharp_legendre_roots(n, &x_buf[0], &w_buf[0])
-    return x, w
-
-
-JOBTYPE_TO_CONST = {
-    'Y': SHARP_Y,
-    'Yt': SHARP_Yt,
-    'WY': SHARP_WY,
-    'YtW': SHARP_YtW
-}
-
-def sht(jobtype, geom_info ginfo, alm_info ainfo, double[:, :, ::1] input,
-        int spin=0, comm=None, add=False):
-    cdef void *comm_ptr
-    cdef int flags = SHARP_DP | (SHARP_ADD if add else 0)
-    cdef int r
-    cdef sharp_jobtype jobtype_i
-    cdef double[:, :, ::1] output_buf
-    cdef int ntrans = input.shape[0]
-    cdef int ntotcomp = ntrans * input.shape[1]
-    cdef int i, j
-
-    if spin == 0 and input.shape[1] != 1:
-        raise ValueError('For spin == 0, we need input.shape[1] == 1')
-    elif spin != 0 and input.shape[1] != 2:
-        raise ValueError('For spin != 0, we need input.shape[1] == 2')
-
-
-    cdef size_t[::1] ptrbuf = np.empty(2 * ntotcomp, dtype=np.uintp)
-    cdef double **alm_ptrs = <double**>&ptrbuf[0]
-    cdef double **map_ptrs = <double**>&ptrbuf[ntotcomp]
-
-    try:
-        jobtype_i = JOBTYPE_TO_CONST[jobtype]
-    except KeyError:
-        raise ValueError('jobtype must be one of: %s' % ', '.join(sorted(JOBTYPE_TO_CONST.keys())))
-
-    if jobtype_i == SHARP_Y or jobtype_i == SHARP_WY:
-        output = np.empty((input.shape[0], input.shape[1], ginfo.local_size()), dtype=np.float64)
-        output_buf = output
-        for i in range(input.shape[0]):
-            for j in range(input.shape[1]):
-                alm_ptrs[i * input.shape[1] + j] = &input[i, j, 0]
-                map_ptrs[i * input.shape[1] + j] = &output_buf[i, j, 0]
-    else:
-        output = np.empty((input.shape[0], input.shape[1], ainfo.local_size()), dtype=np.float64)
-        output_buf = output
-        for i in range(input.shape[0]):
-            for j in range(input.shape[1]):
-                alm_ptrs[i * input.shape[1] + j] = &output_buf[i, j, 0]
-                map_ptrs[i * input.shape[1] + j] = &input[i, j, 0]
-
-    if comm is None:
-        with nogil:
-            sharp_execute (
-                jobtype_i,
-                geom_info=ginfo.ginfo, alm_info=ainfo.ainfo,
-                spin=spin, alm=alm_ptrs, map=map_ptrs,
-                ntrans=ntrans, flags=flags, time=NULL, opcnt=NULL)
-    else:
-        from mpi4py import MPI
-        if not isinstance(comm, MPI.Comm):
-            raise TypeError('comm must be an mpi4py communicator')
-        from .libsharp_mpi import _addressof
-        comm_ptr = <void*><size_t>_addressof(comm)
-        with nogil:
-            r = sharp_execute_mpi_maybe (
-                comm_ptr, jobtype_i,
-                geom_info=ginfo.ginfo, alm_info=ainfo.ainfo,
-                spin=spin, alm=alm_ptrs, map=map_ptrs,
-                ntrans=ntrans, flags=flags, time=NULL, opcnt=NULL)
-        if r == SHARP_ERROR_NO_MPI:
-            raise Exception('MPI requested, but not available')
-
-    return output
-
-
-def synthesis(*args, **kw):
-    return sht('Y', *args, **kw)
-
-def adjoint_synthesis(*args, **kw):
-    return sht('Yt', *args, **kw)
-
-def analysis(*args, **kw):
-    return sht('YtW', *args, **kw)
-
-def adjoint_analysis(*args, **kw):
-    return sht('WY', *args, **kw)
-
-
-#
-# geom_info
-#
-class NotInitializedError(Exception):
-    pass
-
-
-cdef class geom_info:
-    cdef sharp_geom_info *ginfo
-
-    def __cinit__(self, *args, **kw):
-        self.ginfo = NULL
-
-    def local_size(self):
-        if self.ginfo == NULL:
-            raise NotInitializedError()
-        return sharp_map_size(self.ginfo)
-
-    def __dealloc__(self):
-        if self.ginfo != NULL:
-            sharp_destroy_geom_info(self.ginfo)
-        self.ginfo = NULL
-
-
-cdef class healpix_grid(geom_info):
-
-    _weight_cache = {}  # { (nside, 'T'/'Q'/'U') -> numpy array of ring weights cached from file }
-
-    def __init__(self, int nside, stride=1, int[::1] rings=None, double[::1] weights=None):
-        if weights is not None and weights.shape[0] != 2 * nside:
-            raise ValueError('weights must have length 2 * nside')
-        sharp_make_subset_healpix_geom_info(nside, stride,
-                                            nrings=4 * nside - 1 if rings is None else rings.shape[0],
-                                            rings=NULL if rings is None else &rings[0],
-                                            weight=NULL if weights is None else &weights[0],
-                                            geom_info=&self.ginfo)
-
-    @classmethod
-    def load_ring_weights(cls, nside, fields):
-        """
-        Loads HEALPix ring weights from file. The environment variable
-        HEALPIX should be set, and this routine will look in the `data`
-        subdirectory.
-
-        Parameters
-        ----------
-
-        nside: int
-            HEALPix nside parameter
-
-        fields: tuple of str
-            Which weights to extract; pass ('T',) to only get scalar
-            weights back, or ('T', 'Q', 'U') to get all the weights
-
-        Returns
-        -------
-
-        List of NumPy arrays, according to fields parameter.
-
-        """
-        import os
-        from astropy.io import fits
-        data_path = os.path.join(os.environ['HEALPIX'], 'data')
-        fits_field_names = {
-            'T': 'TEMPERATURE WEIGHTS',
-            'Q': 'Q-POLARISATION WEIGHTS',
-            'U': 'U-POLARISATION WEIGHTS'}
-
-        must_load = [field for field in fields if (nside, field) not in cls._weight_cache]
-
-        if must_load:
-            hdulist = fits.open(os.path.join(data_path, 'weight_ring_n%05d.fits' % nside))
-            try:
-                for field in must_load:
-                    w = hdulist[1].data.field(fits_field_names[field]).ravel().astype(np.double)
-                    w += 1
-                    cls._weight_cache[nside, field] = w
-            finally:
-                hdulist.close()
-        return [cls._weight_cache[(nside, field)].copy() for field in fields]
-
-#
-# alm_info
-#
-
-
-cdef class alm_info:
-    cdef sharp_alm_info *ainfo
-
-    def __cinit__(self, *args, **kw):
-        self.ainfo = NULL
-
-    def local_size(self):
-        if self.ainfo == NULL:
-            raise NotInitializedError()
-        return sharp_alm_count(self.ainfo)
-
-    def mval(self):
-        if self.ainfo == NULL:
-            raise NotInitializedError()
-        return np.asarray(<int[:self.ainfo.nm]> self.ainfo.mval)
-
-    def mvstart(self):
-        if self.ainfo == NULL:
-            raise NotInitializedError()
-        return np.asarray(<long[:self.ainfo.nm]> self.ainfo.mvstart)
-
-    def __dealloc__(self):
-        if self.ainfo != NULL:
-            sharp_destroy_alm_info(self.ainfo)
-        self.ainfo = NULL
-
-    @cython.boundscheck(False)
-    def almxfl(self, np.ndarray[double, ndim=3, mode='c'] alm, np.ndarray[double, ndim=2, mode='c'] fl):
-        """Multiply Alm by a Ell based array
-
-
-        Parameters
-        ----------
-        alm : np.ndarray
-            input alm, 3 dimensions = (different signal x polarizations x lm-ordering)
-        fl : np.ndarray
-            either 1 dimension, e.g. gaussian beam, or 2 dimensions e.g. a polarized beam
-
-        Returns
-        -------
-        None, it modifies alms in-place
-
-        """
-        cdef int mvstart = 0
-        cdef bint has_multiple_beams = alm.shape[2] > 1 and fl.shape[1] > 1
-        cdef int f, i_m, m, num_ells, i_l, i_signal, i_pol, i_mv
-
-        for i_m in range(self.ainfo.nm):
-            m = self.ainfo.mval[i_m]
-            f = 1 if (m==0) else 2
-            num_ells = self.ainfo.lmax + 1 - m
-
-            if not has_multiple_beams:
-                for i_signal in range(alm.shape[0]):
-                    for i_pol in range(alm.shape[1]):
-                        for i_l in range(num_ells):
-                            l = m + i_l
-                            for i_mv in range(mvstart + f*i_l, mvstart + f*i_l +f):
-                                alm[i_signal, i_pol, i_mv] *= fl[l, 0]
-            else:
-                for i_signal in range(alm.shape[0]):
-                    for i_pol in range(alm.shape[1]):
-                        for i_l in range(num_ells):
-                            l = m + i_l
-                            for i_mv in range(mvstart + f*i_l, mvstart + f*i_l +f):
-                                alm[i_signal, i_pol, i_mv] *= fl[l, i_pol]
-            mvstart += f * num_ells
-
-cdef class triangular_order(alm_info):
-    def __init__(self, int lmax, mmax=None, stride=1):
-        mmax = mmax if mmax is not None else lmax
-        sharp_make_triangular_alm_info(lmax, mmax, stride, &self.ainfo)
-
-
-cdef class rectangular_order(alm_info):
-    def __init__(self, int lmax, mmax=None, stride=1):
-        mmax = mmax if mmax is not None else lmax
-        sharp_make_rectangular_alm_info(lmax, mmax, stride, &self.ainfo)
-
-
-cdef class packed_real_order(alm_info):
-    def __init__(self, int lmax, stride=1, int[::1] ms=None):
-        sharp_make_mmajor_real_packed_alm_info(lmax=lmax, stride=stride,
-                                               nm=lmax + 1 if ms is None else ms.shape[0],
-                                               ms=NULL if ms is None else &ms[0],
-                                               alm_info=&self.ainfo)
-
-#
-# 
-#
-
-@cython.boundscheck(False)
-def normalized_associated_legendre_table(int lmax, int m, theta):
-    cdef double[::1] theta_ = np.ascontiguousarray(theta, dtype=np.double)
-    out = np.zeros((theta_.shape[0], lmax - m + 1), np.double)
-    cdef double[:, ::1] out_ = out
-    if lmax < m:
-        raise ValueError("lmax < m")
-    with nogil:
-        sharp_normalized_associated_legendre_table(m, 0, lmax, theta_.shape[0], &theta_[0], lmax - m + 1, 1, 1, &out_[0,0])
-    return out
--- a/python/libsharp/libsharp_mpi.pyx
+++ b/python/libsharp/libsharp_mpi.pyx
@ -1,17 +0,0 @@
-cdef extern from "mpi.h":
-    ctypedef void *MPI_Comm
-
-cdef extern from "Python.h":
-    object PyLong_FromVoidPtr(void*)
-
-cdef extern:
-    ctypedef class mpi4py.MPI.Comm [object PyMPICommObject]:
-        cdef MPI_Comm ob_mpi
-        cdef unsigned flags
-
-# For compatibility with mpi4py <= 1.3.1
-# Newer versions could use the MPI._addressof function
-def _addressof(Comm comm):
-    cdef void *ptr = NULL
-    ptr = <void*>&comm.ob_mpi
-    return PyLong_FromVoidPtr(ptr)
--- a/python/libsharp/tests/init.py
+++ b/python/libsharp/tests/init.py
@ -1 +0,0 @@
-# empty
--- a/python/libsharp/tests/test_legendre.py
+++ b/python/libsharp/tests/test_legendre.py
@ -1,58 +0,0 @@
-import numpy as np
-from scipy.special import legendre
-from scipy.special import p_roots
-import libsharp
-
-from numpy.testing import assert_allclose
-
-
-def check_legendre_transform(lmax, ntheta):
-    l = np.arange(lmax + 1)
-    if lmax >= 1:
-        sigma = -np.log(1e-3) / lmax / (lmax + 1)
-        bl = np.exp(-sigma*l*(l+1))
-        bl *= (2 * l + 1)
-    else:
-        bl = np.asarray([1], dtype=np.double)
-
-    theta = np.linspace(0, np.pi, ntheta, endpoint=True)
-    x = np.cos(theta)
-
-    # Compute truth using scipy.special.legendre
-    P = np.zeros((ntheta, lmax + 1))
-    for l in range(lmax + 1):
-        P[:, l] = legendre(l)(x)
-    y0 = np.dot(P, bl)
-
-
-    # double-precision
-    y = libsharp.legendre_transform(x, bl)
-
-    assert_allclose(y, y0, rtol=1e-12, atol=1e-12)
-
-    # single-precision
-    y32 = libsharp.legendre_transform(x.astype(np.float32), bl)
-    assert_allclose(y, y0, rtol=1e-5, atol=1e-5)
-
-
-def test_legendre_transform():
-    nthetas_to_try = [0, 9, 17, 19] + list(np.random.randint(500, size=20))
-    for ntheta in nthetas_to_try:
-        for lmax in [0, 1, 2, 3, 20] + list(np.random.randint(50, size=4)):
-            yield check_legendre_transform, lmax, ntheta
-
-def check_legendre_roots(n):
-    xs, ws = ([], []) if n == 0 else p_roots(n) # from SciPy
-    xl, wl = libsharp.legendre_roots(n)
-    assert_allclose(xs, xl, rtol=1e-14, atol=1e-14)
-    assert_allclose(ws, wl, rtol=1e-14, atol=1e-14)
-
-def test_legendre_roots():
-    """
-    Test the Legendre root-finding algorithm from libsharp by comparing it with
-    the SciPy version.
-    """
-    yield check_legendre_roots, 0
-    yield check_legendre_roots, 1
-    yield check_legendre_roots, 32
-    yield check_legendre_roots, 33
--- a/python/libsharp/tests/test_legendre_table.py
+++ b/python/libsharp/tests/test_legendre_table.py
@ -1,36 +0,0 @@
-from __future__ import print_function
-import numpy as np
-
-from numpy.testing import assert_almost_equal
-from nose.tools import eq_, ok_
-
-from libsharp import normalized_associated_legendre_table
-from scipy.special import sph_harm, p_roots
-
-def test_compare_legendre_table_with_scipy():
-    def test(theta, m, lmax):
-        Plm = normalized_associated_legendre_table(lmax, m, theta)
-
-        Plm_p = sph_harm(m, np.arange(m, lmax + 1), 0, theta)[None, :]
-        if not np.allclose(Plm_p, Plm):
-            print(Plm_p)
-            print(Plm)
-        return ok_, np.allclose(Plm_p, Plm)
-
-    yield test(np.pi/2, 0, 10)
-    yield test(np.pi/4, 0, 10)
-    yield test(3 * np.pi/4, 0, 10)
-    yield test(np.pi/4, 1, 4)
-    yield test(np.pi/4, 2, 4)
-    yield test(np.pi/4, 50, 50)
-    yield test(np.pi/2, 49, 50)
-
-
-def test_legendre_table_wrapper_logic():
-    # tests the SSE 2 logic in the high-level wrapper by using an odd number of thetas
-    theta = np.asarray([np.pi/2, np.pi/4, 3 * np.pi / 4])
-    m = 3
-    lmax = 10
-    Plm = normalized_associated_legendre_table(lmax, m, theta)
-    assert np.allclose(Plm[1, :], normalized_associated_legendre_table(lmax, m, np.pi/4)[0, :])
-    assert np.allclose(Plm[2, :], normalized_associated_legendre_table(lmax, m, 3 * np.pi/4)[0, :])
--- a/python/libsharp/tests/test_sht.py
+++ b/python/libsharp/tests/test_sht.py
@ -1,32 +0,0 @@
-import numpy as np
-from numpy.testing import assert_allclose
-import libsharp
-
-from mpi4py import MPI
-
-
-def test_basic():
-    lmax = 10
-    nside = 8
-    rank = MPI.COMM_WORLD.Get_rank()
-    ms = np.arange(rank, lmax + 1, MPI.COMM_WORLD.Get_size(), dtype=np.int32)
-    
-    order = libsharp.packed_real_order(lmax, ms=ms)
-    grid = libsharp.healpix_grid(nside)
-
-    
-    alm = np.zeros(order.local_size())
-    if rank == 0:
-        alm[0] = 1
-    elif rank == 1:
-        alm[0] = 1
-
-
-    map = libsharp.synthesis(grid, order, np.repeat(alm[None, None, :], 3, 0), comm=MPI.COMM_WORLD)
-    assert np.all(map[2, :] == map[1, :]) and np.all(map[1, :] == map[0, :])
-    map = map[0, 0, :]
-    print(rank, "shape", map.shape)
-    print(rank, "mean", map.mean())
-
-if __name__=="__main__":
-    test_basic()
--- a/python/libsharp/tests/test_smoothing_noise_pol_mpi.py
+++ b/python/libsharp/tests/test_smoothing_noise_pol_mpi.py
@ -1,137 +0,0 @@
-# This test needs to be run with:
-
-# mpirun -np X python test_smoothing_noise_pol_mpi.py
-
-from mpi4py import MPI
-
-import numpy as np
-
-import healpy as hp
-
-import libsharp
-
-mpi = True
-rank = MPI.COMM_WORLD.Get_rank()
-
-nside = 256
-npix = hp.nside2npix(nside)
-
-np.random.seed(100)
-input_map = np.random.normal(size=(3, npix))
-fwhm_deg = 10
-lmax = 512
-
-nrings = 4 * nside - 1  # four missing pixels
-
-if rank == 0:
-    print("total rings", nrings)
-
-n_mpi_processes = MPI.COMM_WORLD.Get_size()
-rings_per_process = nrings // n_mpi_processes + 1
-# ring indices are 1-based
-
-ring_indices_emisphere = np.arange(2*nside, dtype=np.int32) + 1
-local_ring_indices = ring_indices_emisphere[rank::n_mpi_processes]
-
-# to improve performance, simmetric rings north/south need to be in the same rank
-# therefore we use symmetry to create the full ring indexing
-
-if local_ring_indices[-1] == 2 * nside:
-    # has equator ring
-    local_ring_indices = np.concatenate(
-      [local_ring_indices[:-1],
-       nrings - local_ring_indices[::-1] + 1]
-    )
-else:
-    # does not have equator ring
-    local_ring_indices = np.concatenate(
-      [local_ring_indices,
-       nrings - local_ring_indices[::-1] + 1]
-    )
-
-print("rank", rank, "n_rings", len(local_ring_indices))
-
-if not mpi:
-    local_ring_indices = None
-grid = libsharp.healpix_grid(nside, rings=local_ring_indices)
-
-# returns start index of the ring and number of pixels
-startpix, ringpix, _, _, _ = hp.ringinfo(nside, local_ring_indices.astype(np.int64))
-
-local_npix = grid.local_size()
-
-def expand_pix(startpix, ringpix, local_npix):
-    """Turn first pixel index and number of pixel in full array of pixels
-
-    to be optimized with cython or numba
-    """
-    local_pix = np.empty(local_npix, dtype=np.int64)
-    i = 0
-    for start, num in zip(startpix, ringpix):
-        local_pix[i:i+num] = np.arange(start, start+num)
-        i += num
-    return local_pix
-
-local_pix = expand_pix(startpix, ringpix, local_npix)
-
-local_map = input_map[:, local_pix]
-
-local_hitmap = np.zeros(npix)
-local_hitmap[local_pix] = 1
-hp.write_map("hitmap_{}.fits".format(rank), local_hitmap, overwrite=True)
-
-print("rank", rank, "npix", npix, "local_npix", local_npix, "local_map len", len(local_map), "unique pix", len(np.unique(local_pix)))
-
-local_m_indices = np.arange(rank, lmax + 1, MPI.COMM_WORLD.Get_size(), dtype=np.int32)
-if not mpi:
-    local_m_indices = None
-
-order = libsharp.packed_real_order(lmax, ms=local_m_indices) 
-local_nl = order.local_size()
-print("rank", rank, "local_nl", local_nl, "mval", order.mval())
-
-mpi_comm = MPI.COMM_WORLD if mpi else None
-
-# map2alm
-# maps in libsharp are 3D, 2nd dimension is IQU, 3rd is pixel
-
-alm_sharp_I = libsharp.analysis(grid, order,
-                                np.ascontiguousarray(local_map[0].reshape((1, 1, -1))),
-                                spin=0, comm=mpi_comm)
-alm_sharp_P = libsharp.analysis(grid, order,
-                                np.ascontiguousarray(local_map[1:].reshape((1, 2, -1))),
-                                spin=2, comm=mpi_comm)
-
-beam = hp.gauss_beam(fwhm=np.radians(fwhm_deg), lmax=lmax, pol=True)
-
-print("Smooth")
-# smooth in place (zonca implemented this function)
-order.almxfl(alm_sharp_I, np.ascontiguousarray(beam[:, 0:1]))
-order.almxfl(alm_sharp_P, np.ascontiguousarray(beam[:, (1, 2)]))
-
-# alm2map
-
-new_local_map_I = libsharp.synthesis(grid, order, alm_sharp_I, spin=0, comm=mpi_comm)
-new_local_map_P = libsharp.synthesis(grid, order, alm_sharp_P, spin=2, comm=mpi_comm)
-
-# Transfer map to first process for writing
-
-local_full_map = np.zeros(input_map.shape, dtype=np.float64)
-local_full_map[0, local_pix] = new_local_map_I
-local_full_map[1:, local_pix] = new_local_map_P
-
-output_map = np.zeros(input_map.shape, dtype=np.float64) if rank == 0 else None
-mpi_comm.Reduce(local_full_map, output_map, root=0, op=MPI.SUM)
-
-if rank == 0:
-    # hp.write_map("sharp_smoothed_map.fits", output_map, overwrite=True)
-    # hp_smoothed = hp.alm2map(hp.map2alm(input_map, lmax=lmax), nside=nside) # transform only
-    hp_smoothed = hp.smoothing(input_map, fwhm=np.radians(fwhm_deg), lmax=lmax)
-    std_diff = (hp_smoothed-output_map).std()
-    print("Std of difference between libsharp and healpy", std_diff)
-    # hp.write_map(
-    #     "healpy_smoothed_map.fits",
-    #     hp_smoothed,
-    #     overwrite=True
-    # )
-    assert std_diff < 1e-5
--- a/python/setup.py
+++ b/python/setup.py
@ -1,83 +0,0 @@
-#! /usr/bin/env python
-
-descr   = """Spherical Harmionic transforms package
-
-Python API for the libsharp spherical harmonic transforms library
-"""
-
-import os
-import sys
-
-DISTNAME            = 'libsharp'
-DESCRIPTION         = 'libsharp library for fast Spherical Harmonic Transforms'
-LONG_DESCRIPTION    = descr
-MAINTAINER          = 'Dag Sverre Seljebotn',
-MAINTAINER_EMAIL    = 'd.s.seljebotn@astro.uio.no',
-URL                 = 'http://sourceforge.net/projects/libsharp/'
-LICENSE             = 'GPL'
-DOWNLOAD_URL        = "http://sourceforge.net/projects/libsharp/"
-VERSION             = '0.1'
-
-# Add our fake Pyrex at the end of the Python search path
-# in order to fool setuptools into allowing compilation of
-# pyx files to C files. Importing Cython.Distutils then
-# makes Cython the tool of choice for this rather than
-# (the possibly nonexisting) Pyrex.
-project_path = os.path.split(__file__)[0]
-sys.path.append(os.path.join(project_path, 'fake_pyrex'))
-
-from setuptools import setup, find_packages, Extension
-from Cython.Build import cythonize
-import numpy as np
-
-libsharp = os.environ.get('LIBSHARP', None)
-libsharp_include = os.environ.get('LIBSHARP_INCLUDE', libsharp and os.path.join(libsharp, 'include'))
-libsharp_lib = os.environ.get('LIBSHARP_LIB', libsharp and os.path.join(libsharp, 'lib'))
-
-if libsharp_include is None or libsharp_lib is None:
-    sys.stderr.write('Please set LIBSHARP environment variable to the install directly of libsharp, '
-                     'this script will refer to the lib and include sub-directories. Alternatively '
-                     'set LIBSHARP_INCLUDE and LIBSHARP_LIB\n')
-    sys.exit(1)
-
-if __name__ == "__main__":
-    setup(install_requires = ['numpy'],
-          packages = find_packages(),
-          test_suite="nose.collector",
-          # Well, technically zipping the package will work, but since it's
-          # all compiled code it'll just get unzipped again at runtime, which
-          # is pointless:
-          zip_safe = False,
-          name = DISTNAME,
-          version = VERSION,
-          maintainer = MAINTAINER,
-          maintainer_email = MAINTAINER_EMAIL,
-          description = DESCRIPTION,
-          license = LICENSE,
-          url = URL,
-          download_url = DOWNLOAD_URL,
-          long_description = LONG_DESCRIPTION,
-          classifiers =
-            [ 'Development Status :: 3 - Alpha',
-              'Environment :: Console',
-              'Intended Audience :: Developers',
-              'Intended Audience :: Science/Research',
-              'License :: OSI Approved :: GNU General Public License (GPL)',
-              'Topic :: Scientific/Engineering'],
-          ext_modules = cythonize([
-              Extension("libsharp.libsharp",
-                        ["libsharp/libsharp.pyx"],
-                        libraries=["sharp", "fftpack", "c_utils"],
-                        include_dirs=[libsharp_include, np.get_include()],
-                        library_dirs=[libsharp_lib],
-                        extra_link_args=["-fopenmp"],
-              ),
-              Extension("libsharp.libsharp_mpi",
-                        ["libsharp/libsharp_mpi.pyx"],
-                        libraries=["sharp", "fftpack", "c_utils"],
-                        include_dirs=[libsharp_include, np.get_include()],
-                        library_dirs=[libsharp_lib],
-                        extra_link_args=["-fopenmp"],
-              ),
-              ]),
-          )
--- a/runjinja.py
+++ b/runjinja.py
@ -1,19 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Preprocesses foo.c.in to foo.c. Reads STDIN and writes STDOUT.
-"""
-
-import sys
-import hashlib
-from jinja2 import Template, Environment
-
-env = Environment(block_start_string='/*{',
-                  block_end_string='}*/',
-                  variable_start_string='{{',
-                  variable_end_string='}}')
-
-extra_vars = dict(len=len)
-input = sys.stdin.read()
-sys.stdout.write('/* DO NOT EDIT. md5sum of source: %s */' % hashlib.md5(input.encode()).hexdigest())
-sys.stdout.write(env.from_string(input).render(**extra_vars))
--- a/runtest.sh
+++ b/runtest.sh
@ -0,0 +1,4 @@
+#!/bin/sh
+
+./sharp_testsuite acctest
+
				`@ -1 +0,0 @@`
				`# work around broken setuptools monkey patching`